1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
51 #include "pcre_internal.h"
54 /* For use to indent debugging output */
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
66 enough. The resulting opcodes don't have to be less than 256 because they are
67 never stored, so we push them well clear of the normal opcodes. */
69 #define OP_PROP_EXTRA 300
70 #define OP_EXTUNI_EXTRA 320
71 #define OP_ANYNL_EXTRA 340
72 #define OP_HSPACE_EXTRA 360
73 #define OP_VSPACE_EXTRA 380
76 /* This table identifies those opcodes that are followed immediately by a
77 character that is to be tested in some way. This makes is possible to
78 centralize the loading of these characters. In the case of Type * etc, the
79 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
80 small value. ***NOTE*** If the start of this table is modified, the two tables
81 that follow must also be modified. */
83 static uschar coptable[] = {
85 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
86 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
87 0, 0, /* Any, Anybyte */
88 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
89 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
90 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
94 /* Positive single-char repeats */
95 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
96 3, 3, 3, /* upto, minupto, exact */
97 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
98 /* Negative single-char repeats - only for chars < 256 */
99 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* NOT upto, minupto, exact */
101 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
102 /* Positive type repeats */
103 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* Type upto, minupto, exact */
105 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
106 /* Character class & ref repeats */
107 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
108 0, 0, /* CRRANGE, CRMINRANGE */
111 0, /* XCLASS - variable length */
121 0, /* Assert behind */
122 0, /* Assert behind not */
124 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
125 0, 0, 0, /* SBRA, SCBRA, SCOND */
129 0, 0 /* BRAZERO, BRAMINZERO */
132 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
135 static uschar toptable1[] = {
137 ctype_digit, ctype_digit,
138 ctype_space, ctype_space,
139 ctype_word, ctype_word,
143 static uschar toptable2[] = {
152 /* Structure for holding data about a particular state, which is in effect the
153 current data for an active path through the match tree. It must consist
154 entirely of ints because the working vector we are passed, and which we put
155 these structures in, is a vector of ints. */
157 typedef struct stateblock {
158 int offset; /* Offset to opcode */
159 int count; /* Count for repeats */
160 int ims; /* ims flag bits */
161 int data; /* Some use extra data */
164 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
168 /*************************************************
169 * Print character string *
170 *************************************************/
172 /* Character string printing function for debugging.
176 length number of bytes
183 pchars(unsigned char *p, int length, FILE *f)
188 if (isprint(c = *(p++)))
191 fprintf(f, "\\x%02x", c);
198 /*************************************************
199 * Execute a Regular Expression - DFA engine *
200 *************************************************/
202 /* This internal function applies a compiled pattern to a subject string,
203 starting at a given point, using a DFA engine. This function is called from the
204 external one, possibly multiple times if the pattern is not anchored. The
205 function calls itself recursively for some kinds of subpattern.
208 md the match_data block with fixed information
209 this_start_code the opening bracket of this subexpression's code
210 current_subject where we currently are in the subject string
211 start_offset start offset in the subject string
212 offsets vector to contain the matching string offsets
213 offsetcount size of same
214 workspace vector of workspace
216 ims the current ims flags
217 rlevel function call recursion level
218 recursing regex recursive call level
222 -1 => failed to match
223 < -1 => some kind of unexpected problem
225 The following macros are used for adding states to the two state vectors (one
226 for the current character, one for the following character). */
228 #define ADD_ACTIVE(x,y) \
229 if (active_count++ < wscount) \
231 next_active_state->offset = (x); \
232 next_active_state->count = (y); \
233 next_active_state->ims = ims; \
234 next_active_state++; \
235 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
237 else return PCRE_ERROR_DFA_WSSIZE
239 #define ADD_ACTIVE_DATA(x,y,z) \
240 if (active_count++ < wscount) \
242 next_active_state->offset = (x); \
243 next_active_state->count = (y); \
244 next_active_state->ims = ims; \
245 next_active_state->data = (z); \
246 next_active_state++; \
247 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
249 else return PCRE_ERROR_DFA_WSSIZE
251 #define ADD_NEW(x,y) \
252 if (new_count++ < wscount) \
254 next_new_state->offset = (x); \
255 next_new_state->count = (y); \
256 next_new_state->ims = ims; \
258 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
260 else return PCRE_ERROR_DFA_WSSIZE
262 #define ADD_NEW_DATA(x,y,z) \
263 if (new_count++ < wscount) \
265 next_new_state->offset = (x); \
266 next_new_state->count = (y); \
267 next_new_state->ims = ims; \
268 next_new_state->data = (z); \
270 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
272 else return PCRE_ERROR_DFA_WSSIZE
274 /* And now, here is the code */
279 const uschar *this_start_code,
280 const uschar *current_subject,
290 stateblock *active_states, *new_states, *temp_states;
291 stateblock *next_active_state, *next_new_state;
293 const uschar *ctypes, *lcc, *fcc;
295 const uschar *end_code, *first_op;
297 int active_count, new_count, match_count;
299 /* Some fields in the md block are frequently referenced, so we load them into
300 independent variables in the hope that this will perform better. */
302 const uschar *start_subject = md->start_subject;
303 const uschar *end_subject = md->end_subject;
304 const uschar *start_code = md->start_code;
307 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
316 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
317 (2 * INTS_PER_STATEBLOCK);
319 DPRINTF(("\n%.*s---------------------\n"
320 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
321 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
323 ctypes = md->tables + ctypes_offset;
324 lcc = md->tables + lcc_offset;
325 fcc = md->tables + fcc_offset;
327 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
329 active_states = (stateblock *)(workspace + 2);
330 next_new_state = new_states = active_states + wscount;
333 first_op = this_start_code + 1 + LINK_SIZE +
334 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
336 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
337 the alternative states onto the list, and find out where the end is. This
338 makes is possible to use this function recursively, when we want to stop at a
339 matching internal ket rather than at the end.
341 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
342 a backward assertion. In that case, we have to find out the maximum amount to
343 move back, and set up each alternative appropriately. */
345 if (*first_op == OP_REVERSE)
350 end_code = this_start_code;
353 int back = GET(end_code, 2+LINK_SIZE);
354 if (back > max_back) max_back = back;
355 end_code += GET(end_code, 1);
357 while (*end_code == OP_ALT);
359 /* If we can't go back the amount required for the longest lookbehind
360 pattern, go back as far as we can; some alternatives may still be viable. */
363 /* In character mode we have to step back character by character */
367 for (gone_back = 0; gone_back < max_back; gone_back++)
369 if (current_subject <= start_subject) break;
371 while (current_subject > start_subject &&
372 (*current_subject & 0xc0) == 0x80)
379 /* In byte-mode we can do this quickly. */
382 gone_back = (current_subject - max_back < start_subject)?
383 current_subject - start_subject : max_back;
384 current_subject -= gone_back;
387 /* Now we can process the individual branches. */
389 end_code = this_start_code;
392 int back = GET(end_code, 2+LINK_SIZE);
393 if (back <= gone_back)
395 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
396 ADD_NEW_DATA(-bstate, 0, gone_back - back);
398 end_code += GET(end_code, 1);
400 while (*end_code == OP_ALT);
403 /* This is the code for a "normal" subpattern (not a backward assertion). The
404 start of a whole pattern is always one of these. If we are at the top level,
405 we may be asked to restart matching from the same point that we reached for a
406 previous partial match. We still have to scan through the top-level branches to
407 find the end state. */
411 end_code = this_start_code;
415 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
417 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
418 new_count = workspace[1];
420 memcpy(new_states, active_states, new_count * sizeof(stateblock));
427 int length = 1 + LINK_SIZE +
428 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
431 ADD_NEW(end_code - start_code + length, 0);
432 end_code += GET(end_code, 1);
433 length = 1 + LINK_SIZE;
435 while (*end_code == OP_ALT);
439 workspace[0] = 0; /* Bit indicating which vector is current */
441 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
443 /* Loop for scanning the subject */
445 ptr = current_subject;
452 /* Make the new state list into the active state list and empty the
455 temp_states = active_states;
456 active_states = new_states;
457 new_states = temp_states;
458 active_count = new_count;
461 workspace[0] ^= 1; /* Remember for the restarting feature */
462 workspace[1] = active_count;
465 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
466 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
469 printf("%.*sActive states: ", rlevel*2-2, SP);
470 for (i = 0; i < active_count; i++)
471 printf("%d/%d ", active_states[i].offset, active_states[i].count);
475 /* Set the pointers for adding new states */
477 next_active_state = active_states + active_count;
478 next_new_state = new_states;
480 /* Load the current character from the subject outside the loop, as many
481 different states may want to look at it, and we assume that at least one
484 if (ptr < end_subject)
486 clen = 1; /* Number of bytes in the character */
488 if (utf8) { GETCHARLEN(c, ptr, clen); } else
489 #endif /* SUPPORT_UTF8 */
494 clen = 0; /* This indicates the end of the subject */
495 c = NOTACHAR; /* This value should never actually be used */
498 /* Scan up the active states and act on each one. The result of an action
499 may be to add more states to the currently active list (e.g. on hitting a
500 parenthesis) or it may be to put states on the new list, for considering
501 when we move the character pointer on. */
503 for (i = 0; i < active_count; i++)
505 stateblock *current_state = active_states + i;
507 int state_offset = current_state->offset;
508 int count, codevalue;
510 int chartype, script;
514 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
515 if (clen == 0) printf("EOL\n");
516 else if (c > 32 && c < 127) printf("'%c'\n", c);
517 else printf("0x%02x\n", c);
520 /* This variable is referred to implicity in the ADD_xxx macros. */
522 ims = current_state->ims;
524 /* A negative offset is a special case meaning "hold off going to this
525 (negated) state until the number of characters in the data field have
528 if (state_offset < 0)
530 if (current_state->data > 0)
532 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
533 ADD_NEW_DATA(state_offset, current_state->count,
534 current_state->data - 1);
539 current_state->offset = state_offset = -state_offset;
543 /* Check for a duplicate state with the same count, and skip if found. */
545 for (j = 0; j < i; j++)
547 if (active_states[j].offset == state_offset &&
548 active_states[j].count == current_state->count)
550 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
551 goto NEXT_ACTIVE_STATE;
555 /* The state offset is the offset to the opcode */
557 code = start_code + state_offset;
560 /* If this opcode is followed by an inline character, load it. It is
561 tempting to test for the presence of a subject character here, but that
562 is wrong, because sometimes zero repetitions of the subject are
565 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
566 argument that is not a data character - but is always one byte long. We
567 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
568 this case. To keep the other cases fast, convert these ones to new opcodes.
571 if (coptable[codevalue] > 0)
575 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
576 #endif /* SUPPORT_UTF8 */
577 d = code[coptable[codevalue]];
578 if (codevalue >= OP_TYPESTAR)
582 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
584 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
585 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
586 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
588 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
590 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 dlen = 0; /* Not strictly necessary, but compilers moan */
598 d = NOTACHAR; /* if these variables are not set. */
602 /* Now process the individual opcodes */
607 /* ========================================================================== */
608 /* Reached a closing bracket. If not at the end of the pattern, carry
609 on with the next opcode. Otherwise, unless we have an empty string and
610 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
611 matches so we always have the longest first. */
616 if (code != end_code)
618 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
619 if (codevalue != OP_KET)
621 ADD_ACTIVE(state_offset - GET(code, 1), 0);
624 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
626 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
627 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
629 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
630 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
631 if (offsetcount >= 2)
633 offsets[0] = current_subject - start_subject;
634 offsets[1] = ptr - start_subject;
635 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
636 offsets[1] - offsets[0], current_subject));
638 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
640 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
641 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
642 match_count, rlevel*2-2, SP));
648 /* ========================================================================== */
649 /* These opcodes add to the current list of states without looking
650 at the current character. */
652 /*-----------------------------------------------------------------*/
654 do { code += GET(code, 1); } while (*code == OP_ALT);
655 ADD_ACTIVE(code - start_code, 0);
658 /*-----------------------------------------------------------------*/
663 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
664 code += GET(code, 1);
666 while (*code == OP_ALT);
669 /*-----------------------------------------------------------------*/
672 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
673 code += GET(code, 1);
674 while (*code == OP_ALT)
676 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
677 code += GET(code, 1);
681 /*-----------------------------------------------------------------*/
684 ADD_ACTIVE(state_offset + 1, 0);
685 code += 1 + GET(code, 2);
686 while (*code == OP_ALT) code += GET(code, 1);
687 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
690 /*-----------------------------------------------------------------*/
692 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
693 ((ims & PCRE_MULTILINE) != 0 &&
694 ptr != end_subject &&
696 { ADD_ACTIVE(state_offset + 1, 0); }
699 /*-----------------------------------------------------------------*/
701 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
704 /*-----------------------------------------------------------------*/
707 ADD_ACTIVE(state_offset + 2, 0);
710 /*-----------------------------------------------------------------*/
712 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
715 /*-----------------------------------------------------------------*/
717 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
721 /* ========================================================================== */
722 /* These opcodes inspect the next subject character, and sometimes
723 the previous one as well, but do not have an argument. The variable
724 clen contains the length of the current character and is zero if we are
725 at the end of the subject. */
727 /*-----------------------------------------------------------------*/
729 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
730 { ADD_NEW(state_offset + 1, 0); }
733 /*-----------------------------------------------------------------*/
735 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
736 { ADD_ACTIVE(state_offset + 1, 0); }
739 /*-----------------------------------------------------------------*/
741 if ((md->moptions & PCRE_NOTEOL) == 0)
745 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
747 { ADD_ACTIVE(state_offset + 1, 0); }
749 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
750 { ADD_ACTIVE(state_offset + 1, 0); }
753 /*-----------------------------------------------------------------*/
758 if (clen > 0 && c < 256 &&
759 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
760 { ADD_NEW(state_offset + 1, 0); }
763 /*-----------------------------------------------------------------*/
765 case OP_NOT_WHITESPACE:
766 case OP_NOT_WORDCHAR:
767 if (clen > 0 && (c >= 256 ||
768 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
769 { ADD_NEW(state_offset + 1, 0); }
772 /*-----------------------------------------------------------------*/
773 case OP_WORD_BOUNDARY:
774 case OP_NOT_WORD_BOUNDARY:
776 int left_word, right_word;
778 if (ptr > start_subject)
780 const uschar *temp = ptr - 1;
782 if (utf8) BACKCHAR(temp);
784 GETCHARTEST(d, temp);
785 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
789 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
792 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
793 { ADD_ACTIVE(state_offset + 1, 0); }
798 /*-----------------------------------------------------------------*/
799 /* Check the next character by Unicode property. We will get here only
800 if the support is in the binary; otherwise a compile-time error occurs.
809 int category = _pcre_ucp_findprop(c, &chartype, &script);
817 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
821 OK = category == code[2];
825 OK = chartype == code[2];
829 OK = script == code[2];
832 /* Should never occur, but keep compilers from grumbling. */
835 OK = codevalue != OP_PROP;
839 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
846 /* ========================================================================== */
847 /* These opcodes likewise inspect the subject character, but have an
848 argument that is not a data character. It is one of these opcodes:
849 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
850 OP_NOT_WORDCHAR. The value is loaded into d. */
855 count = current_state->count; /* Already matched */
856 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
859 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
862 (ims & PCRE_DOTALL) != 0 ||
865 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
867 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
869 active_count--; /* Remove non-match possibility */
873 ADD_NEW(state_offset, count);
878 /*-----------------------------------------------------------------*/
880 case OP_TYPEMINQUERY:
881 case OP_TYPEPOSQUERY:
882 ADD_ACTIVE(state_offset + 2, 0);
885 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
888 (ims & PCRE_DOTALL) != 0 ||
891 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
893 if (codevalue == OP_TYPEPOSQUERY)
895 active_count--; /* Remove non-match possibility */
898 ADD_NEW(state_offset + 2, 0);
903 /*-----------------------------------------------------------------*/
907 ADD_ACTIVE(state_offset + 2, 0);
910 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
913 (ims & PCRE_DOTALL) != 0 ||
916 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
918 if (codevalue == OP_TYPEPOSSTAR)
920 active_count--; /* Remove non-match possibility */
923 ADD_NEW(state_offset, 0);
928 /*-----------------------------------------------------------------*/
930 count = current_state->count; /* Number already matched */
933 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
936 (ims & PCRE_DOTALL) != 0 ||
939 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
941 if (++count >= GET2(code, 1))
942 { ADD_NEW(state_offset + 4, 0); }
944 { ADD_NEW(state_offset, count); }
949 /*-----------------------------------------------------------------*/
953 ADD_ACTIVE(state_offset + 4, 0);
954 count = current_state->count; /* Number already matched */
957 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
960 (ims & PCRE_DOTALL) != 0 ||
963 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965 if (codevalue == OP_TYPEPOSUPTO)
967 active_count--; /* Remove non-match possibility */
970 if (++count >= GET2(code, 1))
971 { ADD_NEW(state_offset + 4, 0); }
973 { ADD_NEW(state_offset, count); }
978 /* ========================================================================== */
979 /* These are virtual opcodes that are used when something like
980 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
981 argument. It keeps the code above fast for the other cases. The argument
982 is in the d variable. */
985 case OP_PROP_EXTRA + OP_TYPEPLUS:
986 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
987 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
988 count = current_state->count; /* Already matched */
989 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
993 int category = _pcre_ucp_findprop(c, &chartype, &script);
1001 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1005 OK = category == code[3];
1009 OK = chartype == code[3];
1013 OK = script == code[3];
1016 /* Should never occur, but keep compilers from grumbling. */
1019 OK = codevalue != OP_PROP;
1023 if (OK == (d == OP_PROP))
1025 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1027 active_count--; /* Remove non-match possibility */
1028 next_active_state--;
1031 ADD_NEW(state_offset, count);
1036 /*-----------------------------------------------------------------*/
1037 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1038 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1039 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1040 count = current_state->count; /* Already matched */
1041 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1042 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1044 const uschar *nptr = ptr + clen;
1046 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1048 active_count--; /* Remove non-match possibility */
1049 next_active_state--;
1051 while (nptr < end_subject)
1055 GETCHARLEN(nd, nptr, ndlen);
1056 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061 ADD_NEW_DATA(-state_offset, count, ncount);
1066 /*-----------------------------------------------------------------*/
1067 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1068 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1069 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1070 count = current_state->count; /* Already matched */
1071 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1078 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1086 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1088 active_count--; /* Remove non-match possibility */
1089 next_active_state--;
1092 ADD_NEW_DATA(-state_offset, count, ncount);
1100 /*-----------------------------------------------------------------*/
1101 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1102 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1103 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1104 count = current_state->count; /* Already matched */
1105 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1126 if (OK == (d == OP_VSPACE))
1128 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1130 active_count--; /* Remove non-match possibility */
1131 next_active_state--;
1134 ADD_NEW_DATA(-state_offset, count, 0);
1139 /*-----------------------------------------------------------------*/
1140 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1141 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1142 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1143 count = current_state->count; /* Already matched */
1144 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1151 case 0x20: /* SPACE */
1152 case 0xa0: /* NBSP */
1153 case 0x1680: /* OGHAM SPACE MARK */
1154 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1155 case 0x2000: /* EN QUAD */
1156 case 0x2001: /* EM QUAD */
1157 case 0x2002: /* EN SPACE */
1158 case 0x2003: /* EM SPACE */
1159 case 0x2004: /* THREE-PER-EM SPACE */
1160 case 0x2005: /* FOUR-PER-EM SPACE */
1161 case 0x2006: /* SIX-PER-EM SPACE */
1162 case 0x2007: /* FIGURE SPACE */
1163 case 0x2008: /* PUNCTUATION SPACE */
1164 case 0x2009: /* THIN SPACE */
1165 case 0x200A: /* HAIR SPACE */
1166 case 0x202f: /* NARROW NO-BREAK SPACE */
1167 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1168 case 0x3000: /* IDEOGRAPHIC SPACE */
1177 if (OK == (d == OP_HSPACE))
1179 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1181 active_count--; /* Remove non-match possibility */
1182 next_active_state--;
1185 ADD_NEW_DATA(-state_offset, count, 0);
1190 /*-----------------------------------------------------------------*/
1192 case OP_PROP_EXTRA + OP_TYPEQUERY:
1193 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1194 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1198 case OP_PROP_EXTRA + OP_TYPESTAR:
1199 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1200 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1205 ADD_ACTIVE(state_offset + 4, 0);
1209 int category = _pcre_ucp_findprop(c, &chartype, &script);
1217 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1221 OK = category == code[3];
1225 OK = chartype == code[3];
1229 OK = script == code[3];
1232 /* Should never occur, but keep compilers from grumbling. */
1235 OK = codevalue != OP_PROP;
1239 if (OK == (d == OP_PROP))
1241 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1242 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1244 active_count--; /* Remove non-match possibility */
1245 next_active_state--;
1247 ADD_NEW(state_offset + count, 0);
1252 /*-----------------------------------------------------------------*/
1253 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1254 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1255 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1259 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1260 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1261 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1266 ADD_ACTIVE(state_offset + 2, 0);
1267 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1269 const uschar *nptr = ptr + clen;
1271 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1272 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1274 active_count--; /* Remove non-match possibility */
1275 next_active_state--;
1277 while (nptr < end_subject)
1281 GETCHARLEN(nd, nptr, ndlen);
1282 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1286 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1291 /*-----------------------------------------------------------------*/
1292 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1293 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1294 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1298 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1299 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1300 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1304 ADD_ACTIVE(state_offset + 2, 0);
1311 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1319 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1320 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1322 active_count--; /* Remove non-match possibility */
1323 next_active_state--;
1325 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1333 /*-----------------------------------------------------------------*/
1334 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1335 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1336 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1340 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1341 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1342 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1346 ADD_ACTIVE(state_offset + 2, 0);
1366 if (OK == (d == OP_VSPACE))
1368 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1369 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1371 active_count--; /* Remove non-match possibility */
1372 next_active_state--;
1374 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1379 /*-----------------------------------------------------------------*/
1380 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1381 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1382 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1386 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1387 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1388 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1392 ADD_ACTIVE(state_offset + 2, 0);
1399 case 0x20: /* SPACE */
1400 case 0xa0: /* NBSP */
1401 case 0x1680: /* OGHAM SPACE MARK */
1402 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1403 case 0x2000: /* EN QUAD */
1404 case 0x2001: /* EM QUAD */
1405 case 0x2002: /* EN SPACE */
1406 case 0x2003: /* EM SPACE */
1407 case 0x2004: /* THREE-PER-EM SPACE */
1408 case 0x2005: /* FOUR-PER-EM SPACE */
1409 case 0x2006: /* SIX-PER-EM SPACE */
1410 case 0x2007: /* FIGURE SPACE */
1411 case 0x2008: /* PUNCTUATION SPACE */
1412 case 0x2009: /* THIN SPACE */
1413 case 0x200A: /* HAIR SPACE */
1414 case 0x202f: /* NARROW NO-BREAK SPACE */
1415 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1416 case 0x3000: /* IDEOGRAPHIC SPACE */
1425 if (OK == (d == OP_HSPACE))
1427 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1428 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1430 active_count--; /* Remove non-match possibility */
1431 next_active_state--;
1433 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1438 /*-----------------------------------------------------------------*/
1440 case OP_PROP_EXTRA + OP_TYPEEXACT:
1441 case OP_PROP_EXTRA + OP_TYPEUPTO:
1442 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1443 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1444 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1445 { ADD_ACTIVE(state_offset + 6, 0); }
1446 count = current_state->count; /* Number already matched */
1450 int category = _pcre_ucp_findprop(c, &chartype, &script);
1458 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1462 OK = category == code[5];
1466 OK = chartype == code[5];
1470 OK = script == code[5];
1473 /* Should never occur, but keep compilers from grumbling. */
1476 OK = codevalue != OP_PROP;
1480 if (OK == (d == OP_PROP))
1482 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1484 active_count--; /* Remove non-match possibility */
1485 next_active_state--;
1487 if (++count >= GET2(code, 1))
1488 { ADD_NEW(state_offset + 6, 0); }
1490 { ADD_NEW(state_offset, count); }
1495 /*-----------------------------------------------------------------*/
1496 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1497 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1498 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1499 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1500 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1501 { ADD_ACTIVE(state_offset + 4, 0); }
1502 count = current_state->count; /* Number already matched */
1503 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1505 const uschar *nptr = ptr + clen;
1507 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1509 active_count--; /* Remove non-match possibility */
1510 next_active_state--;
1512 while (nptr < end_subject)
1516 GETCHARLEN(nd, nptr, ndlen);
1517 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1521 if (++count >= GET2(code, 1))
1522 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1524 { ADD_NEW_DATA(-state_offset, count, ncount); }
1529 /*-----------------------------------------------------------------*/
1530 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1531 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1532 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1533 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1534 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1535 { ADD_ACTIVE(state_offset + 4, 0); }
1536 count = current_state->count; /* Number already matched */
1543 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1551 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1553 active_count--; /* Remove non-match possibility */
1554 next_active_state--;
1556 if (++count >= GET2(code, 1))
1557 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1559 { ADD_NEW_DATA(-state_offset, count, ncount); }
1567 /*-----------------------------------------------------------------*/
1568 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1569 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1570 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1571 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1572 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1573 { ADD_ACTIVE(state_offset + 4, 0); }
1574 count = current_state->count; /* Number already matched */
1594 if (OK == (d == OP_VSPACE))
1596 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1598 active_count--; /* Remove non-match possibility */
1599 next_active_state--;
1601 if (++count >= GET2(code, 1))
1602 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1604 { ADD_NEW_DATA(-state_offset, count, 0); }
1609 /*-----------------------------------------------------------------*/
1610 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1611 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1612 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1613 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1614 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1615 { ADD_ACTIVE(state_offset + 4, 0); }
1616 count = current_state->count; /* Number already matched */
1623 case 0x20: /* SPACE */
1624 case 0xa0: /* NBSP */
1625 case 0x1680: /* OGHAM SPACE MARK */
1626 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1627 case 0x2000: /* EN QUAD */
1628 case 0x2001: /* EM QUAD */
1629 case 0x2002: /* EN SPACE */
1630 case 0x2003: /* EM SPACE */
1631 case 0x2004: /* THREE-PER-EM SPACE */
1632 case 0x2005: /* FOUR-PER-EM SPACE */
1633 case 0x2006: /* SIX-PER-EM SPACE */
1634 case 0x2007: /* FIGURE SPACE */
1635 case 0x2008: /* PUNCTUATION SPACE */
1636 case 0x2009: /* THIN SPACE */
1637 case 0x200A: /* HAIR SPACE */
1638 case 0x202f: /* NARROW NO-BREAK SPACE */
1639 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1640 case 0x3000: /* IDEOGRAPHIC SPACE */
1649 if (OK == (d == OP_HSPACE))
1651 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1653 active_count--; /* Remove non-match possibility */
1654 next_active_state--;
1656 if (++count >= GET2(code, 1))
1657 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1659 { ADD_NEW_DATA(-state_offset, count, 0); }
1664 /* ========================================================================== */
1665 /* These opcodes are followed by a character that is usually compared
1666 to the current subject character; it is loaded into d. We still get
1667 here even if there is no subject character, because in some cases zero
1668 repetitions are permitted. */
1670 /*-----------------------------------------------------------------*/
1672 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1675 /*-----------------------------------------------------------------*/
1677 if (clen == 0) break;
1682 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1684 unsigned int othercase;
1685 if (c < 128) othercase = fcc[c]; else
1687 /* If we have Unicode property support, we can use it to test the
1688 other case of the character. */
1691 othercase = _pcre_ucp_othercase(c);
1693 othercase = NOTACHAR;
1696 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1700 #endif /* SUPPORT_UTF8 */
1702 /* Non-UTF-8 mode */
1704 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1710 /*-----------------------------------------------------------------*/
1711 /* This is a tricky one because it can match more than one character.
1712 Find out how many characters to skip, and then set up a negative state
1713 to wait for them to pass before continuing. */
1716 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1718 const uschar *nptr = ptr + clen;
1720 while (nptr < end_subject)
1723 GETCHARLEN(c, nptr, nclen);
1724 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1728 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1733 /*-----------------------------------------------------------------*/
1734 /* This is a tricky like EXTUNI because it too can match more than one
1735 character (when CR is followed by LF). In this case, set up a negative
1736 state to wait for one character to pass before continuing. */
1739 if (clen > 0) switch(c)
1747 ADD_NEW(state_offset + 1, 0);
1750 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1752 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1756 ADD_NEW(state_offset + 1, 0);
1762 /*-----------------------------------------------------------------*/
1764 if (clen > 0) switch(c)
1776 ADD_NEW(state_offset + 1, 0);
1781 /*-----------------------------------------------------------------*/
1783 if (clen > 0) switch(c)
1792 ADD_NEW(state_offset + 1, 0);
1799 /*-----------------------------------------------------------------*/
1801 if (clen > 0) switch(c)
1804 case 0x20: /* SPACE */
1805 case 0xa0: /* NBSP */
1806 case 0x1680: /* OGHAM SPACE MARK */
1807 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1808 case 0x2000: /* EN QUAD */
1809 case 0x2001: /* EM QUAD */
1810 case 0x2002: /* EN SPACE */
1811 case 0x2003: /* EM SPACE */
1812 case 0x2004: /* THREE-PER-EM SPACE */
1813 case 0x2005: /* FOUR-PER-EM SPACE */
1814 case 0x2006: /* SIX-PER-EM SPACE */
1815 case 0x2007: /* FIGURE SPACE */
1816 case 0x2008: /* PUNCTUATION SPACE */
1817 case 0x2009: /* THIN SPACE */
1818 case 0x200A: /* HAIR SPACE */
1819 case 0x202f: /* NARROW NO-BREAK SPACE */
1820 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1821 case 0x3000: /* IDEOGRAPHIC SPACE */
1825 ADD_NEW(state_offset + 1, 0);
1830 /*-----------------------------------------------------------------*/
1832 if (clen > 0) switch(c)
1835 case 0x20: /* SPACE */
1836 case 0xa0: /* NBSP */
1837 case 0x1680: /* OGHAM SPACE MARK */
1838 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1839 case 0x2000: /* EN QUAD */
1840 case 0x2001: /* EM QUAD */
1841 case 0x2002: /* EN SPACE */
1842 case 0x2003: /* EM SPACE */
1843 case 0x2004: /* THREE-PER-EM SPACE */
1844 case 0x2005: /* FOUR-PER-EM SPACE */
1845 case 0x2006: /* SIX-PER-EM SPACE */
1846 case 0x2007: /* FIGURE SPACE */
1847 case 0x2008: /* PUNCTUATION SPACE */
1848 case 0x2009: /* THIN SPACE */
1849 case 0x200A: /* HAIR SPACE */
1850 case 0x202f: /* NARROW NO-BREAK SPACE */
1851 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1852 case 0x3000: /* IDEOGRAPHIC SPACE */
1853 ADD_NEW(state_offset + 1, 0);
1858 /*-----------------------------------------------------------------*/
1859 /* Match a negated single character. This is only used for one-byte
1860 characters, that is, we know that d < 256. The character we are
1861 checking (c) can be multibyte. */
1866 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1867 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1871 /*-----------------------------------------------------------------*/
1878 count = current_state->count; /* Already matched */
1879 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1882 unsigned int otherd = NOTACHAR;
1883 if ((ims & PCRE_CASELESS) != 0)
1886 if (utf8 && d >= 128)
1889 otherd = _pcre_ucp_othercase(d);
1890 #endif /* SUPPORT_UCP */
1893 #endif /* SUPPORT_UTF8 */
1896 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1899 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1901 active_count--; /* Remove non-match possibility */
1902 next_active_state--;
1905 ADD_NEW(state_offset, count);
1910 /*-----------------------------------------------------------------*/
1915 case OP_NOTMINQUERY:
1916 case OP_NOTPOSQUERY:
1917 ADD_ACTIVE(state_offset + dlen + 1, 0);
1920 unsigned int otherd = NOTACHAR;
1921 if ((ims & PCRE_CASELESS) != 0)
1924 if (utf8 && d >= 128)
1927 otherd = _pcre_ucp_othercase(d);
1928 #endif /* SUPPORT_UCP */
1931 #endif /* SUPPORT_UTF8 */
1934 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1936 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1938 active_count--; /* Remove non-match possibility */
1939 next_active_state--;
1941 ADD_NEW(state_offset + dlen + 1, 0);
1946 /*-----------------------------------------------------------------*/
1953 ADD_ACTIVE(state_offset + dlen + 1, 0);
1956 unsigned int otherd = NOTACHAR;
1957 if ((ims & PCRE_CASELESS) != 0)
1960 if (utf8 && d >= 128)
1963 otherd = _pcre_ucp_othercase(d);
1964 #endif /* SUPPORT_UCP */
1967 #endif /* SUPPORT_UTF8 */
1970 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1972 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1974 active_count--; /* Remove non-match possibility */
1975 next_active_state--;
1977 ADD_NEW(state_offset, 0);
1982 /*-----------------------------------------------------------------*/
1985 count = current_state->count; /* Number already matched */
1988 unsigned int otherd = NOTACHAR;
1989 if ((ims & PCRE_CASELESS) != 0)
1992 if (utf8 && d >= 128)
1995 otherd = _pcre_ucp_othercase(d);
1996 #endif /* SUPPORT_UCP */
1999 #endif /* SUPPORT_UTF8 */
2002 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2004 if (++count >= GET2(code, 1))
2005 { ADD_NEW(state_offset + dlen + 3, 0); }
2007 { ADD_NEW(state_offset, count); }
2012 /*-----------------------------------------------------------------*/
2019 ADD_ACTIVE(state_offset + dlen + 3, 0);
2020 count = current_state->count; /* Number already matched */
2023 unsigned int otherd = NOTACHAR;
2024 if ((ims & PCRE_CASELESS) != 0)
2027 if (utf8 && d >= 128)
2030 otherd = _pcre_ucp_othercase(d);
2031 #endif /* SUPPORT_UCP */
2034 #endif /* SUPPORT_UTF8 */
2037 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2039 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2041 active_count--; /* Remove non-match possibility */
2042 next_active_state--;
2044 if (++count >= GET2(code, 1))
2045 { ADD_NEW(state_offset + dlen + 3, 0); }
2047 { ADD_NEW(state_offset, count); }
2053 /* ========================================================================== */
2054 /* These are the class-handling opcodes */
2060 BOOL isinclass = FALSE;
2061 int next_state_offset;
2062 const uschar *ecode;
2064 /* For a simple class, there is always just a 32-byte table, and we
2065 can set isinclass from it. */
2067 if (codevalue != OP_XCLASS)
2072 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2073 ((code[1 + c/8] & (1 << (c&7))) != 0);
2077 /* An extended class may have a table or a list of single characters,
2078 ranges, or both, and it may be positive or negative. There's a
2079 function that sorts all this out. */
2083 ecode = code + GET(code, 1);
2084 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2087 /* At this point, isinclass is set for all kinds of class, and ecode
2088 points to the byte after the end of the class. If there is a
2089 quantifier, this is where it will be. */
2091 next_state_offset = ecode - start_code;
2097 ADD_ACTIVE(next_state_offset + 1, 0);
2098 if (isinclass) { ADD_NEW(state_offset, 0); }
2103 count = current_state->count; /* Already matched */
2104 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2105 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2110 ADD_ACTIVE(next_state_offset + 1, 0);
2111 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2116 count = current_state->count; /* Already matched */
2117 if (count >= GET2(ecode, 1))
2118 { ADD_ACTIVE(next_state_offset + 5, 0); }
2121 int max = GET2(ecode, 3);
2122 if (++count >= max && max != 0) /* Max 0 => no limit */
2123 { ADD_NEW(next_state_offset + 5, 0); }
2125 { ADD_NEW(state_offset, count); }
2130 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2136 /* ========================================================================== */
2137 /* These are the opcodes for fancy brackets of various kinds. We have
2138 to use recursion in order to handle them. */
2143 case OP_ASSERTBACK_NOT:
2146 int local_offsets[2];
2147 int local_workspace[1000];
2148 const uschar *endasscode = code + GET(code, 1);
2150 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2152 rc = internal_dfa_exec(
2153 md, /* static match data */
2154 code, /* this subexpression's code */
2155 ptr, /* where we currently are */
2156 ptr - start_subject, /* start offset */
2157 local_offsets, /* offset vector */
2158 sizeof(local_offsets)/sizeof(int), /* size of same */
2159 local_workspace, /* workspace vector */
2160 sizeof(local_workspace)/sizeof(int), /* size of same */
2161 ims, /* the current ims flags */
2162 rlevel, /* function recursion level */
2163 recursing); /* pass on regex recursion */
2165 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2166 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2170 /*-----------------------------------------------------------------*/
2174 int local_offsets[1000];
2175 int local_workspace[1000];
2176 int condcode = code[LINK_SIZE+1];
2178 /* Back reference conditions are not supported */
2180 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2182 /* The DEFINE condition is always false */
2184 if (condcode == OP_DEF)
2186 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2189 /* The only supported version of OP_RREF is for the value RREF_ANY,
2190 which means "test if in any recursion". We can't test for specifically
2193 else if (condcode == OP_RREF)
2195 int value = GET2(code, LINK_SIZE+2);
2196 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2197 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2198 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2201 /* Otherwise, the condition is an assertion */
2206 const uschar *asscode = code + LINK_SIZE + 1;
2207 const uschar *endasscode = asscode + GET(asscode, 1);
2209 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2211 rc = internal_dfa_exec(
2212 md, /* fixed match data */
2213 asscode, /* this subexpression's code */
2214 ptr, /* where we currently are */
2215 ptr - start_subject, /* start offset */
2216 local_offsets, /* offset vector */
2217 sizeof(local_offsets)/sizeof(int), /* size of same */
2218 local_workspace, /* workspace vector */
2219 sizeof(local_workspace)/sizeof(int), /* size of same */
2220 ims, /* the current ims flags */
2221 rlevel, /* function recursion level */
2222 recursing); /* pass on regex recursion */
2225 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2226 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2228 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2233 /*-----------------------------------------------------------------*/
2236 int local_offsets[1000];
2237 int local_workspace[1000];
2240 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2243 rc = internal_dfa_exec(
2244 md, /* fixed match data */
2245 start_code + GET(code, 1), /* this subexpression's code */
2246 ptr, /* where we currently are */
2247 ptr - start_subject, /* start offset */
2248 local_offsets, /* offset vector */
2249 sizeof(local_offsets)/sizeof(int), /* size of same */
2250 local_workspace, /* workspace vector */
2251 sizeof(local_workspace)/sizeof(int), /* size of same */
2252 ims, /* the current ims flags */
2253 rlevel, /* function recursion level */
2254 recursing + 1); /* regex recurse level */
2256 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2257 recursing + 1, rc));
2259 /* Ran out of internal offsets */
2261 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2263 /* For each successful matched substring, set up the next state with a
2264 count of characters to skip before trying it. Note that the count is in
2265 characters, not bytes. */
2269 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2271 const uschar *p = start_subject + local_offsets[rc];
2272 const uschar *pp = start_subject + local_offsets[rc+1];
2273 int charcount = local_offsets[rc+1] - local_offsets[rc];
2274 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2277 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2281 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2285 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2289 /*-----------------------------------------------------------------*/
2292 int local_offsets[2];
2293 int local_workspace[1000];
2295 int rc = internal_dfa_exec(
2296 md, /* fixed match data */
2297 code, /* this subexpression's code */
2298 ptr, /* where we currently are */
2299 ptr - start_subject, /* start offset */
2300 local_offsets, /* offset vector */
2301 sizeof(local_offsets)/sizeof(int), /* size of same */
2302 local_workspace, /* workspace vector */
2303 sizeof(local_workspace)/sizeof(int), /* size of same */
2304 ims, /* the current ims flags */
2305 rlevel, /* function recursion level */
2306 recursing); /* pass on regex recursion */
2310 const uschar *end_subpattern = code;
2311 int charcount = local_offsets[1] - local_offsets[0];
2312 int next_state_offset, repeat_state_offset;
2314 do { end_subpattern += GET(end_subpattern, 1); }
2315 while (*end_subpattern == OP_ALT);
2316 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2318 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2319 arrange for the repeat state also to be added to the relevant list.
2320 Calculate the offset, or set -1 for no repeat. */
2322 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2323 *end_subpattern == OP_KETRMIN)?
2324 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2326 /* If we have matched an empty string, add the next state at the
2327 current character pointer. This is important so that the duplicate
2328 checking kicks in, which is what breaks infinite loops that match an
2333 ADD_ACTIVE(next_state_offset, 0);
2336 /* Optimization: if there are no more active states, and there
2337 are no new states yet set up, then skip over the subject string
2338 right here, to save looping. Otherwise, set up the new state to swing
2339 into action when the end of the substring is reached. */
2341 else if (i + 1 >= active_count && new_count == 0)
2345 ADD_NEW(next_state_offset, 0);
2347 /* If we are adding a repeat state at the new character position,
2348 we must fudge things so that it is the only current state.
2349 Otherwise, it might be a duplicate of one we processed before, and
2350 that would cause it to be skipped. */
2352 if (repeat_state_offset >= 0)
2354 next_active_state = active_states;
2357 ADD_ACTIVE(repeat_state_offset, 0);
2362 const uschar *p = start_subject + local_offsets[0];
2363 const uschar *pp = start_subject + local_offsets[1];
2364 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2365 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2366 if (repeat_state_offset >= 0)
2367 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2371 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2376 /* ========================================================================== */
2377 /* Handle callouts */
2380 if (pcre_callout != NULL)
2383 pcre_callout_block cb;
2384 cb.version = 1; /* Version 1 of the callout block */
2385 cb.callout_number = code[1];
2386 cb.offset_vector = offsets;
2387 cb.subject = (PCRE_SPTR)start_subject;
2388 cb.subject_length = end_subject - start_subject;
2389 cb.start_match = current_subject - start_subject;
2390 cb.current_position = ptr - start_subject;
2391 cb.pattern_position = GET(code, 2);
2392 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2394 cb.capture_last = -1;
2395 cb.callout_data = md->callout_data;
2396 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2397 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2402 /* ========================================================================== */
2403 default: /* Unsupported opcode */
2404 return PCRE_ERROR_DFA_UITEM;
2407 NEXT_ACTIVE_STATE: continue;
2409 } /* End of loop scanning active states */
2411 /* We have finished the processing at the current subject character. If no
2412 new states have been set for the next character, we have found all the
2413 matches that we are going to find. If we are at the top level and partial
2414 matching has been requested, check for appropriate conditions. */
2418 if (match_count < 0 && /* No matches found */
2419 rlevel == 1 && /* Top level match function */
2420 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2421 ptr >= end_subject && /* Reached end of subject */
2422 ptr > current_subject) /* Matched non-empty string */
2424 if (offsetcount >= 2)
2426 offsets[0] = current_subject - start_subject;
2427 offsets[1] = end_subject - start_subject;
2429 match_count = PCRE_ERROR_PARTIAL;
2432 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2433 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2435 break; /* In effect, "return", but see the comment below */
2438 /* One or more states are active for the next character. */
2440 ptr += clen; /* Advance to next subject character */
2441 } /* Loop to move along the subject string */
2443 /* Control gets here from "break" a few lines above. We do it this way because
2444 if we use "return" above, we have compiler trouble. Some compilers warn if
2445 there's nothing here because they think the function doesn't return a value. On
2446 the other hand, if we put a dummy statement here, some more clever compilers
2447 complain that it can't be reached. Sigh. */
2455 /*************************************************
2456 * Execute a Regular Expression - DFA engine *
2457 *************************************************/
2459 /* This external function applies a compiled re to a subject string using a DFA
2460 engine. This function calls the internal function multiple times if the pattern
2464 argument_re points to the compiled expression
2465 extra_data points to extra data or is NULL
2466 subject points to the subject string
2467 length length of subject string (may contain binary zeros)
2468 start_offset where to start in the subject string
2470 offsets vector of match offsets
2471 offsetcount size of same
2472 workspace workspace vector
2473 wscount size of same
2475 Returns: > 0 => number of match offset pairs placed in offsets
2476 = 0 => offsets overflowed; longest matches are present
2477 -1 => failed to match
2478 < -1 => some kind of unexpected problem
2482 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2483 const char *subject, int length, int start_offset, int options, int *offsets,
2484 int offsetcount, int *workspace, int wscount)
2486 real_pcre *re = (real_pcre *)argument_re;
2487 dfa_match_data match_block;
2488 dfa_match_data *md = &match_block;
2489 BOOL utf8, anchored, startline, firstline;
2490 const uschar *current_subject, *end_subject, *lcc;
2492 pcre_study_data internal_study;
2493 const pcre_study_data *study = NULL;
2494 real_pcre internal_re;
2496 const uschar *req_byte_ptr;
2497 const uschar *start_bits = NULL;
2498 BOOL first_byte_caseless = FALSE;
2499 BOOL req_byte_caseless = FALSE;
2500 int first_byte = -1;
2505 /* Plausibility checks */
2507 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2508 if (re == NULL || subject == NULL || workspace == NULL ||
2509 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2510 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2511 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2513 /* We need to find the pointer to any study data before we test for byte
2514 flipping, so we scan the extra_data block first. This may set two fields in the
2515 match block, so we must initialize them beforehand. However, the other fields
2516 in the match block must not be set until after the byte flipping. */
2518 md->tables = re->tables;
2519 md->callout_data = NULL;
2521 if (extra_data != NULL)
2523 unsigned int flags = extra_data->flags;
2524 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2525 study = (const pcre_study_data *)extra_data->study_data;
2526 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2527 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2528 return PCRE_ERROR_DFA_UMLIMIT;
2529 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2530 md->callout_data = extra_data->callout_data;
2531 if ((flags & PCRE_EXTRA_TABLES) != 0)
2532 md->tables = extra_data->tables;
2535 /* Check that the first field in the block is the magic number. If it is not,
2536 test for a regex that was compiled on a host of opposite endianness. If this is
2537 the case, flipped values are put in internal_re and internal_study if there was
2540 if (re->magic_number != MAGIC_NUMBER)
2542 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2543 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2544 if (study != NULL) study = &internal_study;
2547 /* Set some local values */
2549 current_subject = (const unsigned char *)subject + start_offset;
2550 end_subject = (const unsigned char *)subject + length;
2551 req_byte_ptr = current_subject - 1;
2554 utf8 = (re->options & PCRE_UTF8) != 0;
2559 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2560 (re->options & PCRE_ANCHORED) != 0;
2562 /* The remaining fixed data for passing around. */
2564 md->start_code = (const uschar *)argument_re +
2565 re->name_table_offset + re->name_count * re->name_entry_size;
2566 md->start_subject = (const unsigned char *)subject;
2567 md->end_subject = end_subject;
2568 md->moptions = options;
2569 md->poptions = re->options;
2571 /* Handle different types of newline. The three bits give eight cases. If
2572 nothing is set at run time, whatever was used at compile time applies. */
2574 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2577 case 0: newline = NEWLINE; break; /* Compile-time default */
2578 case PCRE_NEWLINE_CR: newline = '\r'; break;
2579 case PCRE_NEWLINE_LF: newline = '\n'; break;
2580 case PCRE_NEWLINE_CR+
2581 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2582 case PCRE_NEWLINE_ANY: newline = -1; break;
2583 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2584 default: return PCRE_ERROR_BADNEWLINE;
2589 md->nltype = NLTYPE_ANYCRLF;
2591 else if (newline < 0)
2593 md->nltype = NLTYPE_ANY;
2597 md->nltype = NLTYPE_FIXED;
2601 md->nl[0] = (newline >> 8) & 255;
2602 md->nl[1] = newline & 255;
2607 md->nl[0] = newline;
2611 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2612 back the character offset. */
2615 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2617 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2618 return PCRE_ERROR_BADUTF8;
2619 if (start_offset > 0 && start_offset < length)
2621 int tb = ((uschar *)subject)[start_offset];
2625 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2631 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2632 is a feature that makes it possible to save compiled regex and re-use them
2633 in other programs later. */
2635 if (md->tables == NULL) md->tables = _pcre_default_tables;
2637 /* The lower casing table and the "must be at the start of a line" flag are
2638 used in a loop when finding where to start. */
2640 lcc = md->tables + lcc_offset;
2641 startline = (re->options & PCRE_STARTLINE) != 0;
2642 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2644 /* Set up the first character to match, if available. The first_byte value is
2645 never set for an anchored regular expression, but the anchoring may be forced
2646 at run time, so we have to test for anchoring. The first char may be unset for
2647 an unanchored pattern, of course. If there's no first char and the pattern was
2648 studied, there may be a bitmap of possible first characters. */
2652 if ((re->options & PCRE_FIRSTSET) != 0)
2654 first_byte = re->first_byte & 255;
2655 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2656 first_byte = lcc[first_byte];
2660 if (startline && study != NULL &&
2661 (study->options & PCRE_STUDY_MAPPED) != 0)
2662 start_bits = study->start_bits;
2666 /* For anchored or unanchored matches, there may be a "last known required
2669 if ((re->options & PCRE_REQCHSET) != 0)
2671 req_byte = re->req_byte & 255;
2672 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2673 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2676 /* Call the main matching function, looping for a non-anchored regex after a
2677 failed match. Unless restarting, optimize by moving to the first match
2678 character if possible, when not anchored. Then unless wanting a partial match,
2679 check for a required later character. */
2685 if ((options & PCRE_DFA_RESTART) == 0)
2687 const uschar *save_end_subject = end_subject;
2689 /* Advance to a unique first char if possible. If firstline is TRUE, the
2690 start of the match is constrained to the first line of a multiline string.
2691 Implement this by temporarily adjusting end_subject so that we stop
2692 scanning at a newline. If the match fails at the newline, later code breaks
2697 const uschar *t = current_subject;
2698 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2702 if (first_byte >= 0)
2704 if (first_byte_caseless)
2705 while (current_subject < end_subject &&
2706 lcc[*current_subject] != first_byte)
2709 while (current_subject < end_subject && *current_subject != first_byte)
2713 /* Or to just after a linebreak for a multiline match if possible */
2717 if (current_subject > md->start_subject + start_offset)
2719 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2722 /* If we have just passed a CR and the newline option is ANY or
2723 ANYCRLF, and we are now at a LF, advance the match position by one more
2726 if (current_subject[-1] == '\r' &&
2727 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2728 current_subject < end_subject &&
2729 *current_subject == '\n')
2734 /* Or to a non-unique first char after study */
2736 else if (start_bits != NULL)
2738 while (current_subject < end_subject)
2740 register unsigned int c = *current_subject;
2741 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2746 /* Restore fudged end_subject */
2748 end_subject = save_end_subject;
2751 /* If req_byte is set, we know that that character must appear in the subject
2752 for the match to succeed. If the first character is set, req_byte must be
2753 later in the subject; otherwise the test starts at the match point. This
2754 optimization can save a huge amount of work in patterns with nested unlimited
2755 repeats that aren't going to match. Writing separate code for cased/caseless
2756 versions makes it go faster, as does using an autoincrement and backing off
2759 HOWEVER: when the subject string is very, very long, searching to its end can
2760 take a long time, and give bad performance on quite ordinary patterns. This
2761 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2762 don't do this when the string is sufficiently long.
2764 ALSO: this processing is disabled when partial matching is requested.
2767 if (req_byte >= 0 &&
2768 end_subject - current_subject < REQ_BYTE_MAX &&
2769 (options & PCRE_PARTIAL) == 0)
2771 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2773 /* We don't need to repeat the search if we haven't yet reached the
2774 place we found it at last time. */
2776 if (p > req_byte_ptr)
2778 if (req_byte_caseless)
2780 while (p < end_subject)
2782 register int pp = *p++;
2783 if (pp == req_byte || pp == req_byte2) { p--; break; }
2788 while (p < end_subject)
2790 if (*p++ == req_byte) { p--; break; }
2794 /* If we can't find the required character, break the matching loop,
2795 which will cause a return or PCRE_ERROR_NOMATCH. */
2797 if (p >= end_subject) break;
2799 /* If we have found the required character, save the point where we
2800 found it, so that we don't search again next time round the loop if
2801 the start hasn't passed this character yet. */
2807 /* OK, now we can do the business */
2809 rc = internal_dfa_exec(
2810 md, /* fixed match data */
2811 md->start_code, /* this subexpression's code */
2812 current_subject, /* where we currently are */
2813 start_offset, /* start offset in subject */
2814 offsets, /* offset vector */
2815 offsetcount, /* size of same */
2816 workspace, /* workspace vector */
2817 wscount, /* size of same */
2818 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2819 0, /* function recurse level */
2820 0); /* regex recurse level */
2822 /* Anything other than "no match" means we are done, always; otherwise, carry
2823 on only if not anchored. */
2825 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2827 /* Advance to the next subject character unless we are at the end of a line
2828 and firstline is set. */
2830 if (firstline && IS_NEWLINE(current_subject)) break;
2834 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2837 if (current_subject > end_subject) break;
2839 /* If we have just passed a CR and the newline option is CRLF or ANY or
2840 ANYCRLF, and we are now at a LF, advance the match position by one more
2843 if (current_subject[-1] == '\r' &&
2844 (md->nltype == NLTYPE_ANY ||
2845 md->nltype == NLTYPE_ANYCRLF ||
2847 current_subject < end_subject &&
2848 *current_subject == '\n')
2851 } /* "Bumpalong" loop */
2853 return PCRE_ERROR_NOMATCH;
2856 /* End of pcre_dfa_exec.c */