1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
51 #define NLBLOCK md /* Block containing newline information */
52 #define PSSTART start_subject /* Field containing processed string start */
53 #define PSEND end_subject /* Field containing processed string end */
55 #include "pcre_internal.h"
58 /* For use to indent debugging output */
64 /*************************************************
65 * Code parameters and static tables *
66 *************************************************/
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 into others, under special conditions. A gap of 20 between the blocks should be
70 enough. The resulting opcodes don't have to be less than 256 because they are
71 never stored, so we push them well clear of the normal opcodes. */
73 #define OP_PROP_EXTRA 300
74 #define OP_EXTUNI_EXTRA 320
75 #define OP_ANYNL_EXTRA 340
76 #define OP_HSPACE_EXTRA 360
77 #define OP_VSPACE_EXTRA 380
80 /* This table identifies those opcodes that are followed immediately by a
81 character that is to be tested in some way. This makes is possible to
82 centralize the loading of these characters. In the case of Type * etc, the
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 small value. ***NOTE*** If the start of this table is modified, the two tables
85 that follow must also be modified. */
87 static const uschar coptable[] = {
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 0, 0, 0, /* Any, AllAny, Anybyte */
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
98 /* Positive single-char repeats */
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* upto, minupto, exact */
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 /* Negative single-char repeats - only for chars < 256 */
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* NOT upto, minupto, exact */
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 /* Positive type repeats */
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108 3, 3, 3, /* Type upto, minupto, exact */
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 /* Character class & ref repeats */
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112 0, 0, /* CRRANGE, CRMINRANGE */
115 0, /* XCLASS - variable length */
125 0, /* Assert behind */
126 0, /* Assert behind not */
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129 0, 0, 0, /* SBRA, SCBRA, SCOND */
133 0, 0, /* BRAZERO, BRAMINZERO */
134 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
141 static const uschar toptable1[] = {
143 ctype_digit, ctype_digit,
144 ctype_space, ctype_space,
145 ctype_word, ctype_word,
146 0, 0 /* OP_ANY, OP_ALLANY */
149 static const uschar toptable2[] = {
154 1, 1 /* OP_ANY, OP_ALLANY */
158 /* Structure for holding data about a particular state, which is in effect the
159 current data for an active path through the match tree. It must consist
160 entirely of ints because the working vector we are passed, and which we put
161 these structures in, is a vector of ints. */
163 typedef struct stateblock {
164 int offset; /* Offset to opcode */
165 int count; /* Count for repeats */
166 int ims; /* ims flag bits */
167 int data; /* Some use extra data */
170 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
174 /*************************************************
175 * Print character string *
176 *************************************************/
178 /* Character string printing function for debugging.
182 length number of bytes
189 pchars(unsigned char *p, int length, FILE *f)
194 if (isprint(c = *(p++)))
197 fprintf(f, "\\x%02x", c);
204 /*************************************************
205 * Execute a Regular Expression - DFA engine *
206 *************************************************/
208 /* This internal function applies a compiled pattern to a subject string,
209 starting at a given point, using a DFA engine. This function is called from the
210 external one, possibly multiple times if the pattern is not anchored. The
211 function calls itself recursively for some kinds of subpattern.
214 md the match_data block with fixed information
215 this_start_code the opening bracket of this subexpression's code
216 current_subject where we currently are in the subject string
217 start_offset start offset in the subject string
218 offsets vector to contain the matching string offsets
219 offsetcount size of same
220 workspace vector of workspace
222 ims the current ims flags
223 rlevel function call recursion level
224 recursing regex recursive call level
226 Returns: > 0 => number of match offset pairs placed in offsets
227 = 0 => offsets overflowed; longest matches are present
228 -1 => failed to match
229 < -1 => some kind of unexpected problem
231 The following macros are used for adding states to the two state vectors (one
232 for the current character, one for the following character). */
234 #define ADD_ACTIVE(x,y) \
235 if (active_count++ < wscount) \
237 next_active_state->offset = (x); \
238 next_active_state->count = (y); \
239 next_active_state->ims = ims; \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
243 else return PCRE_ERROR_DFA_WSSIZE
245 #define ADD_ACTIVE_DATA(x,y,z) \
246 if (active_count++ < wscount) \
248 next_active_state->offset = (x); \
249 next_active_state->count = (y); \
250 next_active_state->ims = ims; \
251 next_active_state->data = (z); \
252 next_active_state++; \
253 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
255 else return PCRE_ERROR_DFA_WSSIZE
257 #define ADD_NEW(x,y) \
258 if (new_count++ < wscount) \
260 next_new_state->offset = (x); \
261 next_new_state->count = (y); \
262 next_new_state->ims = ims; \
264 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
266 else return PCRE_ERROR_DFA_WSSIZE
268 #define ADD_NEW_DATA(x,y,z) \
269 if (new_count++ < wscount) \
271 next_new_state->offset = (x); \
272 next_new_state->count = (y); \
273 next_new_state->ims = ims; \
274 next_new_state->data = (z); \
276 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
278 else return PCRE_ERROR_DFA_WSSIZE
280 /* And now, here is the code */
285 const uschar *this_start_code,
286 const uschar *current_subject,
296 stateblock *active_states, *new_states, *temp_states;
297 stateblock *next_active_state, *next_new_state;
299 const uschar *ctypes, *lcc, *fcc;
301 const uschar *end_code, *first_op;
303 int active_count, new_count, match_count;
305 /* Some fields in the md block are frequently referenced, so we load them into
306 independent variables in the hope that this will perform better. */
308 const uschar *start_subject = md->start_subject;
309 const uschar *end_subject = md->end_subject;
310 const uschar *start_code = md->start_code;
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323 (2 * INTS_PER_STATEBLOCK);
325 DPRINTF(("\n%.*s---------------------\n"
326 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
329 ctypes = md->tables + ctypes_offset;
330 lcc = md->tables + lcc_offset;
331 fcc = md->tables + fcc_offset;
333 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
335 active_states = (stateblock *)(workspace + 2);
336 next_new_state = new_states = active_states + wscount;
339 first_op = this_start_code + 1 + LINK_SIZE +
340 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343 the alternative states onto the list, and find out where the end is. This
344 makes is possible to use this function recursively, when we want to stop at a
345 matching internal ket rather than at the end.
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348 a backward assertion. In that case, we have to find out the maximum amount to
349 move back, and set up each alternative appropriately. */
351 if (*first_op == OP_REVERSE)
356 end_code = this_start_code;
359 int back = GET(end_code, 2+LINK_SIZE);
360 if (back > max_back) max_back = back;
361 end_code += GET(end_code, 1);
363 while (*end_code == OP_ALT);
365 /* If we can't go back the amount required for the longest lookbehind
366 pattern, go back as far as we can; some alternatives may still be viable. */
369 /* In character mode we have to step back character by character */
373 for (gone_back = 0; gone_back < max_back; gone_back++)
375 if (current_subject <= start_subject) break;
377 while (current_subject > start_subject &&
378 (*current_subject & 0xc0) == 0x80)
385 /* In byte-mode we can do this quickly. */
388 gone_back = (current_subject - max_back < start_subject)?
389 current_subject - start_subject : max_back;
390 current_subject -= gone_back;
393 /* Now we can process the individual branches. */
395 end_code = this_start_code;
398 int back = GET(end_code, 2+LINK_SIZE);
399 if (back <= gone_back)
401 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402 ADD_NEW_DATA(-bstate, 0, gone_back - back);
404 end_code += GET(end_code, 1);
406 while (*end_code == OP_ALT);
409 /* This is the code for a "normal" subpattern (not a backward assertion). The
410 start of a whole pattern is always one of these. If we are at the top level,
411 we may be asked to restart matching from the same point that we reached for a
412 previous partial match. We still have to scan through the top-level branches to
413 find the end state. */
417 end_code = this_start_code;
421 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
423 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424 new_count = workspace[1];
426 memcpy(new_states, active_states, new_count * sizeof(stateblock));
433 int length = 1 + LINK_SIZE +
434 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
437 ADD_NEW(end_code - start_code + length, 0);
438 end_code += GET(end_code, 1);
439 length = 1 + LINK_SIZE;
441 while (*end_code == OP_ALT);
445 workspace[0] = 0; /* Bit indicating which vector is current */
447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
449 /* Loop for scanning the subject */
451 ptr = current_subject;
458 /* Make the new state list into the active state list and empty the
461 temp_states = active_states;
462 active_states = new_states;
463 new_states = temp_states;
464 active_count = new_count;
467 workspace[0] ^= 1; /* Remember for the restarting feature */
468 workspace[1] = active_count;
471 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
475 printf("%.*sActive states: ", rlevel*2-2, SP);
476 for (i = 0; i < active_count; i++)
477 printf("%d/%d ", active_states[i].offset, active_states[i].count);
481 /* Set the pointers for adding new states */
483 next_active_state = active_states + active_count;
484 next_new_state = new_states;
486 /* Load the current character from the subject outside the loop, as many
487 different states may want to look at it, and we assume that at least one
490 if (ptr < end_subject)
492 clen = 1; /* Number of bytes in the character */
494 if (utf8) { GETCHARLEN(c, ptr, clen); } else
495 #endif /* SUPPORT_UTF8 */
500 clen = 0; /* This indicates the end of the subject */
501 c = NOTACHAR; /* This value should never actually be used */
504 /* Scan up the active states and act on each one. The result of an action
505 may be to add more states to the currently active list (e.g. on hitting a
506 parenthesis) or it may be to put states on the new list, for considering
507 when we move the character pointer on. */
509 for (i = 0; i < active_count; i++)
511 stateblock *current_state = active_states + i;
513 int state_offset = current_state->offset;
514 int count, codevalue;
516 int chartype, script;
520 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 if (clen == 0) printf("EOL\n");
522 else if (c > 32 && c < 127) printf("'%c'\n", c);
523 else printf("0x%02x\n", c);
526 /* This variable is referred to implicity in the ADD_xxx macros. */
528 ims = current_state->ims;
530 /* A negative offset is a special case meaning "hold off going to this
531 (negated) state until the number of characters in the data field have
534 if (state_offset < 0)
536 if (current_state->data > 0)
538 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539 ADD_NEW_DATA(state_offset, current_state->count,
540 current_state->data - 1);
545 current_state->offset = state_offset = -state_offset;
549 /* Check for a duplicate state with the same count, and skip if found. */
551 for (j = 0; j < i; j++)
553 if (active_states[j].offset == state_offset &&
554 active_states[j].count == current_state->count)
556 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557 goto NEXT_ACTIVE_STATE;
561 /* The state offset is the offset to the opcode */
563 code = start_code + state_offset;
566 /* If this opcode is followed by an inline character, load it. It is
567 tempting to test for the presence of a subject character here, but that
568 is wrong, because sometimes zero repetitions of the subject are
571 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 argument that is not a data character - but is always one byte long. We
573 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574 this case. To keep the other cases fast, convert these ones to new opcodes.
577 if (coptable[codevalue] > 0)
581 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582 #endif /* SUPPORT_UTF8 */
583 d = code[coptable[codevalue]];
584 if (codevalue >= OP_TYPESTAR)
588 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
590 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
594 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
596 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
603 dlen = 0; /* Not strictly necessary, but compilers moan */
604 d = NOTACHAR; /* if these variables are not set. */
608 /* Now process the individual opcodes */
613 /* ========================================================================== */
614 /* Reached a closing bracket. If not at the end of the pattern, carry
615 on with the next opcode. Otherwise, unless we have an empty string and
616 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617 matches so we always have the longest first. */
622 if (code != end_code)
624 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625 if (codevalue != OP_KET)
627 ADD_ACTIVE(state_offset - GET(code, 1), 0);
630 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
632 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
635 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637 if (offsetcount >= 2)
639 offsets[0] = current_subject - start_subject;
640 offsets[1] = ptr - start_subject;
641 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642 offsets[1] - offsets[0], current_subject));
644 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
646 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648 match_count, rlevel*2-2, SP));
654 /* ========================================================================== */
655 /* These opcodes add to the current list of states without looking
656 at the current character. */
658 /*-----------------------------------------------------------------*/
660 do { code += GET(code, 1); } while (*code == OP_ALT);
661 ADD_ACTIVE(code - start_code, 0);
664 /*-----------------------------------------------------------------*/
669 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670 code += GET(code, 1);
672 while (*code == OP_ALT);
675 /*-----------------------------------------------------------------*/
678 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679 code += GET(code, 1);
680 while (*code == OP_ALT)
682 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683 code += GET(code, 1);
687 /*-----------------------------------------------------------------*/
690 ADD_ACTIVE(state_offset + 1, 0);
691 code += 1 + GET(code, 2);
692 while (*code == OP_ALT) code += GET(code, 1);
693 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
696 /*-----------------------------------------------------------------*/
698 code += 1 + GET(code, 2);
699 while (*code == OP_ALT) code += GET(code, 1);
700 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
703 /*-----------------------------------------------------------------*/
705 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706 ((ims & PCRE_MULTILINE) != 0 &&
707 ptr != end_subject &&
709 { ADD_ACTIVE(state_offset + 1, 0); }
712 /*-----------------------------------------------------------------*/
714 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
717 /*-----------------------------------------------------------------*/
720 ADD_ACTIVE(state_offset + 2, 0);
723 /*-----------------------------------------------------------------*/
725 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
728 /*-----------------------------------------------------------------*/
730 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
734 /* ========================================================================== */
735 /* These opcodes inspect the next subject character, and sometimes
736 the previous one as well, but do not have an argument. The variable
737 clen contains the length of the current character and is zero if we are
738 at the end of the subject. */
740 /*-----------------------------------------------------------------*/
742 if (clen > 0 && !IS_NEWLINE(ptr))
743 { ADD_NEW(state_offset + 1, 0); }
746 /*-----------------------------------------------------------------*/
749 { ADD_NEW(state_offset + 1, 0); }
752 /*-----------------------------------------------------------------*/
754 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
755 { ADD_ACTIVE(state_offset + 1, 0); }
758 /*-----------------------------------------------------------------*/
760 if ((md->moptions & PCRE_NOTEOL) == 0)
764 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
766 { ADD_ACTIVE(state_offset + 1, 0); }
768 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
769 { ADD_ACTIVE(state_offset + 1, 0); }
772 /*-----------------------------------------------------------------*/
777 if (clen > 0 && c < 256 &&
778 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
779 { ADD_NEW(state_offset + 1, 0); }
782 /*-----------------------------------------------------------------*/
784 case OP_NOT_WHITESPACE:
785 case OP_NOT_WORDCHAR:
786 if (clen > 0 && (c >= 256 ||
787 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
788 { ADD_NEW(state_offset + 1, 0); }
791 /*-----------------------------------------------------------------*/
792 case OP_WORD_BOUNDARY:
793 case OP_NOT_WORD_BOUNDARY:
795 int left_word, right_word;
797 if (ptr > start_subject)
799 const uschar *temp = ptr - 1;
801 if (utf8) BACKCHAR(temp);
803 GETCHARTEST(d, temp);
804 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
808 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
811 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
812 { ADD_ACTIVE(state_offset + 1, 0); }
817 /*-----------------------------------------------------------------*/
818 /* Check the next character by Unicode property. We will get here only
819 if the support is in the binary; otherwise a compile-time error occurs.
828 int category = _pcre_ucp_findprop(c, &chartype, &script);
836 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
840 OK = category == code[2];
844 OK = chartype == code[2];
848 OK = script == code[2];
851 /* Should never occur, but keep compilers from grumbling. */
854 OK = codevalue != OP_PROP;
858 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
865 /* ========================================================================== */
866 /* These opcodes likewise inspect the subject character, but have an
867 argument that is not a data character. It is one of these opcodes:
868 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
869 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
874 count = current_state->count; /* Already matched */
875 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
878 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
880 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
881 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
883 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
885 active_count--; /* Remove non-match possibility */
889 ADD_NEW(state_offset, count);
894 /*-----------------------------------------------------------------*/
896 case OP_TYPEMINQUERY:
897 case OP_TYPEPOSQUERY:
898 ADD_ACTIVE(state_offset + 2, 0);
901 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
903 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
904 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
906 if (codevalue == OP_TYPEPOSQUERY)
908 active_count--; /* Remove non-match possibility */
911 ADD_NEW(state_offset + 2, 0);
916 /*-----------------------------------------------------------------*/
920 ADD_ACTIVE(state_offset + 2, 0);
923 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
925 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
926 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
928 if (codevalue == OP_TYPEPOSSTAR)
930 active_count--; /* Remove non-match possibility */
933 ADD_NEW(state_offset, 0);
938 /*-----------------------------------------------------------------*/
940 count = current_state->count; /* Number already matched */
943 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
945 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
946 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
948 if (++count >= GET2(code, 1))
949 { ADD_NEW(state_offset + 4, 0); }
951 { ADD_NEW(state_offset, count); }
956 /*-----------------------------------------------------------------*/
960 ADD_ACTIVE(state_offset + 4, 0);
961 count = current_state->count; /* Number already matched */
964 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
966 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
967 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
969 if (codevalue == OP_TYPEPOSUPTO)
971 active_count--; /* Remove non-match possibility */
974 if (++count >= GET2(code, 1))
975 { ADD_NEW(state_offset + 4, 0); }
977 { ADD_NEW(state_offset, count); }
982 /* ========================================================================== */
983 /* These are virtual opcodes that are used when something like
984 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
985 argument. It keeps the code above fast for the other cases. The argument
986 is in the d variable. */
989 case OP_PROP_EXTRA + OP_TYPEPLUS:
990 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
991 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
992 count = current_state->count; /* Already matched */
993 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
997 int category = _pcre_ucp_findprop(c, &chartype, &script);
1005 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1009 OK = category == code[3];
1013 OK = chartype == code[3];
1017 OK = script == code[3];
1020 /* Should never occur, but keep compilers from grumbling. */
1023 OK = codevalue != OP_PROP;
1027 if (OK == (d == OP_PROP))
1029 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1031 active_count--; /* Remove non-match possibility */
1032 next_active_state--;
1035 ADD_NEW(state_offset, count);
1040 /*-----------------------------------------------------------------*/
1041 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044 count = current_state->count; /* Already matched */
1045 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1048 const uschar *nptr = ptr + clen;
1050 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1052 active_count--; /* Remove non-match possibility */
1053 next_active_state--;
1055 while (nptr < end_subject)
1059 GETCHARLEN(nd, nptr, ndlen);
1060 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1065 ADD_NEW_DATA(-state_offset, count, ncount);
1070 /*-----------------------------------------------------------------*/
1071 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074 count = current_state->count; /* Already matched */
1075 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1086 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1090 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1095 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1097 active_count--; /* Remove non-match possibility */
1098 next_active_state--;
1101 ADD_NEW_DATA(-state_offset, count, ncount);
1110 /*-----------------------------------------------------------------*/
1111 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1112 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1113 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1114 count = current_state->count; /* Already matched */
1115 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136 if (OK == (d == OP_VSPACE))
1138 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1140 active_count--; /* Remove non-match possibility */
1141 next_active_state--;
1144 ADD_NEW_DATA(-state_offset, count, 0);
1149 /*-----------------------------------------------------------------*/
1150 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1151 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1152 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1153 count = current_state->count; /* Already matched */
1154 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1161 case 0x20: /* SPACE */
1162 case 0xa0: /* NBSP */
1163 case 0x1680: /* OGHAM SPACE MARK */
1164 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1165 case 0x2000: /* EN QUAD */
1166 case 0x2001: /* EM QUAD */
1167 case 0x2002: /* EN SPACE */
1168 case 0x2003: /* EM SPACE */
1169 case 0x2004: /* THREE-PER-EM SPACE */
1170 case 0x2005: /* FOUR-PER-EM SPACE */
1171 case 0x2006: /* SIX-PER-EM SPACE */
1172 case 0x2007: /* FIGURE SPACE */
1173 case 0x2008: /* PUNCTUATION SPACE */
1174 case 0x2009: /* THIN SPACE */
1175 case 0x200A: /* HAIR SPACE */
1176 case 0x202f: /* NARROW NO-BREAK SPACE */
1177 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1178 case 0x3000: /* IDEOGRAPHIC SPACE */
1187 if (OK == (d == OP_HSPACE))
1189 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1191 active_count--; /* Remove non-match possibility */
1192 next_active_state--;
1195 ADD_NEW_DATA(-state_offset, count, 0);
1200 /*-----------------------------------------------------------------*/
1202 case OP_PROP_EXTRA + OP_TYPEQUERY:
1203 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1204 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1208 case OP_PROP_EXTRA + OP_TYPESTAR:
1209 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1210 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1215 ADD_ACTIVE(state_offset + 4, 0);
1219 int category = _pcre_ucp_findprop(c, &chartype, &script);
1227 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1231 OK = category == code[3];
1235 OK = chartype == code[3];
1239 OK = script == code[3];
1242 /* Should never occur, but keep compilers from grumbling. */
1245 OK = codevalue != OP_PROP;
1249 if (OK == (d == OP_PROP))
1251 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1252 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1254 active_count--; /* Remove non-match possibility */
1255 next_active_state--;
1257 ADD_NEW(state_offset + count, 0);
1262 /*-----------------------------------------------------------------*/
1263 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1264 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1265 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1269 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1270 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1271 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1276 ADD_ACTIVE(state_offset + 2, 0);
1277 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1279 const uschar *nptr = ptr + clen;
1281 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1282 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1284 active_count--; /* Remove non-match possibility */
1285 next_active_state--;
1287 while (nptr < end_subject)
1291 GETCHARLEN(nd, nptr, ndlen);
1292 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1296 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1301 /*-----------------------------------------------------------------*/
1302 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1303 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1304 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1308 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1309 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1310 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1314 ADD_ACTIVE(state_offset + 2, 0);
1325 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1329 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1334 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1335 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1337 active_count--; /* Remove non-match possibility */
1338 next_active_state--;
1340 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1349 /*-----------------------------------------------------------------*/
1350 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1351 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1352 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1356 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1357 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1358 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1362 ADD_ACTIVE(state_offset + 2, 0);
1382 if (OK == (d == OP_VSPACE))
1384 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1385 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1387 active_count--; /* Remove non-match possibility */
1388 next_active_state--;
1390 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1395 /*-----------------------------------------------------------------*/
1396 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1397 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1398 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1402 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1403 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1404 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1408 ADD_ACTIVE(state_offset + 2, 0);
1415 case 0x20: /* SPACE */
1416 case 0xa0: /* NBSP */
1417 case 0x1680: /* OGHAM SPACE MARK */
1418 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1419 case 0x2000: /* EN QUAD */
1420 case 0x2001: /* EM QUAD */
1421 case 0x2002: /* EN SPACE */
1422 case 0x2003: /* EM SPACE */
1423 case 0x2004: /* THREE-PER-EM SPACE */
1424 case 0x2005: /* FOUR-PER-EM SPACE */
1425 case 0x2006: /* SIX-PER-EM SPACE */
1426 case 0x2007: /* FIGURE SPACE */
1427 case 0x2008: /* PUNCTUATION SPACE */
1428 case 0x2009: /* THIN SPACE */
1429 case 0x200A: /* HAIR SPACE */
1430 case 0x202f: /* NARROW NO-BREAK SPACE */
1431 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1432 case 0x3000: /* IDEOGRAPHIC SPACE */
1441 if (OK == (d == OP_HSPACE))
1443 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1444 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1446 active_count--; /* Remove non-match possibility */
1447 next_active_state--;
1449 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1454 /*-----------------------------------------------------------------*/
1456 case OP_PROP_EXTRA + OP_TYPEEXACT:
1457 case OP_PROP_EXTRA + OP_TYPEUPTO:
1458 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1459 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1460 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1461 { ADD_ACTIVE(state_offset + 6, 0); }
1462 count = current_state->count; /* Number already matched */
1466 int category = _pcre_ucp_findprop(c, &chartype, &script);
1474 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1478 OK = category == code[5];
1482 OK = chartype == code[5];
1486 OK = script == code[5];
1489 /* Should never occur, but keep compilers from grumbling. */
1492 OK = codevalue != OP_PROP;
1496 if (OK == (d == OP_PROP))
1498 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1500 active_count--; /* Remove non-match possibility */
1501 next_active_state--;
1503 if (++count >= GET2(code, 1))
1504 { ADD_NEW(state_offset + 6, 0); }
1506 { ADD_NEW(state_offset, count); }
1511 /*-----------------------------------------------------------------*/
1512 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1513 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1514 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1515 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1516 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1517 { ADD_ACTIVE(state_offset + 4, 0); }
1518 count = current_state->count; /* Number already matched */
1519 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1521 const uschar *nptr = ptr + clen;
1523 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1525 active_count--; /* Remove non-match possibility */
1526 next_active_state--;
1528 while (nptr < end_subject)
1532 GETCHARLEN(nd, nptr, ndlen);
1533 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1537 if (++count >= GET2(code, 1))
1538 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1540 { ADD_NEW_DATA(-state_offset, count, ncount); }
1545 /*-----------------------------------------------------------------*/
1546 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1547 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1548 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1549 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1550 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1551 { ADD_ACTIVE(state_offset + 4, 0); }
1552 count = current_state->count; /* Number already matched */
1563 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1567 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1572 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1574 active_count--; /* Remove non-match possibility */
1575 next_active_state--;
1577 if (++count >= GET2(code, 1))
1578 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1580 { ADD_NEW_DATA(-state_offset, count, ncount); }
1589 /*-----------------------------------------------------------------*/
1590 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1591 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1592 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1593 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1594 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1595 { ADD_ACTIVE(state_offset + 4, 0); }
1596 count = current_state->count; /* Number already matched */
1616 if (OK == (d == OP_VSPACE))
1618 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1620 active_count--; /* Remove non-match possibility */
1621 next_active_state--;
1623 if (++count >= GET2(code, 1))
1624 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1626 { ADD_NEW_DATA(-state_offset, count, 0); }
1631 /*-----------------------------------------------------------------*/
1632 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1633 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1634 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1635 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1636 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1637 { ADD_ACTIVE(state_offset + 4, 0); }
1638 count = current_state->count; /* Number already matched */
1645 case 0x20: /* SPACE */
1646 case 0xa0: /* NBSP */
1647 case 0x1680: /* OGHAM SPACE MARK */
1648 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1649 case 0x2000: /* EN QUAD */
1650 case 0x2001: /* EM QUAD */
1651 case 0x2002: /* EN SPACE */
1652 case 0x2003: /* EM SPACE */
1653 case 0x2004: /* THREE-PER-EM SPACE */
1654 case 0x2005: /* FOUR-PER-EM SPACE */
1655 case 0x2006: /* SIX-PER-EM SPACE */
1656 case 0x2007: /* FIGURE SPACE */
1657 case 0x2008: /* PUNCTUATION SPACE */
1658 case 0x2009: /* THIN SPACE */
1659 case 0x200A: /* HAIR SPACE */
1660 case 0x202f: /* NARROW NO-BREAK SPACE */
1661 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1662 case 0x3000: /* IDEOGRAPHIC SPACE */
1671 if (OK == (d == OP_HSPACE))
1673 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1675 active_count--; /* Remove non-match possibility */
1676 next_active_state--;
1678 if (++count >= GET2(code, 1))
1679 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1681 { ADD_NEW_DATA(-state_offset, count, 0); }
1686 /* ========================================================================== */
1687 /* These opcodes are followed by a character that is usually compared
1688 to the current subject character; it is loaded into d. We still get
1689 here even if there is no subject character, because in some cases zero
1690 repetitions are permitted. */
1692 /*-----------------------------------------------------------------*/
1694 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1697 /*-----------------------------------------------------------------*/
1699 if (clen == 0) break;
1704 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1706 unsigned int othercase;
1707 if (c < 128) othercase = fcc[c]; else
1709 /* If we have Unicode property support, we can use it to test the
1710 other case of the character. */
1713 othercase = _pcre_ucp_othercase(c);
1715 othercase = NOTACHAR;
1718 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1722 #endif /* SUPPORT_UTF8 */
1724 /* Non-UTF-8 mode */
1726 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1732 /*-----------------------------------------------------------------*/
1733 /* This is a tricky one because it can match more than one character.
1734 Find out how many characters to skip, and then set up a negative state
1735 to wait for them to pass before continuing. */
1738 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1740 const uschar *nptr = ptr + clen;
1742 while (nptr < end_subject)
1745 GETCHARLEN(c, nptr, nclen);
1746 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1750 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1755 /*-----------------------------------------------------------------*/
1756 /* This is a tricky like EXTUNI because it too can match more than one
1757 character (when CR is followed by LF). In this case, set up a negative
1758 state to wait for one character to pass before continuing. */
1761 if (clen > 0) switch(c)
1768 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1771 ADD_NEW(state_offset + 1, 0);
1775 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1777 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1781 ADD_NEW(state_offset + 1, 0);
1787 /*-----------------------------------------------------------------*/
1789 if (clen > 0) switch(c)
1801 ADD_NEW(state_offset + 1, 0);
1806 /*-----------------------------------------------------------------*/
1808 if (clen > 0) switch(c)
1817 ADD_NEW(state_offset + 1, 0);
1824 /*-----------------------------------------------------------------*/
1826 if (clen > 0) switch(c)
1829 case 0x20: /* SPACE */
1830 case 0xa0: /* NBSP */
1831 case 0x1680: /* OGHAM SPACE MARK */
1832 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1833 case 0x2000: /* EN QUAD */
1834 case 0x2001: /* EM QUAD */
1835 case 0x2002: /* EN SPACE */
1836 case 0x2003: /* EM SPACE */
1837 case 0x2004: /* THREE-PER-EM SPACE */
1838 case 0x2005: /* FOUR-PER-EM SPACE */
1839 case 0x2006: /* SIX-PER-EM SPACE */
1840 case 0x2007: /* FIGURE SPACE */
1841 case 0x2008: /* PUNCTUATION SPACE */
1842 case 0x2009: /* THIN SPACE */
1843 case 0x200A: /* HAIR SPACE */
1844 case 0x202f: /* NARROW NO-BREAK SPACE */
1845 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1846 case 0x3000: /* IDEOGRAPHIC SPACE */
1850 ADD_NEW(state_offset + 1, 0);
1855 /*-----------------------------------------------------------------*/
1857 if (clen > 0) switch(c)
1860 case 0x20: /* SPACE */
1861 case 0xa0: /* NBSP */
1862 case 0x1680: /* OGHAM SPACE MARK */
1863 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1864 case 0x2000: /* EN QUAD */
1865 case 0x2001: /* EM QUAD */
1866 case 0x2002: /* EN SPACE */
1867 case 0x2003: /* EM SPACE */
1868 case 0x2004: /* THREE-PER-EM SPACE */
1869 case 0x2005: /* FOUR-PER-EM SPACE */
1870 case 0x2006: /* SIX-PER-EM SPACE */
1871 case 0x2007: /* FIGURE SPACE */
1872 case 0x2008: /* PUNCTUATION SPACE */
1873 case 0x2009: /* THIN SPACE */
1874 case 0x200A: /* HAIR SPACE */
1875 case 0x202f: /* NARROW NO-BREAK SPACE */
1876 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1877 case 0x3000: /* IDEOGRAPHIC SPACE */
1878 ADD_NEW(state_offset + 1, 0);
1883 /*-----------------------------------------------------------------*/
1884 /* Match a negated single character. This is only used for one-byte
1885 characters, that is, we know that d < 256. The character we are
1886 checking (c) can be multibyte. */
1891 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1892 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1896 /*-----------------------------------------------------------------*/
1903 count = current_state->count; /* Already matched */
1904 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1907 unsigned int otherd = NOTACHAR;
1908 if ((ims & PCRE_CASELESS) != 0)
1911 if (utf8 && d >= 128)
1914 otherd = _pcre_ucp_othercase(d);
1915 #endif /* SUPPORT_UCP */
1918 #endif /* SUPPORT_UTF8 */
1921 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1924 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1926 active_count--; /* Remove non-match possibility */
1927 next_active_state--;
1930 ADD_NEW(state_offset, count);
1935 /*-----------------------------------------------------------------*/
1940 case OP_NOTMINQUERY:
1941 case OP_NOTPOSQUERY:
1942 ADD_ACTIVE(state_offset + dlen + 1, 0);
1945 unsigned int otherd = NOTACHAR;
1946 if ((ims & PCRE_CASELESS) != 0)
1949 if (utf8 && d >= 128)
1952 otherd = _pcre_ucp_othercase(d);
1953 #endif /* SUPPORT_UCP */
1956 #endif /* SUPPORT_UTF8 */
1959 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1961 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1963 active_count--; /* Remove non-match possibility */
1964 next_active_state--;
1966 ADD_NEW(state_offset + dlen + 1, 0);
1971 /*-----------------------------------------------------------------*/
1978 ADD_ACTIVE(state_offset + dlen + 1, 0);
1981 unsigned int otherd = NOTACHAR;
1982 if ((ims & PCRE_CASELESS) != 0)
1985 if (utf8 && d >= 128)
1988 otherd = _pcre_ucp_othercase(d);
1989 #endif /* SUPPORT_UCP */
1992 #endif /* SUPPORT_UTF8 */
1995 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1997 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1999 active_count--; /* Remove non-match possibility */
2000 next_active_state--;
2002 ADD_NEW(state_offset, 0);
2007 /*-----------------------------------------------------------------*/
2010 count = current_state->count; /* Number already matched */
2013 unsigned int otherd = NOTACHAR;
2014 if ((ims & PCRE_CASELESS) != 0)
2017 if (utf8 && d >= 128)
2020 otherd = _pcre_ucp_othercase(d);
2021 #endif /* SUPPORT_UCP */
2024 #endif /* SUPPORT_UTF8 */
2027 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2029 if (++count >= GET2(code, 1))
2030 { ADD_NEW(state_offset + dlen + 3, 0); }
2032 { ADD_NEW(state_offset, count); }
2037 /*-----------------------------------------------------------------*/
2044 ADD_ACTIVE(state_offset + dlen + 3, 0);
2045 count = current_state->count; /* Number already matched */
2048 unsigned int otherd = NOTACHAR;
2049 if ((ims & PCRE_CASELESS) != 0)
2052 if (utf8 && d >= 128)
2055 otherd = _pcre_ucp_othercase(d);
2056 #endif /* SUPPORT_UCP */
2059 #endif /* SUPPORT_UTF8 */
2062 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2064 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2066 active_count--; /* Remove non-match possibility */
2067 next_active_state--;
2069 if (++count >= GET2(code, 1))
2070 { ADD_NEW(state_offset + dlen + 3, 0); }
2072 { ADD_NEW(state_offset, count); }
2078 /* ========================================================================== */
2079 /* These are the class-handling opcodes */
2085 BOOL isinclass = FALSE;
2086 int next_state_offset;
2087 const uschar *ecode;
2089 /* For a simple class, there is always just a 32-byte table, and we
2090 can set isinclass from it. */
2092 if (codevalue != OP_XCLASS)
2097 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2098 ((code[1 + c/8] & (1 << (c&7))) != 0);
2102 /* An extended class may have a table or a list of single characters,
2103 ranges, or both, and it may be positive or negative. There's a
2104 function that sorts all this out. */
2108 ecode = code + GET(code, 1);
2109 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2112 /* At this point, isinclass is set for all kinds of class, and ecode
2113 points to the byte after the end of the class. If there is a
2114 quantifier, this is where it will be. */
2116 next_state_offset = ecode - start_code;
2122 ADD_ACTIVE(next_state_offset + 1, 0);
2123 if (isinclass) { ADD_NEW(state_offset, 0); }
2128 count = current_state->count; /* Already matched */
2129 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2130 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2135 ADD_ACTIVE(next_state_offset + 1, 0);
2136 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2141 count = current_state->count; /* Already matched */
2142 if (count >= GET2(ecode, 1))
2143 { ADD_ACTIVE(next_state_offset + 5, 0); }
2146 int max = GET2(ecode, 3);
2147 if (++count >= max && max != 0) /* Max 0 => no limit */
2148 { ADD_NEW(next_state_offset + 5, 0); }
2150 { ADD_NEW(state_offset, count); }
2155 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2161 /* ========================================================================== */
2162 /* These are the opcodes for fancy brackets of various kinds. We have
2163 to use recursion in order to handle them. The "always failing" assersion
2164 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2165 though the other "backtracking verbs" are not supported. */
2173 case OP_ASSERTBACK_NOT:
2176 int local_offsets[2];
2177 int local_workspace[1000];
2178 const uschar *endasscode = code + GET(code, 1);
2180 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2182 rc = internal_dfa_exec(
2183 md, /* static match data */
2184 code, /* this subexpression's code */
2185 ptr, /* where we currently are */
2186 ptr - start_subject, /* start offset */
2187 local_offsets, /* offset vector */
2188 sizeof(local_offsets)/sizeof(int), /* size of same */
2189 local_workspace, /* workspace vector */
2190 sizeof(local_workspace)/sizeof(int), /* size of same */
2191 ims, /* the current ims flags */
2192 rlevel, /* function recursion level */
2193 recursing); /* pass on regex recursion */
2195 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2196 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2200 /*-----------------------------------------------------------------*/
2204 int local_offsets[1000];
2205 int local_workspace[1000];
2206 int condcode = code[LINK_SIZE+1];
2208 /* Back reference conditions are not supported */
2210 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2212 /* The DEFINE condition is always false */
2214 if (condcode == OP_DEF)
2216 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2219 /* The only supported version of OP_RREF is for the value RREF_ANY,
2220 which means "test if in any recursion". We can't test for specifically
2223 else if (condcode == OP_RREF)
2225 int value = GET2(code, LINK_SIZE+2);
2226 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2227 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2228 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2231 /* Otherwise, the condition is an assertion */
2236 const uschar *asscode = code + LINK_SIZE + 1;
2237 const uschar *endasscode = asscode + GET(asscode, 1);
2239 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2241 rc = internal_dfa_exec(
2242 md, /* fixed match data */
2243 asscode, /* this subexpression's code */
2244 ptr, /* where we currently are */
2245 ptr - start_subject, /* start offset */
2246 local_offsets, /* offset vector */
2247 sizeof(local_offsets)/sizeof(int), /* size of same */
2248 local_workspace, /* workspace vector */
2249 sizeof(local_workspace)/sizeof(int), /* size of same */
2250 ims, /* the current ims flags */
2251 rlevel, /* function recursion level */
2252 recursing); /* pass on regex recursion */
2255 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2256 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2258 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2263 /*-----------------------------------------------------------------*/
2266 int local_offsets[1000];
2267 int local_workspace[1000];
2270 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2273 rc = internal_dfa_exec(
2274 md, /* fixed match data */
2275 start_code + GET(code, 1), /* this subexpression's code */
2276 ptr, /* where we currently are */
2277 ptr - start_subject, /* start offset */
2278 local_offsets, /* offset vector */
2279 sizeof(local_offsets)/sizeof(int), /* size of same */
2280 local_workspace, /* workspace vector */
2281 sizeof(local_workspace)/sizeof(int), /* size of same */
2282 ims, /* the current ims flags */
2283 rlevel, /* function recursion level */
2284 recursing + 1); /* regex recurse level */
2286 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2287 recursing + 1, rc));
2289 /* Ran out of internal offsets */
2291 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2293 /* For each successful matched substring, set up the next state with a
2294 count of characters to skip before trying it. Note that the count is in
2295 characters, not bytes. */
2299 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2301 const uschar *p = start_subject + local_offsets[rc];
2302 const uschar *pp = start_subject + local_offsets[rc+1];
2303 int charcount = local_offsets[rc+1] - local_offsets[rc];
2304 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2307 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2311 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2315 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2319 /*-----------------------------------------------------------------*/
2322 int local_offsets[2];
2323 int local_workspace[1000];
2325 int rc = internal_dfa_exec(
2326 md, /* fixed match data */
2327 code, /* this subexpression's code */
2328 ptr, /* where we currently are */
2329 ptr - start_subject, /* start offset */
2330 local_offsets, /* offset vector */
2331 sizeof(local_offsets)/sizeof(int), /* size of same */
2332 local_workspace, /* workspace vector */
2333 sizeof(local_workspace)/sizeof(int), /* size of same */
2334 ims, /* the current ims flags */
2335 rlevel, /* function recursion level */
2336 recursing); /* pass on regex recursion */
2340 const uschar *end_subpattern = code;
2341 int charcount = local_offsets[1] - local_offsets[0];
2342 int next_state_offset, repeat_state_offset;
2344 do { end_subpattern += GET(end_subpattern, 1); }
2345 while (*end_subpattern == OP_ALT);
2346 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2348 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2349 arrange for the repeat state also to be added to the relevant list.
2350 Calculate the offset, or set -1 for no repeat. */
2352 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2353 *end_subpattern == OP_KETRMIN)?
2354 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2356 /* If we have matched an empty string, add the next state at the
2357 current character pointer. This is important so that the duplicate
2358 checking kicks in, which is what breaks infinite loops that match an
2363 ADD_ACTIVE(next_state_offset, 0);
2366 /* Optimization: if there are no more active states, and there
2367 are no new states yet set up, then skip over the subject string
2368 right here, to save looping. Otherwise, set up the new state to swing
2369 into action when the end of the substring is reached. */
2371 else if (i + 1 >= active_count && new_count == 0)
2375 ADD_NEW(next_state_offset, 0);
2377 /* If we are adding a repeat state at the new character position,
2378 we must fudge things so that it is the only current state.
2379 Otherwise, it might be a duplicate of one we processed before, and
2380 that would cause it to be skipped. */
2382 if (repeat_state_offset >= 0)
2384 next_active_state = active_states;
2387 ADD_ACTIVE(repeat_state_offset, 0);
2392 const uschar *p = start_subject + local_offsets[0];
2393 const uschar *pp = start_subject + local_offsets[1];
2394 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2395 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2396 if (repeat_state_offset >= 0)
2397 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2401 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2406 /* ========================================================================== */
2407 /* Handle callouts */
2410 if (pcre_callout != NULL)
2413 pcre_callout_block cb;
2414 cb.version = 1; /* Version 1 of the callout block */
2415 cb.callout_number = code[1];
2416 cb.offset_vector = offsets;
2417 cb.subject = (PCRE_SPTR)start_subject;
2418 cb.subject_length = end_subject - start_subject;
2419 cb.start_match = current_subject - start_subject;
2420 cb.current_position = ptr - start_subject;
2421 cb.pattern_position = GET(code, 2);
2422 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2424 cb.capture_last = -1;
2425 cb.callout_data = md->callout_data;
2426 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2427 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2432 /* ========================================================================== */
2433 default: /* Unsupported opcode */
2434 return PCRE_ERROR_DFA_UITEM;
2437 NEXT_ACTIVE_STATE: continue;
2439 } /* End of loop scanning active states */
2441 /* We have finished the processing at the current subject character. If no
2442 new states have been set for the next character, we have found all the
2443 matches that we are going to find. If we are at the top level and partial
2444 matching has been requested, check for appropriate conditions. */
2448 if (match_count < 0 && /* No matches found */
2449 rlevel == 1 && /* Top level match function */
2450 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2451 ptr >= end_subject && /* Reached end of subject */
2452 ptr > current_subject) /* Matched non-empty string */
2454 if (offsetcount >= 2)
2456 offsets[0] = current_subject - start_subject;
2457 offsets[1] = end_subject - start_subject;
2459 match_count = PCRE_ERROR_PARTIAL;
2462 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2463 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2465 break; /* In effect, "return", but see the comment below */
2468 /* One or more states are active for the next character. */
2470 ptr += clen; /* Advance to next subject character */
2471 } /* Loop to move along the subject string */
2473 /* Control gets here from "break" a few lines above. We do it this way because
2474 if we use "return" above, we have compiler trouble. Some compilers warn if
2475 there's nothing here because they think the function doesn't return a value. On
2476 the other hand, if we put a dummy statement here, some more clever compilers
2477 complain that it can't be reached. Sigh. */
2485 /*************************************************
2486 * Execute a Regular Expression - DFA engine *
2487 *************************************************/
2489 /* This external function applies a compiled re to a subject string using a DFA
2490 engine. This function calls the internal function multiple times if the pattern
2494 argument_re points to the compiled expression
2495 extra_data points to extra data or is NULL
2496 subject points to the subject string
2497 length length of subject string (may contain binary zeros)
2498 start_offset where to start in the subject string
2500 offsets vector of match offsets
2501 offsetcount size of same
2502 workspace workspace vector
2503 wscount size of same
2505 Returns: > 0 => number of match offset pairs placed in offsets
2506 = 0 => offsets overflowed; longest matches are present
2507 -1 => failed to match
2508 < -1 => some kind of unexpected problem
2512 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2513 const char *subject, int length, int start_offset, int options, int *offsets,
2514 int offsetcount, int *workspace, int wscount)
2516 real_pcre *re = (real_pcre *)argument_re;
2517 dfa_match_data match_block;
2518 dfa_match_data *md = &match_block;
2519 BOOL utf8, anchored, startline, firstline;
2520 const uschar *current_subject, *end_subject, *lcc;
2522 pcre_study_data internal_study;
2523 const pcre_study_data *study = NULL;
2524 real_pcre internal_re;
2526 const uschar *req_byte_ptr;
2527 const uschar *start_bits = NULL;
2528 BOOL first_byte_caseless = FALSE;
2529 BOOL req_byte_caseless = FALSE;
2530 int first_byte = -1;
2535 /* Plausibility checks */
2537 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2538 if (re == NULL || subject == NULL || workspace == NULL ||
2539 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2540 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2541 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2543 /* We need to find the pointer to any study data before we test for byte
2544 flipping, so we scan the extra_data block first. This may set two fields in the
2545 match block, so we must initialize them beforehand. However, the other fields
2546 in the match block must not be set until after the byte flipping. */
2548 md->tables = re->tables;
2549 md->callout_data = NULL;
2551 if (extra_data != NULL)
2553 unsigned int flags = extra_data->flags;
2554 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2555 study = (const pcre_study_data *)extra_data->study_data;
2556 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2557 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2558 return PCRE_ERROR_DFA_UMLIMIT;
2559 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2560 md->callout_data = extra_data->callout_data;
2561 if ((flags & PCRE_EXTRA_TABLES) != 0)
2562 md->tables = extra_data->tables;
2565 /* Check that the first field in the block is the magic number. If it is not,
2566 test for a regex that was compiled on a host of opposite endianness. If this is
2567 the case, flipped values are put in internal_re and internal_study if there was
2570 if (re->magic_number != MAGIC_NUMBER)
2572 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2573 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2574 if (study != NULL) study = &internal_study;
2577 /* Set some local values */
2579 current_subject = (const unsigned char *)subject + start_offset;
2580 end_subject = (const unsigned char *)subject + length;
2581 req_byte_ptr = current_subject - 1;
2584 utf8 = (re->options & PCRE_UTF8) != 0;
2589 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2590 (re->options & PCRE_ANCHORED) != 0;
2592 /* The remaining fixed data for passing around. */
2594 md->start_code = (const uschar *)argument_re +
2595 re->name_table_offset + re->name_count * re->name_entry_size;
2596 md->start_subject = (const unsigned char *)subject;
2597 md->end_subject = end_subject;
2598 md->moptions = options;
2599 md->poptions = re->options;
2601 /* If the BSR option is not set at match time, copy what was set
2604 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2606 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2607 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2609 else md->moptions |= PCRE_BSR_ANYCRLF;
2613 /* Handle different types of newline. The three bits give eight cases. If
2614 nothing is set at run time, whatever was used at compile time applies. */
2616 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2619 case 0: newline = NEWLINE; break; /* Compile-time default */
2620 case PCRE_NEWLINE_CR: newline = '\r'; break;
2621 case PCRE_NEWLINE_LF: newline = '\n'; break;
2622 case PCRE_NEWLINE_CR+
2623 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2624 case PCRE_NEWLINE_ANY: newline = -1; break;
2625 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2626 default: return PCRE_ERROR_BADNEWLINE;
2631 md->nltype = NLTYPE_ANYCRLF;
2633 else if (newline < 0)
2635 md->nltype = NLTYPE_ANY;
2639 md->nltype = NLTYPE_FIXED;
2643 md->nl[0] = (newline >> 8) & 255;
2644 md->nl[1] = newline & 255;
2649 md->nl[0] = newline;
2653 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2654 back the character offset. */
2657 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2659 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2660 return PCRE_ERROR_BADUTF8;
2661 if (start_offset > 0 && start_offset < length)
2663 int tb = ((uschar *)subject)[start_offset];
2667 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2673 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2674 is a feature that makes it possible to save compiled regex and re-use them
2675 in other programs later. */
2677 if (md->tables == NULL) md->tables = _pcre_default_tables;
2679 /* The lower casing table and the "must be at the start of a line" flag are
2680 used in a loop when finding where to start. */
2682 lcc = md->tables + lcc_offset;
2683 startline = (re->flags & PCRE_STARTLINE) != 0;
2684 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2686 /* Set up the first character to match, if available. The first_byte value is
2687 never set for an anchored regular expression, but the anchoring may be forced
2688 at run time, so we have to test for anchoring. The first char may be unset for
2689 an unanchored pattern, of course. If there's no first char and the pattern was
2690 studied, there may be a bitmap of possible first characters. */
2694 if ((re->flags & PCRE_FIRSTSET) != 0)
2696 first_byte = re->first_byte & 255;
2697 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2698 first_byte = lcc[first_byte];
2702 if (startline && study != NULL &&
2703 (study->options & PCRE_STUDY_MAPPED) != 0)
2704 start_bits = study->start_bits;
2708 /* For anchored or unanchored matches, there may be a "last known required
2711 if ((re->flags & PCRE_REQCHSET) != 0)
2713 req_byte = re->req_byte & 255;
2714 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2715 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2718 /* Call the main matching function, looping for a non-anchored regex after a
2719 failed match. Unless restarting, optimize by moving to the first match
2720 character if possible, when not anchored. Then unless wanting a partial match,
2721 check for a required later character. */
2727 if ((options & PCRE_DFA_RESTART) == 0)
2729 const uschar *save_end_subject = end_subject;
2731 /* Advance to a unique first char if possible. If firstline is TRUE, the
2732 start of the match is constrained to the first line of a multiline string.
2733 Implement this by temporarily adjusting end_subject so that we stop
2734 scanning at a newline. If the match fails at the newline, later code breaks
2739 const uschar *t = current_subject;
2740 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2744 if (first_byte >= 0)
2746 if (first_byte_caseless)
2747 while (current_subject < end_subject &&
2748 lcc[*current_subject] != first_byte)
2751 while (current_subject < end_subject && *current_subject != first_byte)
2755 /* Or to just after a linebreak for a multiline match if possible */
2759 if (current_subject > md->start_subject + start_offset)
2761 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2764 /* If we have just passed a CR and the newline option is ANY or
2765 ANYCRLF, and we are now at a LF, advance the match position by one more
2768 if (current_subject[-1] == '\r' &&
2769 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2770 current_subject < end_subject &&
2771 *current_subject == '\n')
2776 /* Or to a non-unique first char after study */
2778 else if (start_bits != NULL)
2780 while (current_subject < end_subject)
2782 register unsigned int c = *current_subject;
2783 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2788 /* Restore fudged end_subject */
2790 end_subject = save_end_subject;
2793 /* If req_byte is set, we know that that character must appear in the subject
2794 for the match to succeed. If the first character is set, req_byte must be
2795 later in the subject; otherwise the test starts at the match point. This
2796 optimization can save a huge amount of work in patterns with nested unlimited
2797 repeats that aren't going to match. Writing separate code for cased/caseless
2798 versions makes it go faster, as does using an autoincrement and backing off
2801 HOWEVER: when the subject string is very, very long, searching to its end can
2802 take a long time, and give bad performance on quite ordinary patterns. This
2803 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2804 don't do this when the string is sufficiently long.
2806 ALSO: this processing is disabled when partial matching is requested.
2809 if (req_byte >= 0 &&
2810 end_subject - current_subject < REQ_BYTE_MAX &&
2811 (options & PCRE_PARTIAL) == 0)
2813 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2815 /* We don't need to repeat the search if we haven't yet reached the
2816 place we found it at last time. */
2818 if (p > req_byte_ptr)
2820 if (req_byte_caseless)
2822 while (p < end_subject)
2824 register int pp = *p++;
2825 if (pp == req_byte || pp == req_byte2) { p--; break; }
2830 while (p < end_subject)
2832 if (*p++ == req_byte) { p--; break; }
2836 /* If we can't find the required character, break the matching loop,
2837 which will cause a return or PCRE_ERROR_NOMATCH. */
2839 if (p >= end_subject) break;
2841 /* If we have found the required character, save the point where we
2842 found it, so that we don't search again next time round the loop if
2843 the start hasn't passed this character yet. */
2849 /* OK, now we can do the business */
2851 rc = internal_dfa_exec(
2852 md, /* fixed match data */
2853 md->start_code, /* this subexpression's code */
2854 current_subject, /* where we currently are */
2855 start_offset, /* start offset in subject */
2856 offsets, /* offset vector */
2857 offsetcount, /* size of same */
2858 workspace, /* workspace vector */
2859 wscount, /* size of same */
2860 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2861 0, /* function recurse level */
2862 0); /* regex recurse level */
2864 /* Anything other than "no match" means we are done, always; otherwise, carry
2865 on only if not anchored. */
2867 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2869 /* Advance to the next subject character unless we are at the end of a line
2870 and firstline is set. */
2872 if (firstline && IS_NEWLINE(current_subject)) break;
2876 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2879 if (current_subject > end_subject) break;
2881 /* If we have just passed a CR and we are now at a LF, and the pattern does
2882 not contain any explicit matches for \r or \n, and the newline option is CRLF
2883 or ANY or ANYCRLF, advance the match position by one more character. */
2885 if (current_subject[-1] == '\r' &&
2886 current_subject < end_subject &&
2887 *current_subject == '\n' &&
2888 (re->flags & PCRE_HASCRORLF) == 0 &&
2889 (md->nltype == NLTYPE_ANY ||
2890 md->nltype == NLTYPE_ANYCRLF ||
2894 } /* "Bumpalong" loop */
2896 return PCRE_ERROR_NOMATCH;
2899 /* End of pcre_dfa_exec.c */