1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
9 Written by Philip Hazel
10 Copyright (c) 1997-2012 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
84 #include "pcre_internal.h"
87 /* For use to indent debugging output */
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
116 static const pcre_uint8 coptable[] = {
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
157 0, /* XCLASS - variable length */
170 0, /* Assert behind */
171 0, /* Assert behind not */
172 0, 0, /* ONCE, ONCE_NC */
173 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
174 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
175 0, 0, /* CREF, NCREF */
176 0, 0, /* RREF, NRREF */
178 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
179 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
180 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
181 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
182 0, 0 /* CLOSE, SKIPZERO */
185 /* This table identifies those opcodes that inspect a character. It is used to
186 remember the fact that a character could have been inspected when the end of
187 the subject is reached. ***NOTE*** If the start of this table is modified, the
188 two tables that follow must also be modified. */
190 static const pcre_uint8 poptable[] = {
192 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
193 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
194 1, 1, 1, /* Any, AllAny, Anybyte */
196 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
198 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
203 /* Positive single-char repeats */
204 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
205 1, 1, 1, /* upto, minupto, exact */
206 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
207 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
208 1, 1, 1, /* upto I, minupto I, exact I */
209 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
210 /* Negative single-char repeats - only for chars < 256 */
211 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
212 1, 1, 1, /* NOT upto, minupto, exact */
213 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
214 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
215 1, 1, 1, /* NOT upto I, minupto I, exact I */
216 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
217 /* Positive type repeats */
218 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
219 1, 1, 1, /* Type upto, minupto, exact */
220 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
221 /* Character class & ref repeats */
222 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
223 1, 1, /* CRRANGE, CRMINRANGE */
226 1, /* XCLASS - variable length */
239 0, /* Assert behind */
240 0, /* Assert behind not */
241 0, 0, /* ONCE, ONCE_NC */
242 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
243 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
244 0, 0, /* CREF, NCREF */
245 0, 0, /* RREF, NRREF */
247 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
248 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
249 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
250 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
251 0, 0 /* CLOSE, SKIPZERO */
254 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
257 static const pcre_uint8 toptable1[] = {
259 ctype_digit, ctype_digit,
260 ctype_space, ctype_space,
261 ctype_word, ctype_word,
262 0, 0 /* OP_ANY, OP_ALLANY */
265 static const pcre_uint8 toptable2[] = {
270 1, 1 /* OP_ANY, OP_ALLANY */
274 /* Structure for holding data about a particular state, which is in effect the
275 current data for an active path through the match tree. It must consist
276 entirely of ints because the working vector we are passed, and which we put
277 these structures in, is a vector of ints. */
279 typedef struct stateblock {
280 int offset; /* Offset to opcode */
281 int count; /* Count for repeats */
282 int data; /* Some use extra data */
285 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
289 /*************************************************
290 * Print character string *
291 *************************************************/
293 /* Character string printing function for debugging.
297 length number of bytes
304 pchars(const pcre_uchar *p, int length, FILE *f)
309 if (isprint(c = *(p++)))
312 fprintf(f, "\\x%02x", c);
319 /*************************************************
320 * Execute a Regular Expression - DFA engine *
321 *************************************************/
323 /* This internal function applies a compiled pattern to a subject string,
324 starting at a given point, using a DFA engine. This function is called from the
325 external one, possibly multiple times if the pattern is not anchored. The
326 function calls itself recursively for some kinds of subpattern.
329 md the match_data block with fixed information
330 this_start_code the opening bracket of this subexpression's code
331 current_subject where we currently are in the subject string
332 start_offset start offset in the subject string
333 offsets vector to contain the matching string offsets
334 offsetcount size of same
335 workspace vector of workspace
337 rlevel function call recursion level
339 Returns: > 0 => number of match offset pairs placed in offsets
340 = 0 => offsets overflowed; longest matches are present
341 -1 => failed to match
342 < -1 => some kind of unexpected problem
344 The following macros are used for adding states to the two state vectors (one
345 for the current character, one for the following character). */
347 #define ADD_ACTIVE(x,y) \
348 if (active_count++ < wscount) \
350 next_active_state->offset = (x); \
351 next_active_state->count = (y); \
352 next_active_state++; \
353 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
355 else return PCRE_ERROR_DFA_WSSIZE
357 #define ADD_ACTIVE_DATA(x,y,z) \
358 if (active_count++ < wscount) \
360 next_active_state->offset = (x); \
361 next_active_state->count = (y); \
362 next_active_state->data = (z); \
363 next_active_state++; \
364 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
366 else return PCRE_ERROR_DFA_WSSIZE
368 #define ADD_NEW(x,y) \
369 if (new_count++ < wscount) \
371 next_new_state->offset = (x); \
372 next_new_state->count = (y); \
374 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
376 else return PCRE_ERROR_DFA_WSSIZE
378 #define ADD_NEW_DATA(x,y,z) \
379 if (new_count++ < wscount) \
381 next_new_state->offset = (x); \
382 next_new_state->count = (y); \
383 next_new_state->data = (z); \
385 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
387 else return PCRE_ERROR_DFA_WSSIZE
389 /* And now, here is the code */
394 const pcre_uchar *this_start_code,
395 const pcre_uchar *current_subject,
403 stateblock *active_states, *new_states, *temp_states;
404 stateblock *next_active_state, *next_new_state;
406 const pcre_uint8 *ctypes, *lcc, *fcc;
407 const pcre_uchar *ptr;
408 const pcre_uchar *end_code, *first_op;
410 dfa_recursion_info new_recursive;
412 int active_count, new_count, match_count;
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
417 const pcre_uchar *start_subject = md->start_subject;
418 const pcre_uchar *end_subject = md->end_subject;
419 const pcre_uchar *start_code = md->start_code;
422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
431 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
432 (2 * INTS_PER_STATEBLOCK);
434 DPRINTF(("\n%.*s---------------------\n"
435 "%.*sCall to internal_dfa_exec f=%d\n",
436 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
438 ctypes = md->tables + ctypes_offset;
439 lcc = md->tables + lcc_offset;
440 fcc = md->tables + fcc_offset;
442 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
444 active_states = (stateblock *)(workspace + 2);
445 next_new_state = new_states = active_states + wscount;
448 first_op = this_start_code + 1 + LINK_SIZE +
449 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
450 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454 the alternative states onto the list, and find out where the end is. This
455 makes is possible to use this function recursively, when we want to stop at a
456 matching internal ket rather than at the end.
458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
459 a backward assertion. In that case, we have to find out the maximum amount to
460 move back, and set up each alternative appropriately. */
462 if (*first_op == OP_REVERSE)
467 end_code = this_start_code;
470 int back = GET(end_code, 2+LINK_SIZE);
471 if (back > max_back) max_back = back;
472 end_code += GET(end_code, 1);
474 while (*end_code == OP_ALT);
476 /* If we can't go back the amount required for the longest lookbehind
477 pattern, go back as far as we can; some alternatives may still be viable. */
480 /* In character mode we have to step back character by character */
484 for (gone_back = 0; gone_back < max_back; gone_back++)
486 if (current_subject <= start_subject) break;
488 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
494 /* In byte-mode we can do this quickly. */
497 gone_back = (current_subject - max_back < start_subject)?
498 (int)(current_subject - start_subject) : max_back;
499 current_subject -= gone_back;
502 /* Save the earliest consulted character */
504 if (current_subject < md->start_used_ptr)
505 md->start_used_ptr = current_subject;
507 /* Now we can process the individual branches. */
509 end_code = this_start_code;
512 int back = GET(end_code, 2+LINK_SIZE);
513 if (back <= gone_back)
515 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516 ADD_NEW_DATA(-bstate, 0, gone_back - back);
518 end_code += GET(end_code, 1);
520 while (*end_code == OP_ALT);
523 /* This is the code for a "normal" subpattern (not a backward assertion). The
524 start of a whole pattern is always one of these. If we are at the top level,
525 we may be asked to restart matching from the same point that we reached for a
526 previous partial match. We still have to scan through the top-level branches to
527 find the end state. */
531 end_code = this_start_code;
535 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
537 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
538 new_count = workspace[1];
540 memcpy(new_states, active_states, new_count * sizeof(stateblock));
547 int length = 1 + LINK_SIZE +
548 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
549 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
553 ADD_NEW((int)(end_code - start_code + length), 0);
554 end_code += GET(end_code, 1);
555 length = 1 + LINK_SIZE;
557 while (*end_code == OP_ALT);
561 workspace[0] = 0; /* Bit indicating which vector is current */
563 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
565 /* Loop for scanning the subject */
567 ptr = current_subject;
574 BOOL could_continue = FALSE;
576 /* Make the new state list into the active state list and empty the
579 temp_states = active_states;
580 active_states = new_states;
581 new_states = temp_states;
582 active_count = new_count;
585 workspace[0] ^= 1; /* Remember for the restarting feature */
586 workspace[1] = active_count;
589 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
590 pchars(ptr, STRLEN_UC(ptr), stdout);
593 printf("%.*sActive states: ", rlevel*2-2, SP);
594 for (i = 0; i < active_count; i++)
595 printf("%d/%d ", active_states[i].offset, active_states[i].count);
599 /* Set the pointers for adding new states */
601 next_active_state = active_states + active_count;
602 next_new_state = new_states;
604 /* Load the current character from the subject outside the loop, as many
605 different states may want to look at it, and we assume that at least one
608 if (ptr < end_subject)
610 clen = 1; /* Number of bytes in the character */
612 if (utf) { GETCHARLEN(c, ptr, clen); } else
613 #endif /* SUPPORT_UTF */
618 clen = 0; /* This indicates the end of the subject */
619 c = NOTACHAR; /* This value should never actually be used */
622 /* Scan up the active states and act on each one. The result of an action
623 may be to add more states to the currently active list (e.g. on hitting a
624 parenthesis) or it may be to put states on the new list, for considering
625 when we move the character pointer on. */
627 for (i = 0; i < active_count; i++)
629 stateblock *current_state = active_states + i;
630 BOOL caseless = FALSE;
631 const pcre_uchar *code;
632 int state_offset = current_state->offset;
633 int count, codevalue, rrc;
636 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
637 if (clen == 0) printf("EOL\n");
638 else if (c > 32 && c < 127) printf("'%c'\n", c);
639 else printf("0x%02x\n", c);
642 /* A negative offset is a special case meaning "hold off going to this
643 (negated) state until the number of characters in the data field have
646 if (state_offset < 0)
648 if (current_state->data > 0)
650 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
651 ADD_NEW_DATA(state_offset, current_state->count,
652 current_state->data - 1);
657 current_state->offset = state_offset = -state_offset;
661 /* Check for a duplicate state with the same count, and skip if found.
662 See the note at the head of this module about the possibility of improving
665 for (j = 0; j < i; j++)
667 if (active_states[j].offset == state_offset &&
668 active_states[j].count == current_state->count)
670 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
671 goto NEXT_ACTIVE_STATE;
675 /* The state offset is the offset to the opcode */
677 code = start_code + state_offset;
680 /* If this opcode inspects a character, but we are at the end of the
681 subject, remember the fact for use when testing for a partial match. */
683 if (clen == 0 && poptable[codevalue] != 0)
684 could_continue = TRUE;
686 /* If this opcode is followed by an inline character, load it. It is
687 tempting to test for the presence of a subject character here, but that
688 is wrong, because sometimes zero repetitions of the subject are
691 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
692 argument that is not a data character - but is always one byte long. We
693 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
694 this case. To keep the other cases fast, convert these ones to new opcodes.
697 if (coptable[codevalue] > 0)
701 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
702 #endif /* SUPPORT_UTF */
703 d = code[coptable[codevalue]];
704 if (codevalue >= OP_TYPESTAR)
708 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
710 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
711 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
712 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
714 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
716 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723 dlen = 0; /* Not strictly necessary, but compilers moan */
724 d = NOTACHAR; /* if these variables are not set. */
728 /* Now process the individual opcodes */
732 /* ========================================================================== */
733 /* These cases are never obeyed. This is a fudge that causes a compile-
734 time error if the vectors coptable or poptable, which are indexed by
735 opcode, are not the correct length. It seems to be the only way to do
736 such a check at compile time, as the sizeof() operator does not work
737 in the C preprocessor. */
739 case OP_TABLE_LENGTH:
740 case OP_TABLE_LENGTH +
741 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
742 (sizeof(poptable) == OP_TABLE_LENGTH)):
745 /* ========================================================================== */
746 /* Reached a closing bracket. If not at the end of the pattern, carry
747 on with the next opcode. For repeating opcodes, also add the repeat
748 state. Note that KETRPOS will always be encountered at the end of the
749 subpattern, because the possessive subpattern repeats are always handled
750 using recursive calls. Thus, it never adds any new states.
752 At the end of the (sub)pattern, unless we have an empty string and
753 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
754 start of the subject, save the match data, shifting up all previous
755 matches so we always have the longest first. */
761 if (code != end_code)
763 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
764 if (codevalue != OP_KET)
766 ADD_ACTIVE(state_offset - GET(code, 1), 0);
771 if (ptr > current_subject ||
772 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
773 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
774 current_subject > start_subject + md->start_offset)))
776 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
777 else if (match_count > 0 && ++match_count * 2 > offsetcount)
779 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
780 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
781 if (offsetcount >= 2)
783 offsets[0] = (int)(current_subject - start_subject);
784 offsets[1] = (int)(ptr - start_subject);
785 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
786 offsets[1] - offsets[0], current_subject));
788 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
790 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
791 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
792 match_count, rlevel*2-2, SP));
799 /* ========================================================================== */
800 /* These opcodes add to the current list of states without looking
801 at the current character. */
803 /*-----------------------------------------------------------------*/
805 do { code += GET(code, 1); } while (*code == OP_ALT);
806 ADD_ACTIVE((int)(code - start_code), 0);
809 /*-----------------------------------------------------------------*/
814 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815 code += GET(code, 1);
817 while (*code == OP_ALT);
820 /*-----------------------------------------------------------------*/
823 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
824 code += GET(code, 1);
825 while (*code == OP_ALT)
827 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
828 code += GET(code, 1);
832 /*-----------------------------------------------------------------*/
835 ADD_ACTIVE(state_offset + 1, 0);
836 code += 1 + GET(code, 2);
837 while (*code == OP_ALT) code += GET(code, 1);
838 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
841 /*-----------------------------------------------------------------*/
843 code += 1 + GET(code, 2);
844 while (*code == OP_ALT) code += GET(code, 1);
845 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
848 /*-----------------------------------------------------------------*/
850 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
851 { ADD_ACTIVE(state_offset + 1, 0); }
854 /*-----------------------------------------------------------------*/
856 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
857 (ptr != end_subject && WAS_NEWLINE(ptr)))
858 { ADD_ACTIVE(state_offset + 1, 0); }
861 /*-----------------------------------------------------------------*/
863 if (ptr >= end_subject)
865 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
866 could_continue = TRUE;
867 else { ADD_ACTIVE(state_offset + 1, 0); }
871 /*-----------------------------------------------------------------*/
873 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
876 /*-----------------------------------------------------------------*/
878 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
882 /* ========================================================================== */
883 /* These opcodes inspect the next subject character, and sometimes
884 the previous one as well, but do not have an argument. The variable
885 clen contains the length of the current character and is zero if we are
886 at the end of the subject. */
888 /*-----------------------------------------------------------------*/
890 if (clen > 0 && !IS_NEWLINE(ptr))
891 { ADD_NEW(state_offset + 1, 0); }
894 /*-----------------------------------------------------------------*/
897 { ADD_NEW(state_offset + 1, 0); }
900 /*-----------------------------------------------------------------*/
902 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
903 could_continue = TRUE;
904 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
905 { ADD_ACTIVE(state_offset + 1, 0); }
908 /*-----------------------------------------------------------------*/
910 if ((md->moptions & PCRE_NOTEOL) == 0)
912 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
913 could_continue = TRUE;
914 else if (clen == 0 ||
915 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
916 (ptr == end_subject - md->nllen)
918 { ADD_ACTIVE(state_offset + 1, 0); }
922 /*-----------------------------------------------------------------*/
924 if ((md->moptions & PCRE_NOTEOL) == 0)
926 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
927 could_continue = TRUE;
928 else if (clen == 0 ||
929 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
930 { ADD_ACTIVE(state_offset + 1, 0); }
932 else if (IS_NEWLINE(ptr))
933 { ADD_ACTIVE(state_offset + 1, 0); }
936 /*-----------------------------------------------------------------*/
941 if (clen > 0 && c < 256 &&
942 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
943 { ADD_NEW(state_offset + 1, 0); }
946 /*-----------------------------------------------------------------*/
948 case OP_NOT_WHITESPACE:
949 case OP_NOT_WORDCHAR:
950 if (clen > 0 && (c >= 256 ||
951 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
952 { ADD_NEW(state_offset + 1, 0); }
955 /*-----------------------------------------------------------------*/
956 case OP_WORD_BOUNDARY:
957 case OP_NOT_WORD_BOUNDARY:
959 int left_word, right_word;
961 if (ptr > start_subject)
963 const pcre_uchar *temp = ptr - 1;
964 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
966 if (utf) { BACKCHAR(temp); }
968 GETCHARTEST(d, temp);
970 if ((md->poptions & PCRE_UCP) != 0)
972 if (d == '_') left_word = TRUE; else
974 int cat = UCD_CATEGORY(d);
975 left_word = (cat == ucp_L || cat == ucp_N);
980 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
982 else left_word = FALSE;
987 if ((md->poptions & PCRE_UCP) != 0)
989 if (c == '_') right_word = TRUE; else
991 int cat = UCD_CATEGORY(c);
992 right_word = (cat == ucp_L || cat == ucp_N);
997 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
999 else right_word = FALSE;
1001 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002 { ADD_ACTIVE(state_offset + 1, 0); }
1007 /*-----------------------------------------------------------------*/
1008 /* Check the next character by Unicode property. We will get here only
1009 if the support is in the binary; otherwise a compile-time error occurs.
1018 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1026 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1031 OK = PRIV(ucp_gentype)[chartype] == code[2];
1035 OK = chartype == code[2];
1039 OK = UCD_SCRIPT(c) == code[2];
1042 /* These are specials for combination cases. */
1045 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1046 PRIV(ucp_gentype)[chartype] == ucp_N;
1049 case PT_SPACE: /* Perl space */
1050 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1051 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1054 case PT_PXSPACE: /* POSIX space */
1055 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1056 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057 c == CHAR_FF || c == CHAR_CR;
1061 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1062 PRIV(ucp_gentype)[chartype] == ucp_N ||
1063 c == CHAR_UNDERSCORE;
1066 /* Should never occur, but keep compilers from grumbling. */
1069 OK = codevalue != OP_PROP;
1073 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1080 /* ========================================================================== */
1081 /* These opcodes likewise inspect the subject character, but have an
1082 argument that is not a data character. It is one of these opcodes:
1083 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1084 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1087 case OP_TYPEMINPLUS:
1088 case OP_TYPEPOSPLUS:
1089 count = current_state->count; /* Already matched */
1090 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1093 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1095 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1096 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1098 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1100 active_count--; /* Remove non-match possibility */
1101 next_active_state--;
1104 ADD_NEW(state_offset, count);
1109 /*-----------------------------------------------------------------*/
1111 case OP_TYPEMINQUERY:
1112 case OP_TYPEPOSQUERY:
1113 ADD_ACTIVE(state_offset + 2, 0);
1116 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1118 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1119 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1121 if (codevalue == OP_TYPEPOSQUERY)
1123 active_count--; /* Remove non-match possibility */
1124 next_active_state--;
1126 ADD_NEW(state_offset + 2, 0);
1131 /*-----------------------------------------------------------------*/
1133 case OP_TYPEMINSTAR:
1134 case OP_TYPEPOSSTAR:
1135 ADD_ACTIVE(state_offset + 2, 0);
1138 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1140 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1141 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1143 if (codevalue == OP_TYPEPOSSTAR)
1145 active_count--; /* Remove non-match possibility */
1146 next_active_state--;
1148 ADD_NEW(state_offset, 0);
1153 /*-----------------------------------------------------------------*/
1155 count = current_state->count; /* Number already matched */
1158 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1160 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1163 if (++count >= GET2(code, 1))
1164 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1166 { ADD_NEW(state_offset, count); }
1171 /*-----------------------------------------------------------------*/
1173 case OP_TYPEMINUPTO:
1174 case OP_TYPEPOSUPTO:
1175 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1176 count = current_state->count; /* Number already matched */
1179 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1181 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1184 if (codevalue == OP_TYPEPOSUPTO)
1186 active_count--; /* Remove non-match possibility */
1187 next_active_state--;
1189 if (++count >= GET2(code, 1))
1190 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1192 { ADD_NEW(state_offset, count); }
1197 /* ========================================================================== */
1198 /* These are virtual opcodes that are used when something like
1199 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1200 argument. It keeps the code above fast for the other cases. The argument
1201 is in the d variable. */
1204 case OP_PROP_EXTRA + OP_TYPEPLUS:
1205 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1206 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1207 count = current_state->count; /* Already matched */
1208 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1212 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1220 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1225 OK = PRIV(ucp_gentype)[chartype] == code[3];
1229 OK = chartype == code[3];
1233 OK = UCD_SCRIPT(c) == code[3];
1236 /* These are specials for combination cases. */
1239 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1240 PRIV(ucp_gentype)[chartype] == ucp_N;
1243 case PT_SPACE: /* Perl space */
1244 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1245 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1248 case PT_PXSPACE: /* POSIX space */
1249 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1250 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251 c == CHAR_FF || c == CHAR_CR;
1255 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1256 PRIV(ucp_gentype)[chartype] == ucp_N ||
1257 c == CHAR_UNDERSCORE;
1260 /* Should never occur, but keep compilers from grumbling. */
1263 OK = codevalue != OP_PROP;
1267 if (OK == (d == OP_PROP))
1269 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1271 active_count--; /* Remove non-match possibility */
1272 next_active_state--;
1275 ADD_NEW(state_offset, count);
1280 /*-----------------------------------------------------------------*/
1281 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1282 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1283 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1284 count = current_state->count; /* Already matched */
1285 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1288 const pcre_uchar *nptr = ptr + clen;
1290 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1292 active_count--; /* Remove non-match possibility */
1293 next_active_state--;
1295 while (nptr < end_subject)
1299 GETCHARLEN(nd, nptr, ndlen);
1300 if (UCD_CATEGORY(nd) != ucp_M) break;
1305 ADD_NEW_DATA(-state_offset, count, ncount);
1310 /*-----------------------------------------------------------------*/
1311 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1312 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1313 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1314 count = current_state->count; /* Already matched */
1315 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1326 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1330 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1335 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1337 active_count--; /* Remove non-match possibility */
1338 next_active_state--;
1341 ADD_NEW_DATA(-state_offset, count, ncount);
1350 /*-----------------------------------------------------------------*/
1351 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1352 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1353 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1354 count = current_state->count; /* Already matched */
1355 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1376 if (OK == (d == OP_VSPACE))
1378 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1380 active_count--; /* Remove non-match possibility */
1381 next_active_state--;
1384 ADD_NEW_DATA(-state_offset, count, 0);
1389 /*-----------------------------------------------------------------*/
1390 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1391 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1392 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1393 count = current_state->count; /* Already matched */
1394 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1401 case 0x20: /* SPACE */
1402 case 0xa0: /* NBSP */
1403 case 0x1680: /* OGHAM SPACE MARK */
1404 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1405 case 0x2000: /* EN QUAD */
1406 case 0x2001: /* EM QUAD */
1407 case 0x2002: /* EN SPACE */
1408 case 0x2003: /* EM SPACE */
1409 case 0x2004: /* THREE-PER-EM SPACE */
1410 case 0x2005: /* FOUR-PER-EM SPACE */
1411 case 0x2006: /* SIX-PER-EM SPACE */
1412 case 0x2007: /* FIGURE SPACE */
1413 case 0x2008: /* PUNCTUATION SPACE */
1414 case 0x2009: /* THIN SPACE */
1415 case 0x200A: /* HAIR SPACE */
1416 case 0x202f: /* NARROW NO-BREAK SPACE */
1417 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1418 case 0x3000: /* IDEOGRAPHIC SPACE */
1427 if (OK == (d == OP_HSPACE))
1429 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1431 active_count--; /* Remove non-match possibility */
1432 next_active_state--;
1435 ADD_NEW_DATA(-state_offset, count, 0);
1440 /*-----------------------------------------------------------------*/
1442 case OP_PROP_EXTRA + OP_TYPEQUERY:
1443 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1444 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1448 case OP_PROP_EXTRA + OP_TYPESTAR:
1449 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1450 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1455 ADD_ACTIVE(state_offset + 4, 0);
1459 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1467 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1472 OK = PRIV(ucp_gentype)[chartype] == code[3];
1476 OK = chartype == code[3];
1480 OK = UCD_SCRIPT(c) == code[3];
1483 /* These are specials for combination cases. */
1486 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1487 PRIV(ucp_gentype)[chartype] == ucp_N;
1490 case PT_SPACE: /* Perl space */
1491 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1492 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1495 case PT_PXSPACE: /* POSIX space */
1496 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1497 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498 c == CHAR_FF || c == CHAR_CR;
1502 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1503 PRIV(ucp_gentype)[chartype] == ucp_N ||
1504 c == CHAR_UNDERSCORE;
1507 /* Should never occur, but keep compilers from grumbling. */
1510 OK = codevalue != OP_PROP;
1514 if (OK == (d == OP_PROP))
1516 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1517 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1519 active_count--; /* Remove non-match possibility */
1520 next_active_state--;
1522 ADD_NEW(state_offset + count, 0);
1527 /*-----------------------------------------------------------------*/
1528 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1529 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1530 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1534 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1535 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1536 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1541 ADD_ACTIVE(state_offset + 2, 0);
1542 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1544 const pcre_uchar *nptr = ptr + clen;
1546 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1549 active_count--; /* Remove non-match possibility */
1550 next_active_state--;
1552 while (nptr < end_subject)
1556 GETCHARLEN(nd, nptr, ndlen);
1557 if (UCD_CATEGORY(nd) != ucp_M) break;
1561 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1566 /*-----------------------------------------------------------------*/
1567 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1568 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1569 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1573 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1574 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1575 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1579 ADD_ACTIVE(state_offset + 2, 0);
1590 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1594 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1599 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1600 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1602 active_count--; /* Remove non-match possibility */
1603 next_active_state--;
1605 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1614 /*-----------------------------------------------------------------*/
1615 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1616 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1617 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1621 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1622 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1623 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1627 ADD_ACTIVE(state_offset + 2, 0);
1647 if (OK == (d == OP_VSPACE))
1649 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1650 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1652 active_count--; /* Remove non-match possibility */
1653 next_active_state--;
1655 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1660 /*-----------------------------------------------------------------*/
1661 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1662 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1663 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1667 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1668 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1669 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1673 ADD_ACTIVE(state_offset + 2, 0);
1680 case 0x20: /* SPACE */
1681 case 0xa0: /* NBSP */
1682 case 0x1680: /* OGHAM SPACE MARK */
1683 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1684 case 0x2000: /* EN QUAD */
1685 case 0x2001: /* EM QUAD */
1686 case 0x2002: /* EN SPACE */
1687 case 0x2003: /* EM SPACE */
1688 case 0x2004: /* THREE-PER-EM SPACE */
1689 case 0x2005: /* FOUR-PER-EM SPACE */
1690 case 0x2006: /* SIX-PER-EM SPACE */
1691 case 0x2007: /* FIGURE SPACE */
1692 case 0x2008: /* PUNCTUATION SPACE */
1693 case 0x2009: /* THIN SPACE */
1694 case 0x200A: /* HAIR SPACE */
1695 case 0x202f: /* NARROW NO-BREAK SPACE */
1696 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1697 case 0x3000: /* IDEOGRAPHIC SPACE */
1706 if (OK == (d == OP_HSPACE))
1708 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1709 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1711 active_count--; /* Remove non-match possibility */
1712 next_active_state--;
1714 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1719 /*-----------------------------------------------------------------*/
1721 case OP_PROP_EXTRA + OP_TYPEEXACT:
1722 case OP_PROP_EXTRA + OP_TYPEUPTO:
1723 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1726 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1727 count = current_state->count; /* Number already matched */
1731 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1732 switch(code[1 + IMM2_SIZE + 1])
1739 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1744 OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
1748 OK = chartype == code[1 + IMM2_SIZE + 2];
1752 OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
1755 /* These are specials for combination cases. */
1758 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1759 PRIV(ucp_gentype)[chartype] == ucp_N;
1762 case PT_SPACE: /* Perl space */
1763 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1764 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1767 case PT_PXSPACE: /* POSIX space */
1768 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1769 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770 c == CHAR_FF || c == CHAR_CR;
1774 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1775 PRIV(ucp_gentype)[chartype] == ucp_N ||
1776 c == CHAR_UNDERSCORE;
1779 /* Should never occur, but keep compilers from grumbling. */
1782 OK = codevalue != OP_PROP;
1786 if (OK == (d == OP_PROP))
1788 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1790 active_count--; /* Remove non-match possibility */
1791 next_active_state--;
1793 if (++count >= GET2(code, 1))
1794 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1796 { ADD_NEW(state_offset, count); }
1801 /*-----------------------------------------------------------------*/
1802 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1803 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1804 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1807 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1808 count = current_state->count; /* Number already matched */
1809 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1811 const pcre_uchar *nptr = ptr + clen;
1813 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1815 active_count--; /* Remove non-match possibility */
1816 next_active_state--;
1818 while (nptr < end_subject)
1822 GETCHARLEN(nd, nptr, ndlen);
1823 if (UCD_CATEGORY(nd) != ucp_M) break;
1827 if (++count >= GET2(code, 1))
1828 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1830 { ADD_NEW_DATA(-state_offset, count, ncount); }
1835 /*-----------------------------------------------------------------*/
1836 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1837 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1838 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1841 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1842 count = current_state->count; /* Number already matched */
1853 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1857 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1862 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1864 active_count--; /* Remove non-match possibility */
1865 next_active_state--;
1867 if (++count >= GET2(code, 1))
1868 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1870 { ADD_NEW_DATA(-state_offset, count, ncount); }
1879 /*-----------------------------------------------------------------*/
1880 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1881 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1882 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1885 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1886 count = current_state->count; /* Number already matched */
1906 if (OK == (d == OP_VSPACE))
1908 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1910 active_count--; /* Remove non-match possibility */
1911 next_active_state--;
1913 if (++count >= GET2(code, 1))
1914 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1916 { ADD_NEW_DATA(-state_offset, count, 0); }
1921 /*-----------------------------------------------------------------*/
1922 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1923 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1924 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1927 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1928 count = current_state->count; /* Number already matched */
1935 case 0x20: /* SPACE */
1936 case 0xa0: /* NBSP */
1937 case 0x1680: /* OGHAM SPACE MARK */
1938 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1939 case 0x2000: /* EN QUAD */
1940 case 0x2001: /* EM QUAD */
1941 case 0x2002: /* EN SPACE */
1942 case 0x2003: /* EM SPACE */
1943 case 0x2004: /* THREE-PER-EM SPACE */
1944 case 0x2005: /* FOUR-PER-EM SPACE */
1945 case 0x2006: /* SIX-PER-EM SPACE */
1946 case 0x2007: /* FIGURE SPACE */
1947 case 0x2008: /* PUNCTUATION SPACE */
1948 case 0x2009: /* THIN SPACE */
1949 case 0x200A: /* HAIR SPACE */
1950 case 0x202f: /* NARROW NO-BREAK SPACE */
1951 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1952 case 0x3000: /* IDEOGRAPHIC SPACE */
1961 if (OK == (d == OP_HSPACE))
1963 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1965 active_count--; /* Remove non-match possibility */
1966 next_active_state--;
1968 if (++count >= GET2(code, 1))
1969 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1971 { ADD_NEW_DATA(-state_offset, count, 0); }
1976 /* ========================================================================== */
1977 /* These opcodes are followed by a character that is usually compared
1978 to the current subject character; it is loaded into d. We still get
1979 here even if there is no subject character, because in some cases zero
1980 repetitions are permitted. */
1982 /*-----------------------------------------------------------------*/
1984 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1987 /*-----------------------------------------------------------------*/
1989 if (clen == 0) break;
1994 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1996 unsigned int othercase;
2000 /* If we have Unicode property support, we can use it to test the
2001 other case of the character. */
2003 othercase = UCD_OTHERCASE(c);
2005 othercase = NOTACHAR;
2008 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2012 #endif /* SUPPORT_UTF */
2015 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2016 { ADD_NEW(state_offset + 2, 0); }
2022 /*-----------------------------------------------------------------*/
2023 /* This is a tricky one because it can match more than one character.
2024 Find out how many characters to skip, and then set up a negative state
2025 to wait for them to pass before continuing. */
2028 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2030 const pcre_uchar *nptr = ptr + clen;
2032 while (nptr < end_subject)
2035 GETCHARLEN(c, nptr, nclen);
2036 if (UCD_CATEGORY(c) != ucp_M) break;
2040 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2045 /*-----------------------------------------------------------------*/
2046 /* This is a tricky like EXTUNI because it too can match more than one
2047 character (when CR is followed by LF). In this case, set up a negative
2048 state to wait for one character to pass before continuing. */
2051 if (clen > 0) switch(c)
2058 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2061 ADD_NEW(state_offset + 1, 0);
2065 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2067 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2071 ADD_NEW(state_offset + 1, 0);
2077 /*-----------------------------------------------------------------*/
2079 if (clen > 0) switch(c)
2091 ADD_NEW(state_offset + 1, 0);
2096 /*-----------------------------------------------------------------*/
2098 if (clen > 0) switch(c)
2107 ADD_NEW(state_offset + 1, 0);
2114 /*-----------------------------------------------------------------*/
2116 if (clen > 0) switch(c)
2119 case 0x20: /* SPACE */
2120 case 0xa0: /* NBSP */
2121 case 0x1680: /* OGHAM SPACE MARK */
2122 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2123 case 0x2000: /* EN QUAD */
2124 case 0x2001: /* EM QUAD */
2125 case 0x2002: /* EN SPACE */
2126 case 0x2003: /* EM SPACE */
2127 case 0x2004: /* THREE-PER-EM SPACE */
2128 case 0x2005: /* FOUR-PER-EM SPACE */
2129 case 0x2006: /* SIX-PER-EM SPACE */
2130 case 0x2007: /* FIGURE SPACE */
2131 case 0x2008: /* PUNCTUATION SPACE */
2132 case 0x2009: /* THIN SPACE */
2133 case 0x200A: /* HAIR SPACE */
2134 case 0x202f: /* NARROW NO-BREAK SPACE */
2135 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2136 case 0x3000: /* IDEOGRAPHIC SPACE */
2140 ADD_NEW(state_offset + 1, 0);
2145 /*-----------------------------------------------------------------*/
2147 if (clen > 0) switch(c)
2150 case 0x20: /* SPACE */
2151 case 0xa0: /* NBSP */
2152 case 0x1680: /* OGHAM SPACE MARK */
2153 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2154 case 0x2000: /* EN QUAD */
2155 case 0x2001: /* EM QUAD */
2156 case 0x2002: /* EN SPACE */
2157 case 0x2003: /* EM SPACE */
2158 case 0x2004: /* THREE-PER-EM SPACE */
2159 case 0x2005: /* FOUR-PER-EM SPACE */
2160 case 0x2006: /* SIX-PER-EM SPACE */
2161 case 0x2007: /* FIGURE SPACE */
2162 case 0x2008: /* PUNCTUATION SPACE */
2163 case 0x2009: /* THIN SPACE */
2164 case 0x200A: /* HAIR SPACE */
2165 case 0x202f: /* NARROW NO-BREAK SPACE */
2166 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2167 case 0x3000: /* IDEOGRAPHIC SPACE */
2168 ADD_NEW(state_offset + 1, 0);
2173 /*-----------------------------------------------------------------*/
2174 /* Match a negated single character casefully. This is only used for
2175 one-byte characters, that is, we know that d < 256. The character we are
2176 checking (c) can be multibyte. */
2179 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2182 /*-----------------------------------------------------------------*/
2183 /* Match a negated single character caselessly. This is only used for
2184 one-byte characters, that is, we know that d < 256. The character we are
2185 checking (c) can be multibyte. */
2188 if (clen > 0 && c != d && c != fcc[d])
2189 { ADD_NEW(state_offset + dlen + 1, 0); }
2192 /*-----------------------------------------------------------------*/
2197 case OP_NOTMINPLUSI:
2198 case OP_NOTPOSPLUSI:
2200 codevalue -= OP_STARI - OP_STAR;
2209 count = current_state->count; /* Already matched */
2210 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2213 unsigned int otherd = NOTACHAR;
2217 if (utf && d >= 128)
2220 otherd = UCD_OTHERCASE(d);
2221 #endif /* SUPPORT_UCP */
2224 #endif /* SUPPORT_UTF */
2225 otherd = TABLE_GET(d, fcc, d);
2227 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2230 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2232 active_count--; /* Remove non-match possibility */
2233 next_active_state--;
2236 ADD_NEW(state_offset, count);
2241 /*-----------------------------------------------------------------*/
2246 case OP_NOTMINQUERYI:
2247 case OP_NOTPOSQUERYI:
2249 codevalue -= OP_STARI - OP_STAR;
2255 case OP_NOTMINQUERY:
2256 case OP_NOTPOSQUERY:
2257 ADD_ACTIVE(state_offset + dlen + 1, 0);
2260 unsigned int otherd = NOTACHAR;
2264 if (utf && d >= 128)
2267 otherd = UCD_OTHERCASE(d);
2268 #endif /* SUPPORT_UCP */
2271 #endif /* SUPPORT_UTF */
2272 otherd = TABLE_GET(d, fcc, d);
2274 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2276 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2278 active_count--; /* Remove non-match possibility */
2279 next_active_state--;
2281 ADD_NEW(state_offset + dlen + 1, 0);
2286 /*-----------------------------------------------------------------*/
2291 case OP_NOTMINSTARI:
2292 case OP_NOTPOSSTARI:
2294 codevalue -= OP_STARI - OP_STAR;
2302 ADD_ACTIVE(state_offset + dlen + 1, 0);
2305 unsigned int otherd = NOTACHAR;
2309 if (utf && d >= 128)
2312 otherd = UCD_OTHERCASE(d);
2313 #endif /* SUPPORT_UCP */
2316 #endif /* SUPPORT_UTF */
2317 otherd = TABLE_GET(d, fcc, d);
2319 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2321 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2323 active_count--; /* Remove non-match possibility */
2324 next_active_state--;
2326 ADD_NEW(state_offset, 0);
2331 /*-----------------------------------------------------------------*/
2335 codevalue -= OP_STARI - OP_STAR;
2339 count = current_state->count; /* Number already matched */
2342 unsigned int otherd = NOTACHAR;
2346 if (utf && d >= 128)
2349 otherd = UCD_OTHERCASE(d);
2350 #endif /* SUPPORT_UCP */
2353 #endif /* SUPPORT_UTF */
2354 otherd = TABLE_GET(d, fcc, d);
2356 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2358 if (++count >= GET2(code, 1))
2359 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2361 { ADD_NEW(state_offset, count); }
2366 /*-----------------------------------------------------------------*/
2371 case OP_NOTMINUPTOI:
2372 case OP_NOTPOSUPTOI:
2374 codevalue -= OP_STARI - OP_STAR;
2382 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2383 count = current_state->count; /* Number already matched */
2386 unsigned int otherd = NOTACHAR;
2390 if (utf && d >= 128)
2393 otherd = UCD_OTHERCASE(d);
2394 #endif /* SUPPORT_UCP */
2397 #endif /* SUPPORT_UTF */
2398 otherd = TABLE_GET(d, fcc, d);
2400 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2402 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2404 active_count--; /* Remove non-match possibility */
2405 next_active_state--;
2407 if (++count >= GET2(code, 1))
2408 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2410 { ADD_NEW(state_offset, count); }
2416 /* ========================================================================== */
2417 /* These are the class-handling opcodes */
2423 BOOL isinclass = FALSE;
2424 int next_state_offset;
2425 const pcre_uchar *ecode;
2427 /* For a simple class, there is always just a 32-byte table, and we
2428 can set isinclass from it. */
2430 if (codevalue != OP_XCLASS)
2432 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2435 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2436 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2440 /* An extended class may have a table or a list of single characters,
2441 ranges, or both, and it may be positive or negative. There's a
2442 function that sorts all this out. */
2446 ecode = code + GET(code, 1);
2447 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2450 /* At this point, isinclass is set for all kinds of class, and ecode
2451 points to the byte after the end of the class. If there is a
2452 quantifier, this is where it will be. */
2454 next_state_offset = (int)(ecode - start_code);
2460 ADD_ACTIVE(next_state_offset + 1, 0);
2461 if (isinclass) { ADD_NEW(state_offset, 0); }
2466 count = current_state->count; /* Already matched */
2467 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2468 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2473 ADD_ACTIVE(next_state_offset + 1, 0);
2474 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2479 count = current_state->count; /* Already matched */
2480 if (count >= GET2(ecode, 1))
2481 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2484 int max = GET2(ecode, 1 + IMM2_SIZE);
2485 if (++count >= max && max != 0) /* Max 0 => no limit */
2486 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2488 { ADD_NEW(state_offset, count); }
2493 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2499 /* ========================================================================== */
2500 /* These are the opcodes for fancy brackets of various kinds. We have
2501 to use recursion in order to handle them. The "always failing" assertion
2502 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2503 though the other "backtracking verbs" are not supported. */
2506 forced_fail++; /* Count FAILs for multiple states */
2512 case OP_ASSERTBACK_NOT:
2515 int local_offsets[2];
2516 int local_workspace[1000];
2517 const pcre_uchar *endasscode = code + GET(code, 1);
2519 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2521 rc = internal_dfa_exec(
2522 md, /* static match data */
2523 code, /* this subexpression's code */
2524 ptr, /* where we currently are */
2525 (int)(ptr - start_subject), /* start offset */
2526 local_offsets, /* offset vector */
2527 sizeof(local_offsets)/sizeof(int), /* size of same */
2528 local_workspace, /* workspace vector */
2529 sizeof(local_workspace)/sizeof(int), /* size of same */
2530 rlevel); /* function recursion level */
2532 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2533 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2534 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2538 /*-----------------------------------------------------------------*/
2542 int local_offsets[1000];
2543 int local_workspace[1000];
2544 int codelink = GET(code, 1);
2547 /* Because of the way auto-callout works during compile, a callout item
2548 is inserted between OP_COND and an assertion condition. This does not
2549 happen for the other conditions. */
2551 if (code[LINK_SIZE+1] == OP_CALLOUT)
2554 if (PUBL(callout) != NULL)
2556 PUBL(callout_block) cb;
2557 cb.version = 1; /* Version 1 of the callout block */
2558 cb.callout_number = code[LINK_SIZE+2];
2559 cb.offset_vector = offsets;
2560 #ifdef COMPILE_PCRE8
2561 cb.subject = (PCRE_SPTR)start_subject;
2563 cb.subject = (PCRE_SPTR16)start_subject;
2565 cb.subject_length = (int)(end_subject - start_subject);
2566 cb.start_match = (int)(current_subject - start_subject);
2567 cb.current_position = (int)(ptr - start_subject);
2568 cb.pattern_position = GET(code, LINK_SIZE + 3);
2569 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2571 cb.capture_last = -1;
2572 cb.callout_data = md->callout_data;
2573 cb.mark = NULL; /* No (*MARK) support */
2574 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2576 if (rrc > 0) break; /* Fail this thread */
2577 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2580 condcode = code[LINK_SIZE+1];
2582 /* Back reference conditions are not supported */
2584 if (condcode == OP_CREF || condcode == OP_NCREF)
2585 return PCRE_ERROR_DFA_UCOND;
2587 /* The DEFINE condition is always false */
2589 if (condcode == OP_DEF)
2590 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2592 /* The only supported version of OP_RREF is for the value RREF_ANY,
2593 which means "test if in any recursion". We can't test for specifically
2596 else if (condcode == OP_RREF || condcode == OP_NRREF)
2598 int value = GET2(code, LINK_SIZE + 2);
2599 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2600 if (md->recursive != NULL)
2601 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2602 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2605 /* Otherwise, the condition is an assertion */
2610 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2611 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2613 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2615 rc = internal_dfa_exec(
2616 md, /* fixed match data */
2617 asscode, /* this subexpression's code */
2618 ptr, /* where we currently are */
2619 (int)(ptr - start_subject), /* start offset */
2620 local_offsets, /* offset vector */
2621 sizeof(local_offsets)/sizeof(int), /* size of same */
2622 local_workspace, /* workspace vector */
2623 sizeof(local_workspace)/sizeof(int), /* size of same */
2624 rlevel); /* function recursion level */
2626 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2628 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2629 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2631 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2636 /*-----------------------------------------------------------------*/
2639 dfa_recursion_info *ri;
2640 int local_offsets[1000];
2641 int local_workspace[1000];
2642 const pcre_uchar *callpat = start_code + GET(code, 1);
2643 int recno = (callpat == md->start_code)? 0 :
2644 GET2(callpat, 1 + LINK_SIZE);
2647 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2649 /* Check for repeating a recursion without advancing the subject
2650 pointer. This should catch convoluted mutual recursions. (Some simple
2651 cases are caught at compile time.) */
2653 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2654 if (recno == ri->group_num && ptr == ri->subject_position)
2655 return PCRE_ERROR_RECURSELOOP;
2657 /* Remember this recursion and where we started it so as to
2658 catch infinite loops. */
2660 new_recursive.group_num = recno;
2661 new_recursive.subject_position = ptr;
2662 new_recursive.prevrec = md->recursive;
2663 md->recursive = &new_recursive;
2665 rc = internal_dfa_exec(
2666 md, /* fixed match data */
2667 callpat, /* this subexpression's code */
2668 ptr, /* where we currently are */
2669 (int)(ptr - start_subject), /* start offset */
2670 local_offsets, /* offset vector */
2671 sizeof(local_offsets)/sizeof(int), /* size of same */
2672 local_workspace, /* workspace vector */
2673 sizeof(local_workspace)/sizeof(int), /* size of same */
2674 rlevel); /* function recursion level */
2676 md->recursive = new_recursive.prevrec; /* Done this recursion */
2678 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2681 /* Ran out of internal offsets */
2683 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2685 /* For each successful matched substring, set up the next state with a
2686 count of characters to skip before trying it. Note that the count is in
2687 characters, not bytes. */
2691 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2693 int charcount = local_offsets[rc+1] - local_offsets[rc];
2695 const pcre_uchar *p = start_subject + local_offsets[rc];
2696 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2697 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2701 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2705 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2709 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2713 /*-----------------------------------------------------------------*/
2720 int charcount, matched_count;
2721 const pcre_uchar *local_ptr = ptr;
2724 if (codevalue == OP_BRAPOSZERO)
2727 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2729 else allow_zero = FALSE;
2731 /* Loop to match the subpattern as many times as possible as if it were
2732 a complete pattern. */
2734 for (matched_count = 0;; matched_count++)
2736 int local_offsets[2];
2737 int local_workspace[1000];
2739 int rc = internal_dfa_exec(
2740 md, /* fixed match data */
2741 code, /* this subexpression's code */
2742 local_ptr, /* where we currently are */
2743 (int)(ptr - start_subject), /* start offset */
2744 local_offsets, /* offset vector */
2745 sizeof(local_offsets)/sizeof(int), /* size of same */
2746 local_workspace, /* workspace vector */
2747 sizeof(local_workspace)/sizeof(int), /* size of same */
2748 rlevel); /* function recursion level */
2750 /* Failed to match */
2754 if (rc != PCRE_ERROR_NOMATCH) return rc;
2758 /* Matched: break the loop if zero characters matched. */
2760 charcount = local_offsets[1] - local_offsets[0];
2761 if (charcount == 0) break;
2762 local_ptr += charcount; /* Advance temporary position ptr */
2765 /* At this point we have matched the subpattern matched_count
2766 times, and local_ptr is pointing to the character after the end of the
2769 if (matched_count > 0 || allow_zero)
2771 const pcre_uchar *end_subpattern = code;
2772 int next_state_offset;
2774 do { end_subpattern += GET(end_subpattern, 1); }
2775 while (*end_subpattern == OP_ALT);
2777 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2779 /* Optimization: if there are no more active states, and there
2780 are no new states yet set up, then skip over the subject string
2781 right here, to save looping. Otherwise, set up the new state to swing
2782 into action when the end of the matched substring is reached. */
2784 if (i + 1 >= active_count && new_count == 0)
2788 ADD_NEW(next_state_offset, 0);
2792 const pcre_uchar *p = ptr;
2793 const pcre_uchar *pp = local_ptr;
2794 charcount = (int)(pp - p);
2796 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2798 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2804 /*-----------------------------------------------------------------*/
2808 int local_offsets[2];
2809 int local_workspace[1000];
2811 int rc = internal_dfa_exec(
2812 md, /* fixed match data */
2813 code, /* this subexpression's code */
2814 ptr, /* where we currently are */
2815 (int)(ptr - start_subject), /* start offset */
2816 local_offsets, /* offset vector */
2817 sizeof(local_offsets)/sizeof(int), /* size of same */
2818 local_workspace, /* workspace vector */
2819 sizeof(local_workspace)/sizeof(int), /* size of same */
2820 rlevel); /* function recursion level */
2824 const pcre_uchar *end_subpattern = code;
2825 int charcount = local_offsets[1] - local_offsets[0];
2826 int next_state_offset, repeat_state_offset;
2828 do { end_subpattern += GET(end_subpattern, 1); }
2829 while (*end_subpattern == OP_ALT);
2831 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2833 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2834 arrange for the repeat state also to be added to the relevant list.
2835 Calculate the offset, or set -1 for no repeat. */
2837 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2838 *end_subpattern == OP_KETRMIN)?
2839 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2841 /* If we have matched an empty string, add the next state at the
2842 current character pointer. This is important so that the duplicate
2843 checking kicks in, which is what breaks infinite loops that match an
2848 ADD_ACTIVE(next_state_offset, 0);
2851 /* Optimization: if there are no more active states, and there
2852 are no new states yet set up, then skip over the subject string
2853 right here, to save looping. Otherwise, set up the new state to swing
2854 into action when the end of the matched substring is reached. */
2856 else if (i + 1 >= active_count && new_count == 0)
2860 ADD_NEW(next_state_offset, 0);
2862 /* If we are adding a repeat state at the new character position,
2863 we must fudge things so that it is the only current state.
2864 Otherwise, it might be a duplicate of one we processed before, and
2865 that would cause it to be skipped. */
2867 if (repeat_state_offset >= 0)
2869 next_active_state = active_states;
2872 ADD_ACTIVE(repeat_state_offset, 0);
2878 const pcre_uchar *p = start_subject + local_offsets[0];
2879 const pcre_uchar *pp = start_subject + local_offsets[1];
2880 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2882 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2883 if (repeat_state_offset >= 0)
2884 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2887 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2892 /* ========================================================================== */
2893 /* Handle callouts */
2897 if (PUBL(callout) != NULL)
2899 PUBL(callout_block) cb;
2900 cb.version = 1; /* Version 1 of the callout block */
2901 cb.callout_number = code[1];
2902 cb.offset_vector = offsets;
2903 #ifdef COMPILE_PCRE8
2904 cb.subject = (PCRE_SPTR)start_subject;
2906 cb.subject = (PCRE_SPTR16)start_subject;
2908 cb.subject_length = (int)(end_subject - start_subject);
2909 cb.start_match = (int)(current_subject - start_subject);
2910 cb.current_position = (int)(ptr - start_subject);
2911 cb.pattern_position = GET(code, 2);
2912 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2914 cb.capture_last = -1;
2915 cb.callout_data = md->callout_data;
2916 cb.mark = NULL; /* No (*MARK) support */
2917 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2920 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2924 /* ========================================================================== */
2925 default: /* Unsupported opcode */
2926 return PCRE_ERROR_DFA_UITEM;
2929 NEXT_ACTIVE_STATE: continue;
2931 } /* End of loop scanning active states */
2933 /* We have finished the processing at the current subject character. If no
2934 new states have been set for the next character, we have found all the
2935 matches that we are going to find. If we are at the top level and partial
2936 matching has been requested, check for appropriate conditions.
2938 The "forced_ fail" variable counts the number of (*F) encountered for the
2939 character. If it is equal to the original active_count (saved in
2940 workspace[1]) it means that (*F) was found on every active state. In this
2941 case we don't want to give a partial match.
2943 The "could_continue" variable is true if a state could have continued but
2944 for the fact that the end of the subject was reached. */
2948 if (rlevel == 1 && /* Top level, and */
2949 could_continue && /* Some could go on */
2950 forced_fail != workspace[1] && /* Not all forced fail & */
2952 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2954 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2955 match_count < 0) /* no matches */
2957 ptr >= end_subject && /* Reached end of subject */
2958 ptr > md->start_used_ptr) /* Inspected non-empty string */
2960 if (offsetcount >= 2)
2962 offsets[0] = (int)(md->start_used_ptr - start_subject);
2963 offsets[1] = (int)(end_subject - start_subject);
2965 match_count = PCRE_ERROR_PARTIAL;
2968 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2969 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2971 break; /* In effect, "return", but see the comment below */
2974 /* One or more states are active for the next character. */
2976 ptr += clen; /* Advance to next subject character */
2977 } /* Loop to move along the subject string */
2979 /* Control gets here from "break" a few lines above. We do it this way because
2980 if we use "return" above, we have compiler trouble. Some compilers warn if
2981 there's nothing here because they think the function doesn't return a value. On
2982 the other hand, if we put a dummy statement here, some more clever compilers
2983 complain that it can't be reached. Sigh. */
2991 /*************************************************
2992 * Execute a Regular Expression - DFA engine *
2993 *************************************************/
2995 /* This external function applies a compiled re to a subject string using a DFA
2996 engine. This function calls the internal function multiple times if the pattern
3000 argument_re points to the compiled expression
3001 extra_data points to extra data or is NULL
3002 subject points to the subject string
3003 length length of subject string (may contain binary zeros)
3004 start_offset where to start in the subject string
3006 offsets vector of match offsets
3007 offsetcount size of same
3008 workspace workspace vector
3009 wscount size of same
3011 Returns: > 0 => number of match offset pairs placed in offsets
3012 = 0 => offsets overflowed; longest matches are present
3013 -1 => failed to match
3014 < -1 => some kind of unexpected problem
3017 #ifdef COMPILE_PCRE8
3018 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3019 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3020 const char *subject, int length, int start_offset, int options, int *offsets,
3021 int offsetcount, int *workspace, int wscount)
3023 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3024 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3025 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3026 int offsetcount, int *workspace, int wscount)
3029 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3030 dfa_match_data match_block;
3031 dfa_match_data *md = &match_block;
3032 BOOL utf, anchored, startline, firstline;
3033 const pcre_uchar *current_subject, *end_subject;
3034 const pcre_study_data *study = NULL;
3036 const pcre_uchar *req_char_ptr;
3037 const pcre_uint8 *start_bits = NULL;
3038 BOOL has_first_char = FALSE;
3039 BOOL has_req_char = FALSE;
3040 pcre_uchar first_char = 0;
3041 pcre_uchar first_char2 = 0;
3042 pcre_uchar req_char = 0;
3043 pcre_uchar req_char2 = 0;
3046 /* Plausibility checks */
3048 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3049 if (re == NULL || subject == NULL || workspace == NULL ||
3050 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3051 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3052 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3053 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3055 /* We need to find the pointer to any study data before we test for byte
3056 flipping, so we scan the extra_data block first. This may set two fields in the
3057 match block, so we must initialize them beforehand. However, the other fields
3058 in the match block must not be set until after the byte flipping. */
3060 md->tables = re->tables;
3061 md->callout_data = NULL;
3063 if (extra_data != NULL)
3065 unsigned int flags = extra_data->flags;
3066 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3067 study = (const pcre_study_data *)extra_data->study_data;
3068 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3069 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3070 return PCRE_ERROR_DFA_UMLIMIT;
3071 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3072 md->callout_data = extra_data->callout_data;
3073 if ((flags & PCRE_EXTRA_TABLES) != 0)
3074 md->tables = extra_data->tables;
3077 /* Check that the first field in the block is the magic number. If it is not,
3078 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3079 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3080 means that the pattern is likely compiled with different endianness. */
3082 if (re->magic_number != MAGIC_NUMBER)
3083 return re->magic_number == REVERSED_MAGIC_NUMBER?
3084 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3085 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3087 /* Set some local values */
3089 current_subject = (const pcre_uchar *)subject + start_offset;
3090 end_subject = (const pcre_uchar *)subject + length;
3091 req_char_ptr = current_subject - 1;
3094 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3095 utf = (re->options & PCRE_UTF8) != 0;
3100 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3101 (re->options & PCRE_ANCHORED) != 0;
3103 /* The remaining fixed data for passing around. */
3105 md->start_code = (const pcre_uchar *)argument_re +
3106 re->name_table_offset + re->name_count * re->name_entry_size;
3107 md->start_subject = (const pcre_uchar *)subject;
3108 md->end_subject = end_subject;
3109 md->start_offset = start_offset;
3110 md->moptions = options;
3111 md->poptions = re->options;
3113 /* If the BSR option is not set at match time, copy what was set
3116 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3118 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3119 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3121 else md->moptions |= PCRE_BSR_ANYCRLF;
3125 /* Handle different types of newline. The three bits give eight cases. If
3126 nothing is set at run time, whatever was used at compile time applies. */
3128 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3131 case 0: newline = NEWLINE; break; /* Compile-time default */
3132 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3133 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3134 case PCRE_NEWLINE_CR+
3135 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3136 case PCRE_NEWLINE_ANY: newline = -1; break;
3137 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3138 default: return PCRE_ERROR_BADNEWLINE;
3143 md->nltype = NLTYPE_ANYCRLF;
3145 else if (newline < 0)
3147 md->nltype = NLTYPE_ANY;
3151 md->nltype = NLTYPE_FIXED;
3155 md->nl[0] = (newline >> 8) & 255;
3156 md->nl[1] = newline & 255;
3161 md->nl[0] = newline;
3165 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3166 back the character offset. */
3169 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3172 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3175 if (offsetcount >= 2)
3177 offsets[0] = erroroffset;
3178 offsets[1] = errorcode;
3180 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3181 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3183 if (start_offset > 0 && start_offset < length &&
3184 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3185 return PCRE_ERROR_BADUTF8_OFFSET;
3189 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3190 is a feature that makes it possible to save compiled regex and re-use them
3191 in other programs later. */
3193 if (md->tables == NULL) md->tables = PRIV(default_tables);
3195 /* The "must be at the start of a line" flags are used in a loop when finding
3198 startline = (re->flags & PCRE_STARTLINE) != 0;
3199 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3201 /* Set up the first character to match, if available. The first_byte value is
3202 never set for an anchored regular expression, but the anchoring may be forced
3203 at run time, so we have to test for anchoring. The first char may be unset for
3204 an unanchored pattern, of course. If there's no first char and the pattern was
3205 studied, there may be a bitmap of possible first characters. */
3209 if ((re->flags & PCRE_FIRSTSET) != 0)
3211 has_first_char = TRUE;
3212 first_char = first_char2 = (pcre_uchar)(re->first_char);
3213 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3215 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3216 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3217 if (utf && first_char > 127)
3218 first_char2 = UCD_OTHERCASE(first_char);
3224 if (!startline && study != NULL &&
3225 (study->flags & PCRE_STUDY_MAPPED) != 0)
3226 start_bits = study->start_bits;
3230 /* For anchored or unanchored matches, there may be a "last known required
3233 if ((re->flags & PCRE_REQCHSET) != 0)
3235 has_req_char = TRUE;
3236 req_char = req_char2 = (pcre_uchar)(re->req_char);
3237 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3239 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3240 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3241 if (utf && req_char > 127)
3242 req_char2 = UCD_OTHERCASE(req_char);
3247 /* Call the main matching function, looping for a non-anchored regex after a
3248 failed match. If not restarting, perform certain optimizations at the start of
3255 if ((options & PCRE_DFA_RESTART) == 0)
3257 const pcre_uchar *save_end_subject = end_subject;
3259 /* If firstline is TRUE, the start of the match is constrained to the first
3260 line of a multiline string. Implement this by temporarily adjusting
3261 end_subject so that we stop scanning at a newline. If the match fails at
3262 the newline, later code breaks this loop. */
3266 PCRE_PUCHAR t = current_subject;
3270 while (t < md->end_subject && !IS_NEWLINE(t))
3273 ACROSSCHAR(t < end_subject, *t, t++);
3278 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3282 /* There are some optimizations that avoid running the match if a known
3283 starting point is not found. However, there is an option that disables
3284 these, for testing and for ensuring that all callouts do actually occur.
3285 The option can be set in the regex by (*NO_START_OPT) or passed in
3286 match-time options. */
3288 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3290 /* Advance to a known first char. */
3294 if (first_char != first_char2)
3295 while (current_subject < end_subject &&
3296 *current_subject != first_char && *current_subject != first_char2)
3299 while (current_subject < end_subject &&
3300 *current_subject != first_char)
3304 /* Or to just after a linebreak for a multiline match if possible */
3308 if (current_subject > md->start_subject + start_offset)
3313 while (current_subject < end_subject &&
3314 !WAS_NEWLINE(current_subject))
3317 ACROSSCHAR(current_subject < end_subject, *current_subject,
3323 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3326 /* If we have just passed a CR and the newline option is ANY or
3327 ANYCRLF, and we are now at a LF, advance the match position by one
3330 if (current_subject[-1] == CHAR_CR &&
3331 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3332 current_subject < end_subject &&
3333 *current_subject == CHAR_NL)
3338 /* Or to a non-unique first char after study */
3340 else if (start_bits != NULL)
3342 while (current_subject < end_subject)
3344 register unsigned int c = *current_subject;
3345 #ifndef COMPILE_PCRE8
3346 if (c > 255) c = 255;
3348 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3351 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3352 /* In non 8-bit mode, the iteration will stop for
3353 characters > 255 at the beginning or not stop at all. */
3355 ACROSSCHAR(current_subject < end_subject, *current_subject,
3364 /* Restore fudged end_subject */
3366 end_subject = save_end_subject;
3368 /* The following two optimizations are disabled for partial matching or if
3369 disabling is explicitly requested (and of course, by the test above, this
3370 code is not obeyed when restarting after a partial match). */
3372 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3373 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3375 /* If the pattern was studied, a minimum subject length may be set. This
3376 is a lower bound; no actual string of that length may actually match the
3377 pattern. Although the value is, strictly, in characters, we treat it as
3378 bytes to avoid spending too much time in this optimization. */
3380 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3381 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3382 return PCRE_ERROR_NOMATCH;
3384 /* If req_char is set, we know that that character must appear in the
3385 subject for the match to succeed. If the first character is set, req_char
3386 must be later in the subject; otherwise the test starts at the match
3387 point. This optimization can save a huge amount of work in patterns with
3388 nested unlimited repeats that aren't going to match. Writing separate
3389 code for cased/caseless versions makes it go faster, as does using an
3390 autoincrement and backing off on a match.
3392 HOWEVER: when the subject string is very, very long, searching to its end
3393 can take a long time, and give bad performance on quite ordinary
3394 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3395 string... so we don't do this when the string is sufficiently long. */
3397 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3399 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3401 /* We don't need to repeat the search if we haven't yet reached the
3402 place we found it at last time. */
3404 if (p > req_char_ptr)
3406 if (req_char != req_char2)
3408 while (p < end_subject)
3410 register int pp = *p++;
3411 if (pp == req_char || pp == req_char2) { p--; break; }
3416 while (p < end_subject)
3418 if (*p++ == req_char) { p--; break; }
3422 /* If we can't find the required character, break the matching loop,
3423 which will cause a return or PCRE_ERROR_NOMATCH. */
3425 if (p >= end_subject) break;
3427 /* If we have found the required character, save the point where we
3428 found it, so that we don't search again next time round the loop if
3429 the start hasn't passed this character yet. */
3435 } /* End of optimizations that are done when not restarting */
3437 /* OK, now we can do the business */
3439 md->start_used_ptr = current_subject;
3440 md->recursive = NULL;
3442 rc = internal_dfa_exec(
3443 md, /* fixed match data */
3444 md->start_code, /* this subexpression's code */
3445 current_subject, /* where we currently are */
3446 start_offset, /* start offset in subject */
3447 offsets, /* offset vector */
3448 offsetcount, /* size of same */
3449 workspace, /* workspace vector */
3450 wscount, /* size of same */
3451 0); /* function recurse level */
3453 /* Anything other than "no match" means we are done, always; otherwise, carry
3454 on only if not anchored. */
3456 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3458 /* Advance to the next subject character unless we are at the end of a line
3459 and firstline is set. */
3461 if (firstline && IS_NEWLINE(current_subject)) break;
3466 ACROSSCHAR(current_subject < end_subject, *current_subject,
3470 if (current_subject > end_subject) break;
3472 /* If we have just passed a CR and we are now at a LF, and the pattern does
3473 not contain any explicit matches for \r or \n, and the newline option is CRLF
3474 or ANY or ANYCRLF, advance the match position by one more character. */
3476 if (current_subject[-1] == CHAR_CR &&
3477 current_subject < end_subject &&
3478 *current_subject == CHAR_NL &&
3479 (re->flags & PCRE_HASCRORLF) == 0 &&
3480 (md->nltype == NLTYPE_ANY ||
3481 md->nltype == NLTYPE_ANYCRLF ||
3485 } /* "Bumpalong" loop */
3487 return PCRE_ERROR_NOMATCH;
3490 /* End of pcre_dfa_exec.c */