1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
45 #define NLBLOCK md /* Block containing newline information */
46 #define PSSTART start_subject /* Field containing processed string start */
47 #define PSEND end_subject /* Field containing processed string end */
49 #include "pcre_internal.h"
51 /* Undefine some potentially clashing cpp symbols */
56 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
59 #define EPTR_WORK_SIZE (1000)
61 /* Flag bits for the match() function */
63 #define match_condassert 0x01 /* Called to check a condition assertion */
64 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 #define match_tail_recursed 0x04 /* Tail recursive call */
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
71 #define MATCH_NOMATCH 0
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
77 #define REC_STACK_SAVE_MAX 30
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
129 Returns: TRUE if matched
133 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 unsigned long int ims)
136 USPTR p = md->start_subject + md->offset_vector[offset];
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
151 /* Always fail if not enough characters left */
153 if (length > md->end_subject - eptr) return FALSE;
155 /* Separate the caselesss case for speed */
157 if ((ims & PCRE_CASELESS) != 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
174 The match() function is highly recursive, though not every recursive call
175 increases the recursive depth. Nevertheless, some regular expressions can cause
176 it to recurse to a great depth. I was writing for Unix, so I just let it call
177 itself recursively. This uses the stack for saving everything that has to be
178 saved for a recursive call. On Unix, the stack can be large, and this works
181 It turns out that on some non-Unix-like systems there are problems with
182 programs that use a lot of stack. (This despite the fact that every last chip
183 has oodles of memory these days, and techniques for extending the stack have
184 been known for decades.) So....
186 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187 calls by keeping local variables that need to be preserved in blocks of memory
188 obtained from malloc() instead instead of on the stack. Macros are used to
189 achieve this so that the actual code doesn't look very different to what it
192 The original heap-recursive code used longjmp(). However, it seems that this
193 can be very slow on some operating systems. Following a suggestion from Stan
194 Switzer, the use of longjmp() has been abolished, at the cost of having to
195 provide a unique number for each call to RMATCH. There is no way of generating
196 a sequence of numbers at compile time in C. I have given them names, to make
197 them stand out more clearly.
199 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201 tests. Furthermore, not using longjmp() means that local dynamic variables
202 don't have indeterminate values; this has meant that the frame size can be
203 reduced because the result can be "passed back" by straight setting of the
204 variable instead of being passed in the frame.
205 ****************************************************************************
206 ***************************************************************************/
209 /* Numbers for RMATCH calls */
211 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
212 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
213 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215 RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
218 /* These versions of the macros use the stack, as normal. There are debugging
219 versions and production versions. Note that the "rw" argument of RMATCH isn't
220 actuall used in this definition. */
223 #define REGISTER register
226 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
228 printf("match() called in line %d\n", __LINE__); \
229 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
230 printf("to line %d\n", __LINE__); \
232 #define RRETURN(ra) \
234 printf("match() returned %d from line %d ", ra, __LINE__); \
238 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
240 #define RRETURN(ra) return ra
246 /* These versions of the macros manage a private stack on the heap. Note that
247 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
248 argument of match(), which never changes. */
252 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
254 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
255 frame->Xwhere = rw; \
256 newframe->Xeptr = ra;\
257 newframe->Xecode = rb;\
258 newframe->Xmstart = mstart;\
259 newframe->Xoffset_top = rc;\
260 newframe->Xims = re;\
261 newframe->Xeptrb = rf;\
262 newframe->Xflags = rg;\
263 newframe->Xrdepth = frame->Xrdepth + 1;\
264 newframe->Xprevframe = frame;\
266 DPRINTF(("restarting from line %d\n", __LINE__));\
269 DPRINTF(("jumped back to line %d\n", __LINE__));\
274 heapframe *newframe = frame;\
275 frame = newframe->Xprevframe;\
276 (pcre_stack_free)(newframe);\
286 /* Structure for remembering the local variables in a private frame */
288 typedef struct heapframe {
289 struct heapframe *Xprevframe;
291 /* Function arguments that may change */
294 const uschar *Xecode;
295 const uschar *Xmstart;
300 unsigned int Xrdepth;
302 /* Function local variables */
304 const uschar *Xcallpat;
305 const uschar *Xcharptr;
310 const uschar *Xsaved_eptr;
312 recursion_info Xnew_recursive;
318 unsigned long int Xoriginal_ims;
323 int Xprop_fail_result;
340 int Xsave_capture_last;
341 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
342 int Xstacksave[REC_STACK_SAVE_MAX];
346 /* Where to jump back to */
355 /***************************************************************************
356 ***************************************************************************/
360 /*************************************************
361 * Match from current position *
362 *************************************************/
364 /* This function is called recursively in many circumstances. Whenever it
365 returns a negative (error) response, the outer incarnation must also return the
368 Performance note: It might be tempting to extract commonly used fields from the
369 md structure (e.g. utf8, end_subject) into individual variables to improve
370 performance. Tests using gcc on a SPARC disproved this; in the first case, it
371 made performance worse.
374 eptr pointer to current character in subject
375 ecode pointer to current position in compiled code
376 mstart pointer to the current match start position (can be modified
378 offset_top current top pointer
379 md pointer to "static" info for the match
380 ims current /i, /m, and /s options
381 eptrb pointer to chain of blocks containing eptr at start of
382 brackets - for testing for empty matches
384 match_condassert - this is an assertion condition
385 match_cbegroup - this is the start of an unlimited repeat
386 group that can match an empty string
387 match_tail_recursed - this is a tail_recursed group
388 rdepth the recursion depth
390 Returns: MATCH_MATCH if matched ) these values are >= 0
391 MATCH_NOMATCH if failed to match )
392 a negative PCRE_ERROR_xxx value if aborted by an error condition
393 (e.g. stopped by repeated call or recursion limit)
397 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
398 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
399 int flags, unsigned int rdepth)
401 /* These variables do not need to be preserved over recursion in this function,
402 so they can be ordinary variables in all cases. Mark some of them with
403 "register" because they are used a lot in loops. */
405 register int rrc; /* Returns from recursive calls */
406 register int i; /* Used for loops not involving calls to RMATCH() */
407 register unsigned int c; /* Character values not kept over RMATCH() calls */
408 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
410 BOOL minimize, possessive; /* Quantifier options */
412 /* When recursion is not being used, all "local" variables that have to be
413 preserved over calls to RMATCH() are part of a "frame" which is obtained from
414 heap storage. Set up the top-level frame here; others are obtained from the
415 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
418 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
419 frame->Xprevframe = NULL; /* Marks the top level */
421 /* Copy in the original argument variables */
424 frame->Xecode = ecode;
425 frame->Xmstart = mstart;
426 frame->Xoffset_top = offset_top;
428 frame->Xeptrb = eptrb;
429 frame->Xflags = flags;
430 frame->Xrdepth = rdepth;
432 /* This is where control jumps back to to effect "recursion" */
436 /* Macros make the argument variables come from the current frame */
438 #define eptr frame->Xeptr
439 #define ecode frame->Xecode
440 #define mstart frame->Xmstart
441 #define offset_top frame->Xoffset_top
442 #define ims frame->Xims
443 #define eptrb frame->Xeptrb
444 #define flags frame->Xflags
445 #define rdepth frame->Xrdepth
447 /* Ditto for the local variables */
450 #define charptr frame->Xcharptr
452 #define callpat frame->Xcallpat
453 #define data frame->Xdata
454 #define next frame->Xnext
455 #define pp frame->Xpp
456 #define prev frame->Xprev
457 #define saved_eptr frame->Xsaved_eptr
459 #define new_recursive frame->Xnew_recursive
461 #define cur_is_word frame->Xcur_is_word
462 #define condition frame->Xcondition
463 #define prev_is_word frame->Xprev_is_word
465 #define original_ims frame->Xoriginal_ims
468 #define prop_type frame->Xprop_type
469 #define prop_value frame->Xprop_value
470 #define prop_fail_result frame->Xprop_fail_result
471 #define prop_category frame->Xprop_category
472 #define prop_chartype frame->Xprop_chartype
473 #define prop_script frame->Xprop_script
474 #define oclength frame->Xoclength
475 #define occhars frame->Xocchars
478 #define ctype frame->Xctype
479 #define fc frame->Xfc
480 #define fi frame->Xfi
481 #define length frame->Xlength
482 #define max frame->Xmax
483 #define min frame->Xmin
484 #define number frame->Xnumber
485 #define offset frame->Xoffset
486 #define op frame->Xop
487 #define save_capture_last frame->Xsave_capture_last
488 #define save_offset1 frame->Xsave_offset1
489 #define save_offset2 frame->Xsave_offset2
490 #define save_offset3 frame->Xsave_offset3
491 #define stacksave frame->Xstacksave
493 #define newptrb frame->Xnewptrb
495 /* When recursion is being used, local variables are allocated on the stack and
496 get preserved during recursion in the normal way. In this environment, fi and
497 i, and fc and c, can be the same variables. */
499 #else /* NO_RECURSE not defined */
504 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
505 const uschar *charptr; /* in small blocks of the code. My normal */
506 #endif /* style of coding would have declared */
507 const uschar *callpat; /* them within each of those blocks. */
508 const uschar *data; /* However, in order to accommodate the */
509 const uschar *next; /* version of this code that uses an */
510 USPTR pp; /* external "stack" implemented on the */
511 const uschar *prev; /* heap, it is easier to declare them all */
512 USPTR saved_eptr; /* here, so the declarations can be cut */
513 /* out in a block. The only declarations */
514 recursion_info new_recursive; /* within blocks below are for variables */
515 /* that do not have to be preserved over */
516 BOOL cur_is_word; /* a recursive call to RMATCH(). */
520 unsigned long int original_ims;
525 int prop_fail_result;
540 int save_capture_last;
541 int save_offset1, save_offset2, save_offset3;
542 int stacksave[REC_STACK_SAVE_MAX];
545 #endif /* NO_RECURSE */
547 /* These statements are here to stop the compiler complaining about unitialized
552 prop_fail_result = 0;
556 /* This label is used for tail recursion, which is used in a few cases even
557 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
558 used. Thanks to Ian Taylor for noticing this possibility and sending the
563 /* OK, now we can get on with the real code of the function. Recursive calls
564 are specified by the macro RMATCH and RRETURN is used to return. When
565 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
566 and a "return", respectively (possibly with some debugging if DEBUG is
567 defined). However, RMATCH isn't like a function call because it's quite a
568 complicated macro. It has to be used in one particular way. This shouldn't,
569 however, impact performance when true recursion is being used. */
572 utf8 = md->utf8; /* Local copy of the flag */
577 /* First check that we haven't called match() too many times, or that we
578 haven't exceeded the recursive call limit. */
580 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
581 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
583 original_ims = ims; /* Save for resetting on ')' */
585 /* At the start of a group with an unlimited repeat that may match an empty
586 string, the match_cbegroup flag is set. When this is the case, add the current
587 subject pointer to the chain of such remembered pointers, to be checked when we
588 hit the closing ket, in order to break infinite loops that match no characters.
589 When match() is called in other circumstances, don't add to the chain. If this
590 is a tail recursion, use a block from the workspace, as the one on the stack is
593 if ((flags & match_cbegroup) != 0)
596 if ((flags & match_tail_recursed) != 0)
598 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
599 p = md->eptrchain + md->eptrn++;
602 p->epb_saved_eptr = eptr;
607 /* Now start processing the opcodes. */
611 minimize = possessive = FALSE;
614 /* For partial matching, remember if we ever hit the end of the subject after
615 matching at least one subject character. */
618 eptr >= md->end_subject &&
624 /* Handle a capturing bracket. If there is space in the offset vector, save
625 the current subject position in the working slot at the top of the vector.
626 We mustn't change the current values of the data slot, because they may be
627 set from a previous iteration of this group, and be referred to by a
628 reference inside the group.
630 If the bracket fails to match, we need to restore this value and also the
631 values of the final offsets, in case they were set by a previous iteration
634 If there isn't enough space in the offset vector, treat this as if it were
635 a non-capturing bracket. Don't worry about setting the flag for the error
636 case here; that is handled in the code for KET. */
640 number = GET2(ecode, 1+LINK_SIZE);
641 offset = number << 1;
644 printf("start bracket %d\n", number);
646 pchars(eptr, 16, TRUE, md);
650 if (offset < md->offset_max)
652 save_offset1 = md->offset_vector[offset];
653 save_offset2 = md->offset_vector[offset+1];
654 save_offset3 = md->offset_vector[md->offset_end - number];
655 save_capture_last = md->capture_last;
657 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
658 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
660 flags = (op == OP_SCBRA)? match_cbegroup : 0;
663 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664 ims, eptrb, flags, RM1);
665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666 md->capture_last = save_capture_last;
667 ecode += GET(ecode, 1);
669 while (*ecode == OP_ALT);
671 DPRINTF(("bracket %d failed\n", number));
673 md->offset_vector[offset] = save_offset1;
674 md->offset_vector[offset+1] = save_offset2;
675 md->offset_vector[md->offset_end - number] = save_offset3;
677 RRETURN(MATCH_NOMATCH);
680 /* Insufficient room for saving captured contents. Treat as a non-capturing
683 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
685 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
686 final alternative within the brackets, we would return the result of a
687 recursive call to match() whatever happened. We can reduce stack usage by
688 turning this into a tail recursion. */
692 DPRINTF(("start non-capturing bracket\n"));
693 flags = (op >= OP_SBRA)? match_cbegroup : 0;
696 if (ecode[GET(ecode, 1)] != OP_ALT)
698 ecode += _pcre_OP_lengths[*ecode];
699 flags |= match_tail_recursed;
700 DPRINTF(("bracket 0 tail recursion\n"));
704 /* For non-final alternatives, continue the loop for a NOMATCH result;
707 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
710 ecode += GET(ecode, 1);
712 /* Control never reaches here. */
714 /* Conditional group: compilation checked that there are no more than
715 two branches. If the condition is false, skipping the first branch takes us
716 past the end if there is only one branch, but that's OK because that is
717 exactly what going to the ket would do. As there is only one branch to be
718 obeyed, we can use tail recursion to avoid using another stack frame. */
722 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
724 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
725 condition = md->recursive != NULL &&
726 (offset == RREF_ANY || offset == md->recursive->group_num);
727 ecode += condition? 3 : GET(ecode, 1);
730 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
732 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
733 condition = offset < offset_top && md->offset_vector[offset] >= 0;
734 ecode += condition? 3 : GET(ecode, 1);
737 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
740 ecode += GET(ecode, 1);
743 /* The condition is an assertion. Call match() to evaluate it - setting
744 the final argument match_condassert causes it to stop at the end of an
749 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
750 match_condassert, RM3);
751 if (rrc == MATCH_MATCH)
754 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
755 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
757 else if (rrc != MATCH_NOMATCH)
759 RRETURN(rrc); /* Need braces because of following else */
764 ecode += GET(ecode, 1);
768 /* We are now at the branch that is to be obeyed. As there is only one,
769 we can use tail recursion to avoid using another stack frame. If the second
770 alternative doesn't exist, we can just plough on. */
772 if (condition || *ecode == OP_ALT)
774 ecode += 1 + LINK_SIZE;
775 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
780 ecode += 1 + LINK_SIZE;
785 /* End of the pattern. If we are in a top-level recursion, we should
786 restore the offsets appropriately and continue from after the call. */
789 if (md->recursive != NULL && md->recursive->group_num == 0)
791 recursion_info *rec = md->recursive;
792 DPRINTF(("End of pattern in a (?0) recursion\n"));
793 md->recursive = rec->prevrec;
794 memmove(md->offset_vector, rec->offset_save,
795 rec->saved_max * sizeof(int));
796 mstart = rec->save_start;
798 ecode = rec->after_call;
802 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
803 string - backtracking will then try other alternatives, if any. */
805 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
806 md->end_match_ptr = eptr; /* Record where we ended */
807 md->end_offset_top = offset_top; /* and how many extracts were taken */
808 md->start_match_ptr = mstart; /* and the start (\K can modify) */
809 RRETURN(MATCH_MATCH);
811 /* Change option settings */
816 DPRINTF(("ims set to %02lx\n", ims));
819 /* Assertion brackets. Check the alternative branches in turn - the
820 matching won't pass the KET for an assertion. If any one branch matches,
821 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
822 start of each branch to move the current point backwards, so the code at
823 this level is identical to the lookahead case. */
829 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
831 if (rrc == MATCH_MATCH) break;
832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
833 ecode += GET(ecode, 1);
835 while (*ecode == OP_ALT);
836 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
838 /* If checking an assertion for a condition, return MATCH_MATCH. */
840 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
842 /* Continue from after the assertion, updating the offsets high water
843 mark, since extracts may have been taken during the assertion. */
845 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
846 ecode += 1 + LINK_SIZE;
847 offset_top = md->end_offset_top;
850 /* Negative assertion: all branches must fail to match */
853 case OP_ASSERTBACK_NOT:
856 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
858 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
860 ecode += GET(ecode,1);
862 while (*ecode == OP_ALT);
864 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
866 ecode += 1 + LINK_SIZE;
869 /* Move the subject pointer back. This occurs only at the start of
870 each branch of a lookbehind assertion. If we are too close to the start to
871 move back, this match function fails. When working with UTF-8 we move
872 back a number of characters, not bytes. */
882 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
889 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
892 eptr -= GET(ecode, 1);
893 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
896 /* Skip to next op code */
898 ecode += 1 + LINK_SIZE;
901 /* The callout item calls an external function, if one is provided, passing
902 details of the match so far. This is mainly for debugging, though the
903 function is able to force a failure. */
906 if (pcre_callout != NULL)
908 pcre_callout_block cb;
909 cb.version = 1; /* Version 1 of the callout block */
910 cb.callout_number = ecode[1];
911 cb.offset_vector = md->offset_vector;
912 cb.subject = (PCRE_SPTR)md->start_subject;
913 cb.subject_length = md->end_subject - md->start_subject;
914 cb.start_match = mstart - md->start_subject;
915 cb.current_position = eptr - md->start_subject;
916 cb.pattern_position = GET(ecode, 2);
917 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
918 cb.capture_top = offset_top/2;
919 cb.capture_last = md->capture_last;
920 cb.callout_data = md->callout_data;
921 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
922 if (rrc < 0) RRETURN(rrc);
924 ecode += 2 + 2*LINK_SIZE;
927 /* Recursion either matches the current regex, or some subexpression. The
928 offset data is the offset to the starting bracket from the start of the
929 whole pattern. (This is so that it works from duplicated subpatterns.)
931 If there are any capturing brackets started but not finished, we have to
932 save their starting points and reinstate them after the recursion. However,
933 we don't know how many such there are (offset_top records the completed
934 total) so we just have to save all the potential data. There may be up to
935 65535 such values, which is too large to put on the stack, but using malloc
936 for small numbers seems expensive. As a compromise, the stack is used when
937 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
938 is used. A problem is what to do if the malloc fails ... there is no way of
939 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
940 values on the stack, and accept that the rest may be wrong.
942 There are also other values that have to be saved. We use a chained
943 sequence of blocks that actually live on the stack. Thanks to Robin Houston
944 for the original version of this logic. */
948 callpat = md->start_code + GET(ecode, 1);
949 new_recursive.group_num = (callpat == md->start_code)? 0 :
950 GET2(callpat, 1 + LINK_SIZE);
952 /* Add to "recursing stack" */
954 new_recursive.prevrec = md->recursive;
955 md->recursive = &new_recursive;
957 /* Find where to continue from afterwards */
959 ecode += 1 + LINK_SIZE;
960 new_recursive.after_call = ecode;
962 /* Now save the offset data. */
964 new_recursive.saved_max = md->offset_end;
965 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
966 new_recursive.offset_save = stacksave;
969 new_recursive.offset_save =
970 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
971 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
974 memcpy(new_recursive.offset_save, md->offset_vector,
975 new_recursive.saved_max * sizeof(int));
976 new_recursive.save_start = mstart;
979 /* OK, now we can do the recursion. For each top-level alternative we
980 restore the offset and recursion data. */
982 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
983 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
986 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
987 md, ims, eptrb, flags, RM6);
988 if (rrc == MATCH_MATCH)
990 DPRINTF(("Recursion matched\n"));
991 md->recursive = new_recursive.prevrec;
992 if (new_recursive.offset_save != stacksave)
993 (pcre_free)(new_recursive.offset_save);
994 RRETURN(MATCH_MATCH);
996 else if (rrc != MATCH_NOMATCH)
998 DPRINTF(("Recursion gave error %d\n", rrc));
1002 md->recursive = &new_recursive;
1003 memcpy(md->offset_vector, new_recursive.offset_save,
1004 new_recursive.saved_max * sizeof(int));
1005 callpat += GET(callpat, 1);
1007 while (*callpat == OP_ALT);
1009 DPRINTF(("Recursion didn't match\n"));
1010 md->recursive = new_recursive.prevrec;
1011 if (new_recursive.offset_save != stacksave)
1012 (pcre_free)(new_recursive.offset_save);
1013 RRETURN(MATCH_NOMATCH);
1015 /* Control never reaches here */
1017 /* "Once" brackets are like assertion brackets except that after a match,
1018 the point in the subject string is not moved back. Thus there can never be
1019 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1020 Check the alternative branches in turn - the matching won't pass the KET
1021 for this kind of subpattern. If any one branch matches, we carry on as at
1022 the end of a normal bracket, leaving the subject pointer. */
1030 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1032 if (rrc == MATCH_MATCH) break;
1033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1034 ecode += GET(ecode,1);
1036 while (*ecode == OP_ALT);
1038 /* If hit the end of the group (which could be repeated), fail */
1040 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1042 /* Continue as from after the assertion, updating the offsets high water
1043 mark, since extracts may have been taken. */
1045 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1047 offset_top = md->end_offset_top;
1048 eptr = md->end_match_ptr;
1050 /* For a non-repeating ket, just continue at this level. This also
1051 happens for a repeating ket if no characters were matched in the group.
1052 This is the forcible breaking of infinite loops as implemented in Perl
1053 5.005. If there is an options reset, it will get obeyed in the normal
1054 course of events. */
1056 if (*ecode == OP_KET || eptr == saved_eptr)
1058 ecode += 1+LINK_SIZE;
1062 /* The repeating kets try the rest of the pattern or restart from the
1063 preceding bracket, in the appropriate order. The second "call" of match()
1064 uses tail recursion, to avoid using another stack frame. We need to reset
1065 any options that changed within the bracket before re-running it, so
1066 check the next opcode. */
1068 if (ecode[1+LINK_SIZE] == OP_OPT)
1070 ims = (ims & ~PCRE_IMS) | ecode[4];
1071 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1074 if (*ecode == OP_KETRMIN)
1076 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1080 flags = match_tail_recursed;
1083 else /* OP_KETRMAX */
1085 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1087 ecode += 1 + LINK_SIZE;
1088 flags = match_tail_recursed;
1091 /* Control never gets here */
1093 /* An alternation is the end of a branch; scan along to find the end of the
1094 bracketed group and go to there. */
1097 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1100 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1101 that it may occur zero times. It may repeat infinitely, or not at all -
1102 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1103 repeat limits are compiled as a number of copies, with the optional ones
1104 preceded by BRAZERO or BRAMINZERO. */
1109 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111 do next += GET(next,1); while (*next == OP_ALT);
1112 ecode = next + 1 + LINK_SIZE;
1119 do next += GET(next, 1); while (*next == OP_ALT);
1120 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 /* End of a group, repeated or non-repeating. */
1131 prev = ecode - GET(ecode, 1);
1133 /* If this was a group that remembered the subject start, in order to break
1134 infinite repeats of empty string matches, retrieve the subject start from
1135 the chain. Otherwise, set it NULL. */
1137 if (*prev >= OP_SBRA)
1139 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1140 eptrb = eptrb->epb_prev; /* Backup to previous group */
1142 else saved_eptr = NULL;
1144 /* If we are at the end of an assertion group, stop matching and return
1145 MATCH_MATCH, but record the current high water mark for use by positive
1146 assertions. Do this also for the "once" (atomic) groups. */
1148 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1149 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1152 md->end_match_ptr = eptr; /* For ONCE */
1153 md->end_offset_top = offset_top;
1154 RRETURN(MATCH_MATCH);
1157 /* For capturing groups we have to check the group number back at the start
1158 and if necessary complete handling an extraction by setting the offsets and
1159 bumping the high water mark. Note that whole-pattern recursion is coded as
1160 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1161 when the OP_END is reached. Other recursion is handled here. */
1163 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1165 number = GET2(prev, 1+LINK_SIZE);
1166 offset = number << 1;
1169 printf("end bracket %d", number);
1173 md->capture_last = number;
1174 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1176 md->offset_vector[offset] =
1177 md->offset_vector[md->offset_end - number];
1178 md->offset_vector[offset+1] = eptr - md->start_subject;
1179 if (offset_top <= offset) offset_top = offset + 2;
1182 /* Handle a recursively called group. Restore the offsets
1183 appropriately and continue from after the call. */
1185 if (md->recursive != NULL && md->recursive->group_num == number)
1187 recursion_info *rec = md->recursive;
1188 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1189 md->recursive = rec->prevrec;
1190 mstart = rec->save_start;
1191 memcpy(md->offset_vector, rec->offset_save,
1192 rec->saved_max * sizeof(int));
1193 ecode = rec->after_call;
1199 /* For both capturing and non-capturing groups, reset the value of the ims
1200 flags, in case they got changed during the group. */
1203 DPRINTF(("ims reset to %02lx\n", ims));
1205 /* For a non-repeating ket, just continue at this level. This also
1206 happens for a repeating ket if no characters were matched in the group.
1207 This is the forcible breaking of infinite loops as implemented in Perl
1208 5.005. If there is an options reset, it will get obeyed in the normal
1209 course of events. */
1211 if (*ecode == OP_KET || eptr == saved_eptr)
1213 ecode += 1 + LINK_SIZE;
1217 /* The repeating kets try the rest of the pattern or restart from the
1218 preceding bracket, in the appropriate order. In the second case, we can use
1219 tail recursion to avoid using another stack frame. */
1221 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1223 if (*ecode == OP_KETRMIN)
1225 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229 flags |= match_tail_recursed;
1232 else /* OP_KETRMAX */
1234 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1235 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1236 ecode += 1 + LINK_SIZE;
1237 flags = match_tail_recursed;
1240 /* Control never gets here */
1242 /* Start of subject unless notbol, or after internal newline if multiline */
1245 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1246 if ((ims & PCRE_MULTILINE) != 0)
1248 if (eptr != md->start_subject &&
1249 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1250 RRETURN(MATCH_NOMATCH);
1254 /* ... else fall through */
1256 /* Start of subject assertion */
1259 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1263 /* Start of match assertion */
1266 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1270 /* Reset the start of match point */
1277 /* Assert before internal newline if multiline, or before a terminating
1278 newline unless endonly is set, else end of subject unless noteol is set. */
1281 if ((ims & PCRE_MULTILINE) != 0)
1283 if (eptr < md->end_subject)
1284 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1286 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1292 if (md->noteol) RRETURN(MATCH_NOMATCH);
1295 if (eptr != md->end_subject &&
1296 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1297 RRETURN(MATCH_NOMATCH);
1302 /* ... else fall through for endonly */
1304 /* End of subject assertion (\z) */
1307 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1311 /* End of subject or ending \n assertion (\Z) */
1314 if (eptr != md->end_subject &&
1315 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1316 RRETURN(MATCH_NOMATCH);
1320 /* Word boundary assertions */
1322 case OP_NOT_WORD_BOUNDARY:
1323 case OP_WORD_BOUNDARY:
1326 /* Find out if the previous and current characters are "word" characters.
1327 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1328 be "non-word" characters. */
1333 if (eptr == md->start_subject) prev_is_word = FALSE; else
1335 const uschar *lastptr = eptr - 1;
1336 while((*lastptr & 0xc0) == 0x80) lastptr--;
1337 GETCHAR(c, lastptr);
1338 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1340 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1343 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1349 /* More streamlined when not in UTF-8 mode */
1352 prev_is_word = (eptr != md->start_subject) &&
1353 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1354 cur_is_word = (eptr < md->end_subject) &&
1355 ((md->ctypes[*eptr] & ctype_word) != 0);
1358 /* Now see if the situation is what we want */
1360 if ((*ecode++ == OP_WORD_BOUNDARY)?
1361 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1362 RRETURN(MATCH_NOMATCH);
1366 /* Match a single character type; inline for speed */
1369 if ((ims & PCRE_DOTALL) == 0)
1371 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1373 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1375 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1379 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1380 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1383 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1388 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1389 GETCHARINCTEST(c, eptr);
1394 (md->ctypes[c] & ctype_digit) != 0
1396 RRETURN(MATCH_NOMATCH);
1401 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1402 GETCHARINCTEST(c, eptr);
1407 (md->ctypes[c] & ctype_digit) == 0
1409 RRETURN(MATCH_NOMATCH);
1413 case OP_NOT_WHITESPACE:
1414 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1415 GETCHARINCTEST(c, eptr);
1420 (md->ctypes[c] & ctype_space) != 0
1422 RRETURN(MATCH_NOMATCH);
1427 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1428 GETCHARINCTEST(c, eptr);
1433 (md->ctypes[c] & ctype_space) == 0
1435 RRETURN(MATCH_NOMATCH);
1439 case OP_NOT_WORDCHAR:
1440 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1441 GETCHARINCTEST(c, eptr);
1446 (md->ctypes[c] & ctype_word) != 0
1448 RRETURN(MATCH_NOMATCH);
1453 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1454 GETCHARINCTEST(c, eptr);
1459 (md->ctypes[c] & ctype_word) == 0
1461 RRETURN(MATCH_NOMATCH);
1466 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1467 GETCHARINCTEST(c, eptr);
1470 default: RRETURN(MATCH_NOMATCH);
1472 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1486 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1487 GETCHARINCTEST(c, eptr);
1492 case 0x20: /* SPACE */
1493 case 0xa0: /* NBSP */
1494 case 0x1680: /* OGHAM SPACE MARK */
1495 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1496 case 0x2000: /* EN QUAD */
1497 case 0x2001: /* EM QUAD */
1498 case 0x2002: /* EN SPACE */
1499 case 0x2003: /* EM SPACE */
1500 case 0x2004: /* THREE-PER-EM SPACE */
1501 case 0x2005: /* FOUR-PER-EM SPACE */
1502 case 0x2006: /* SIX-PER-EM SPACE */
1503 case 0x2007: /* FIGURE SPACE */
1504 case 0x2008: /* PUNCTUATION SPACE */
1505 case 0x2009: /* THIN SPACE */
1506 case 0x200A: /* HAIR SPACE */
1507 case 0x202f: /* NARROW NO-BREAK SPACE */
1508 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1509 case 0x3000: /* IDEOGRAPHIC SPACE */
1510 RRETURN(MATCH_NOMATCH);
1516 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517 GETCHARINCTEST(c, eptr);
1520 default: RRETURN(MATCH_NOMATCH);
1522 case 0x20: /* SPACE */
1523 case 0xa0: /* NBSP */
1524 case 0x1680: /* OGHAM SPACE MARK */
1525 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1526 case 0x2000: /* EN QUAD */
1527 case 0x2001: /* EM QUAD */
1528 case 0x2002: /* EN SPACE */
1529 case 0x2003: /* EM SPACE */
1530 case 0x2004: /* THREE-PER-EM SPACE */
1531 case 0x2005: /* FOUR-PER-EM SPACE */
1532 case 0x2006: /* SIX-PER-EM SPACE */
1533 case 0x2007: /* FIGURE SPACE */
1534 case 0x2008: /* PUNCTUATION SPACE */
1535 case 0x2009: /* THIN SPACE */
1536 case 0x200A: /* HAIR SPACE */
1537 case 0x202f: /* NARROW NO-BREAK SPACE */
1538 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1539 case 0x3000: /* IDEOGRAPHIC SPACE */
1546 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1547 GETCHARINCTEST(c, eptr);
1555 case 0x85: /* NEL */
1556 case 0x2028: /* LINE SEPARATOR */
1557 case 0x2029: /* PARAGRAPH SEPARATOR */
1558 RRETURN(MATCH_NOMATCH);
1564 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1565 GETCHARINCTEST(c, eptr);
1568 default: RRETURN(MATCH_NOMATCH);
1573 case 0x85: /* NEL */
1574 case 0x2028: /* LINE SEPARATOR */
1575 case 0x2029: /* PARAGRAPH SEPARATOR */
1582 /* Check the next character by Unicode property. We will get here only
1583 if the support is in the binary; otherwise a compile-time error occurs. */
1587 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1588 GETCHARINCTEST(c, eptr);
1590 int chartype, script;
1591 int category = _pcre_ucp_findprop(c, &chartype, &script);
1596 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1600 if ((chartype == ucp_Lu ||
1601 chartype == ucp_Ll ||
1602 chartype == ucp_Lt) == (op == OP_NOTPROP))
1603 RRETURN(MATCH_NOMATCH);
1607 if ((ecode[2] != category) == (op == OP_PROP))
1608 RRETURN(MATCH_NOMATCH);
1612 if ((ecode[2] != chartype) == (op == OP_PROP))
1613 RRETURN(MATCH_NOMATCH);
1617 if ((ecode[2] != script) == (op == OP_PROP))
1618 RRETURN(MATCH_NOMATCH);
1622 RRETURN(PCRE_ERROR_INTERNAL);
1629 /* Match an extended Unicode sequence. We will get here only if the support
1630 is in the binary; otherwise a compile-time error occurs. */
1633 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1634 GETCHARINCTEST(c, eptr);
1636 int chartype, script;
1637 int category = _pcre_ucp_findprop(c, &chartype, &script);
1638 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1639 while (eptr < md->end_subject)
1642 if (!utf8) c = *eptr; else
1644 GETCHARLEN(c, eptr, len);
1646 category = _pcre_ucp_findprop(c, &chartype, &script);
1647 if (category != ucp_M) break;
1656 /* Match a back reference, possibly repeatedly. Look past the end of the
1657 item to see if there is repeat information following. The code is similar
1658 to that for character classes, but repeated for efficiency. Then obey
1659 similar code to character type repeats - written out again for speed.
1660 However, if the referenced string is the empty string, always treat
1661 it as matched, any number of times (otherwise there could be infinite
1666 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1667 ecode += 3; /* Advance past item */
1669 /* If the reference is unset, set the length to be longer than the amount
1670 of subject left; this ensures that every attempt at a match fails. We
1671 can't just fail here, because of the possibility of quantifiers with zero
1674 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1675 md->end_subject - eptr + 1 :
1676 md->offset_vector[offset+1] - md->offset_vector[offset];
1678 /* Set up for repetition, or handle the non-repeated case */
1688 c = *ecode++ - OP_CRSTAR;
1689 minimize = (c & 1) != 0;
1690 min = rep_min[c]; /* Pick up values from tables; */
1691 max = rep_max[c]; /* zero for max => infinity */
1692 if (max == 0) max = INT_MAX;
1697 minimize = (*ecode == OP_CRMINRANGE);
1698 min = GET2(ecode, 1);
1699 max = GET2(ecode, 3);
1700 if (max == 0) max = INT_MAX;
1704 default: /* No repeat follows */
1705 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1707 continue; /* With the main loop */
1710 /* If the length of the reference is zero, just continue with the
1713 if (length == 0) continue;
1715 /* First, ensure the minimum number of matches are present. We get back
1716 the length of the reference string explicitly rather than passing the
1717 address of eptr, so that eptr can be a register variable. */
1719 for (i = 1; i <= min; i++)
1721 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1725 /* If min = max, continue at the same level without recursion.
1726 They are not both allowed to be zero. */
1728 if (min == max) continue;
1730 /* If minimizing, keep trying and advancing the pointer */
1734 for (fi = min;; fi++)
1736 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1737 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1738 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1739 RRETURN(MATCH_NOMATCH);
1742 /* Control never gets here */
1745 /* If maximizing, find the longest string and work backwards */
1750 for (i = min; i < max; i++)
1752 if (!match_ref(offset, eptr, length, md, ims)) break;
1757 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1761 RRETURN(MATCH_NOMATCH);
1764 /* Control never gets here */
1768 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1769 used when all the characters in the class have values in the range 0-255,
1770 and either the matching is caseful, or the characters are in the range
1771 0-127 when UTF-8 processing is enabled. The only difference between
1772 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1775 First, look past the end of the item to see if there is repeat information
1776 following. Then obey similar code to character type repeats - written out
1782 data = ecode + 1; /* Save for matching */
1783 ecode += 33; /* Advance past the item */
1793 c = *ecode++ - OP_CRSTAR;
1794 minimize = (c & 1) != 0;
1795 min = rep_min[c]; /* Pick up values from tables; */
1796 max = rep_max[c]; /* zero for max => infinity */
1797 if (max == 0) max = INT_MAX;
1802 minimize = (*ecode == OP_CRMINRANGE);
1803 min = GET2(ecode, 1);
1804 max = GET2(ecode, 3);
1805 if (max == 0) max = INT_MAX;
1809 default: /* No repeat follows */
1814 /* First, ensure the minimum number of matches are present. */
1820 for (i = 1; i <= min; i++)
1822 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1823 GETCHARINC(c, eptr);
1826 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1830 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1836 /* Not UTF-8 mode */
1838 for (i = 1; i <= min; i++)
1840 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1842 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1846 /* If max == min we can continue with the main loop without the
1849 if (min == max) continue;
1851 /* If minimizing, keep testing the rest of the expression and advancing
1852 the pointer while it matches the class. */
1860 for (fi = min;; fi++)
1862 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1865 GETCHARINC(c, eptr);
1868 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1872 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1878 /* Not UTF-8 mode */
1880 for (fi = min;; fi++)
1882 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1884 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1886 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1889 /* Control never gets here */
1892 /* If maximizing, find the longest possible run, then work backwards. */
1902 for (i = min; i < max; i++)
1905 if (eptr >= md->end_subject) break;
1906 GETCHARLEN(c, eptr, len);
1909 if (op == OP_CLASS) break;
1913 if ((data[c/8] & (1 << (c&7))) == 0) break;
1919 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921 if (eptr-- == pp) break; /* Stop if tried at original pos */
1927 /* Not UTF-8 mode */
1929 for (i = min; i < max; i++)
1931 if (eptr >= md->end_subject) break;
1933 if ((data[c/8] & (1 << (c&7))) == 0) break;
1938 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1944 RRETURN(MATCH_NOMATCH);
1947 /* Control never gets here */
1950 /* Match an extended character class. This opcode is encountered only
1951 in UTF-8 mode, because that's the only time it is compiled. */
1956 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1957 ecode += GET(ecode, 1); /* Advance past the item */
1967 c = *ecode++ - OP_CRSTAR;
1968 minimize = (c & 1) != 0;
1969 min = rep_min[c]; /* Pick up values from tables; */
1970 max = rep_max[c]; /* zero for max => infinity */
1971 if (max == 0) max = INT_MAX;
1976 minimize = (*ecode == OP_CRMINRANGE);
1977 min = GET2(ecode, 1);
1978 max = GET2(ecode, 3);
1979 if (max == 0) max = INT_MAX;
1983 default: /* No repeat follows */
1988 /* First, ensure the minimum number of matches are present. */
1990 for (i = 1; i <= min; i++)
1992 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1993 GETCHARINC(c, eptr);
1994 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1997 /* If max == min we can continue with the main loop without the
2000 if (min == max) continue;
2002 /* If minimizing, keep testing the rest of the expression and advancing
2003 the pointer while it matches the class. */
2007 for (fi = min;; fi++)
2009 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2012 GETCHARINC(c, eptr);
2013 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2015 /* Control never gets here */
2018 /* If maximizing, find the longest possible run, then work backwards. */
2023 for (i = min; i < max; i++)
2026 if (eptr >= md->end_subject) break;
2027 GETCHARLEN(c, eptr, len);
2028 if (!_pcre_xclass(c, data)) break;
2033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2035 if (eptr-- == pp) break; /* Stop if tried at original pos */
2038 RRETURN(MATCH_NOMATCH);
2041 /* Control never gets here */
2043 #endif /* End of XCLASS */
2045 /* Match a single character, casefully */
2053 GETCHARLEN(fc, ecode, length);
2054 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2055 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2060 /* Non-UTF-8 mode */
2062 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2063 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2068 /* Match a single character, caselessly */
2076 GETCHARLEN(fc, ecode, length);
2078 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2080 /* If the pattern character's value is < 128, we have only one byte, and
2081 can use the fast lookup table. */
2085 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2088 /* Otherwise we must pick up the subject character */
2093 GETCHARINC(dc, eptr);
2096 /* If we have Unicode property support, we can use it to test the other
2097 case of the character, if there is one. */
2102 if (dc != _pcre_ucp_othercase(fc))
2104 RRETURN(MATCH_NOMATCH);
2109 #endif /* SUPPORT_UTF8 */
2111 /* Non-UTF-8 mode */
2113 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2114 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2119 /* Match a single character repeatedly. */
2122 min = max = GET2(ecode, 1);
2133 max = GET2(ecode, 1);
2134 minimize = *ecode == OP_MINUPTO;
2165 c = *ecode++ - OP_STAR;
2166 minimize = (c & 1) != 0;
2167 min = rep_min[c]; /* Pick up values from tables; */
2168 max = rep_max[c]; /* zero for max => infinity */
2169 if (max == 0) max = INT_MAX;
2171 /* Common code for all repeated single-character matches. We can give
2172 up quickly if there are fewer than the minimum number of characters left in
2181 GETCHARLEN(fc, ecode, length);
2182 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2185 /* Handle multibyte character matching specially here. There is
2186 support for caseless matching if UCP support is present. */
2191 unsigned int othercase;
2192 if ((ims & PCRE_CASELESS) != 0 &&
2193 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2194 oclength = _pcre_ord2utf8(othercase, occhars);
2196 #endif /* SUPPORT_UCP */
2198 for (i = 1; i <= min; i++)
2200 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2202 /* Need braces because of following else */
2203 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2206 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2209 #else /* without SUPPORT_UCP */
2210 else { RRETURN(MATCH_NOMATCH); }
2211 #endif /* SUPPORT_UCP */
2214 if (min == max) continue;
2218 for (fi = min;; fi++)
2220 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2222 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2223 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2225 /* Need braces because of following else */
2226 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2229 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2232 #else /* without SUPPORT_UCP */
2233 else { RRETURN (MATCH_NOMATCH); }
2234 #endif /* SUPPORT_UCP */
2236 /* Control never gets here */
2242 for (i = min; i < max; i++)
2244 if (eptr > md->end_subject - length) break;
2245 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2247 else if (oclength == 0) break;
2250 if (memcmp(eptr, occhars, oclength) != 0) break;
2253 #else /* without SUPPORT_UCP */
2255 #endif /* SUPPORT_UCP */
2258 if (possessive) continue;
2261 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2263 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2267 #else /* without SUPPORT_UCP */
2269 #endif /* SUPPORT_UCP */
2272 /* Control never gets here */
2275 /* If the length of a UTF-8 character is 1, we fall through here, and
2276 obey the code as for non-UTF-8 characters below, though in this case the
2277 value of fc will always be < 128. */
2280 #endif /* SUPPORT_UTF8 */
2282 /* When not in UTF-8 mode, load a single-byte character. */
2284 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2288 /* The value of fc at this point is always less than 256, though we may or
2289 may not be in UTF-8 mode. The code is duplicated for the caseless and
2290 caseful cases, for speed, since matching characters is likely to be quite
2291 common. First, ensure the minimum number of matches are present. If min =
2292 max, continue at the same level without recursing. Otherwise, if
2293 minimizing, keep trying the rest of the expression and advancing one
2294 matching character if failing, up to the maximum. Alternatively, if
2295 maximizing, find the maximum number of characters and work backwards. */
2297 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2300 if ((ims & PCRE_CASELESS) != 0)
2303 for (i = 1; i <= min; i++)
2304 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2305 if (min == max) continue;
2308 for (fi = min;; fi++)
2310 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2311 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2312 if (fi >= max || eptr >= md->end_subject ||
2313 fc != md->lcc[*eptr++])
2314 RRETURN(MATCH_NOMATCH);
2316 /* Control never gets here */
2321 for (i = min; i < max; i++)
2323 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2326 if (possessive) continue;
2329 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2331 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2333 RRETURN(MATCH_NOMATCH);
2335 /* Control never gets here */
2338 /* Caseful comparisons (includes all multi-byte characters) */
2342 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2343 if (min == max) continue;
2346 for (fi = min;; fi++)
2348 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2349 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2350 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2351 RRETURN(MATCH_NOMATCH);
2353 /* Control never gets here */
2358 for (i = min; i < max; i++)
2360 if (eptr >= md->end_subject || fc != *eptr) break;
2363 if (possessive) continue;
2366 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2368 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2370 RRETURN(MATCH_NOMATCH);
2373 /* Control never gets here */
2375 /* Match a negated single one-byte character. The character we are
2376 checking can be multibyte. */
2379 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2381 GETCHARINCTEST(c, eptr);
2382 if ((ims & PCRE_CASELESS) != 0)
2388 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2392 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2396 /* Match a negated single one-byte character repeatedly. This is almost a
2397 repeat of the code for a repeated single character, but I haven't found a
2398 nice way of commoning these up that doesn't require a test of the
2399 positive/negative option for each character match. Maybe that wouldn't add
2400 very much to the time taken, but character matching *is* what this is all
2404 min = max = GET2(ecode, 1);
2411 max = GET2(ecode, 1);
2412 minimize = *ecode == OP_NOTMINUPTO;
2430 case OP_NOTPOSQUERY:
2440 max = GET2(ecode, 1);
2449 case OP_NOTMINQUERY:
2450 c = *ecode++ - OP_NOTSTAR;
2451 minimize = (c & 1) != 0;
2452 min = rep_min[c]; /* Pick up values from tables; */
2453 max = rep_max[c]; /* zero for max => infinity */
2454 if (max == 0) max = INT_MAX;
2456 /* Common code for all repeated single-byte matches. We can give up quickly
2457 if there are fewer than the minimum number of bytes left in the
2461 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2464 /* The code is duplicated for the caseless and caseful cases, for speed,
2465 since matching characters is likely to be quite common. First, ensure the
2466 minimum number of matches are present. If min = max, continue at the same
2467 level without recursing. Otherwise, if minimizing, keep trying the rest of
2468 the expression and advancing one matching character if failing, up to the
2469 maximum. Alternatively, if maximizing, find the maximum number of
2470 characters and work backwards. */
2472 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2475 if ((ims & PCRE_CASELESS) != 0)
2483 register unsigned int d;
2484 for (i = 1; i <= min; i++)
2486 GETCHARINC(d, eptr);
2487 if (d < 256) d = md->lcc[d];
2488 if (fc == d) RRETURN(MATCH_NOMATCH);
2494 /* Not UTF-8 mode */
2496 for (i = 1; i <= min; i++)
2497 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2500 if (min == max) continue;
2508 register unsigned int d;
2509 for (fi = min;; fi++)
2511 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2512 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2513 GETCHARINC(d, eptr);
2514 if (d < 256) d = md->lcc[d];
2515 if (fi >= max || eptr >= md->end_subject || fc == d)
2516 RRETURN(MATCH_NOMATCH);
2521 /* Not UTF-8 mode */
2523 for (fi = min;; fi++)
2525 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2527 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2528 RRETURN(MATCH_NOMATCH);
2531 /* Control never gets here */
2544 register unsigned int d;
2545 for (i = min; i < max; i++)
2548 if (eptr >= md->end_subject) break;
2549 GETCHARLEN(d, eptr, len);
2550 if (d < 256) d = md->lcc[d];
2554 if (possessive) continue;
2557 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2559 if (eptr-- == pp) break; /* Stop if tried at original pos */
2565 /* Not UTF-8 mode */
2567 for (i = min; i < max; i++)
2569 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2572 if (possessive) continue;
2575 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2581 RRETURN(MATCH_NOMATCH);
2583 /* Control never gets here */
2586 /* Caseful comparisons */
2594 register unsigned int d;
2595 for (i = 1; i <= min; i++)
2597 GETCHARINC(d, eptr);
2598 if (fc == d) RRETURN(MATCH_NOMATCH);
2603 /* Not UTF-8 mode */
2605 for (i = 1; i <= min; i++)
2606 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2609 if (min == max) continue;
2617 register unsigned int d;
2618 for (fi = min;; fi++)
2620 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2621 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2622 GETCHARINC(d, eptr);
2623 if (fi >= max || eptr >= md->end_subject || fc == d)
2624 RRETURN(MATCH_NOMATCH);
2629 /* Not UTF-8 mode */
2631 for (fi = min;; fi++)
2633 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2634 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2635 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2636 RRETURN(MATCH_NOMATCH);
2639 /* Control never gets here */
2652 register unsigned int d;
2653 for (i = min; i < max; i++)
2656 if (eptr >= md->end_subject) break;
2657 GETCHARLEN(d, eptr, len);
2661 if (possessive) continue;
2664 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666 if (eptr-- == pp) break; /* Stop if tried at original pos */
2672 /* Not UTF-8 mode */
2674 for (i = min; i < max; i++)
2676 if (eptr >= md->end_subject || fc == *eptr) break;
2679 if (possessive) continue;
2682 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688 RRETURN(MATCH_NOMATCH);
2691 /* Control never gets here */
2693 /* Match a single character type repeatedly; several different opcodes
2694 share code. This is very similar to the code for single characters, but we
2695 repeat it in the interests of efficiency. */
2698 min = max = GET2(ecode, 1);
2704 case OP_TYPEMINUPTO:
2706 max = GET2(ecode, 1);
2707 minimize = *ecode == OP_TYPEMINUPTO;
2711 case OP_TYPEPOSSTAR:
2718 case OP_TYPEPOSPLUS:
2725 case OP_TYPEPOSQUERY:
2732 case OP_TYPEPOSUPTO:
2735 max = GET2(ecode, 1);
2740 case OP_TYPEMINSTAR:
2742 case OP_TYPEMINPLUS:
2744 case OP_TYPEMINQUERY:
2745 c = *ecode++ - OP_TYPESTAR;
2746 minimize = (c & 1) != 0;
2747 min = rep_min[c]; /* Pick up values from tables; */
2748 max = rep_max[c]; /* zero for max => infinity */
2749 if (max == 0) max = INT_MAX;
2751 /* Common code for all repeated single character type matches. Note that
2752 in UTF-8 mode, '.' matches a character of any length, but for the other
2753 character types, the valid characters are all one-byte long. */
2756 ctype = *ecode++; /* Code for the character type */
2759 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2761 prop_fail_result = ctype == OP_NOTPROP;
2762 prop_type = *ecode++;
2763 prop_value = *ecode++;
2765 else prop_type = -1;
2768 /* First, ensure the minimum number of matches are present. Use inline
2769 code for maximizing the speed, and do the type test once at the start
2770 (i.e. keep it out of the loop). Also we can test that there are at least
2771 the minimum number of bytes before we start. This isn't as effective in
2772 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2773 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2774 and single-bytes. */
2776 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2785 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2786 for (i = 1; i <= min; i++)
2788 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2789 GETCHARINCTEST(c, eptr);
2794 for (i = 1; i <= min; i++)
2796 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2797 GETCHARINCTEST(c, eptr);
2798 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2799 if ((prop_chartype == ucp_Lu ||
2800 prop_chartype == ucp_Ll ||
2801 prop_chartype == ucp_Lt) == prop_fail_result)
2802 RRETURN(MATCH_NOMATCH);
2807 for (i = 1; i <= min; i++)
2809 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2810 GETCHARINCTEST(c, eptr);
2811 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2812 if ((prop_category == prop_value) == prop_fail_result)
2813 RRETURN(MATCH_NOMATCH);
2818 for (i = 1; i <= min; i++)
2820 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2821 GETCHARINCTEST(c, eptr);
2822 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2823 if ((prop_chartype == prop_value) == prop_fail_result)
2824 RRETURN(MATCH_NOMATCH);
2829 for (i = 1; i <= min; i++)
2831 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2832 GETCHARINCTEST(c, eptr);
2833 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2834 if ((prop_script == prop_value) == prop_fail_result)
2835 RRETURN(MATCH_NOMATCH);
2840 RRETURN(PCRE_ERROR_INTERNAL);
2844 /* Match extended Unicode sequences. We will get here only if the
2845 support is in the binary; otherwise a compile-time error occurs. */
2847 else if (ctype == OP_EXTUNI)
2849 for (i = 1; i <= min; i++)
2851 GETCHARINCTEST(c, eptr);
2852 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2853 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2854 while (eptr < md->end_subject)
2857 if (!utf8) c = *eptr; else
2859 GETCHARLEN(c, eptr, len);
2861 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2862 if (prop_category != ucp_M) break;
2869 #endif /* SUPPORT_UCP */
2871 /* Handle all other cases when the coding is UTF-8 */
2874 if (utf8) switch(ctype)
2877 for (i = 1; i <= min; i++)
2879 if (eptr >= md->end_subject ||
2880 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2881 RRETURN(MATCH_NOMATCH);
2883 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2892 for (i = 1; i <= min; i++)
2894 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2895 GETCHARINC(c, eptr);
2898 default: RRETURN(MATCH_NOMATCH);
2900 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2914 for (i = 1; i <= min; i++)
2916 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2917 GETCHARINC(c, eptr);
2922 case 0x20: /* SPACE */
2923 case 0xa0: /* NBSP */
2924 case 0x1680: /* OGHAM SPACE MARK */
2925 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2926 case 0x2000: /* EN QUAD */
2927 case 0x2001: /* EM QUAD */
2928 case 0x2002: /* EN SPACE */
2929 case 0x2003: /* EM SPACE */
2930 case 0x2004: /* THREE-PER-EM SPACE */
2931 case 0x2005: /* FOUR-PER-EM SPACE */
2932 case 0x2006: /* SIX-PER-EM SPACE */
2933 case 0x2007: /* FIGURE SPACE */
2934 case 0x2008: /* PUNCTUATION SPACE */
2935 case 0x2009: /* THIN SPACE */
2936 case 0x200A: /* HAIR SPACE */
2937 case 0x202f: /* NARROW NO-BREAK SPACE */
2938 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2939 case 0x3000: /* IDEOGRAPHIC SPACE */
2940 RRETURN(MATCH_NOMATCH);
2946 for (i = 1; i <= min; i++)
2948 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949 GETCHARINC(c, eptr);
2952 default: RRETURN(MATCH_NOMATCH);
2954 case 0x20: /* SPACE */
2955 case 0xa0: /* NBSP */
2956 case 0x1680: /* OGHAM SPACE MARK */
2957 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2958 case 0x2000: /* EN QUAD */
2959 case 0x2001: /* EM QUAD */
2960 case 0x2002: /* EN SPACE */
2961 case 0x2003: /* EM SPACE */
2962 case 0x2004: /* THREE-PER-EM SPACE */
2963 case 0x2005: /* FOUR-PER-EM SPACE */
2964 case 0x2006: /* SIX-PER-EM SPACE */
2965 case 0x2007: /* FIGURE SPACE */
2966 case 0x2008: /* PUNCTUATION SPACE */
2967 case 0x2009: /* THIN SPACE */
2968 case 0x200A: /* HAIR SPACE */
2969 case 0x202f: /* NARROW NO-BREAK SPACE */
2970 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2971 case 0x3000: /* IDEOGRAPHIC SPACE */
2978 for (i = 1; i <= min; i++)
2980 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2981 GETCHARINC(c, eptr);
2989 case 0x85: /* NEL */
2990 case 0x2028: /* LINE SEPARATOR */
2991 case 0x2029: /* PARAGRAPH SEPARATOR */
2992 RRETURN(MATCH_NOMATCH);
2998 for (i = 1; i <= min; i++)
3000 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3001 GETCHARINC(c, eptr);
3004 default: RRETURN(MATCH_NOMATCH);
3009 case 0x85: /* NEL */
3010 case 0x2028: /* LINE SEPARATOR */
3011 case 0x2029: /* PARAGRAPH SEPARATOR */
3018 for (i = 1; i <= min; i++)
3020 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3021 GETCHARINC(c, eptr);
3022 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3023 RRETURN(MATCH_NOMATCH);
3028 for (i = 1; i <= min; i++)
3030 if (eptr >= md->end_subject ||
3031 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3032 RRETURN(MATCH_NOMATCH);
3033 /* No need to skip more bytes - we know it's a 1-byte character */
3037 case OP_NOT_WHITESPACE:
3038 for (i = 1; i <= min; i++)
3040 if (eptr >= md->end_subject ||
3041 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3042 RRETURN(MATCH_NOMATCH);
3043 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3048 for (i = 1; i <= min; i++)
3050 if (eptr >= md->end_subject ||
3051 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3052 RRETURN(MATCH_NOMATCH);
3053 /* No need to skip more bytes - we know it's a 1-byte character */
3057 case OP_NOT_WORDCHAR:
3058 for (i = 1; i <= min; i++)
3060 if (eptr >= md->end_subject ||
3061 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3062 RRETURN(MATCH_NOMATCH);
3063 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3068 for (i = 1; i <= min; i++)
3070 if (eptr >= md->end_subject ||
3071 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3072 RRETURN(MATCH_NOMATCH);
3073 /* No need to skip more bytes - we know it's a 1-byte character */
3078 RRETURN(PCRE_ERROR_INTERNAL);
3079 } /* End switch(ctype) */
3082 #endif /* SUPPORT_UTF8 */
3084 /* Code for the non-UTF-8 case for minimum matching of operators other
3085 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3086 number of bytes present, as this was tested above. */
3091 if ((ims & PCRE_DOTALL) == 0)
3093 for (i = 1; i <= min; i++)
3095 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3106 /* Because of the CRLF case, we can't assume the minimum number of
3107 bytes are present in this case. */
3110 for (i = 1; i <= min; i++)
3112 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3115 default: RRETURN(MATCH_NOMATCH);
3117 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3129 for (i = 1; i <= min; i++)
3131 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3136 case 0x20: /* SPACE */
3137 case 0xa0: /* NBSP */
3138 RRETURN(MATCH_NOMATCH);
3144 for (i = 1; i <= min; i++)
3146 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3149 default: RRETURN(MATCH_NOMATCH);
3151 case 0x20: /* SPACE */
3152 case 0xa0: /* NBSP */
3159 for (i = 1; i <= min; i++)
3161 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3169 case 0x85: /* NEL */
3170 RRETURN(MATCH_NOMATCH);
3176 for (i = 1; i <= min; i++)
3178 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3181 default: RRETURN(MATCH_NOMATCH);
3186 case 0x85: /* NEL */
3193 for (i = 1; i <= min; i++)
3194 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3198 for (i = 1; i <= min; i++)
3199 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3202 case OP_NOT_WHITESPACE:
3203 for (i = 1; i <= min; i++)
3204 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3208 for (i = 1; i <= min; i++)
3209 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3212 case OP_NOT_WORDCHAR:
3213 for (i = 1; i <= min; i++)
3214 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3215 RRETURN(MATCH_NOMATCH);
3219 for (i = 1; i <= min; i++)
3220 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3221 RRETURN(MATCH_NOMATCH);
3225 RRETURN(PCRE_ERROR_INTERNAL);
3229 /* If min = max, continue at the same level without recursing */
3231 if (min == max) continue;
3233 /* If minimizing, we have to test the rest of the pattern before each
3234 subsequent match. Again, separate the UTF-8 case for speed, and also
3235 separate the UCP cases. */
3245 for (fi = min;; fi++)
3247 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3248 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3249 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3250 GETCHARINC(c, eptr);
3251 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3253 /* Control never gets here */
3256 for (fi = min;; fi++)
3258 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3261 GETCHARINC(c, eptr);
3262 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3263 if ((prop_chartype == ucp_Lu ||
3264 prop_chartype == ucp_Ll ||
3265 prop_chartype == ucp_Lt) == prop_fail_result)
3266 RRETURN(MATCH_NOMATCH);
3268 /* Control never gets here */
3271 for (fi = min;; fi++)
3273 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3274 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3275 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3276 GETCHARINC(c, eptr);
3277 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3278 if ((prop_category == prop_value) == prop_fail_result)
3279 RRETURN(MATCH_NOMATCH);
3281 /* Control never gets here */
3284 for (fi = min;; fi++)
3286 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3287 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3288 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3289 GETCHARINC(c, eptr);
3290 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3291 if ((prop_chartype == prop_value) == prop_fail_result)
3292 RRETURN(MATCH_NOMATCH);
3294 /* Control never gets here */
3297 for (fi = min;; fi++)
3299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3300 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3301 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3302 GETCHARINC(c, eptr);
3303 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3304 if ((prop_script == prop_value) == prop_fail_result)
3305 RRETURN(MATCH_NOMATCH);
3307 /* Control never gets here */
3310 RRETURN(PCRE_ERROR_INTERNAL);
3314 /* Match extended Unicode sequences. We will get here only if the
3315 support is in the binary; otherwise a compile-time error occurs. */
3317 else if (ctype == OP_EXTUNI)
3319 for (fi = min;; fi++)
3321 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3322 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3323 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3324 GETCHARINCTEST(c, eptr);
3325 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3326 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3327 while (eptr < md->end_subject)
3330 if (!utf8) c = *eptr; else
3332 GETCHARLEN(c, eptr, len);
3334 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3335 if (prop_category != ucp_M) break;
3342 #endif /* SUPPORT_UCP */
3348 for (fi = min;; fi++)
3350 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3351 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3352 if (fi >= max || eptr >= md->end_subject ||
3353 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3355 RRETURN(MATCH_NOMATCH);
3357 GETCHARINC(c, eptr);
3360 case OP_ANY: /* This is the DOTALL case */
3369 default: RRETURN(MATCH_NOMATCH);
3371 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3388 case 0x20: /* SPACE */
3389 case 0xa0: /* NBSP */
3390 case 0x1680: /* OGHAM SPACE MARK */
3391 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3392 case 0x2000: /* EN QUAD */
3393 case 0x2001: /* EM QUAD */
3394 case 0x2002: /* EN SPACE */
3395 case 0x2003: /* EM SPACE */
3396 case 0x2004: /* THREE-PER-EM SPACE */
3397 case 0x2005: /* FOUR-PER-EM SPACE */
3398 case 0x2006: /* SIX-PER-EM SPACE */
3399 case 0x2007: /* FIGURE SPACE */
3400 case 0x2008: /* PUNCTUATION SPACE */
3401 case 0x2009: /* THIN SPACE */
3402 case 0x200A: /* HAIR SPACE */
3403 case 0x202f: /* NARROW NO-BREAK SPACE */
3404 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3405 case 0x3000: /* IDEOGRAPHIC SPACE */
3406 RRETURN(MATCH_NOMATCH);
3413 default: RRETURN(MATCH_NOMATCH);
3415 case 0x20: /* SPACE */
3416 case 0xa0: /* NBSP */
3417 case 0x1680: /* OGHAM SPACE MARK */
3418 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3419 case 0x2000: /* EN QUAD */
3420 case 0x2001: /* EM QUAD */
3421 case 0x2002: /* EN SPACE */
3422 case 0x2003: /* EM SPACE */
3423 case 0x2004: /* THREE-PER-EM SPACE */
3424 case 0x2005: /* FOUR-PER-EM SPACE */
3425 case 0x2006: /* SIX-PER-EM SPACE */
3426 case 0x2007: /* FIGURE SPACE */
3427 case 0x2008: /* PUNCTUATION SPACE */
3428 case 0x2009: /* THIN SPACE */
3429 case 0x200A: /* HAIR SPACE */
3430 case 0x202f: /* NARROW NO-BREAK SPACE */
3431 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3432 case 0x3000: /* IDEOGRAPHIC SPACE */
3445 case 0x85: /* NEL */
3446 case 0x2028: /* LINE SEPARATOR */
3447 case 0x2029: /* PARAGRAPH SEPARATOR */
3448 RRETURN(MATCH_NOMATCH);
3455 default: RRETURN(MATCH_NOMATCH);
3460 case 0x85: /* NEL */
3461 case 0x2028: /* LINE SEPARATOR */
3462 case 0x2029: /* PARAGRAPH SEPARATOR */
3468 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3469 RRETURN(MATCH_NOMATCH);
3473 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3474 RRETURN(MATCH_NOMATCH);
3477 case OP_NOT_WHITESPACE:
3478 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3479 RRETURN(MATCH_NOMATCH);
3483 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3484 RRETURN(MATCH_NOMATCH);
3487 case OP_NOT_WORDCHAR:
3488 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3489 RRETURN(MATCH_NOMATCH);
3493 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3494 RRETURN(MATCH_NOMATCH);
3498 RRETURN(PCRE_ERROR_INTERNAL);
3504 /* Not UTF-8 mode */
3506 for (fi = min;; fi++)
3508 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3509 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3510 if (fi >= max || eptr >= md->end_subject ||
3511 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3512 RRETURN(MATCH_NOMATCH);
3517 case OP_ANY: /* This is the DOTALL case */
3526 default: RRETURN(MATCH_NOMATCH);
3528 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3543 case 0x20: /* SPACE */
3544 case 0xa0: /* NBSP */
3545 RRETURN(MATCH_NOMATCH);
3552 default: RRETURN(MATCH_NOMATCH);
3554 case 0x20: /* SPACE */
3555 case 0xa0: /* NBSP */
3568 case 0x85: /* NEL */
3569 RRETURN(MATCH_NOMATCH);
3576 default: RRETURN(MATCH_NOMATCH);
3581 case 0x85: /* NEL */
3587 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3591 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3594 case OP_NOT_WHITESPACE:
3595 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3599 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3602 case OP_NOT_WORDCHAR:
3603 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3607 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3611 RRETURN(PCRE_ERROR_INTERNAL);
3615 /* Control never gets here */
3618 /* If maximizing, it is worth using inline code for speed, doing the type
3619 test once at the start (i.e. keep it out of the loop). Again, keep the
3620 UTF-8 and UCP stuff separate. */
3624 pp = eptr; /* Remember where we started */
3632 for (i = min; i < max; i++)
3635 if (eptr >= md->end_subject) break;
3636 GETCHARLEN(c, eptr, len);
3637 if (prop_fail_result) break;
3643 for (i = min; i < max; i++)
3646 if (eptr >= md->end_subject) break;
3647 GETCHARLEN(c, eptr, len);
3648 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3649 if ((prop_chartype == ucp_Lu ||
3650 prop_chartype == ucp_Ll ||
3651 prop_chartype == ucp_Lt) == prop_fail_result)
3658 for (i = min; i < max; i++)
3661 if (eptr >= md->end_subject) break;
3662 GETCHARLEN(c, eptr, len);
3663 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3664 if ((prop_category == prop_value) == prop_fail_result)
3671 for (i = min; i < max; i++)
3674 if (eptr >= md->end_subject) break;
3675 GETCHARLEN(c, eptr, len);
3676 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3677 if ((prop_chartype == prop_value) == prop_fail_result)
3684 for (i = min; i < max; i++)
3687 if (eptr >= md->end_subject) break;
3688 GETCHARLEN(c, eptr, len);
3689 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3690 if ((prop_script == prop_value) == prop_fail_result)
3697 /* eptr is now past the end of the maximum run */
3699 if (possessive) continue;
3702 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3703 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3704 if (eptr-- == pp) break; /* Stop if tried at original pos */
3709 /* Match extended Unicode sequences. We will get here only if the
3710 support is in the binary; otherwise a compile-time error occurs. */
3712 else if (ctype == OP_EXTUNI)
3714 for (i = min; i < max; i++)
3716 if (eptr >= md->end_subject) break;
3717 GETCHARINCTEST(c, eptr);
3718 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3719 if (prop_category == ucp_M) break;
3720 while (eptr < md->end_subject)
3723 if (!utf8) c = *eptr; else
3725 GETCHARLEN(c, eptr, len);
3727 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3728 if (prop_category != ucp_M) break;
3733 /* eptr is now past the end of the maximum run */
3735 if (possessive) continue;
3738 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3740 if (eptr-- == pp) break; /* Stop if tried at original pos */
3741 for (;;) /* Move back over one extended */
3745 if (!utf8) c = *eptr; else
3747 GETCHARLEN(c, eptr, len);
3749 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3750 if (prop_category != ucp_M) break;
3757 #endif /* SUPPORT_UCP */
3768 /* Special code is required for UTF8, but when the maximum is
3769 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3770 probably worth it, because .* is quite a common idiom. */
3774 if ((ims & PCRE_DOTALL) == 0)
3776 for (i = min; i < max; i++)
3778 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3780 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3785 for (i = min; i < max; i++)
3787 if (eptr >= md->end_subject) break;
3789 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3794 /* Handle unlimited UTF-8 repeat */
3798 if ((ims & PCRE_DOTALL) == 0)
3800 for (i = min; i < max; i++)
3802 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3810 if (c > (unsigned int)(md->end_subject - eptr))
3811 c = md->end_subject - eptr;
3817 /* The byte case is the same as non-UTF8 */
3821 if (c > (unsigned int)(md->end_subject - eptr))
3822 c = md->end_subject - eptr;
3827 for (i = min; i < max; i++)
3830 if (eptr >= md->end_subject) break;
3831 GETCHARLEN(c, eptr, len);
3834 if (++eptr >= md->end_subject) break;
3835 if (*eptr == 0x000a) eptr++;
3839 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3840 c != 0x0085 && c != 0x2028 && c != 0x2029)
3849 for (i = min; i < max; i++)
3853 if (eptr >= md->end_subject) break;
3854 GETCHARLEN(c, eptr, len);
3857 default: gotspace = FALSE; break;
3859 case 0x20: /* SPACE */
3860 case 0xa0: /* NBSP */
3861 case 0x1680: /* OGHAM SPACE MARK */
3862 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3863 case 0x2000: /* EN QUAD */
3864 case 0x2001: /* EM QUAD */
3865 case 0x2002: /* EN SPACE */
3866 case 0x2003: /* EM SPACE */
3867 case 0x2004: /* THREE-PER-EM SPACE */
3868 case 0x2005: /* FOUR-PER-EM SPACE */
3869 case 0x2006: /* SIX-PER-EM SPACE */
3870 case 0x2007: /* FIGURE SPACE */
3871 case 0x2008: /* PUNCTUATION SPACE */
3872 case 0x2009: /* THIN SPACE */
3873 case 0x200A: /* HAIR SPACE */
3874 case 0x202f: /* NARROW NO-BREAK SPACE */
3875 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3876 case 0x3000: /* IDEOGRAPHIC SPACE */
3880 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3887 for (i = min; i < max; i++)
3891 if (eptr >= md->end_subject) break;
3892 GETCHARLEN(c, eptr, len);
3895 default: gotspace = FALSE; break;
3900 case 0x85: /* NEL */
3901 case 0x2028: /* LINE SEPARATOR */
3902 case 0x2029: /* PARAGRAPH SEPARATOR */
3906 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3912 for (i = min; i < max; i++)
3915 if (eptr >= md->end_subject) break;
3916 GETCHARLEN(c, eptr, len);
3917 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3923 for (i = min; i < max; i++)
3926 if (eptr >= md->end_subject) break;
3927 GETCHARLEN(c, eptr, len);
3928 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3933 case OP_NOT_WHITESPACE:
3934 for (i = min; i < max; i++)
3937 if (eptr >= md->end_subject) break;
3938 GETCHARLEN(c, eptr, len);
3939 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3945 for (i = min; i < max; i++)
3948 if (eptr >= md->end_subject) break;
3949 GETCHARLEN(c, eptr, len);
3950 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3955 case OP_NOT_WORDCHAR:
3956 for (i = min; i < max; i++)
3959 if (eptr >= md->end_subject) break;
3960 GETCHARLEN(c, eptr, len);
3961 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3967 for (i = min; i < max; i++)
3970 if (eptr >= md->end_subject) break;
3971 GETCHARLEN(c, eptr, len);
3972 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3978 RRETURN(PCRE_ERROR_INTERNAL);
3981 /* eptr is now past the end of the maximum run */
3983 if (possessive) continue;
3986 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3988 if (eptr-- == pp) break; /* Stop if tried at original pos */
3995 /* Not UTF-8 mode */
4000 if ((ims & PCRE_DOTALL) == 0)
4002 for (i = min; i < max; i++)
4004 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4009 /* For DOTALL case, fall through and treat as \C */
4013 if (c > (unsigned int)(md->end_subject - eptr))
4014 c = md->end_subject - eptr;
4019 for (i = min; i < max; i++)
4021 if (eptr >= md->end_subject) break;
4025 if (++eptr >= md->end_subject) break;
4026 if (*eptr == 0x000a) eptr++;
4030 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4038 for (i = min; i < max; i++)
4040 if (eptr >= md->end_subject) break;
4042 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4048 for (i = min; i < max; i++)
4050 if (eptr >= md->end_subject) break;
4052 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4058 for (i = min; i < max; i++)
4060 if (eptr >= md->end_subject) break;
4062 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4069 for (i = min; i < max; i++)
4071 if (eptr >= md->end_subject) break;
4073 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4080 for (i = min; i < max; i++)
4082 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4089 for (i = min; i < max; i++)
4091 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4097 case OP_NOT_WHITESPACE:
4098 for (i = min; i < max; i++)
4100 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4107 for (i = min; i < max; i++)
4109 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4115 case OP_NOT_WORDCHAR:
4116 for (i = min; i < max; i++)
4118 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4125 for (i = min; i < max; i++)
4127 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4134 RRETURN(PCRE_ERROR_INTERNAL);
4137 /* eptr is now past the end of the maximum run */
4139 if (possessive) continue;
4142 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4144 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4148 /* Get here if we can't make it match with any permitted repetitions */
4150 RRETURN(MATCH_NOMATCH);
4152 /* Control never gets here */
4154 /* There's been some horrible disaster. Arrival here can only mean there is
4155 something seriously wrong in the code above or the OP_xxx definitions. */
4158 DPRINTF(("Unknown opcode %d\n", *ecode));
4159 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4162 /* Do not stick any code in here without much thought; it is assumed
4163 that "continue" in the code above comes out to here to repeat the main
4166 } /* End of main loop */
4167 /* Control never reaches here */
4170 /* When compiling to use the heap rather than the stack for recursive calls to
4171 match(), the RRETURN() macro jumps here. The number that is saved in
4172 frame->Xwhere indicates which label we actually want to return to. */
4175 #define LBL(val) case val: goto L_RM##val;
4177 switch (frame->Xwhere)
4179 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4180 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
4181 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
4182 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
4183 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
4184 LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
4186 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4187 return PCRE_ERROR_INTERNAL;
4190 #endif /* NO_RECURSE */
4194 /***************************************************************************
4195 ****************************************************************************
4196 RECURSION IN THE match() FUNCTION
4198 Undefine all the macros that were defined above to handle this. */
4217 #undef new_recursive
4232 #undef save_capture_last
4242 /* These two are defined as macros in both cases */
4247 /***************************************************************************
4248 ***************************************************************************/
4252 /*************************************************
4253 * Execute a Regular Expression *
4254 *************************************************/
4256 /* This function applies a compiled re to a subject string and picks out
4257 portions of the string if it matches. Two elements in the vector are set for
4258 each substring: the offsets to the start and end of the substring.
4261 argument_re points to the compiled expression
4262 extra_data points to extra data or is NULL
4263 subject points to the subject string
4264 length length of subject string (may contain binary zeros)
4265 start_offset where to start in the subject string
4267 offsets points to a vector of ints to be filled in with offsets
4268 offsetcount the number of elements in the vector
4270 Returns: > 0 => success; value is the number of elements filled in
4271 = 0 => success, but offsets is not big enough
4272 -1 => failed to match
4273 < -1 => some kind of unexpected problem
4277 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4278 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4281 int rc, resetcount, ocount;
4282 int first_byte = -1;
4286 unsigned long int ims;
4287 BOOL using_temporary_offsets = FALSE;
4291 BOOL first_byte_caseless = FALSE;
4292 BOOL req_byte_caseless = FALSE;
4294 match_data match_block;
4295 match_data *md = &match_block;
4296 const uschar *tables;
4297 const uschar *start_bits = NULL;
4298 USPTR start_match = (USPTR)subject + start_offset;
4300 USPTR req_byte_ptr = start_match - 1;
4301 eptrblock eptrchain[EPTR_WORK_SIZE];
4303 pcre_study_data internal_study;
4304 const pcre_study_data *study;
4306 real_pcre internal_re;
4307 const real_pcre *external_re = (const real_pcre *)argument_re;
4308 const real_pcre *re = external_re;
4310 /* Plausibility checks */
4312 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4313 if (re == NULL || subject == NULL ||
4314 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4315 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4317 /* Fish out the optional data from the extra_data structure, first setting
4318 the default values. */
4321 md->match_limit = MATCH_LIMIT;
4322 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4323 md->callout_data = NULL;
4325 /* The table pointer is always in native byte order. */
4327 tables = external_re->tables;
4329 if (extra_data != NULL)
4331 register unsigned int flags = extra_data->flags;
4332 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4333 study = (const pcre_study_data *)extra_data->study_data;
4334 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4335 md->match_limit = extra_data->match_limit;
4336 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4337 md->match_limit_recursion = extra_data->match_limit_recursion;
4338 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4339 md->callout_data = extra_data->callout_data;
4340 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4343 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4344 is a feature that makes it possible to save compiled regex and re-use them
4345 in other programs later. */
4347 if (tables == NULL) tables = _pcre_default_tables;
4349 /* Check that the first field in the block is the magic number. If it is not,
4350 test for a regex that was compiled on a host of opposite endianness. If this is
4351 the case, flipped values are put in internal_re and internal_study if there was
4354 if (re->magic_number != MAGIC_NUMBER)
4356 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4357 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4358 if (study != NULL) study = &internal_study;
4361 /* Set up other data */
4363 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4364 startline = (re->options & PCRE_STARTLINE) != 0;
4365 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4367 /* The code starts after the real_pcre block and the capture name table. */
4369 md->start_code = (const uschar *)external_re + re->name_table_offset +
4370 re->name_count * re->name_entry_size;
4372 md->start_subject = (USPTR)subject;
4373 md->start_offset = start_offset;
4374 md->end_subject = md->start_subject + length;
4375 end_subject = md->end_subject;
4377 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4378 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4380 md->notbol = (options & PCRE_NOTBOL) != 0;
4381 md->noteol = (options & PCRE_NOTEOL) != 0;
4382 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4383 md->partial = (options & PCRE_PARTIAL) != 0;
4386 md->recursive = NULL; /* No recursion at top level */
4387 md->eptrchain = eptrchain; /* Make workspace generally available */
4389 md->lcc = tables + lcc_offset;
4390 md->ctypes = tables + ctypes_offset;
4392 /* Handle different types of newline. The three bits give eight cases. If
4393 nothing is set at run time, whatever was used at compile time applies. */
4395 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
4398 case 0: newline = NEWLINE; break; /* Compile-time default */
4399 case PCRE_NEWLINE_CR: newline = '\r'; break;
4400 case PCRE_NEWLINE_LF: newline = '\n'; break;
4401 case PCRE_NEWLINE_CR+
4402 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4403 case PCRE_NEWLINE_ANY: newline = -1; break;
4404 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4405 default: return PCRE_ERROR_BADNEWLINE;
4410 md->nltype = NLTYPE_ANYCRLF;
4412 else if (newline < 0)
4414 md->nltype = NLTYPE_ANY;
4418 md->nltype = NLTYPE_FIXED;
4422 md->nl[0] = (newline >> 8) & 255;
4423 md->nl[1] = newline & 255;
4428 md->nl[0] = newline;
4432 /* Partial matching is supported only for a restricted set of regexes at the
4435 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
4436 return PCRE_ERROR_BADPARTIAL;
4438 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4439 back the character offset. */
4442 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4444 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4445 return PCRE_ERROR_BADUTF8;
4446 if (start_offset > 0 && start_offset < length)
4448 int tb = ((uschar *)subject)[start_offset];
4452 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4458 /* The ims options can vary during the matching as a result of the presence
4459 of (?ims) items in the pattern. They are kept in a local variable so that
4460 restoring at the exit of a group is easy. */
4462 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4464 /* If the expression has got more back references than the offsets supplied can
4465 hold, we get a temporary chunk of working store to use during the matching.
4466 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4469 ocount = offsetcount - (offsetcount % 3);
4471 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4473 ocount = re->top_backref * 3 + 3;
4474 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4475 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4476 using_temporary_offsets = TRUE;
4477 DPRINTF(("Got memory to hold back references\n"));
4479 else md->offset_vector = offsets;
4481 md->offset_end = ocount;
4482 md->offset_max = (2*ocount)/3;
4483 md->offset_overflow = FALSE;
4484 md->capture_last = -1;
4486 /* Compute the minimum number of offsets that we need to reset each time. Doing
4487 this makes a huge difference to execution time when there aren't many brackets
4490 resetcount = 2 + re->top_bracket * 2;
4491 if (resetcount > offsetcount) resetcount = ocount;
4493 /* Reset the working variable associated with each extraction. These should
4494 never be used unless previously set, but they get saved and restored, and so we
4495 initialize them to avoid reading uninitialized locations. */
4497 if (md->offset_vector != NULL)
4499 register int *iptr = md->offset_vector + ocount;
4500 register int *iend = iptr - resetcount/2 + 1;
4501 while (--iptr >= iend) *iptr = -1;
4504 /* Set up the first character to match, if available. The first_byte value is
4505 never set for an anchored regular expression, but the anchoring may be forced
4506 at run time, so we have to test for anchoring. The first char may be unset for
4507 an unanchored pattern, of course. If there's no first char and the pattern was
4508 studied, there may be a bitmap of possible first characters. */
4512 if ((re->options & PCRE_FIRSTSET) != 0)
4514 first_byte = re->first_byte & 255;
4515 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4516 first_byte = md->lcc[first_byte];
4519 if (!startline && study != NULL &&
4520 (study->options & PCRE_STUDY_MAPPED) != 0)
4521 start_bits = study->start_bits;
4524 /* For anchored or unanchored matches, there may be a "last known required
4527 if ((re->options & PCRE_REQCHSET) != 0)
4529 req_byte = re->req_byte & 255;
4530 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4531 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4535 /* ==========================================================================*/
4537 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4538 the loop runs just once. */
4542 USPTR save_end_subject = end_subject;
4544 /* Reset the maximum number of extractions we might see. */
4546 if (md->offset_vector != NULL)
4548 register int *iptr = md->offset_vector;
4549 register int *iend = iptr + resetcount;
4550 while (iptr < iend) *iptr++ = -1;
4553 /* Advance to a unique first char if possible. If firstline is TRUE, the
4554 start of the match is constrained to the first line of a multiline string.
4555 That is, the match must be before or at the first newline. Implement this by
4556 temporarily adjusting end_subject so that we stop scanning at a newline. If
4557 the match fails at the newline, later code breaks this loop. */
4561 USPTR t = start_match;
4562 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4566 /* Now test for a unique first byte */
4568 if (first_byte >= 0)
4570 if (first_byte_caseless)
4571 while (start_match < end_subject &&
4572 md->lcc[*start_match] != first_byte)
4575 while (start_match < end_subject && *start_match != first_byte)
4579 /* Or to just after a linebreak for a multiline match if possible */
4583 if (start_match > md->start_subject + start_offset)
4585 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4588 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4589 and we are now at a LF, advance the match position by one more character.
4592 if (start_match[-1] == '\r' &&
4593 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4594 start_match < end_subject &&
4595 *start_match == '\n')
4600 /* Or to a non-unique first char after study */
4602 else if (start_bits != NULL)
4604 while (start_match < end_subject)
4606 register unsigned int c = *start_match;
4607 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4611 /* Restore fudged end_subject */
4613 end_subject = save_end_subject;
4615 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4616 printf(">>>> Match against: ");
4617 pchars(start_match, end_subject - start_match, TRUE, md);
4621 /* If req_byte is set, we know that that character must appear in the subject
4622 for the match to succeed. If the first character is set, req_byte must be
4623 later in the subject; otherwise the test starts at the match point. This
4624 optimization can save a huge amount of backtracking in patterns with nested
4625 unlimited repeats that aren't going to match. Writing separate code for
4626 cased/caseless versions makes it go faster, as does using an autoincrement
4627 and backing off on a match.
4629 HOWEVER: when the subject string is very, very long, searching to its end can
4630 take a long time, and give bad performance on quite ordinary patterns. This
4631 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4632 string... so we don't do this when the string is sufficiently long.
4634 ALSO: this processing is disabled when partial matching is requested.
4637 if (req_byte >= 0 &&
4638 end_subject - start_match < REQ_BYTE_MAX &&
4641 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4643 /* We don't need to repeat the search if we haven't yet reached the
4644 place we found it at last time. */
4646 if (p > req_byte_ptr)
4648 if (req_byte_caseless)
4650 while (p < end_subject)
4652 register int pp = *p++;
4653 if (pp == req_byte || pp == req_byte2) { p--; break; }
4658 while (p < end_subject)
4660 if (*p++ == req_byte) { p--; break; }
4664 /* If we can't find the required character, break the matching loop,
4665 forcing a match failure. */
4667 if (p >= end_subject)
4673 /* If we have found the required character, save the point where we
4674 found it, so that we don't search again next time round the loop if
4675 the start hasn't passed this character yet. */
4681 /* OK, we can now run the match. */
4683 md->start_match_ptr = start_match; /* Insurance */
4684 md->match_call_count = 0;
4685 md->eptrn = 0; /* Next free eptrchain slot */
4686 rc = match(start_match, md->start_code, start_match, 2, md,
4689 /* Any return other than MATCH_NOMATCH breaks the loop. */
4691 if (rc != MATCH_NOMATCH) break;
4693 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4694 newline in the subject (though it may continue over the newline). Therefore,
4695 if we have just failed to match, starting at a newline, do not continue. */
4697 if (firstline && IS_NEWLINE(start_match)) break;
4699 /* Advance the match position by one character. */
4704 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4708 /* Break the loop if the pattern is anchored or if we have passed the end of
4711 if (anchored || start_match > end_subject) break;
4713 /* If we have just passed a CR and the newline option is CRLF or ANY or
4714 ANYCRLF, and we are now at a LF, advance the match position by one more
4717 if (start_match[-1] == '\r' &&
4718 (md->nltype == NLTYPE_ANY ||
4719 md->nltype == NLTYPE_ANYCRLF ||
4721 start_match < end_subject &&
4722 *start_match == '\n')
4725 } /* End of for(;;) "bumpalong" loop */
4727 /* ==========================================================================*/
4729 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4732 (1) The pattern is anchored;
4734 (2) We are past the end of the subject;
4736 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4737 this option requests that a match occur at or before the first newline in
4740 When we have a match and the offset vector is big enough to deal with any
4741 backreferences, captured substring offsets will already be set up. In the case
4742 where we had to get some local store to hold offsets for backreference
4743 processing, copy those that we can. In this case there need not be overflow if
4744 certain parts of the pattern were not used, even though there are more
4745 capturing parentheses than vector slots. */
4747 if (rc == MATCH_MATCH)
4749 if (using_temporary_offsets)
4751 if (offsetcount >= 4)
4753 memcpy(offsets + 2, md->offset_vector + 2,
4754 (offsetcount - 2) * sizeof(int));
4755 DPRINTF(("Copied offsets from temporary memory\n"));
4757 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4758 DPRINTF(("Freeing temporary memory\n"));
4759 (pcre_free)(md->offset_vector);
4762 /* Set the return code to the number of captured strings, or 0 if there are
4763 too many to fit into the vector. */
4765 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4767 /* If there is space, set up the whole thing as substring 0. The value of
4768 md->start_match_ptr might be modified if \K was encountered on the success
4771 if (offsetcount < 2) rc = 0; else
4773 offsets[0] = md->start_match_ptr - md->start_subject;
4774 offsets[1] = md->end_match_ptr - md->start_subject;
4777 DPRINTF((">>>> returning %d\n", rc));
4781 /* Control gets here if there has been an error, or if the overall match
4782 attempt has failed at all permitted starting positions. */
4784 if (using_temporary_offsets)
4786 DPRINTF(("Freeing temporary memory\n"));
4787 (pcre_free)(md->offset_vector);
4790 if (rc != MATCH_NOMATCH)
4792 DPRINTF((">>>> error: returning %d\n", rc));
4795 else if (md->partial && md->hitend)
4797 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4798 return PCRE_ERROR_PARTIAL;
4802 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4803 return PCRE_ERROR_NOMATCH;
4807 /* End of pcre_exec.c */