1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
53 #include "pcre_internal.h"
55 /* Undefine some potentially clashing cpp symbols */
60 /* Flag bits for the match() function */
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
69 #define MATCH_NOMATCH 0
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
83 #define REC_STACK_SAVE_MAX 30
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
135 Returns: TRUE if matched
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
142 USPTR p = md->start_subject + md->offset_vector[offset];
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
157 /* Always fail if not enough characters left */
159 if (length > md->end_subject - eptr) return FALSE;
161 /* Separate the caselesss case for speed */
163 if ((ims & PCRE_CASELESS) != 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
229 #define REGISTER register
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
238 #define RRETURN(ra) \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
292 /* Structure for remembering the local variables in a private frame */
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
297 /* Function arguments that may change */
300 const uschar *Xecode;
301 const uschar *Xmstart;
306 unsigned int Xrdepth;
308 /* Function local variables */
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
316 const uschar *Xsaved_eptr;
318 recursion_info Xnew_recursive;
324 unsigned long int Xoriginal_ims;
329 int Xprop_fail_result;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
352 /* Where to jump back to */
361 /***************************************************************************
362 ***************************************************************************/
366 /*************************************************
367 * Match from current position *
368 *************************************************/
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
415 BOOL minimize, possessive; /* Quantifier options */
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
426 /* Copy in the original argument variables */
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
437 /* This is where control jumps back to to effect "recursion" */
441 /* Macros make the argument variables come from the current frame */
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
452 /* Ditto for the local variables */
455 #define charptr frame->Xcharptr
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
464 #define new_recursive frame->Xnew_recursive
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
470 #define original_ims frame->Xoriginal_ims
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
498 #define newptrb frame->Xnewptrb
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
504 #else /* NO_RECURSE not defined */
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
525 unsigned long int original_ims;
530 int prop_fail_result;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
550 #endif /* NO_RECURSE */
552 /* These statements are here to stop the compiler complaining about unitialized
557 prop_fail_result = 0;
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
577 utf8 = md->utf8; /* Local copy of the flag */
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
588 original_ims = ims; /* Save for resetting on ')' */
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
599 if ((flags & match_cbegroup) != 0)
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
606 /* Now start processing the opcodes. */
610 minimize = possessive = FALSE;
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
617 eptr >= md->end_subject &&
624 RRETURN(MATCH_NOMATCH);
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
671 printf("start bracket %d\n", number);
673 pchars(eptr, 16, TRUE, md);
677 if (offset < md->offset_max)
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
696 while (*ecode == OP_ALT);
698 DPRINTF(("bracket %d failed\n", number));
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
704 RRETURN(MATCH_NOMATCH);
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
732 if (flags == 0) /* Not a possibly empty group */
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
739 /* Possibly empty group; can't use tail recursion. */
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
754 /* Control never reaches here. */
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
782 ecode += GET(ecode, 1);
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
801 RRETURN(rrc); /* Need braces because of following else */
806 ecode += GET(ecode, 1);
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
815 if (condition || *ecode == OP_ALT)
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
823 else /* Group must match something */
829 else /* Condition false & no 2nd alternative */
831 ecode += 1 + LINK_SIZE;
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
842 if (md->recursive != NULL && md->recursive->group_num == 0)
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
851 ecode = rec->after_call;
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
864 /* Change option settings */
869 DPRINTF(("ims set to %02lx\n", ims));
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
903 /* Negative assertion: all branches must fail to match */
906 case OP_ASSERTBACK_NOT:
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
915 while (*ecode == OP_ALT);
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
919 ecode += 1 + LINK_SIZE;
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
949 /* Skip to next op code */
951 ecode += 1 + LINK_SIZE;
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
959 if (pcre_callout != NULL)
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
977 ecode += 2 + 2*LINK_SIZE;
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1005 /* Add to "recursing stack" */
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1010 /* Find where to continue from afterwards */
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1015 /* Now save the offset data. */
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1060 while (*callpat == OP_ALT);
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1068 /* Control never reaches here */
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1088 while (*ecode == OP_ALT);
1090 /* If hit the end of the group (which could be repeated), fail */
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1110 ecode += 1+LINK_SIZE;
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1126 if (*ecode == OP_KETRMIN)
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1134 else /* OP_KETRMAX */
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1142 /* Control never gets here */
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1151 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152 indicating that it may occur zero times. It may repeat infinitely, or not
1153 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154 with fixed upper repeat limits are compiled as a number of copies, with the
1155 optional ones preceded by BRAZERO or BRAMINZERO. */
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1180 do next += GET(next,1); while (*next == OP_ALT);
1181 ecode = next + 1 + LINK_SIZE;
1185 /* End of a group, repeated or non-repeating. */
1190 prev = ecode - GET(ecode, 1);
1192 /* If this was a group that remembered the subject start, in order to break
1193 infinite repeats of empty string matches, retrieve the subject start from
1194 the chain. Otherwise, set it NULL. */
1196 if (*prev >= OP_SBRA)
1198 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199 eptrb = eptrb->epb_prev; /* Backup to previous group */
1201 else saved_eptr = NULL;
1203 /* If we are at the end of an assertion group, stop matching and return
1204 MATCH_MATCH, but record the current high water mark for use by positive
1205 assertions. Do this also for the "once" (atomic) groups. */
1207 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1211 md->end_match_ptr = eptr; /* For ONCE */
1212 md->end_offset_top = offset_top;
1213 RRETURN(MATCH_MATCH);
1216 /* For capturing groups we have to check the group number back at the start
1217 and if necessary complete handling an extraction by setting the offsets and
1218 bumping the high water mark. Note that whole-pattern recursion is coded as
1219 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220 when the OP_END is reached. Other recursion is handled here. */
1222 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1224 number = GET2(prev, 1+LINK_SIZE);
1225 offset = number << 1;
1228 printf("end bracket %d", number);
1232 md->capture_last = number;
1233 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1235 md->offset_vector[offset] =
1236 md->offset_vector[md->offset_end - number];
1237 md->offset_vector[offset+1] = eptr - md->start_subject;
1238 if (offset_top <= offset) offset_top = offset + 2;
1241 /* Handle a recursively called group. Restore the offsets
1242 appropriately and continue from after the call. */
1244 if (md->recursive != NULL && md->recursive->group_num == number)
1246 recursion_info *rec = md->recursive;
1247 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248 md->recursive = rec->prevrec;
1249 mstart = rec->save_start;
1250 memcpy(md->offset_vector, rec->offset_save,
1251 rec->saved_max * sizeof(int));
1252 ecode = rec->after_call;
1258 /* For both capturing and non-capturing groups, reset the value of the ims
1259 flags, in case they got changed during the group. */
1262 DPRINTF(("ims reset to %02lx\n", ims));
1264 /* For a non-repeating ket, just continue at this level. This also
1265 happens for a repeating ket if no characters were matched in the group.
1266 This is the forcible breaking of infinite loops as implemented in Perl
1267 5.005. If there is an options reset, it will get obeyed in the normal
1268 course of events. */
1270 if (*ecode == OP_KET || eptr == saved_eptr)
1272 ecode += 1 + LINK_SIZE;
1276 /* The repeating kets try the rest of the pattern or restart from the
1277 preceding bracket, in the appropriate order. In the second case, we can use
1278 tail recursion to avoid using another stack frame, unless we have an
1279 unlimited repeat of a group that can match an empty string. */
1281 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1283 if (*ecode == OP_KETRMIN)
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 if (flags != 0) /* Could match an empty string */
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1295 else /* OP_KETRMAX */
1297 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299 ecode += 1 + LINK_SIZE;
1303 /* Control never gets here */
1305 /* Start of subject unless notbol, or after internal newline if multiline */
1308 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309 if ((ims & PCRE_MULTILINE) != 0)
1311 if (eptr != md->start_subject &&
1312 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 RRETURN(MATCH_NOMATCH);
1317 /* ... else fall through */
1319 /* Start of subject assertion */
1322 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1326 /* Start of match assertion */
1329 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1333 /* Reset the start of match point */
1340 /* Assert before internal newline if multiline, or before a terminating
1341 newline unless endonly is set, else end of subject unless noteol is set. */
1344 if ((ims & PCRE_MULTILINE) != 0)
1346 if (eptr < md->end_subject)
1347 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1349 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1355 if (md->noteol) RRETURN(MATCH_NOMATCH);
1358 if (eptr != md->end_subject &&
1359 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 RRETURN(MATCH_NOMATCH);
1365 /* ... else fall through for endonly */
1367 /* End of subject assertion (\z) */
1370 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1374 /* End of subject or ending \n assertion (\Z) */
1377 if (eptr != md->end_subject &&
1378 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 RRETURN(MATCH_NOMATCH);
1383 /* Word boundary assertions */
1385 case OP_NOT_WORD_BOUNDARY:
1386 case OP_WORD_BOUNDARY:
1389 /* Find out if the previous and current characters are "word" characters.
1390 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391 be "non-word" characters. */
1396 if (eptr == md->start_subject) prev_is_word = FALSE; else
1398 const uschar *lastptr = eptr - 1;
1399 while((*lastptr & 0xc0) == 0x80) lastptr--;
1400 GETCHAR(c, lastptr);
1401 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1403 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1406 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1412 /* More streamlined when not in UTF-8 mode */
1415 prev_is_word = (eptr != md->start_subject) &&
1416 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417 cur_is_word = (eptr < md->end_subject) &&
1418 ((md->ctypes[*eptr] & ctype_word) != 0);
1421 /* Now see if the situation is what we want */
1423 if ((*ecode++ == OP_WORD_BOUNDARY)?
1424 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425 RRETURN(MATCH_NOMATCH);
1429 /* Match a single character type; inline for speed */
1432 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1436 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1441 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1442 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1445 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1450 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1451 GETCHARINCTEST(c, eptr);
1456 (md->ctypes[c] & ctype_digit) != 0
1458 RRETURN(MATCH_NOMATCH);
1463 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1464 GETCHARINCTEST(c, eptr);
1469 (md->ctypes[c] & ctype_digit) == 0
1471 RRETURN(MATCH_NOMATCH);
1475 case OP_NOT_WHITESPACE:
1476 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477 GETCHARINCTEST(c, eptr);
1482 (md->ctypes[c] & ctype_space) != 0
1484 RRETURN(MATCH_NOMATCH);
1489 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490 GETCHARINCTEST(c, eptr);
1495 (md->ctypes[c] & ctype_space) == 0
1497 RRETURN(MATCH_NOMATCH);
1501 case OP_NOT_WORDCHAR:
1502 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 GETCHARINCTEST(c, eptr);
1508 (md->ctypes[c] & ctype_word) != 0
1510 RRETURN(MATCH_NOMATCH);
1515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516 GETCHARINCTEST(c, eptr);
1521 (md->ctypes[c] & ctype_word) == 0
1523 RRETURN(MATCH_NOMATCH);
1528 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529 GETCHARINCTEST(c, eptr);
1532 default: RRETURN(MATCH_NOMATCH);
1534 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1545 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1552 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1553 GETCHARINCTEST(c, eptr);
1558 case 0x20: /* SPACE */
1559 case 0xa0: /* NBSP */
1560 case 0x1680: /* OGHAM SPACE MARK */
1561 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1562 case 0x2000: /* EN QUAD */
1563 case 0x2001: /* EM QUAD */
1564 case 0x2002: /* EN SPACE */
1565 case 0x2003: /* EM SPACE */
1566 case 0x2004: /* THREE-PER-EM SPACE */
1567 case 0x2005: /* FOUR-PER-EM SPACE */
1568 case 0x2006: /* SIX-PER-EM SPACE */
1569 case 0x2007: /* FIGURE SPACE */
1570 case 0x2008: /* PUNCTUATION SPACE */
1571 case 0x2009: /* THIN SPACE */
1572 case 0x200A: /* HAIR SPACE */
1573 case 0x202f: /* NARROW NO-BREAK SPACE */
1574 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1575 case 0x3000: /* IDEOGRAPHIC SPACE */
1576 RRETURN(MATCH_NOMATCH);
1582 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583 GETCHARINCTEST(c, eptr);
1586 default: RRETURN(MATCH_NOMATCH);
1588 case 0x20: /* SPACE */
1589 case 0xa0: /* NBSP */
1590 case 0x1680: /* OGHAM SPACE MARK */
1591 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1592 case 0x2000: /* EN QUAD */
1593 case 0x2001: /* EM QUAD */
1594 case 0x2002: /* EN SPACE */
1595 case 0x2003: /* EM SPACE */
1596 case 0x2004: /* THREE-PER-EM SPACE */
1597 case 0x2005: /* FOUR-PER-EM SPACE */
1598 case 0x2006: /* SIX-PER-EM SPACE */
1599 case 0x2007: /* FIGURE SPACE */
1600 case 0x2008: /* PUNCTUATION SPACE */
1601 case 0x2009: /* THIN SPACE */
1602 case 0x200A: /* HAIR SPACE */
1603 case 0x202f: /* NARROW NO-BREAK SPACE */
1604 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1605 case 0x3000: /* IDEOGRAPHIC SPACE */
1612 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1613 GETCHARINCTEST(c, eptr);
1621 case 0x85: /* NEL */
1622 case 0x2028: /* LINE SEPARATOR */
1623 case 0x2029: /* PARAGRAPH SEPARATOR */
1624 RRETURN(MATCH_NOMATCH);
1630 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1631 GETCHARINCTEST(c, eptr);
1634 default: RRETURN(MATCH_NOMATCH);
1639 case 0x85: /* NEL */
1640 case 0x2028: /* LINE SEPARATOR */
1641 case 0x2029: /* PARAGRAPH SEPARATOR */
1648 /* Check the next character by Unicode property. We will get here only
1649 if the support is in the binary; otherwise a compile-time error occurs. */
1653 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1654 GETCHARINCTEST(c, eptr);
1656 int chartype, script;
1657 int category = _pcre_ucp_findprop(c, &chartype, &script);
1662 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1666 if ((chartype == ucp_Lu ||
1667 chartype == ucp_Ll ||
1668 chartype == ucp_Lt) == (op == OP_NOTPROP))
1669 RRETURN(MATCH_NOMATCH);
1673 if ((ecode[2] != category) == (op == OP_PROP))
1674 RRETURN(MATCH_NOMATCH);
1678 if ((ecode[2] != chartype) == (op == OP_PROP))
1679 RRETURN(MATCH_NOMATCH);
1683 if ((ecode[2] != script) == (op == OP_PROP))
1684 RRETURN(MATCH_NOMATCH);
1688 RRETURN(PCRE_ERROR_INTERNAL);
1695 /* Match an extended Unicode sequence. We will get here only if the support
1696 is in the binary; otherwise a compile-time error occurs. */
1699 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1700 GETCHARINCTEST(c, eptr);
1702 int chartype, script;
1703 int category = _pcre_ucp_findprop(c, &chartype, &script);
1704 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1705 while (eptr < md->end_subject)
1708 if (!utf8) c = *eptr; else
1710 GETCHARLEN(c, eptr, len);
1712 category = _pcre_ucp_findprop(c, &chartype, &script);
1713 if (category != ucp_M) break;
1722 /* Match a back reference, possibly repeatedly. Look past the end of the
1723 item to see if there is repeat information following. The code is similar
1724 to that for character classes, but repeated for efficiency. Then obey
1725 similar code to character type repeats - written out again for speed.
1726 However, if the referenced string is the empty string, always treat
1727 it as matched, any number of times (otherwise there could be infinite
1732 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1735 /* If the reference is unset, there are two possibilities:
1737 (a) In the default, Perl-compatible state, set the length to be longer
1738 than the amount of subject left; this ensures that every attempt at a
1739 match fails. We can't just fail here, because of the possibility of
1740 quantifiers with zero minima.
1742 (b) If the JavaScript compatibility flag is set, set the length to zero
1743 so that the back reference matches an empty string.
1745 Otherwise, set the length to the length of what was matched by the
1746 referenced subpattern. */
1748 if (offset >= offset_top || md->offset_vector[offset] < 0)
1749 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1751 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1753 /* Set up for repetition, or handle the non-repeated case */
1763 c = *ecode++ - OP_CRSTAR;
1764 minimize = (c & 1) != 0;
1765 min = rep_min[c]; /* Pick up values from tables; */
1766 max = rep_max[c]; /* zero for max => infinity */
1767 if (max == 0) max = INT_MAX;
1772 minimize = (*ecode == OP_CRMINRANGE);
1773 min = GET2(ecode, 1);
1774 max = GET2(ecode, 3);
1775 if (max == 0) max = INT_MAX;
1779 default: /* No repeat follows */
1780 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1782 continue; /* With the main loop */
1785 /* If the length of the reference is zero, just continue with the
1788 if (length == 0) continue;
1790 /* First, ensure the minimum number of matches are present. We get back
1791 the length of the reference string explicitly rather than passing the
1792 address of eptr, so that eptr can be a register variable. */
1794 for (i = 1; i <= min; i++)
1796 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1800 /* If min = max, continue at the same level without recursion.
1801 They are not both allowed to be zero. */
1803 if (min == max) continue;
1805 /* If minimizing, keep trying and advancing the pointer */
1809 for (fi = min;; fi++)
1811 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1813 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1814 RRETURN(MATCH_NOMATCH);
1817 /* Control never gets here */
1820 /* If maximizing, find the longest string and work backwards */
1825 for (i = min; i < max; i++)
1827 if (!match_ref(offset, eptr, length, md, ims)) break;
1832 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1836 RRETURN(MATCH_NOMATCH);
1839 /* Control never gets here */
1843 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1844 used when all the characters in the class have values in the range 0-255,
1845 and either the matching is caseful, or the characters are in the range
1846 0-127 when UTF-8 processing is enabled. The only difference between
1847 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1850 First, look past the end of the item to see if there is repeat information
1851 following. Then obey similar code to character type repeats - written out
1857 data = ecode + 1; /* Save for matching */
1858 ecode += 33; /* Advance past the item */
1868 c = *ecode++ - OP_CRSTAR;
1869 minimize = (c & 1) != 0;
1870 min = rep_min[c]; /* Pick up values from tables; */
1871 max = rep_max[c]; /* zero for max => infinity */
1872 if (max == 0) max = INT_MAX;
1877 minimize = (*ecode == OP_CRMINRANGE);
1878 min = GET2(ecode, 1);
1879 max = GET2(ecode, 3);
1880 if (max == 0) max = INT_MAX;
1884 default: /* No repeat follows */
1889 /* First, ensure the minimum number of matches are present. */
1895 for (i = 1; i <= min; i++)
1897 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1898 GETCHARINC(c, eptr);
1901 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1905 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1911 /* Not UTF-8 mode */
1913 for (i = 1; i <= min; i++)
1915 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1917 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1921 /* If max == min we can continue with the main loop without the
1924 if (min == max) continue;
1926 /* If minimizing, keep testing the rest of the expression and advancing
1927 the pointer while it matches the class. */
1935 for (fi = min;; fi++)
1937 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1939 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940 GETCHARINC(c, eptr);
1943 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1947 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1953 /* Not UTF-8 mode */
1955 for (fi = min;; fi++)
1957 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1958 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1959 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1961 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1964 /* Control never gets here */
1967 /* If maximizing, find the longest possible run, then work backwards. */
1977 for (i = min; i < max; i++)
1980 if (eptr >= md->end_subject) break;
1981 GETCHARLEN(c, eptr, len);
1984 if (op == OP_CLASS) break;
1988 if ((data[c/8] & (1 << (c&7))) == 0) break;
1994 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1996 if (eptr-- == pp) break; /* Stop if tried at original pos */
2002 /* Not UTF-8 mode */
2004 for (i = min; i < max; i++)
2006 if (eptr >= md->end_subject) break;
2008 if ((data[c/8] & (1 << (c&7))) == 0) break;
2013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 RRETURN(MATCH_NOMATCH);
2022 /* Control never gets here */
2025 /* Match an extended character class. This opcode is encountered only
2026 in UTF-8 mode, because that's the only time it is compiled. */
2031 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2032 ecode += GET(ecode, 1); /* Advance past the item */
2042 c = *ecode++ - OP_CRSTAR;
2043 minimize = (c & 1) != 0;
2044 min = rep_min[c]; /* Pick up values from tables; */
2045 max = rep_max[c]; /* zero for max => infinity */
2046 if (max == 0) max = INT_MAX;
2051 minimize = (*ecode == OP_CRMINRANGE);
2052 min = GET2(ecode, 1);
2053 max = GET2(ecode, 3);
2054 if (max == 0) max = INT_MAX;
2058 default: /* No repeat follows */
2063 /* First, ensure the minimum number of matches are present. */
2065 for (i = 1; i <= min; i++)
2067 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2068 GETCHARINC(c, eptr);
2069 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2072 /* If max == min we can continue with the main loop without the
2075 if (min == max) continue;
2077 /* If minimizing, keep testing the rest of the expression and advancing
2078 the pointer while it matches the class. */
2082 for (fi = min;; fi++)
2084 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2086 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2087 GETCHARINC(c, eptr);
2088 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2090 /* Control never gets here */
2093 /* If maximizing, find the longest possible run, then work backwards. */
2098 for (i = min; i < max; i++)
2101 if (eptr >= md->end_subject) break;
2102 GETCHARLEN(c, eptr, len);
2103 if (!_pcre_xclass(c, data)) break;
2108 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110 if (eptr-- == pp) break; /* Stop if tried at original pos */
2111 if (utf8) BACKCHAR(eptr);
2113 RRETURN(MATCH_NOMATCH);
2116 /* Control never gets here */
2118 #endif /* End of XCLASS */
2120 /* Match a single character, casefully */
2128 GETCHARLEN(fc, ecode, length);
2129 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2130 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2135 /* Non-UTF-8 mode */
2137 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2138 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2143 /* Match a single character, caselessly */
2151 GETCHARLEN(fc, ecode, length);
2153 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2155 /* If the pattern character's value is < 128, we have only one byte, and
2156 can use the fast lookup table. */
2160 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2163 /* Otherwise we must pick up the subject character */
2168 GETCHARINC(dc, eptr);
2171 /* If we have Unicode property support, we can use it to test the other
2172 case of the character, if there is one. */
2177 if (dc != _pcre_ucp_othercase(fc))
2179 RRETURN(MATCH_NOMATCH);
2184 #endif /* SUPPORT_UTF8 */
2186 /* Non-UTF-8 mode */
2188 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2189 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2194 /* Match a single character repeatedly. */
2197 min = max = GET2(ecode, 1);
2208 max = GET2(ecode, 1);
2209 minimize = *ecode == OP_MINUPTO;
2240 c = *ecode++ - OP_STAR;
2241 minimize = (c & 1) != 0;
2242 min = rep_min[c]; /* Pick up values from tables; */
2243 max = rep_max[c]; /* zero for max => infinity */
2244 if (max == 0) max = INT_MAX;
2246 /* Common code for all repeated single-character matches. We can give
2247 up quickly if there are fewer than the minimum number of characters left in
2256 GETCHARLEN(fc, ecode, length);
2257 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2260 /* Handle multibyte character matching specially here. There is
2261 support for caseless matching if UCP support is present. */
2266 unsigned int othercase;
2267 if ((ims & PCRE_CASELESS) != 0 &&
2268 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2269 oclength = _pcre_ord2utf8(othercase, occhars);
2271 #endif /* SUPPORT_UCP */
2273 for (i = 1; i <= min; i++)
2275 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2277 /* Need braces because of following else */
2278 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2281 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2284 #else /* without SUPPORT_UCP */
2285 else { RRETURN(MATCH_NOMATCH); }
2286 #endif /* SUPPORT_UCP */
2289 if (min == max) continue;
2293 for (fi = min;; fi++)
2295 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2296 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2297 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2298 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2300 /* Need braces because of following else */
2301 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2304 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2307 #else /* without SUPPORT_UCP */
2308 else { RRETURN (MATCH_NOMATCH); }
2309 #endif /* SUPPORT_UCP */
2311 /* Control never gets here */
2317 for (i = min; i < max; i++)
2319 if (eptr > md->end_subject - length) break;
2320 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2322 else if (oclength == 0) break;
2325 if (memcmp(eptr, occhars, oclength) != 0) break;
2328 #else /* without SUPPORT_UCP */
2330 #endif /* SUPPORT_UCP */
2333 if (possessive) continue;
2336 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2338 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2342 #else /* without SUPPORT_UCP */
2344 #endif /* SUPPORT_UCP */
2347 /* Control never gets here */
2350 /* If the length of a UTF-8 character is 1, we fall through here, and
2351 obey the code as for non-UTF-8 characters below, though in this case the
2352 value of fc will always be < 128. */
2355 #endif /* SUPPORT_UTF8 */
2357 /* When not in UTF-8 mode, load a single-byte character. */
2359 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2363 /* The value of fc at this point is always less than 256, though we may or
2364 may not be in UTF-8 mode. The code is duplicated for the caseless and
2365 caseful cases, for speed, since matching characters is likely to be quite
2366 common. First, ensure the minimum number of matches are present. If min =
2367 max, continue at the same level without recursing. Otherwise, if
2368 minimizing, keep trying the rest of the expression and advancing one
2369 matching character if failing, up to the maximum. Alternatively, if
2370 maximizing, find the maximum number of characters and work backwards. */
2372 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2375 if ((ims & PCRE_CASELESS) != 0)
2378 for (i = 1; i <= min; i++)
2379 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2380 if (min == max) continue;
2383 for (fi = min;; fi++)
2385 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2387 if (fi >= max || eptr >= md->end_subject ||
2388 fc != md->lcc[*eptr++])
2389 RRETURN(MATCH_NOMATCH);
2391 /* Control never gets here */
2396 for (i = min; i < max; i++)
2398 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2401 if (possessive) continue;
2404 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2406 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2408 RRETURN(MATCH_NOMATCH);
2410 /* Control never gets here */
2413 /* Caseful comparisons (includes all multi-byte characters) */
2417 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2418 if (min == max) continue;
2421 for (fi = min;; fi++)
2423 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2425 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2426 RRETURN(MATCH_NOMATCH);
2428 /* Control never gets here */
2433 for (i = min; i < max; i++)
2435 if (eptr >= md->end_subject || fc != *eptr) break;
2438 if (possessive) continue;
2441 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2445 RRETURN(MATCH_NOMATCH);
2448 /* Control never gets here */
2450 /* Match a negated single one-byte character. The character we are
2451 checking can be multibyte. */
2454 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2456 GETCHARINCTEST(c, eptr);
2457 if ((ims & PCRE_CASELESS) != 0)
2463 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2467 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2471 /* Match a negated single one-byte character repeatedly. This is almost a
2472 repeat of the code for a repeated single character, but I haven't found a
2473 nice way of commoning these up that doesn't require a test of the
2474 positive/negative option for each character match. Maybe that wouldn't add
2475 very much to the time taken, but character matching *is* what this is all
2479 min = max = GET2(ecode, 1);
2486 max = GET2(ecode, 1);
2487 minimize = *ecode == OP_NOTMINUPTO;
2505 case OP_NOTPOSQUERY:
2515 max = GET2(ecode, 1);
2524 case OP_NOTMINQUERY:
2525 c = *ecode++ - OP_NOTSTAR;
2526 minimize = (c & 1) != 0;
2527 min = rep_min[c]; /* Pick up values from tables; */
2528 max = rep_max[c]; /* zero for max => infinity */
2529 if (max == 0) max = INT_MAX;
2531 /* Common code for all repeated single-byte matches. We can give up quickly
2532 if there are fewer than the minimum number of bytes left in the
2536 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2539 /* The code is duplicated for the caseless and caseful cases, for speed,
2540 since matching characters is likely to be quite common. First, ensure the
2541 minimum number of matches are present. If min = max, continue at the same
2542 level without recursing. Otherwise, if minimizing, keep trying the rest of
2543 the expression and advancing one matching character if failing, up to the
2544 maximum. Alternatively, if maximizing, find the maximum number of
2545 characters and work backwards. */
2547 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2550 if ((ims & PCRE_CASELESS) != 0)
2558 register unsigned int d;
2559 for (i = 1; i <= min; i++)
2561 GETCHARINC(d, eptr);
2562 if (d < 256) d = md->lcc[d];
2563 if (fc == d) RRETURN(MATCH_NOMATCH);
2569 /* Not UTF-8 mode */
2571 for (i = 1; i <= min; i++)
2572 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2575 if (min == max) continue;
2583 register unsigned int d;
2584 for (fi = min;; fi++)
2586 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 GETCHARINC(d, eptr);
2589 if (d < 256) d = md->lcc[d];
2590 if (fi >= max || eptr >= md->end_subject || fc == d)
2591 RRETURN(MATCH_NOMATCH);
2596 /* Not UTF-8 mode */
2598 for (fi = min;; fi++)
2600 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2601 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2602 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2603 RRETURN(MATCH_NOMATCH);
2606 /* Control never gets here */
2619 register unsigned int d;
2620 for (i = min; i < max; i++)
2623 if (eptr >= md->end_subject) break;
2624 GETCHARLEN(d, eptr, len);
2625 if (d < 256) d = md->lcc[d];
2629 if (possessive) continue;
2632 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2634 if (eptr-- == pp) break; /* Stop if tried at original pos */
2640 /* Not UTF-8 mode */
2642 for (i = min; i < max; i++)
2644 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2647 if (possessive) continue;
2650 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656 RRETURN(MATCH_NOMATCH);
2658 /* Control never gets here */
2661 /* Caseful comparisons */
2669 register unsigned int d;
2670 for (i = 1; i <= min; i++)
2672 GETCHARINC(d, eptr);
2673 if (fc == d) RRETURN(MATCH_NOMATCH);
2678 /* Not UTF-8 mode */
2680 for (i = 1; i <= min; i++)
2681 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2684 if (min == max) continue;
2692 register unsigned int d;
2693 for (fi = min;; fi++)
2695 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2696 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2697 GETCHARINC(d, eptr);
2698 if (fi >= max || eptr >= md->end_subject || fc == d)
2699 RRETURN(MATCH_NOMATCH);
2704 /* Not UTF-8 mode */
2706 for (fi = min;; fi++)
2708 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2710 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2711 RRETURN(MATCH_NOMATCH);
2714 /* Control never gets here */
2727 register unsigned int d;
2728 for (i = min; i < max; i++)
2731 if (eptr >= md->end_subject) break;
2732 GETCHARLEN(d, eptr, len);
2736 if (possessive) continue;
2739 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2741 if (eptr-- == pp) break; /* Stop if tried at original pos */
2747 /* Not UTF-8 mode */
2749 for (i = min; i < max; i++)
2751 if (eptr >= md->end_subject || fc == *eptr) break;
2754 if (possessive) continue;
2757 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763 RRETURN(MATCH_NOMATCH);
2766 /* Control never gets here */
2768 /* Match a single character type repeatedly; several different opcodes
2769 share code. This is very similar to the code for single characters, but we
2770 repeat it in the interests of efficiency. */
2773 min = max = GET2(ecode, 1);
2779 case OP_TYPEMINUPTO:
2781 max = GET2(ecode, 1);
2782 minimize = *ecode == OP_TYPEMINUPTO;
2786 case OP_TYPEPOSSTAR:
2793 case OP_TYPEPOSPLUS:
2800 case OP_TYPEPOSQUERY:
2807 case OP_TYPEPOSUPTO:
2810 max = GET2(ecode, 1);
2815 case OP_TYPEMINSTAR:
2817 case OP_TYPEMINPLUS:
2819 case OP_TYPEMINQUERY:
2820 c = *ecode++ - OP_TYPESTAR;
2821 minimize = (c & 1) != 0;
2822 min = rep_min[c]; /* Pick up values from tables; */
2823 max = rep_max[c]; /* zero for max => infinity */
2824 if (max == 0) max = INT_MAX;
2826 /* Common code for all repeated single character type matches. Note that
2827 in UTF-8 mode, '.' matches a character of any length, but for the other
2828 character types, the valid characters are all one-byte long. */
2831 ctype = *ecode++; /* Code for the character type */
2834 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2836 prop_fail_result = ctype == OP_NOTPROP;
2837 prop_type = *ecode++;
2838 prop_value = *ecode++;
2840 else prop_type = -1;
2843 /* First, ensure the minimum number of matches are present. Use inline
2844 code for maximizing the speed, and do the type test once at the start
2845 (i.e. keep it out of the loop). Also we can test that there are at least
2846 the minimum number of bytes before we start. This isn't as effective in
2847 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2848 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2849 and single-bytes. */
2851 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2860 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2861 for (i = 1; i <= min; i++)
2863 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2864 GETCHARINCTEST(c, eptr);
2869 for (i = 1; i <= min; i++)
2871 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2872 GETCHARINCTEST(c, eptr);
2873 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2874 if ((prop_chartype == ucp_Lu ||
2875 prop_chartype == ucp_Ll ||
2876 prop_chartype == ucp_Lt) == prop_fail_result)
2877 RRETURN(MATCH_NOMATCH);
2882 for (i = 1; i <= min; i++)
2884 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2885 GETCHARINCTEST(c, eptr);
2886 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2887 if ((prop_category == prop_value) == prop_fail_result)
2888 RRETURN(MATCH_NOMATCH);
2893 for (i = 1; i <= min; i++)
2895 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2896 GETCHARINCTEST(c, eptr);
2897 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2898 if ((prop_chartype == prop_value) == prop_fail_result)
2899 RRETURN(MATCH_NOMATCH);
2904 for (i = 1; i <= min; i++)
2906 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2907 GETCHARINCTEST(c, eptr);
2908 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2909 if ((prop_script == prop_value) == prop_fail_result)
2910 RRETURN(MATCH_NOMATCH);
2915 RRETURN(PCRE_ERROR_INTERNAL);
2919 /* Match extended Unicode sequences. We will get here only if the
2920 support is in the binary; otherwise a compile-time error occurs. */
2922 else if (ctype == OP_EXTUNI)
2924 for (i = 1; i <= min; i++)
2926 GETCHARINCTEST(c, eptr);
2927 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2928 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2929 while (eptr < md->end_subject)
2932 if (!utf8) c = *eptr; else
2934 GETCHARLEN(c, eptr, len);
2936 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2937 if (prop_category != ucp_M) break;
2944 #endif /* SUPPORT_UCP */
2946 /* Handle all other cases when the coding is UTF-8 */
2949 if (utf8) switch(ctype)
2952 for (i = 1; i <= min; i++)
2954 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2955 RRETURN(MATCH_NOMATCH);
2957 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2962 for (i = 1; i <= min; i++)
2964 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2966 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2975 for (i = 1; i <= min; i++)
2977 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2978 GETCHARINC(c, eptr);
2981 default: RRETURN(MATCH_NOMATCH);
2983 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2994 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3001 for (i = 1; i <= min; i++)
3003 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3004 GETCHARINC(c, eptr);
3009 case 0x20: /* SPACE */
3010 case 0xa0: /* NBSP */
3011 case 0x1680: /* OGHAM SPACE MARK */
3012 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3013 case 0x2000: /* EN QUAD */
3014 case 0x2001: /* EM QUAD */
3015 case 0x2002: /* EN SPACE */
3016 case 0x2003: /* EM SPACE */
3017 case 0x2004: /* THREE-PER-EM SPACE */
3018 case 0x2005: /* FOUR-PER-EM SPACE */
3019 case 0x2006: /* SIX-PER-EM SPACE */
3020 case 0x2007: /* FIGURE SPACE */
3021 case 0x2008: /* PUNCTUATION SPACE */
3022 case 0x2009: /* THIN SPACE */
3023 case 0x200A: /* HAIR SPACE */
3024 case 0x202f: /* NARROW NO-BREAK SPACE */
3025 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3026 case 0x3000: /* IDEOGRAPHIC SPACE */
3027 RRETURN(MATCH_NOMATCH);
3033 for (i = 1; i <= min; i++)
3035 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3036 GETCHARINC(c, eptr);
3039 default: RRETURN(MATCH_NOMATCH);
3041 case 0x20: /* SPACE */
3042 case 0xa0: /* NBSP */
3043 case 0x1680: /* OGHAM SPACE MARK */
3044 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3045 case 0x2000: /* EN QUAD */
3046 case 0x2001: /* EM QUAD */
3047 case 0x2002: /* EN SPACE */
3048 case 0x2003: /* EM SPACE */
3049 case 0x2004: /* THREE-PER-EM SPACE */
3050 case 0x2005: /* FOUR-PER-EM SPACE */
3051 case 0x2006: /* SIX-PER-EM SPACE */
3052 case 0x2007: /* FIGURE SPACE */
3053 case 0x2008: /* PUNCTUATION SPACE */
3054 case 0x2009: /* THIN SPACE */
3055 case 0x200A: /* HAIR SPACE */
3056 case 0x202f: /* NARROW NO-BREAK SPACE */
3057 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3058 case 0x3000: /* IDEOGRAPHIC SPACE */
3065 for (i = 1; i <= min; i++)
3067 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3068 GETCHARINC(c, eptr);
3076 case 0x85: /* NEL */
3077 case 0x2028: /* LINE SEPARATOR */
3078 case 0x2029: /* PARAGRAPH SEPARATOR */
3079 RRETURN(MATCH_NOMATCH);
3085 for (i = 1; i <= min; i++)
3087 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3088 GETCHARINC(c, eptr);
3091 default: RRETURN(MATCH_NOMATCH);
3096 case 0x85: /* NEL */
3097 case 0x2028: /* LINE SEPARATOR */
3098 case 0x2029: /* PARAGRAPH SEPARATOR */
3105 for (i = 1; i <= min; i++)
3107 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3108 GETCHARINC(c, eptr);
3109 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3110 RRETURN(MATCH_NOMATCH);
3115 for (i = 1; i <= min; i++)
3117 if (eptr >= md->end_subject ||
3118 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3119 RRETURN(MATCH_NOMATCH);
3120 /* No need to skip more bytes - we know it's a 1-byte character */
3124 case OP_NOT_WHITESPACE:
3125 for (i = 1; i <= min; i++)
3127 if (eptr >= md->end_subject ||
3128 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3129 RRETURN(MATCH_NOMATCH);
3130 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3135 for (i = 1; i <= min; i++)
3137 if (eptr >= md->end_subject ||
3138 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3139 RRETURN(MATCH_NOMATCH);
3140 /* No need to skip more bytes - we know it's a 1-byte character */
3144 case OP_NOT_WORDCHAR:
3145 for (i = 1; i <= min; i++)
3147 if (eptr >= md->end_subject ||
3148 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3149 RRETURN(MATCH_NOMATCH);
3150 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3155 for (i = 1; i <= min; i++)
3157 if (eptr >= md->end_subject ||
3158 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3159 RRETURN(MATCH_NOMATCH);
3160 /* No need to skip more bytes - we know it's a 1-byte character */
3165 RRETURN(PCRE_ERROR_INTERNAL);
3166 } /* End switch(ctype) */
3169 #endif /* SUPPORT_UTF8 */
3171 /* Code for the non-UTF-8 case for minimum matching of operators other
3172 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3173 number of bytes present, as this was tested above. */
3178 for (i = 1; i <= min; i++)
3180 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3193 /* Because of the CRLF case, we can't assume the minimum number of
3194 bytes are present in this case. */
3197 for (i = 1; i <= min; i++)
3199 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3202 default: RRETURN(MATCH_NOMATCH);
3204 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3212 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3219 for (i = 1; i <= min; i++)
3221 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3226 case 0x20: /* SPACE */
3227 case 0xa0: /* NBSP */
3228 RRETURN(MATCH_NOMATCH);
3234 for (i = 1; i <= min; i++)
3236 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3239 default: RRETURN(MATCH_NOMATCH);
3241 case 0x20: /* SPACE */
3242 case 0xa0: /* NBSP */
3249 for (i = 1; i <= min; i++)
3251 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3259 case 0x85: /* NEL */
3260 RRETURN(MATCH_NOMATCH);
3266 for (i = 1; i <= min; i++)
3268 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3271 default: RRETURN(MATCH_NOMATCH);
3276 case 0x85: /* NEL */
3283 for (i = 1; i <= min; i++)
3284 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3288 for (i = 1; i <= min; i++)
3289 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3292 case OP_NOT_WHITESPACE:
3293 for (i = 1; i <= min; i++)
3294 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3298 for (i = 1; i <= min; i++)
3299 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3302 case OP_NOT_WORDCHAR:
3303 for (i = 1; i <= min; i++)
3304 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3305 RRETURN(MATCH_NOMATCH);
3309 for (i = 1; i <= min; i++)
3310 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3311 RRETURN(MATCH_NOMATCH);
3315 RRETURN(PCRE_ERROR_INTERNAL);
3319 /* If min = max, continue at the same level without recursing */
3321 if (min == max) continue;
3323 /* If minimizing, we have to test the rest of the pattern before each
3324 subsequent match. Again, separate the UTF-8 case for speed, and also
3325 separate the UCP cases. */
3335 for (fi = min;; fi++)
3337 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3340 GETCHARINC(c, eptr);
3341 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3343 /* Control never gets here */
3346 for (fi = min;; fi++)
3348 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3349 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3350 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3351 GETCHARINC(c, eptr);
3352 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3353 if ((prop_chartype == ucp_Lu ||
3354 prop_chartype == ucp_Ll ||
3355 prop_chartype == ucp_Lt) == prop_fail_result)
3356 RRETURN(MATCH_NOMATCH);
3358 /* Control never gets here */
3361 for (fi = min;; fi++)
3363 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3364 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3365 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3366 GETCHARINC(c, eptr);
3367 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3368 if ((prop_category == prop_value) == prop_fail_result)
3369 RRETURN(MATCH_NOMATCH);
3371 /* Control never gets here */
3374 for (fi = min;; fi++)
3376 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3377 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3378 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3379 GETCHARINC(c, eptr);
3380 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3381 if ((prop_chartype == prop_value) == prop_fail_result)
3382 RRETURN(MATCH_NOMATCH);
3384 /* Control never gets here */
3387 for (fi = min;; fi++)
3389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3392 GETCHARINC(c, eptr);
3393 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3394 if ((prop_script == prop_value) == prop_fail_result)
3395 RRETURN(MATCH_NOMATCH);
3397 /* Control never gets here */
3400 RRETURN(PCRE_ERROR_INTERNAL);
3404 /* Match extended Unicode sequences. We will get here only if the
3405 support is in the binary; otherwise a compile-time error occurs. */
3407 else if (ctype == OP_EXTUNI)
3409 for (fi = min;; fi++)
3411 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3413 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3414 GETCHARINCTEST(c, eptr);
3415 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3416 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3417 while (eptr < md->end_subject)
3420 if (!utf8) c = *eptr; else
3422 GETCHARLEN(c, eptr, len);
3424 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3425 if (prop_category != ucp_M) break;
3432 #endif /* SUPPORT_UCP */
3438 for (fi = min;; fi++)
3440 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 if (fi >= max || eptr >= md->end_subject ||
3443 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3444 RRETURN(MATCH_NOMATCH);
3446 GETCHARINC(c, eptr);
3449 case OP_ANY: /* This is the non-NL case */
3457 default: RRETURN(MATCH_NOMATCH);
3459 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3469 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3479 case 0x20: /* SPACE */
3480 case 0xa0: /* NBSP */
3481 case 0x1680: /* OGHAM SPACE MARK */
3482 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3483 case 0x2000: /* EN QUAD */
3484 case 0x2001: /* EM QUAD */
3485 case 0x2002: /* EN SPACE */
3486 case 0x2003: /* EM SPACE */
3487 case 0x2004: /* THREE-PER-EM SPACE */
3488 case 0x2005: /* FOUR-PER-EM SPACE */
3489 case 0x2006: /* SIX-PER-EM SPACE */
3490 case 0x2007: /* FIGURE SPACE */
3491 case 0x2008: /* PUNCTUATION SPACE */
3492 case 0x2009: /* THIN SPACE */
3493 case 0x200A: /* HAIR SPACE */
3494 case 0x202f: /* NARROW NO-BREAK SPACE */
3495 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3496 case 0x3000: /* IDEOGRAPHIC SPACE */
3497 RRETURN(MATCH_NOMATCH);
3504 default: RRETURN(MATCH_NOMATCH);
3506 case 0x20: /* SPACE */
3507 case 0xa0: /* NBSP */
3508 case 0x1680: /* OGHAM SPACE MARK */
3509 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3510 case 0x2000: /* EN QUAD */
3511 case 0x2001: /* EM QUAD */
3512 case 0x2002: /* EN SPACE */
3513 case 0x2003: /* EM SPACE */
3514 case 0x2004: /* THREE-PER-EM SPACE */
3515 case 0x2005: /* FOUR-PER-EM SPACE */
3516 case 0x2006: /* SIX-PER-EM SPACE */
3517 case 0x2007: /* FIGURE SPACE */
3518 case 0x2008: /* PUNCTUATION SPACE */
3519 case 0x2009: /* THIN SPACE */
3520 case 0x200A: /* HAIR SPACE */
3521 case 0x202f: /* NARROW NO-BREAK SPACE */
3522 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3523 case 0x3000: /* IDEOGRAPHIC SPACE */
3536 case 0x85: /* NEL */
3537 case 0x2028: /* LINE SEPARATOR */
3538 case 0x2029: /* PARAGRAPH SEPARATOR */
3539 RRETURN(MATCH_NOMATCH);
3546 default: RRETURN(MATCH_NOMATCH);
3551 case 0x85: /* NEL */
3552 case 0x2028: /* LINE SEPARATOR */
3553 case 0x2029: /* PARAGRAPH SEPARATOR */
3559 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3560 RRETURN(MATCH_NOMATCH);
3564 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3565 RRETURN(MATCH_NOMATCH);
3568 case OP_NOT_WHITESPACE:
3569 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3570 RRETURN(MATCH_NOMATCH);
3574 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3575 RRETURN(MATCH_NOMATCH);
3578 case OP_NOT_WORDCHAR:
3579 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3580 RRETURN(MATCH_NOMATCH);
3584 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3585 RRETURN(MATCH_NOMATCH);
3589 RRETURN(PCRE_ERROR_INTERNAL);
3595 /* Not UTF-8 mode */
3597 for (fi = min;; fi++)
3599 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601 if (fi >= max || eptr >= md->end_subject ||
3602 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3603 RRETURN(MATCH_NOMATCH);
3608 case OP_ANY: /* This is the non-NL case */
3616 default: RRETURN(MATCH_NOMATCH);
3618 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3627 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3637 case 0x20: /* SPACE */
3638 case 0xa0: /* NBSP */
3639 RRETURN(MATCH_NOMATCH);
3646 default: RRETURN(MATCH_NOMATCH);
3648 case 0x20: /* SPACE */
3649 case 0xa0: /* NBSP */
3662 case 0x85: /* NEL */
3663 RRETURN(MATCH_NOMATCH);
3670 default: RRETURN(MATCH_NOMATCH);
3675 case 0x85: /* NEL */
3681 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3685 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3688 case OP_NOT_WHITESPACE:
3689 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3693 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3696 case OP_NOT_WORDCHAR:
3697 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3701 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3705 RRETURN(PCRE_ERROR_INTERNAL);
3709 /* Control never gets here */
3712 /* If maximizing, it is worth using inline code for speed, doing the type
3713 test once at the start (i.e. keep it out of the loop). Again, keep the
3714 UTF-8 and UCP stuff separate. */
3718 pp = eptr; /* Remember where we started */
3726 for (i = min; i < max; i++)
3729 if (eptr >= md->end_subject) break;
3730 GETCHARLEN(c, eptr, len);
3731 if (prop_fail_result) break;
3737 for (i = min; i < max; i++)
3740 if (eptr >= md->end_subject) break;
3741 GETCHARLEN(c, eptr, len);
3742 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3743 if ((prop_chartype == ucp_Lu ||
3744 prop_chartype == ucp_Ll ||
3745 prop_chartype == ucp_Lt) == prop_fail_result)
3752 for (i = min; i < max; i++)
3755 if (eptr >= md->end_subject) break;
3756 GETCHARLEN(c, eptr, len);
3757 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3758 if ((prop_category == prop_value) == prop_fail_result)
3765 for (i = min; i < max; i++)
3768 if (eptr >= md->end_subject) break;
3769 GETCHARLEN(c, eptr, len);
3770 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3771 if ((prop_chartype == prop_value) == prop_fail_result)
3778 for (i = min; i < max; i++)
3781 if (eptr >= md->end_subject) break;
3782 GETCHARLEN(c, eptr, len);
3783 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3784 if ((prop_script == prop_value) == prop_fail_result)
3791 /* eptr is now past the end of the maximum run */
3793 if (possessive) continue;
3796 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3798 if (eptr-- == pp) break; /* Stop if tried at original pos */
3799 if (utf8) BACKCHAR(eptr);
3803 /* Match extended Unicode sequences. We will get here only if the
3804 support is in the binary; otherwise a compile-time error occurs. */
3806 else if (ctype == OP_EXTUNI)
3808 for (i = min; i < max; i++)
3810 if (eptr >= md->end_subject) break;
3811 GETCHARINCTEST(c, eptr);
3812 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3813 if (prop_category == ucp_M) break;
3814 while (eptr < md->end_subject)
3817 if (!utf8) c = *eptr; else
3819 GETCHARLEN(c, eptr, len);
3821 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3822 if (prop_category != ucp_M) break;
3827 /* eptr is now past the end of the maximum run */
3829 if (possessive) continue;
3832 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3834 if (eptr-- == pp) break; /* Stop if tried at original pos */
3835 for (;;) /* Move back over one extended */
3838 if (!utf8) c = *eptr; else
3841 GETCHARLEN(c, eptr, len);
3843 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3844 if (prop_category != ucp_M) break;
3851 #endif /* SUPPORT_UCP */
3863 for (i = min; i < max; i++)
3865 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3867 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3871 /* Handle unlimited UTF-8 repeat */
3875 for (i = min; i < max; i++)
3877 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3879 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3887 for (i = min; i < max; i++)
3889 if (eptr >= md->end_subject) break;
3891 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3894 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3897 /* The byte case is the same as non-UTF8 */
3901 if (c > (unsigned int)(md->end_subject - eptr))
3902 c = md->end_subject - eptr;
3907 for (i = min; i < max; i++)
3910 if (eptr >= md->end_subject) break;
3911 GETCHARLEN(c, eptr, len);
3914 if (++eptr >= md->end_subject) break;
3915 if (*eptr == 0x000a) eptr++;
3921 (c != 0x000b && c != 0x000c &&
3922 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3931 for (i = min; i < max; i++)
3935 if (eptr >= md->end_subject) break;
3936 GETCHARLEN(c, eptr, len);
3939 default: gotspace = FALSE; break;
3941 case 0x20: /* SPACE */
3942 case 0xa0: /* NBSP */
3943 case 0x1680: /* OGHAM SPACE MARK */
3944 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3945 case 0x2000: /* EN QUAD */
3946 case 0x2001: /* EM QUAD */
3947 case 0x2002: /* EN SPACE */
3948 case 0x2003: /* EM SPACE */
3949 case 0x2004: /* THREE-PER-EM SPACE */
3950 case 0x2005: /* FOUR-PER-EM SPACE */
3951 case 0x2006: /* SIX-PER-EM SPACE */
3952 case 0x2007: /* FIGURE SPACE */
3953 case 0x2008: /* PUNCTUATION SPACE */
3954 case 0x2009: /* THIN SPACE */
3955 case 0x200A: /* HAIR SPACE */
3956 case 0x202f: /* NARROW NO-BREAK SPACE */
3957 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3958 case 0x3000: /* IDEOGRAPHIC SPACE */
3962 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3969 for (i = min; i < max; i++)
3973 if (eptr >= md->end_subject) break;
3974 GETCHARLEN(c, eptr, len);
3977 default: gotspace = FALSE; break;
3982 case 0x85: /* NEL */
3983 case 0x2028: /* LINE SEPARATOR */
3984 case 0x2029: /* PARAGRAPH SEPARATOR */
3988 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3994 for (i = min; i < max; i++)
3997 if (eptr >= md->end_subject) break;
3998 GETCHARLEN(c, eptr, len);
3999 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4005 for (i = min; i < max; i++)
4008 if (eptr >= md->end_subject) break;
4009 GETCHARLEN(c, eptr, len);
4010 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4015 case OP_NOT_WHITESPACE:
4016 for (i = min; i < max; i++)
4019 if (eptr >= md->end_subject) break;
4020 GETCHARLEN(c, eptr, len);
4021 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4027 for (i = min; i < max; i++)
4030 if (eptr >= md->end_subject) break;
4031 GETCHARLEN(c, eptr, len);
4032 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4037 case OP_NOT_WORDCHAR:
4038 for (i = min; i < max; i++)
4041 if (eptr >= md->end_subject) break;
4042 GETCHARLEN(c, eptr, len);
4043 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4049 for (i = min; i < max; i++)
4052 if (eptr >= md->end_subject) break;
4053 GETCHARLEN(c, eptr, len);
4054 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4060 RRETURN(PCRE_ERROR_INTERNAL);
4063 /* eptr is now past the end of the maximum run */
4065 if (possessive) continue;
4068 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4070 if (eptr-- == pp) break; /* Stop if tried at original pos */
4075 #endif /* SUPPORT_UTF8 */
4077 /* Not UTF-8 mode */
4082 for (i = min; i < max; i++)
4084 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4092 if (c > (unsigned int)(md->end_subject - eptr))
4093 c = md->end_subject - eptr;
4098 for (i = min; i < max; i++)
4100 if (eptr >= md->end_subject) break;
4104 if (++eptr >= md->end_subject) break;
4105 if (*eptr == 0x000a) eptr++;
4111 (c != 0x000b && c != 0x000c && c != 0x0085)))
4119 for (i = min; i < max; i++)
4121 if (eptr >= md->end_subject) break;
4123 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4129 for (i = min; i < max; i++)
4131 if (eptr >= md->end_subject) break;
4133 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4139 for (i = min; i < max; i++)
4141 if (eptr >= md->end_subject) break;
4143 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4150 for (i = min; i < max; i++)
4152 if (eptr >= md->end_subject) break;
4154 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4161 for (i = min; i < max; i++)
4163 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4170 for (i = min; i < max; i++)
4172 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4178 case OP_NOT_WHITESPACE:
4179 for (i = min; i < max; i++)
4181 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4188 for (i = min; i < max; i++)
4190 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4196 case OP_NOT_WORDCHAR:
4197 for (i = min; i < max; i++)
4199 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4206 for (i = min; i < max; i++)
4208 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4215 RRETURN(PCRE_ERROR_INTERNAL);
4218 /* eptr is now past the end of the maximum run */
4220 if (possessive) continue;
4223 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4229 /* Get here if we can't make it match with any permitted repetitions */
4231 RRETURN(MATCH_NOMATCH);
4233 /* Control never gets here */
4235 /* There's been some horrible disaster. Arrival here can only mean there is
4236 something seriously wrong in the code above or the OP_xxx definitions. */
4239 DPRINTF(("Unknown opcode %d\n", *ecode));
4240 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4243 /* Do not stick any code in here without much thought; it is assumed
4244 that "continue" in the code above comes out to here to repeat the main
4247 } /* End of main loop */
4248 /* Control never reaches here */
4251 /* When compiling to use the heap rather than the stack for recursive calls to
4252 match(), the RRETURN() macro jumps here. The number that is saved in
4253 frame->Xwhere indicates which label we actually want to return to. */
4256 #define LBL(val) case val: goto L_RM##val;
4258 switch (frame->Xwhere)
4260 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4261 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4262 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4263 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4266 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4267 LBL(32) LBL(34) LBL(42) LBL(46)
4269 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4270 #endif /* SUPPORT_UCP */
4271 #endif /* SUPPORT_UTF8 */
4273 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4274 return PCRE_ERROR_INTERNAL;
4277 #endif /* NO_RECURSE */
4281 /***************************************************************************
4282 ****************************************************************************
4283 RECURSION IN THE match() FUNCTION
4285 Undefine all the macros that were defined above to handle this. */
4304 #undef new_recursive
4319 #undef save_capture_last
4329 /* These two are defined as macros in both cases */
4334 /***************************************************************************
4335 ***************************************************************************/
4339 /*************************************************
4340 * Execute a Regular Expression *
4341 *************************************************/
4343 /* This function applies a compiled re to a subject string and picks out
4344 portions of the string if it matches. Two elements in the vector are set for
4345 each substring: the offsets to the start and end of the substring.
4348 argument_re points to the compiled expression
4349 extra_data points to extra data or is NULL
4350 subject points to the subject string
4351 length length of subject string (may contain binary zeros)
4352 start_offset where to start in the subject string
4354 offsets points to a vector of ints to be filled in with offsets
4355 offsetcount the number of elements in the vector
4357 Returns: > 0 => success; value is the number of elements filled in
4358 = 0 => success, but offsets is not big enough
4359 -1 => failed to match
4360 < -1 => some kind of unexpected problem
4364 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4365 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4368 int rc, resetcount, ocount;
4369 int first_byte = -1;
4373 unsigned long int ims;
4374 BOOL using_temporary_offsets = FALSE;
4378 BOOL first_byte_caseless = FALSE;
4379 BOOL req_byte_caseless = FALSE;
4381 match_data match_block;
4382 match_data *md = &match_block;
4383 const uschar *tables;
4384 const uschar *start_bits = NULL;
4385 USPTR start_match = (USPTR)subject + start_offset;
4387 USPTR req_byte_ptr = start_match - 1;
4389 pcre_study_data internal_study;
4390 const pcre_study_data *study;
4392 real_pcre internal_re;
4393 const real_pcre *external_re = (const real_pcre *)argument_re;
4394 const real_pcre *re = external_re;
4396 /* Plausibility checks */
4398 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4399 if (re == NULL || subject == NULL ||
4400 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4401 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4403 /* Fish out the optional data from the extra_data structure, first setting
4404 the default values. */
4407 md->match_limit = MATCH_LIMIT;
4408 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4409 md->callout_data = NULL;
4411 /* The table pointer is always in native byte order. */
4413 tables = external_re->tables;
4415 if (extra_data != NULL)
4417 register unsigned int flags = extra_data->flags;
4418 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4419 study = (const pcre_study_data *)extra_data->study_data;
4420 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4421 md->match_limit = extra_data->match_limit;
4422 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4423 md->match_limit_recursion = extra_data->match_limit_recursion;
4424 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4425 md->callout_data = extra_data->callout_data;
4426 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4429 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4430 is a feature that makes it possible to save compiled regex and re-use them
4431 in other programs later. */
4433 if (tables == NULL) tables = _pcre_default_tables;
4435 /* Check that the first field in the block is the magic number. If it is not,
4436 test for a regex that was compiled on a host of opposite endianness. If this is
4437 the case, flipped values are put in internal_re and internal_study if there was
4440 if (re->magic_number != MAGIC_NUMBER)
4442 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4443 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4444 if (study != NULL) study = &internal_study;
4447 /* Set up other data */
4449 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4450 startline = (re->flags & PCRE_STARTLINE) != 0;
4451 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4453 /* The code starts after the real_pcre block and the capture name table. */
4455 md->start_code = (const uschar *)external_re + re->name_table_offset +
4456 re->name_count * re->name_entry_size;
4458 md->start_subject = (USPTR)subject;
4459 md->start_offset = start_offset;
4460 md->end_subject = md->start_subject + length;
4461 end_subject = md->end_subject;
4463 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4464 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4465 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4467 md->notbol = (options & PCRE_NOTBOL) != 0;
4468 md->noteol = (options & PCRE_NOTEOL) != 0;
4469 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4470 md->partial = (options & PCRE_PARTIAL) != 0;
4473 md->recursive = NULL; /* No recursion at top level */
4475 md->lcc = tables + lcc_offset;
4476 md->ctypes = tables + ctypes_offset;
4478 /* Handle different \R options. */
4480 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4483 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4484 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4487 md->bsr_anycrlf = TRUE;
4489 md->bsr_anycrlf = FALSE;
4493 case PCRE_BSR_ANYCRLF:
4494 md->bsr_anycrlf = TRUE;
4497 case PCRE_BSR_UNICODE:
4498 md->bsr_anycrlf = FALSE;
4501 default: return PCRE_ERROR_BADNEWLINE;
4504 /* Handle different types of newline. The three bits give eight cases. If
4505 nothing is set at run time, whatever was used at compile time applies. */
4507 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4508 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4510 case 0: newline = NEWLINE; break; /* Compile-time default */
4511 case PCRE_NEWLINE_CR: newline = '\r'; break;
4512 case PCRE_NEWLINE_LF: newline = '\n'; break;
4513 case PCRE_NEWLINE_CR+
4514 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4515 case PCRE_NEWLINE_ANY: newline = -1; break;
4516 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4517 default: return PCRE_ERROR_BADNEWLINE;
4522 md->nltype = NLTYPE_ANYCRLF;
4524 else if (newline < 0)
4526 md->nltype = NLTYPE_ANY;
4530 md->nltype = NLTYPE_FIXED;
4534 md->nl[0] = (newline >> 8) & 255;
4535 md->nl[1] = newline & 255;
4540 md->nl[0] = newline;
4544 /* Partial matching is supported only for a restricted set of regexes at the
4547 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4548 return PCRE_ERROR_BADPARTIAL;
4550 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4551 back the character offset. */
4554 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4556 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4557 return PCRE_ERROR_BADUTF8;
4558 if (start_offset > 0 && start_offset < length)
4560 int tb = ((uschar *)subject)[start_offset];
4564 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4570 /* The ims options can vary during the matching as a result of the presence
4571 of (?ims) items in the pattern. They are kept in a local variable so that
4572 restoring at the exit of a group is easy. */
4574 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4576 /* If the expression has got more back references than the offsets supplied can
4577 hold, we get a temporary chunk of working store to use during the matching.
4578 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4581 ocount = offsetcount - (offsetcount % 3);
4583 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4585 ocount = re->top_backref * 3 + 3;
4586 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4587 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4588 using_temporary_offsets = TRUE;
4589 DPRINTF(("Got memory to hold back references\n"));
4591 else md->offset_vector = offsets;
4593 md->offset_end = ocount;
4594 md->offset_max = (2*ocount)/3;
4595 md->offset_overflow = FALSE;
4596 md->capture_last = -1;
4598 /* Compute the minimum number of offsets that we need to reset each time. Doing
4599 this makes a huge difference to execution time when there aren't many brackets
4602 resetcount = 2 + re->top_bracket * 2;
4603 if (resetcount > offsetcount) resetcount = ocount;
4605 /* Reset the working variable associated with each extraction. These should
4606 never be used unless previously set, but they get saved and restored, and so we
4607 initialize them to avoid reading uninitialized locations. */
4609 if (md->offset_vector != NULL)
4611 register int *iptr = md->offset_vector + ocount;
4612 register int *iend = iptr - resetcount/2 + 1;
4613 while (--iptr >= iend) *iptr = -1;
4616 /* Set up the first character to match, if available. The first_byte value is
4617 never set for an anchored regular expression, but the anchoring may be forced
4618 at run time, so we have to test for anchoring. The first char may be unset for
4619 an unanchored pattern, of course. If there's no first char and the pattern was
4620 studied, there may be a bitmap of possible first characters. */
4624 if ((re->flags & PCRE_FIRSTSET) != 0)
4626 first_byte = re->first_byte & 255;
4627 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4628 first_byte = md->lcc[first_byte];
4631 if (!startline && study != NULL &&
4632 (study->options & PCRE_STUDY_MAPPED) != 0)
4633 start_bits = study->start_bits;
4636 /* For anchored or unanchored matches, there may be a "last known required
4639 if ((re->flags & PCRE_REQCHSET) != 0)
4641 req_byte = re->req_byte & 255;
4642 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4643 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4647 /* ==========================================================================*/
4649 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4650 the loop runs just once. */
4654 USPTR save_end_subject = end_subject;
4655 USPTR new_start_match;
4657 /* Reset the maximum number of extractions we might see. */
4659 if (md->offset_vector != NULL)
4661 register int *iptr = md->offset_vector;
4662 register int *iend = iptr + resetcount;
4663 while (iptr < iend) *iptr++ = -1;
4666 /* Advance to a unique first char if possible. If firstline is TRUE, the
4667 start of the match is constrained to the first line of a multiline string.
4668 That is, the match must be before or at the first newline. Implement this by
4669 temporarily adjusting end_subject so that we stop scanning at a newline. If
4670 the match fails at the newline, later code breaks this loop. */
4674 USPTR t = start_match;
4675 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4679 /* Now test for a unique first byte */
4681 if (first_byte >= 0)
4683 if (first_byte_caseless)
4684 while (start_match < end_subject &&
4685 md->lcc[*start_match] != first_byte)
4686 { NEXTCHAR(start_match); }
4688 while (start_match < end_subject && *start_match != first_byte)
4689 { NEXTCHAR(start_match); }
4692 /* Or to just after a linebreak for a multiline match if possible */
4696 if (start_match > md->start_subject + start_offset)
4698 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4699 { NEXTCHAR(start_match); }
4701 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4702 and we are now at a LF, advance the match position by one more character.
4705 if (start_match[-1] == '\r' &&
4706 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4707 start_match < end_subject &&
4708 *start_match == '\n')
4713 /* Or to a non-unique first char after study */
4715 else if (start_bits != NULL)
4717 while (start_match < end_subject)
4719 register unsigned int c = *start_match;
4720 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4721 { NEXTCHAR(start_match); }
4726 /* Restore fudged end_subject */
4728 end_subject = save_end_subject;
4730 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4731 printf(">>>> Match against: ");
4732 pchars(start_match, end_subject - start_match, TRUE, md);
4736 /* If req_byte is set, we know that that character must appear in the subject
4737 for the match to succeed. If the first character is set, req_byte must be
4738 later in the subject; otherwise the test starts at the match point. This
4739 optimization can save a huge amount of backtracking in patterns with nested
4740 unlimited repeats that aren't going to match. Writing separate code for
4741 cased/caseless versions makes it go faster, as does using an autoincrement
4742 and backing off on a match.
4744 HOWEVER: when the subject string is very, very long, searching to its end can
4745 take a long time, and give bad performance on quite ordinary patterns. This
4746 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4747 string... so we don't do this when the string is sufficiently long.
4749 ALSO: this processing is disabled when partial matching is requested.
4752 if (req_byte >= 0 &&
4753 end_subject - start_match < REQ_BYTE_MAX &&
4756 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4758 /* We don't need to repeat the search if we haven't yet reached the
4759 place we found it at last time. */
4761 if (p > req_byte_ptr)
4763 if (req_byte_caseless)
4765 while (p < end_subject)
4767 register int pp = *p++;
4768 if (pp == req_byte || pp == req_byte2) { p--; break; }
4773 while (p < end_subject)
4775 if (*p++ == req_byte) { p--; break; }
4779 /* If we can't find the required character, break the matching loop,
4780 forcing a match failure. */
4782 if (p >= end_subject)
4788 /* If we have found the required character, save the point where we
4789 found it, so that we don't search again next time round the loop if
4790 the start hasn't passed this character yet. */
4796 /* OK, we can now run the match. */
4798 md->start_match_ptr = start_match;
4799 md->match_call_count = 0;
4800 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4804 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4805 exactly like PRUNE. */
4810 new_start_match = start_match + 1;
4813 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4818 /* SKIP passes back the next starting point explicitly. */
4821 new_start_match = md->start_match_ptr;
4824 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4830 /* Any other return is some kind of error. */
4836 /* Control reaches here for the various types of "no match at this point"
4837 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4841 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4842 newline in the subject (though it may continue over the newline). Therefore,
4843 if we have just failed to match, starting at a newline, do not continue. */
4845 if (firstline && IS_NEWLINE(start_match)) break;
4847 /* Advance to new matching position */
4849 start_match = new_start_match;
4851 /* Break the loop if the pattern is anchored or if we have passed the end of
4854 if (anchored || start_match > end_subject) break;
4856 /* If we have just passed a CR and we are now at a LF, and the pattern does
4857 not contain any explicit matches for \r or \n, and the newline option is CRLF
4858 or ANY or ANYCRLF, advance the match position by one more character. */
4860 if (start_match[-1] == '\r' &&
4861 start_match < end_subject &&
4862 *start_match == '\n' &&
4863 (re->flags & PCRE_HASCRORLF) == 0 &&
4864 (md->nltype == NLTYPE_ANY ||
4865 md->nltype == NLTYPE_ANYCRLF ||
4869 } /* End of for(;;) "bumpalong" loop */
4871 /* ==========================================================================*/
4873 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4876 (1) The pattern is anchored or the match was failed by (*COMMIT);
4878 (2) We are past the end of the subject;
4880 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4881 this option requests that a match occur at or before the first newline in
4884 When we have a match and the offset vector is big enough to deal with any
4885 backreferences, captured substring offsets will already be set up. In the case
4886 where we had to get some local store to hold offsets for backreference
4887 processing, copy those that we can. In this case there need not be overflow if
4888 certain parts of the pattern were not used, even though there are more
4889 capturing parentheses than vector slots. */
4893 if (rc == MATCH_MATCH)
4895 if (using_temporary_offsets)
4897 if (offsetcount >= 4)
4899 memcpy(offsets + 2, md->offset_vector + 2,
4900 (offsetcount - 2) * sizeof(int));
4901 DPRINTF(("Copied offsets from temporary memory\n"));
4903 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4904 DPRINTF(("Freeing temporary memory\n"));
4905 (pcre_free)(md->offset_vector);
4908 /* Set the return code to the number of captured strings, or 0 if there are
4909 too many to fit into the vector. */
4911 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4913 /* If there is space, set up the whole thing as substring 0. The value of
4914 md->start_match_ptr might be modified if \K was encountered on the success
4917 if (offsetcount < 2) rc = 0; else
4919 offsets[0] = md->start_match_ptr - md->start_subject;
4920 offsets[1] = md->end_match_ptr - md->start_subject;
4923 DPRINTF((">>>> returning %d\n", rc));
4927 /* Control gets here if there has been an error, or if the overall match
4928 attempt has failed at all permitted starting positions. */
4930 if (using_temporary_offsets)
4932 DPRINTF(("Freeing temporary memory\n"));
4933 (pcre_free)(md->offset_vector);
4936 if (rc != MATCH_NOMATCH)
4938 DPRINTF((">>>> error: returning %d\n", rc));
4941 else if (md->partial && md->hitend)
4943 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4944 return PCRE_ERROR_PARTIAL;
4948 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4949 return PCRE_ERROR_NOMATCH;
4953 /* End of pcre_exec.c */