1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
53 #include "pcre_internal.h"
55 /* Undefine some potentially clashing cpp symbols */
60 /* Flag bits for the match() function */
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
69 #define MATCH_NOMATCH 0
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
83 #define REC_STACK_SAVE_MAX 30
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
135 Returns: TRUE if matched
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
142 USPTR p = md->start_subject + md->offset_vector[offset];
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
157 /* Always fail if not enough characters left */
159 if (length > md->end_subject - eptr) return FALSE;
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
165 if ((ims & PCRE_CASELESS) != 0)
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actually used in this definition. */
255 #define REGISTER register
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
264 #define RRETURN(ra) \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xmarkptr = markptr;\
292 newframe->Xoffset_top = rc;\
293 newframe->Xims = re;\
294 newframe->Xeptrb = rf;\
295 newframe->Xflags = rg;\
296 newframe->Xrdepth = frame->Xrdepth + 1;\
297 newframe->Xprevframe = frame;\
299 DPRINTF(("restarting from line %d\n", __LINE__));\
302 DPRINTF(("jumped back to line %d\n", __LINE__));\
307 heapframe *newframe = frame;\
308 frame = newframe->Xprevframe;\
309 (pcre_stack_free)(newframe);\
319 /* Structure for remembering the local variables in a private frame */
321 typedef struct heapframe {
322 struct heapframe *Xprevframe;
324 /* Function arguments that may change */
327 const uschar *Xecode;
334 unsigned int Xrdepth;
336 /* Function local variables */
348 recursion_info Xnew_recursive;
354 unsigned long int Xoriginal_ims;
359 int Xprop_fail_result;
377 int Xsave_capture_last;
378 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
379 int Xstacksave[REC_STACK_SAVE_MAX];
383 /* Where to jump back to */
392 /***************************************************************************
393 ***************************************************************************/
397 /*************************************************
398 * Match from current position *
399 *************************************************/
401 /* This function is called recursively in many circumstances. Whenever it
402 returns a negative (error) response, the outer incarnation must also return the
405 /* These macros pack up tests that are used for partial matching, and which
406 appears several times in the code. We set the "hit end" flag if the pointer is
407 at the end of the subject and also past the start of the subject (i.e.
408 something has been matched). For hard partial matching, we then return
409 immediately. The second one is used when we already know we are past the end of
412 #define CHECK_PARTIAL()\
413 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
416 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
419 #define SCHECK_PARTIAL()\
420 if (md->partial != 0 && eptr > mstart)\
423 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
427 /* Performance note: It might be tempting to extract commonly used fields from
428 the md structure (e.g. utf8, end_subject) into individual variables to improve
429 performance. Tests using gcc on a SPARC disproved this; in the first case, it
430 made performance worse.
433 eptr pointer to current character in subject
434 ecode pointer to current position in compiled code
435 mstart pointer to the current match start position (can be modified
437 markptr pointer to the most recent MARK name, or NULL
438 offset_top current top pointer
439 md pointer to "static" info for the match
440 ims current /i, /m, and /s options
441 eptrb pointer to chain of blocks containing eptr at start of
442 brackets - for testing for empty matches
444 match_condassert - this is an assertion condition
445 match_cbegroup - this is the start of an unlimited repeat
446 group that can match an empty string
447 rdepth the recursion depth
449 Returns: MATCH_MATCH if matched ) these values are >= 0
450 MATCH_NOMATCH if failed to match )
451 a negative PCRE_ERROR_xxx value if aborted by an error condition
452 (e.g. stopped by repeated call or recursion limit)
456 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR
457 markptr, int offset_top, match_data *md, unsigned long int ims,
458 eptrblock *eptrb, int flags, unsigned int rdepth)
460 /* These variables do not need to be preserved over recursion in this function,
461 so they can be ordinary variables in all cases. Mark some of them with
462 "register" because they are used a lot in loops. */
464 register int rrc; /* Returns from recursive calls */
465 register int i; /* Used for loops not involving calls to RMATCH() */
466 register unsigned int c; /* Character values not kept over RMATCH() calls */
467 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
469 BOOL minimize, possessive; /* Quantifier options */
472 /* When recursion is not being used, all "local" variables that have to be
473 preserved over calls to RMATCH() are part of a "frame" which is obtained from
474 heap storage. Set up the top-level frame here; others are obtained from the
475 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
478 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
479 frame->Xprevframe = NULL; /* Marks the top level */
481 /* Copy in the original argument variables */
484 frame->Xecode = ecode;
485 frame->Xmstart = mstart;
486 frame->Xmarkptr = markptr;
487 frame->Xoffset_top = offset_top;
489 frame->Xeptrb = eptrb;
490 frame->Xflags = flags;
491 frame->Xrdepth = rdepth;
493 /* This is where control jumps back to to effect "recursion" */
497 /* Macros make the argument variables come from the current frame */
499 #define eptr frame->Xeptr
500 #define ecode frame->Xecode
501 #define mstart frame->Xmstart
502 #define markptr frame->Xmarkptr
503 #define offset_top frame->Xoffset_top
504 #define ims frame->Xims
505 #define eptrb frame->Xeptrb
506 #define flags frame->Xflags
507 #define rdepth frame->Xrdepth
509 /* Ditto for the local variables */
512 #define charptr frame->Xcharptr
514 #define callpat frame->Xcallpat
515 #define codelink frame->Xcodelink
516 #define data frame->Xdata
517 #define next frame->Xnext
518 #define pp frame->Xpp
519 #define prev frame->Xprev
520 #define saved_eptr frame->Xsaved_eptr
522 #define new_recursive frame->Xnew_recursive
524 #define cur_is_word frame->Xcur_is_word
525 #define condition frame->Xcondition
526 #define prev_is_word frame->Xprev_is_word
528 #define original_ims frame->Xoriginal_ims
531 #define prop_type frame->Xprop_type
532 #define prop_value frame->Xprop_value
533 #define prop_fail_result frame->Xprop_fail_result
534 #define prop_category frame->Xprop_category
535 #define prop_chartype frame->Xprop_chartype
536 #define prop_script frame->Xprop_script
537 #define oclength frame->Xoclength
538 #define occhars frame->Xocchars
541 #define ctype frame->Xctype
542 #define fc frame->Xfc
543 #define fi frame->Xfi
544 #define length frame->Xlength
545 #define max frame->Xmax
546 #define min frame->Xmin
547 #define number frame->Xnumber
548 #define offset frame->Xoffset
549 #define op frame->Xop
550 #define save_capture_last frame->Xsave_capture_last
551 #define save_offset1 frame->Xsave_offset1
552 #define save_offset2 frame->Xsave_offset2
553 #define save_offset3 frame->Xsave_offset3
554 #define stacksave frame->Xstacksave
556 #define newptrb frame->Xnewptrb
558 /* When recursion is being used, local variables are allocated on the stack and
559 get preserved during recursion in the normal way. In this environment, fi and
560 i, and fc and c, can be the same variables. */
562 #else /* NO_RECURSE not defined */
567 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
568 const uschar *charptr; /* in small blocks of the code. My normal */
569 #endif /* style of coding would have declared */
570 const uschar *callpat; /* them within each of those blocks. */
571 const uschar *data; /* However, in order to accommodate the */
572 const uschar *next; /* version of this code that uses an */
573 USPTR pp; /* external "stack" implemented on the */
574 const uschar *prev; /* heap, it is easier to declare them all */
575 USPTR saved_eptr; /* here, so the declarations can be cut */
576 /* out in a block. The only declarations */
577 recursion_info new_recursive; /* within blocks below are for variables */
578 /* that do not have to be preserved over */
579 BOOL cur_is_word; /* a recursive call to RMATCH(). */
583 unsigned long int original_ims;
588 int prop_fail_result;
604 int save_capture_last;
605 int save_offset1, save_offset2, save_offset3;
606 int stacksave[REC_STACK_SAVE_MAX];
609 #endif /* NO_RECURSE */
611 /* These statements are here to stop the compiler complaining about unitialized
616 prop_fail_result = 0;
620 /* This label is used for tail recursion, which is used in a few cases even
621 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
622 used. Thanks to Ian Taylor for noticing this possibility and sending the
627 /* OK, now we can get on with the real code of the function. Recursive calls
628 are specified by the macro RMATCH and RRETURN is used to return. When
629 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
630 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
631 defined). However, RMATCH isn't like a function call because it's quite a
632 complicated macro. It has to be used in one particular way. This shouldn't,
633 however, impact performance when true recursion is being used. */
636 utf8 = md->utf8; /* Local copy of the flag */
641 /* First check that we haven't called match() too many times, or that we
642 haven't exceeded the recursive call limit. */
644 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
645 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
647 original_ims = ims; /* Save for resetting on ')' */
649 /* At the start of a group with an unlimited repeat that may match an empty
650 string, the match_cbegroup flag is set. When this is the case, add the current
651 subject pointer to the chain of such remembered pointers, to be checked when we
652 hit the closing ket, in order to break infinite loops that match no characters.
653 When match() is called in other circumstances, don't add to the chain. The
654 match_cbegroup flag must NOT be used with tail recursion, because the memory
655 block that is used is on the stack, so a new one may be required for each
658 if ((flags & match_cbegroup) != 0)
660 newptrb.epb_saved_eptr = eptr;
661 newptrb.epb_prev = eptrb;
665 /* Now start processing the opcodes. */
669 minimize = possessive = FALSE;
675 RRETURN(MATCH_NOMATCH);
678 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
679 ims, eptrb, flags, RM51);
680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
681 RRETURN(MATCH_PRUNE);
684 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
685 ims, eptrb, flags, RM52);
686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
687 RRETURN(MATCH_COMMIT);
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM53);
692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
693 md->start_match_ptr = eptr; /* Pass back current position */
697 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
698 ims, eptrb, flags, RM54);
699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
702 /* Handle a capturing bracket. If there is space in the offset vector, save
703 the current subject position in the working slot at the top of the vector.
704 We mustn't change the current values of the data slot, because they may be
705 set from a previous iteration of this group, and be referred to by a
706 reference inside the group.
708 If the bracket fails to match, we need to restore this value and also the
709 values of the final offsets, in case they were set by a previous iteration
712 If there isn't enough space in the offset vector, treat this as if it were
713 a non-capturing bracket. Don't worry about setting the flag for the error
714 case here; that is handled in the code for KET. */
718 number = GET2(ecode, 1+LINK_SIZE);
719 offset = number << 1;
722 printf("start bracket %d\n", number);
724 pchars(eptr, 16, TRUE, md);
728 if (offset < md->offset_max)
730 save_offset1 = md->offset_vector[offset];
731 save_offset2 = md->offset_vector[offset+1];
732 save_offset3 = md->offset_vector[md->offset_end - number];
733 save_capture_last = md->capture_last;
735 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
736 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
738 flags = (op == OP_SCBRA)? match_cbegroup : 0;
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
742 ims, eptrb, flags, RM1);
743 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
744 md->capture_last = save_capture_last;
745 ecode += GET(ecode, 1);
747 while (*ecode == OP_ALT);
749 DPRINTF(("bracket %d failed\n", number));
751 md->offset_vector[offset] = save_offset1;
752 md->offset_vector[offset+1] = save_offset2;
753 md->offset_vector[md->offset_end - number] = save_offset3;
755 RRETURN(MATCH_NOMATCH);
758 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
759 as a non-capturing bracket. */
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
764 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
766 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
767 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
769 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
770 final alternative within the brackets, we would return the result of a
771 recursive call to match() whatever happened. We can reduce stack usage by
772 turning this into a tail recursion, except in the case when match_cbegroup
777 DPRINTF(("start non-capturing bracket\n"));
778 flags = (op >= OP_SBRA)? match_cbegroup : 0;
781 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
783 if (flags == 0) /* Not a possibly empty group */
785 ecode += _pcre_OP_lengths[*ecode];
786 DPRINTF(("bracket 0 tail recursion\n"));
790 /* Possibly empty group; can't use tail recursion. */
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
797 /* For non-final alternatives, continue the loop for a NOMATCH result;
800 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
802 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
803 ecode += GET(ecode, 1);
805 /* Control never reaches here. */
807 /* Conditional group: compilation checked that there are no more than
808 two branches. If the condition is false, skipping the first branch takes us
809 past the end if there is only one branch, but that's OK because that is
810 exactly what going to the ket would do. As there is only one branch to be
811 obeyed, we can use tail recursion to avoid using another stack frame. */
815 codelink= GET(ecode, 1);
817 /* Because of the way auto-callout works during compile, a callout item is
818 inserted between OP_COND and an assertion condition. */
820 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
822 if (pcre_callout != NULL)
824 pcre_callout_block cb;
825 cb.version = 1; /* Version 1 of the callout block */
826 cb.callout_number = ecode[LINK_SIZE+2];
827 cb.offset_vector = md->offset_vector;
828 cb.subject = (PCRE_SPTR)md->start_subject;
829 cb.subject_length = md->end_subject - md->start_subject;
830 cb.start_match = mstart - md->start_subject;
831 cb.current_position = eptr - md->start_subject;
832 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
833 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
834 cb.capture_top = offset_top/2;
835 cb.capture_last = md->capture_last;
836 cb.callout_data = md->callout_data;
837 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
838 if (rrc < 0) RRETURN(rrc);
840 ecode += _pcre_OP_lengths[OP_CALLOUT];
843 condcode = ecode[LINK_SIZE+1];
845 /* Now see what the actual condition is */
847 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
849 if (md->recursive == NULL) /* Not recursing => FALSE */
852 ecode += GET(ecode, 1);
856 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
857 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
859 /* If the test is for recursion into a specific subpattern, and it is
860 false, but the test was set up by name, scan the table to see if the
861 name refers to any other numbers, and test them. The condition is true
862 if any one is set. */
864 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
866 uschar *slotA = md->name_table;
867 for (i = 0; i < md->name_count; i++)
869 if (GET2(slotA, 0) == recno) break;
870 slotA += md->name_entry_size;
873 /* Found a name for the number - there can be only one; duplicate
874 names for different numbers are allowed, but not vice versa. First
875 scan down for duplicates. */
877 if (i < md->name_count)
879 uschar *slotB = slotA;
880 while (slotB > md->name_table)
882 slotB -= md->name_entry_size;
883 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
885 condition = GET2(slotB, 0) == md->recursive->group_num;
886 if (condition) break;
891 /* Scan up for duplicates */
896 for (i++; i < md->name_count; i++)
898 slotB += md->name_entry_size;
899 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
901 condition = GET2(slotB, 0) == md->recursive->group_num;
902 if (condition) break;
910 /* Chose branch according to the condition */
912 ecode += condition? 3 : GET(ecode, 1);
916 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
918 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
919 condition = offset < offset_top && md->offset_vector[offset] >= 0;
921 /* If the numbered capture is unset, but the reference was by name,
922 scan the table to see if the name refers to any other numbers, and test
923 them. The condition is true if any one is set. This is tediously similar
924 to the code above, but not close enough to try to amalgamate. */
926 if (!condition && condcode == OP_NCREF)
928 int refno = offset >> 1;
929 uschar *slotA = md->name_table;
931 for (i = 0; i < md->name_count; i++)
933 if (GET2(slotA, 0) == refno) break;
934 slotA += md->name_entry_size;
937 /* Found a name for the number - there can be only one; duplicate names
938 for different numbers are allowed, but not vice versa. First scan down
941 if (i < md->name_count)
943 uschar *slotB = slotA;
944 while (slotB > md->name_table)
946 slotB -= md->name_entry_size;
947 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
949 offset = GET2(slotB, 0) << 1;
950 condition = offset < offset_top &&
951 md->offset_vector[offset] >= 0;
952 if (condition) break;
957 /* Scan up for duplicates */
962 for (i++; i < md->name_count; i++)
964 slotB += md->name_entry_size;
965 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
967 offset = GET2(slotB, 0) << 1;
968 condition = offset < offset_top &&
969 md->offset_vector[offset] >= 0;
970 if (condition) break;
978 /* Chose branch according to the condition */
980 ecode += condition? 3 : GET(ecode, 1);
983 else if (condcode == OP_DEF) /* DEFINE - always false */
986 ecode += GET(ecode, 1);
989 /* The condition is an assertion. Call match() to evaluate it - setting
990 the final argument match_condassert causes it to stop at the end of an
995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
996 match_condassert, RM3);
997 if (rrc == MATCH_MATCH)
1000 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1001 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1003 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1005 RRETURN(rrc); /* Need braces because of following else */
1014 /* We are now at the branch that is to be obeyed. As there is only one,
1015 we can use tail recursion to avoid using another stack frame, except when
1016 match_cbegroup is required for an unlimited repeat of a possibly empty
1017 group. If the second alternative doesn't exist, we can just plough on. */
1019 if (condition || *ecode == OP_ALT)
1021 ecode += 1 + LINK_SIZE;
1022 if (op == OP_SCOND) /* Possibly empty group */
1024 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1027 else /* Group must match something */
1033 else /* Condition false & no alternative */
1035 ecode += 1 + LINK_SIZE;
1040 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1041 to close any currently open capturing brackets. */
1044 number = GET2(ecode, 1);
1045 offset = number << 1;
1048 printf("end bracket %d at *ACCEPT", number);
1052 md->capture_last = number;
1053 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1055 md->offset_vector[offset] =
1056 md->offset_vector[md->offset_end - number];
1057 md->offset_vector[offset+1] = eptr - md->start_subject;
1058 if (offset_top <= offset) offset_top = offset + 2;
1064 /* End of the pattern, either real or forced. If we are in a top-level
1065 recursion, we should restore the offsets appropriately and continue from
1070 if (md->recursive != NULL && md->recursive->group_num == 0)
1072 recursion_info *rec = md->recursive;
1073 DPRINTF(("End of pattern in a (?0) recursion\n"));
1074 md->recursive = rec->prevrec;
1075 memmove(md->offset_vector, rec->offset_save,
1076 rec->saved_max * sizeof(int));
1077 offset_top = rec->save_offset_top;
1079 ecode = rec->after_call;
1083 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1084 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1085 the subject. In both cases, backtracking will then try other alternatives,
1088 if (eptr == mstart &&
1090 (md->notempty_atstart &&
1091 mstart == md->start_subject + md->start_offset)))
1092 RRETURN(MATCH_NOMATCH);
1094 /* Otherwise, we have a match. */
1096 md->end_match_ptr = eptr; /* Record where we ended */
1097 md->end_offset_top = offset_top; /* and how many extracts were taken */
1098 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1099 RRETURN(MATCH_MATCH);
1101 /* Change option settings */
1106 DPRINTF(("ims set to %02lx\n", ims));
1109 /* Assertion brackets. Check the alternative branches in turn - the
1110 matching won't pass the KET for an assertion. If any one branch matches,
1111 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1112 start of each branch to move the current point backwards, so the code at
1113 this level is identical to the lookahead case. */
1119 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1121 if (rrc == MATCH_MATCH)
1123 mstart = md->start_match_ptr; /* In case \K reset it */
1126 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1127 ecode += GET(ecode, 1);
1129 while (*ecode == OP_ALT);
1130 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1132 /* If checking an assertion for a condition, return MATCH_MATCH. */
1134 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1136 /* Continue from after the assertion, updating the offsets high water
1137 mark, since extracts may have been taken during the assertion. */
1139 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1140 ecode += 1 + LINK_SIZE;
1141 offset_top = md->end_offset_top;
1144 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1145 PRUNE, or COMMIT means we must assume failure without checking subsequent
1149 case OP_ASSERTBACK_NOT:
1152 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1154 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1155 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1157 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1160 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1161 ecode += GET(ecode,1);
1163 while (*ecode == OP_ALT);
1165 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1167 ecode += 1 + LINK_SIZE;
1170 /* Move the subject pointer back. This occurs only at the start of
1171 each branch of a lookbehind assertion. If we are too close to the start to
1172 move back, this match function fails. When working with UTF-8 we move
1173 back a number of characters, not bytes. */
1183 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1190 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1193 eptr -= GET(ecode, 1);
1194 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1197 /* Save the earliest consulted character, then skip to next op code */
1199 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1200 ecode += 1 + LINK_SIZE;
1203 /* The callout item calls an external function, if one is provided, passing
1204 details of the match so far. This is mainly for debugging, though the
1205 function is able to force a failure. */
1208 if (pcre_callout != NULL)
1210 pcre_callout_block cb;
1211 cb.version = 1; /* Version 1 of the callout block */
1212 cb.callout_number = ecode[1];
1213 cb.offset_vector = md->offset_vector;
1214 cb.subject = (PCRE_SPTR)md->start_subject;
1215 cb.subject_length = md->end_subject - md->start_subject;
1216 cb.start_match = mstart - md->start_subject;
1217 cb.current_position = eptr - md->start_subject;
1218 cb.pattern_position = GET(ecode, 2);
1219 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1220 cb.capture_top = offset_top/2;
1221 cb.capture_last = md->capture_last;
1222 cb.callout_data = md->callout_data;
1223 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1224 if (rrc < 0) RRETURN(rrc);
1226 ecode += 2 + 2*LINK_SIZE;
1229 /* Recursion either matches the current regex, or some subexpression. The
1230 offset data is the offset to the starting bracket from the start of the
1231 whole pattern. (This is so that it works from duplicated subpatterns.)
1233 If there are any capturing brackets started but not finished, we have to
1234 save their starting points and reinstate them after the recursion. However,
1235 we don't know how many such there are (offset_top records the completed
1236 total) so we just have to save all the potential data. There may be up to
1237 65535 such values, which is too large to put on the stack, but using malloc
1238 for small numbers seems expensive. As a compromise, the stack is used when
1239 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1240 is used. A problem is what to do if the malloc fails ... there is no way of
1241 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1242 values on the stack, and accept that the rest may be wrong.
1244 There are also other values that have to be saved. We use a chained
1245 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1246 for the original version of this logic. */
1250 callpat = md->start_code + GET(ecode, 1);
1251 new_recursive.group_num = (callpat == md->start_code)? 0 :
1252 GET2(callpat, 1 + LINK_SIZE);
1254 /* Add to "recursing stack" */
1256 new_recursive.prevrec = md->recursive;
1257 md->recursive = &new_recursive;
1259 /* Find where to continue from afterwards */
1261 ecode += 1 + LINK_SIZE;
1262 new_recursive.after_call = ecode;
1264 /* Now save the offset data. */
1266 new_recursive.saved_max = md->offset_end;
1267 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1268 new_recursive.offset_save = stacksave;
1271 new_recursive.offset_save =
1272 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1273 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1276 memcpy(new_recursive.offset_save, md->offset_vector,
1277 new_recursive.saved_max * sizeof(int));
1278 new_recursive.save_offset_top = offset_top;
1280 /* OK, now we can do the recursion. For each top-level alternative we
1281 restore the offset and recursion data. */
1283 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1284 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1287 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1288 md, ims, eptrb, flags, RM6);
1289 if (rrc == MATCH_MATCH)
1291 DPRINTF(("Recursion matched\n"));
1292 md->recursive = new_recursive.prevrec;
1293 if (new_recursive.offset_save != stacksave)
1294 (pcre_free)(new_recursive.offset_save);
1295 RRETURN(MATCH_MATCH);
1297 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1299 DPRINTF(("Recursion gave error %d\n", rrc));
1300 if (new_recursive.offset_save != stacksave)
1301 (pcre_free)(new_recursive.offset_save);
1305 md->recursive = &new_recursive;
1306 memcpy(md->offset_vector, new_recursive.offset_save,
1307 new_recursive.saved_max * sizeof(int));
1308 callpat += GET(callpat, 1);
1310 while (*callpat == OP_ALT);
1312 DPRINTF(("Recursion didn't match\n"));
1313 md->recursive = new_recursive.prevrec;
1314 if (new_recursive.offset_save != stacksave)
1315 (pcre_free)(new_recursive.offset_save);
1316 RRETURN(MATCH_NOMATCH);
1318 /* Control never reaches here */
1320 /* "Once" brackets are like assertion brackets except that after a match,
1321 the point in the subject string is not moved back. Thus there can never be
1322 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1323 Check the alternative branches in turn - the matching won't pass the KET
1324 for this kind of subpattern. If any one branch matches, we carry on as at
1325 the end of a normal bracket, leaving the subject pointer, but resetting
1326 the start-of-match value in case it was changed by \K. */
1334 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1335 if (rrc == MATCH_MATCH)
1337 mstart = md->start_match_ptr;
1340 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1341 ecode += GET(ecode,1);
1343 while (*ecode == OP_ALT);
1345 /* If hit the end of the group (which could be repeated), fail */
1347 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1349 /* Continue as from after the assertion, updating the offsets high water
1350 mark, since extracts may have been taken. */
1352 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1354 offset_top = md->end_offset_top;
1355 eptr = md->end_match_ptr;
1357 /* For a non-repeating ket, just continue at this level. This also
1358 happens for a repeating ket if no characters were matched in the group.
1359 This is the forcible breaking of infinite loops as implemented in Perl
1360 5.005. If there is an options reset, it will get obeyed in the normal
1361 course of events. */
1363 if (*ecode == OP_KET || eptr == saved_eptr)
1365 ecode += 1+LINK_SIZE;
1369 /* The repeating kets try the rest of the pattern or restart from the
1370 preceding bracket, in the appropriate order. The second "call" of match()
1371 uses tail recursion, to avoid using another stack frame. We need to reset
1372 any options that changed within the bracket before re-running it, so
1373 check the next opcode. */
1375 if (ecode[1+LINK_SIZE] == OP_OPT)
1377 ims = (ims & ~PCRE_IMS) | ecode[4];
1378 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1381 if (*ecode == OP_KETRMIN)
1383 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1384 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1389 else /* OP_KETRMAX */
1391 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1393 ecode += 1 + LINK_SIZE;
1397 /* Control never gets here */
1399 /* An alternation is the end of a branch; scan along to find the end of the
1400 bracketed group and go to there. */
1403 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1406 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1407 indicating that it may occur zero times. It may repeat infinitely, or not
1408 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1409 with fixed upper repeat limits are compiled as a number of copies, with the
1410 optional ones preceded by BRAZERO or BRAMINZERO. */
1415 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1417 do next += GET(next,1); while (*next == OP_ALT);
1418 ecode = next + 1 + LINK_SIZE;
1425 do next += GET(next, 1); while (*next == OP_ALT);
1426 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1435 do next += GET(next,1); while (*next == OP_ALT);
1436 ecode = next + 1 + LINK_SIZE;
1440 /* End of a group, repeated or non-repeating. */
1445 prev = ecode - GET(ecode, 1);
1447 /* If this was a group that remembered the subject start, in order to break
1448 infinite repeats of empty string matches, retrieve the subject start from
1449 the chain. Otherwise, set it NULL. */
1451 if (*prev >= OP_SBRA)
1453 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1454 eptrb = eptrb->epb_prev; /* Backup to previous group */
1456 else saved_eptr = NULL;
1458 /* If we are at the end of an assertion group or an atomic group, stop
1459 matching and return MATCH_MATCH, but record the current high water mark for
1460 use by positive assertions. We also need to record the match start in case
1461 it was changed by \K. */
1463 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1464 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1467 md->end_match_ptr = eptr; /* For ONCE */
1468 md->end_offset_top = offset_top;
1469 md->start_match_ptr = mstart;
1470 RRETURN(MATCH_MATCH);
1473 /* For capturing groups we have to check the group number back at the start
1474 and if necessary complete handling an extraction by setting the offsets and
1475 bumping the high water mark. Note that whole-pattern recursion is coded as
1476 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1477 when the OP_END is reached. Other recursion is handled here. */
1479 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1481 number = GET2(prev, 1+LINK_SIZE);
1482 offset = number << 1;
1485 printf("end bracket %d", number);
1489 md->capture_last = number;
1490 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1492 md->offset_vector[offset] =
1493 md->offset_vector[md->offset_end - number];
1494 md->offset_vector[offset+1] = eptr - md->start_subject;
1495 if (offset_top <= offset) offset_top = offset + 2;
1498 /* Handle a recursively called group. Restore the offsets
1499 appropriately and continue from after the call. */
1501 if (md->recursive != NULL && md->recursive->group_num == number)
1503 recursion_info *rec = md->recursive;
1504 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1505 md->recursive = rec->prevrec;
1506 memcpy(md->offset_vector, rec->offset_save,
1507 rec->saved_max * sizeof(int));
1508 offset_top = rec->save_offset_top;
1509 ecode = rec->after_call;
1515 /* For both capturing and non-capturing groups, reset the value of the ims
1516 flags, in case they got changed during the group. */
1519 DPRINTF(("ims reset to %02lx\n", ims));
1521 /* For a non-repeating ket, just continue at this level. This also
1522 happens for a repeating ket if no characters were matched in the group.
1523 This is the forcible breaking of infinite loops as implemented in Perl
1524 5.005. If there is an options reset, it will get obeyed in the normal
1525 course of events. */
1527 if (*ecode == OP_KET || eptr == saved_eptr)
1529 ecode += 1 + LINK_SIZE;
1533 /* The repeating kets try the rest of the pattern or restart from the
1534 preceding bracket, in the appropriate order. In the second case, we can use
1535 tail recursion to avoid using another stack frame, unless we have an
1536 unlimited repeat of a group that can match an empty string. */
1538 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1540 if (*ecode == OP_KETRMIN)
1542 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1543 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1544 if (flags != 0) /* Could match an empty string */
1546 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1552 else /* OP_KETRMAX */
1554 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1556 ecode += 1 + LINK_SIZE;
1560 /* Control never gets here */
1562 /* Start of subject unless notbol, or after internal newline if multiline */
1565 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1566 if ((ims & PCRE_MULTILINE) != 0)
1568 if (eptr != md->start_subject &&
1569 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1570 RRETURN(MATCH_NOMATCH);
1574 /* ... else fall through */
1576 /* Start of subject assertion */
1579 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1583 /* Start of match assertion */
1586 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1590 /* Reset the start of match point */
1597 /* Assert before internal newline if multiline, or before a terminating
1598 newline unless endonly is set, else end of subject unless noteol is set. */
1601 if ((ims & PCRE_MULTILINE) != 0)
1603 if (eptr < md->end_subject)
1604 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1606 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1612 if (md->noteol) RRETURN(MATCH_NOMATCH);
1615 if (eptr != md->end_subject &&
1616 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1617 RRETURN(MATCH_NOMATCH);
1622 /* ... else fall through for endonly */
1624 /* End of subject assertion (\z) */
1627 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1631 /* End of subject or ending \n assertion (\Z) */
1634 if (eptr != md->end_subject &&
1635 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1636 RRETURN(MATCH_NOMATCH);
1640 /* Word boundary assertions */
1642 case OP_NOT_WORD_BOUNDARY:
1643 case OP_WORD_BOUNDARY:
1646 /* Find out if the previous and current characters are "word" characters.
1647 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1648 be "non-word" characters. Remember the earliest consulted character for
1649 partial matching. */
1654 if (eptr == md->start_subject) prev_is_word = FALSE; else
1656 USPTR lastptr = eptr - 1;
1657 while((*lastptr & 0xc0) == 0x80) lastptr--;
1658 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1659 GETCHAR(c, lastptr);
1660 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1662 if (eptr >= md->end_subject)
1665 cur_is_word = FALSE;
1670 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1676 /* Not in UTF-8 mode */
1679 if (eptr == md->start_subject) prev_is_word = FALSE; else
1681 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1682 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1684 if (eptr >= md->end_subject)
1687 cur_is_word = FALSE;
1689 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1692 /* Now see if the situation is what we want */
1694 if ((*ecode++ == OP_WORD_BOUNDARY)?
1695 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1696 RRETURN(MATCH_NOMATCH);
1700 /* Match a single character type; inline for speed */
1703 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1707 if (eptr++ >= md->end_subject)
1710 RRETURN(MATCH_NOMATCH);
1712 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1716 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1717 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1720 if (eptr++ >= md->end_subject)
1723 RRETURN(MATCH_NOMATCH);
1729 if (eptr >= md->end_subject)
1732 RRETURN(MATCH_NOMATCH);
1734 GETCHARINCTEST(c, eptr);
1739 (md->ctypes[c] & ctype_digit) != 0
1741 RRETURN(MATCH_NOMATCH);
1746 if (eptr >= md->end_subject)
1749 RRETURN(MATCH_NOMATCH);
1751 GETCHARINCTEST(c, eptr);
1756 (md->ctypes[c] & ctype_digit) == 0
1758 RRETURN(MATCH_NOMATCH);
1762 case OP_NOT_WHITESPACE:
1763 if (eptr >= md->end_subject)
1766 RRETURN(MATCH_NOMATCH);
1768 GETCHARINCTEST(c, eptr);
1773 (md->ctypes[c] & ctype_space) != 0
1775 RRETURN(MATCH_NOMATCH);
1780 if (eptr >= md->end_subject)
1783 RRETURN(MATCH_NOMATCH);
1785 GETCHARINCTEST(c, eptr);
1790 (md->ctypes[c] & ctype_space) == 0
1792 RRETURN(MATCH_NOMATCH);
1796 case OP_NOT_WORDCHAR:
1797 if (eptr >= md->end_subject)
1800 RRETURN(MATCH_NOMATCH);
1802 GETCHARINCTEST(c, eptr);
1807 (md->ctypes[c] & ctype_word) != 0
1809 RRETURN(MATCH_NOMATCH);
1814 if (eptr >= md->end_subject)
1817 RRETURN(MATCH_NOMATCH);
1819 GETCHARINCTEST(c, eptr);
1824 (md->ctypes[c] & ctype_word) == 0
1826 RRETURN(MATCH_NOMATCH);
1831 if (eptr >= md->end_subject)
1834 RRETURN(MATCH_NOMATCH);
1836 GETCHARINCTEST(c, eptr);
1839 default: RRETURN(MATCH_NOMATCH);
1841 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1852 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1859 if (eptr >= md->end_subject)
1862 RRETURN(MATCH_NOMATCH);
1864 GETCHARINCTEST(c, eptr);
1869 case 0x20: /* SPACE */
1870 case 0xa0: /* NBSP */
1871 case 0x1680: /* OGHAM SPACE MARK */
1872 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1873 case 0x2000: /* EN QUAD */
1874 case 0x2001: /* EM QUAD */
1875 case 0x2002: /* EN SPACE */
1876 case 0x2003: /* EM SPACE */
1877 case 0x2004: /* THREE-PER-EM SPACE */
1878 case 0x2005: /* FOUR-PER-EM SPACE */
1879 case 0x2006: /* SIX-PER-EM SPACE */
1880 case 0x2007: /* FIGURE SPACE */
1881 case 0x2008: /* PUNCTUATION SPACE */
1882 case 0x2009: /* THIN SPACE */
1883 case 0x200A: /* HAIR SPACE */
1884 case 0x202f: /* NARROW NO-BREAK SPACE */
1885 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1886 case 0x3000: /* IDEOGRAPHIC SPACE */
1887 RRETURN(MATCH_NOMATCH);
1893 if (eptr >= md->end_subject)
1896 RRETURN(MATCH_NOMATCH);
1898 GETCHARINCTEST(c, eptr);
1901 default: RRETURN(MATCH_NOMATCH);
1903 case 0x20: /* SPACE */
1904 case 0xa0: /* NBSP */
1905 case 0x1680: /* OGHAM SPACE MARK */
1906 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1907 case 0x2000: /* EN QUAD */
1908 case 0x2001: /* EM QUAD */
1909 case 0x2002: /* EN SPACE */
1910 case 0x2003: /* EM SPACE */
1911 case 0x2004: /* THREE-PER-EM SPACE */
1912 case 0x2005: /* FOUR-PER-EM SPACE */
1913 case 0x2006: /* SIX-PER-EM SPACE */
1914 case 0x2007: /* FIGURE SPACE */
1915 case 0x2008: /* PUNCTUATION SPACE */
1916 case 0x2009: /* THIN SPACE */
1917 case 0x200A: /* HAIR SPACE */
1918 case 0x202f: /* NARROW NO-BREAK SPACE */
1919 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1920 case 0x3000: /* IDEOGRAPHIC SPACE */
1927 if (eptr >= md->end_subject)
1930 RRETURN(MATCH_NOMATCH);
1932 GETCHARINCTEST(c, eptr);
1940 case 0x85: /* NEL */
1941 case 0x2028: /* LINE SEPARATOR */
1942 case 0x2029: /* PARAGRAPH SEPARATOR */
1943 RRETURN(MATCH_NOMATCH);
1949 if (eptr >= md->end_subject)
1952 RRETURN(MATCH_NOMATCH);
1954 GETCHARINCTEST(c, eptr);
1957 default: RRETURN(MATCH_NOMATCH);
1962 case 0x85: /* NEL */
1963 case 0x2028: /* LINE SEPARATOR */
1964 case 0x2029: /* PARAGRAPH SEPARATOR */
1971 /* Check the next character by Unicode property. We will get here only
1972 if the support is in the binary; otherwise a compile-time error occurs. */
1976 if (eptr >= md->end_subject)
1979 RRETURN(MATCH_NOMATCH);
1981 GETCHARINCTEST(c, eptr);
1983 int chartype = UCD_CHARTYPE(c);
1987 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1991 if ((chartype == ucp_Lu ||
1992 chartype == ucp_Ll ||
1993 chartype == ucp_Lt) == (op == OP_NOTPROP))
1994 RRETURN(MATCH_NOMATCH);
1998 if ((ecode[2] != _pcre_ucp_gentype[chartype]) == (op == OP_PROP))
1999 RRETURN(MATCH_NOMATCH);
2003 if ((ecode[2] != chartype) == (op == OP_PROP))
2004 RRETURN(MATCH_NOMATCH);
2008 if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
2009 RRETURN(MATCH_NOMATCH);
2013 RRETURN(PCRE_ERROR_INTERNAL);
2020 /* Match an extended Unicode sequence. We will get here only if the support
2021 is in the binary; otherwise a compile-time error occurs. */
2024 if (eptr >= md->end_subject)
2027 RRETURN(MATCH_NOMATCH);
2029 GETCHARINCTEST(c, eptr);
2031 int category = UCD_CATEGORY(c);
2032 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
2033 while (eptr < md->end_subject)
2036 if (!utf8) c = *eptr; else
2038 GETCHARLEN(c, eptr, len);
2040 category = UCD_CATEGORY(c);
2041 if (category != ucp_M) break;
2050 /* Match a back reference, possibly repeatedly. Look past the end of the
2051 item to see if there is repeat information following. The code is similar
2052 to that for character classes, but repeated for efficiency. Then obey
2053 similar code to character type repeats - written out again for speed.
2054 However, if the referenced string is the empty string, always treat
2055 it as matched, any number of times (otherwise there could be infinite
2060 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2063 /* If the reference is unset, there are two possibilities:
2065 (a) In the default, Perl-compatible state, set the length to be longer
2066 than the amount of subject left; this ensures that every attempt at a
2067 match fails. We can't just fail here, because of the possibility of
2068 quantifiers with zero minima.
2070 (b) If the JavaScript compatibility flag is set, set the length to zero
2071 so that the back reference matches an empty string.
2073 Otherwise, set the length to the length of what was matched by the
2074 referenced subpattern. */
2076 if (offset >= offset_top || md->offset_vector[offset] < 0)
2077 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2079 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2081 /* Set up for repetition, or handle the non-repeated case */
2091 c = *ecode++ - OP_CRSTAR;
2092 minimize = (c & 1) != 0;
2093 min = rep_min[c]; /* Pick up values from tables; */
2094 max = rep_max[c]; /* zero for max => infinity */
2095 if (max == 0) max = INT_MAX;
2100 minimize = (*ecode == OP_CRMINRANGE);
2101 min = GET2(ecode, 1);
2102 max = GET2(ecode, 3);
2103 if (max == 0) max = INT_MAX;
2107 default: /* No repeat follows */
2108 if (!match_ref(offset, eptr, length, md, ims))
2111 RRETURN(MATCH_NOMATCH);
2114 continue; /* With the main loop */
2117 /* If the length of the reference is zero, just continue with the
2120 if (length == 0) continue;
2122 /* First, ensure the minimum number of matches are present. We get back
2123 the length of the reference string explicitly rather than passing the
2124 address of eptr, so that eptr can be a register variable. */
2126 for (i = 1; i <= min; i++)
2128 if (!match_ref(offset, eptr, length, md, ims))
2131 RRETURN(MATCH_NOMATCH);
2136 /* If min = max, continue at the same level without recursion.
2137 They are not both allowed to be zero. */
2139 if (min == max) continue;
2141 /* If minimizing, keep trying and advancing the pointer */
2145 for (fi = min;; fi++)
2147 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2149 if (fi >= max) RRETURN(MATCH_NOMATCH);
2150 if (!match_ref(offset, eptr, length, md, ims))
2153 RRETURN(MATCH_NOMATCH);
2157 /* Control never gets here */
2160 /* If maximizing, find the longest string and work backwards */
2165 for (i = min; i < max; i++)
2167 if (!match_ref(offset, eptr, length, md, ims))
2176 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2177 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2180 RRETURN(MATCH_NOMATCH);
2183 /* Control never gets here */
2185 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2186 used when all the characters in the class have values in the range 0-255,
2187 and either the matching is caseful, or the characters are in the range
2188 0-127 when UTF-8 processing is enabled. The only difference between
2189 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2192 First, look past the end of the item to see if there is repeat information
2193 following. Then obey similar code to character type repeats - written out
2199 data = ecode + 1; /* Save for matching */
2200 ecode += 33; /* Advance past the item */
2210 c = *ecode++ - OP_CRSTAR;
2211 minimize = (c & 1) != 0;
2212 min = rep_min[c]; /* Pick up values from tables; */
2213 max = rep_max[c]; /* zero for max => infinity */
2214 if (max == 0) max = INT_MAX;
2219 minimize = (*ecode == OP_CRMINRANGE);
2220 min = GET2(ecode, 1);
2221 max = GET2(ecode, 3);
2222 if (max == 0) max = INT_MAX;
2226 default: /* No repeat follows */
2231 /* First, ensure the minimum number of matches are present. */
2237 for (i = 1; i <= min; i++)
2239 if (eptr >= md->end_subject)
2242 RRETURN(MATCH_NOMATCH);
2244 GETCHARINC(c, eptr);
2247 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2251 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2257 /* Not UTF-8 mode */
2259 for (i = 1; i <= min; i++)
2261 if (eptr >= md->end_subject)
2264 RRETURN(MATCH_NOMATCH);
2267 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2271 /* If max == min we can continue with the main loop without the
2274 if (min == max) continue;
2276 /* If minimizing, keep testing the rest of the expression and advancing
2277 the pointer while it matches the class. */
2285 for (fi = min;; fi++)
2287 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2288 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2289 if (fi >= max) RRETURN(MATCH_NOMATCH);
2290 if (eptr >= md->end_subject)
2293 RRETURN(MATCH_NOMATCH);
2295 GETCHARINC(c, eptr);
2298 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2302 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2308 /* Not UTF-8 mode */
2310 for (fi = min;; fi++)
2312 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2313 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2314 if (fi >= max) RRETURN(MATCH_NOMATCH);
2315 if (eptr >= md->end_subject)
2318 RRETURN(MATCH_NOMATCH);
2321 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2324 /* Control never gets here */
2327 /* If maximizing, find the longest possible run, then work backwards. */
2337 for (i = min; i < max; i++)
2340 if (eptr >= md->end_subject)
2345 GETCHARLEN(c, eptr, len);
2348 if (op == OP_CLASS) break;
2352 if ((data[c/8] & (1 << (c&7))) == 0) break;
2358 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360 if (eptr-- == pp) break; /* Stop if tried at original pos */
2366 /* Not UTF-8 mode */
2368 for (i = min; i < max; i++)
2370 if (eptr >= md->end_subject)
2376 if ((data[c/8] & (1 << (c&7))) == 0) break;
2381 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2382 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2387 RRETURN(MATCH_NOMATCH);
2390 /* Control never gets here */
2393 /* Match an extended character class. This opcode is encountered only
2394 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2395 mode, because Unicode properties are supported in non-UTF-8 mode. */
2400 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2401 ecode += GET(ecode, 1); /* Advance past the item */
2411 c = *ecode++ - OP_CRSTAR;
2412 minimize = (c & 1) != 0;
2413 min = rep_min[c]; /* Pick up values from tables; */
2414 max = rep_max[c]; /* zero for max => infinity */
2415 if (max == 0) max = INT_MAX;
2420 minimize = (*ecode == OP_CRMINRANGE);
2421 min = GET2(ecode, 1);
2422 max = GET2(ecode, 3);
2423 if (max == 0) max = INT_MAX;
2427 default: /* No repeat follows */
2432 /* First, ensure the minimum number of matches are present. */
2434 for (i = 1; i <= min; i++)
2436 if (eptr >= md->end_subject)
2439 RRETURN(MATCH_NOMATCH);
2441 GETCHARINCTEST(c, eptr);
2442 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2445 /* If max == min we can continue with the main loop without the
2448 if (min == max) continue;
2450 /* If minimizing, keep testing the rest of the expression and advancing
2451 the pointer while it matches the class. */
2455 for (fi = min;; fi++)
2457 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2458 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2459 if (fi >= max) RRETURN(MATCH_NOMATCH);
2460 if (eptr >= md->end_subject)
2463 RRETURN(MATCH_NOMATCH);
2465 GETCHARINCTEST(c, eptr);
2466 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2468 /* Control never gets here */
2471 /* If maximizing, find the longest possible run, then work backwards. */
2476 for (i = min; i < max; i++)
2479 if (eptr >= md->end_subject)
2484 GETCHARLENTEST(c, eptr, len);
2485 if (!_pcre_xclass(c, data)) break;
2490 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2491 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2492 if (eptr-- == pp) break; /* Stop if tried at original pos */
2493 if (utf8) BACKCHAR(eptr);
2495 RRETURN(MATCH_NOMATCH);
2498 /* Control never gets here */
2500 #endif /* End of XCLASS */
2502 /* Match a single character, casefully */
2510 GETCHARLEN(fc, ecode, length);
2511 if (length > md->end_subject - eptr)
2513 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2514 RRETURN(MATCH_NOMATCH);
2516 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2521 /* Non-UTF-8 mode */
2523 if (md->end_subject - eptr < 1)
2525 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2526 RRETURN(MATCH_NOMATCH);
2528 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2533 /* Match a single character, caselessly */
2541 GETCHARLEN(fc, ecode, length);
2543 if (length > md->end_subject - eptr)
2545 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2546 RRETURN(MATCH_NOMATCH);
2549 /* If the pattern character's value is < 128, we have only one byte, and
2550 can use the fast lookup table. */
2554 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2557 /* Otherwise we must pick up the subject character */
2562 GETCHARINC(dc, eptr);
2565 /* If we have Unicode property support, we can use it to test the other
2566 case of the character, if there is one. */
2571 if (dc != UCD_OTHERCASE(fc))
2573 RRETURN(MATCH_NOMATCH);
2578 #endif /* SUPPORT_UTF8 */
2580 /* Non-UTF-8 mode */
2582 if (md->end_subject - eptr < 1)
2584 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2585 RRETURN(MATCH_NOMATCH);
2587 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2592 /* Match a single character repeatedly. */
2595 min = max = GET2(ecode, 1);
2606 max = GET2(ecode, 1);
2607 minimize = *ecode == OP_MINUPTO;
2638 c = *ecode++ - OP_STAR;
2639 minimize = (c & 1) != 0;
2641 min = rep_min[c]; /* Pick up values from tables; */
2642 max = rep_max[c]; /* zero for max => infinity */
2643 if (max == 0) max = INT_MAX;
2645 /* Common code for all repeated single-character matches. */
2653 GETCHARLEN(fc, ecode, length);
2656 /* Handle multibyte character matching specially here. There is
2657 support for caseless matching if UCP support is present. */
2662 unsigned int othercase;
2663 if ((ims & PCRE_CASELESS) != 0 &&
2664 (othercase = UCD_OTHERCASE(fc)) != fc)
2665 oclength = _pcre_ord2utf8(othercase, occhars);
2667 #endif /* SUPPORT_UCP */
2669 for (i = 1; i <= min; i++)
2671 if (eptr <= md->end_subject - length &&
2672 memcmp(eptr, charptr, length) == 0) eptr += length;
2674 else if (oclength > 0 &&
2675 eptr <= md->end_subject - oclength &&
2676 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2677 #endif /* SUPPORT_UCP */
2681 RRETURN(MATCH_NOMATCH);
2685 if (min == max) continue;
2689 for (fi = min;; fi++)
2691 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2693 if (fi >= max) RRETURN(MATCH_NOMATCH);
2694 if (eptr <= md->end_subject - length &&
2695 memcmp(eptr, charptr, length) == 0) eptr += length;
2697 else if (oclength > 0 &&
2698 eptr <= md->end_subject - oclength &&
2699 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2700 #endif /* SUPPORT_UCP */
2704 RRETURN(MATCH_NOMATCH);
2707 /* Control never gets here */
2713 for (i = min; i < max; i++)
2715 if (eptr <= md->end_subject - length &&
2716 memcmp(eptr, charptr, length) == 0) eptr += length;
2718 else if (oclength > 0 &&
2719 eptr <= md->end_subject - oclength &&
2720 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2721 #endif /* SUPPORT_UCP */
2729 if (possessive) continue;
2733 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2734 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2735 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2739 #else /* without SUPPORT_UCP */
2741 #endif /* SUPPORT_UCP */
2744 /* Control never gets here */
2747 /* If the length of a UTF-8 character is 1, we fall through here, and
2748 obey the code as for non-UTF-8 characters below, though in this case the
2749 value of fc will always be < 128. */
2752 #endif /* SUPPORT_UTF8 */
2754 /* When not in UTF-8 mode, load a single-byte character. */
2758 /* The value of fc at this point is always less than 256, though we may or
2759 may not be in UTF-8 mode. The code is duplicated for the caseless and
2760 caseful cases, for speed, since matching characters is likely to be quite
2761 common. First, ensure the minimum number of matches are present. If min =
2762 max, continue at the same level without recursing. Otherwise, if
2763 minimizing, keep trying the rest of the expression and advancing one
2764 matching character if failing, up to the maximum. Alternatively, if
2765 maximizing, find the maximum number of characters and work backwards. */
2767 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2770 if ((ims & PCRE_CASELESS) != 0)
2773 for (i = 1; i <= min; i++)
2775 if (eptr >= md->end_subject)
2778 RRETURN(MATCH_NOMATCH);
2780 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2782 if (min == max) continue;
2785 for (fi = min;; fi++)
2787 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2789 if (fi >= max) RRETURN(MATCH_NOMATCH);
2790 if (eptr >= md->end_subject)
2793 RRETURN(MATCH_NOMATCH);
2795 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2797 /* Control never gets here */
2802 for (i = min; i < max; i++)
2804 if (eptr >= md->end_subject)
2809 if (fc != md->lcc[*eptr]) break;
2813 if (possessive) continue;
2817 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 RRETURN(MATCH_NOMATCH);
2823 /* Control never gets here */
2826 /* Caseful comparisons (includes all multi-byte characters) */
2830 for (i = 1; i <= min; i++)
2832 if (eptr >= md->end_subject)
2835 RRETURN(MATCH_NOMATCH);
2837 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2840 if (min == max) continue;
2844 for (fi = min;; fi++)
2846 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2848 if (fi >= max) RRETURN(MATCH_NOMATCH);
2849 if (eptr >= md->end_subject)
2852 RRETURN(MATCH_NOMATCH);
2854 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2856 /* Control never gets here */
2861 for (i = min; i < max; i++)
2863 if (eptr >= md->end_subject)
2868 if (fc != *eptr) break;
2871 if (possessive) continue;
2875 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2879 RRETURN(MATCH_NOMATCH);
2882 /* Control never gets here */
2884 /* Match a negated single one-byte character. The character we are
2885 checking can be multibyte. */
2888 if (eptr >= md->end_subject)
2891 RRETURN(MATCH_NOMATCH);
2894 GETCHARINCTEST(c, eptr);
2895 if ((ims & PCRE_CASELESS) != 0)
2901 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2905 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2909 /* Match a negated single one-byte character repeatedly. This is almost a
2910 repeat of the code for a repeated single character, but I haven't found a
2911 nice way of commoning these up that doesn't require a test of the
2912 positive/negative option for each character match. Maybe that wouldn't add
2913 very much to the time taken, but character matching *is* what this is all
2917 min = max = GET2(ecode, 1);
2924 max = GET2(ecode, 1);
2925 minimize = *ecode == OP_NOTMINUPTO;
2943 case OP_NOTPOSQUERY:
2953 max = GET2(ecode, 1);
2962 case OP_NOTMINQUERY:
2963 c = *ecode++ - OP_NOTSTAR;
2964 minimize = (c & 1) != 0;
2965 min = rep_min[c]; /* Pick up values from tables; */
2966 max = rep_max[c]; /* zero for max => infinity */
2967 if (max == 0) max = INT_MAX;
2969 /* Common code for all repeated single-byte matches. */
2974 /* The code is duplicated for the caseless and caseful cases, for speed,
2975 since matching characters is likely to be quite common. First, ensure the
2976 minimum number of matches are present. If min = max, continue at the same
2977 level without recursing. Otherwise, if minimizing, keep trying the rest of
2978 the expression and advancing one matching character if failing, up to the
2979 maximum. Alternatively, if maximizing, find the maximum number of
2980 characters and work backwards. */
2982 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2985 if ((ims & PCRE_CASELESS) != 0)
2993 register unsigned int d;
2994 for (i = 1; i <= min; i++)
2996 if (eptr >= md->end_subject)
2999 RRETURN(MATCH_NOMATCH);
3001 GETCHARINC(d, eptr);
3002 if (d < 256) d = md->lcc[d];
3003 if (fc == d) RRETURN(MATCH_NOMATCH);
3009 /* Not UTF-8 mode */
3011 for (i = 1; i <= min; i++)
3013 if (eptr >= md->end_subject)
3016 RRETURN(MATCH_NOMATCH);
3018 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3022 if (min == max) continue;
3030 register unsigned int d;
3031 for (fi = min;; fi++)
3033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3035 if (fi >= max) RRETURN(MATCH_NOMATCH);
3036 if (eptr >= md->end_subject)
3039 RRETURN(MATCH_NOMATCH);
3041 GETCHARINC(d, eptr);
3042 if (d < 256) d = md->lcc[d];
3043 if (fc == d) RRETURN(MATCH_NOMATCH);
3048 /* Not UTF-8 mode */
3050 for (fi = min;; fi++)
3052 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3054 if (fi >= max) RRETURN(MATCH_NOMATCH);
3055 if (eptr >= md->end_subject)
3058 RRETURN(MATCH_NOMATCH);
3060 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3063 /* Control never gets here */
3076 register unsigned int d;
3077 for (i = min; i < max; i++)
3080 if (eptr >= md->end_subject)
3085 GETCHARLEN(d, eptr, len);
3086 if (d < 256) d = md->lcc[d];
3090 if (possessive) continue;
3093 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3095 if (eptr-- == pp) break; /* Stop if tried at original pos */
3101 /* Not UTF-8 mode */
3103 for (i = min; i < max; i++)
3105 if (eptr >= md->end_subject)
3110 if (fc == md->lcc[*eptr]) break;
3113 if (possessive) continue;
3116 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3122 RRETURN(MATCH_NOMATCH);
3124 /* Control never gets here */
3127 /* Caseful comparisons */
3135 register unsigned int d;
3136 for (i = 1; i <= min; i++)
3138 if (eptr >= md->end_subject)
3141 RRETURN(MATCH_NOMATCH);
3143 GETCHARINC(d, eptr);
3144 if (fc == d) RRETURN(MATCH_NOMATCH);
3149 /* Not UTF-8 mode */
3151 for (i = 1; i <= min; i++)
3153 if (eptr >= md->end_subject)
3156 RRETURN(MATCH_NOMATCH);
3158 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3162 if (min == max) continue;
3170 register unsigned int d;
3171 for (fi = min;; fi++)
3173 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3175 if (fi >= max) RRETURN(MATCH_NOMATCH);
3176 if (eptr >= md->end_subject)
3179 RRETURN(MATCH_NOMATCH);
3181 GETCHARINC(d, eptr);
3182 if (fc == d) RRETURN(MATCH_NOMATCH);
3187 /* Not UTF-8 mode */
3189 for (fi = min;; fi++)
3191 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3193 if (fi >= max) RRETURN(MATCH_NOMATCH);
3194 if (eptr >= md->end_subject)
3197 RRETURN(MATCH_NOMATCH);
3199 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3202 /* Control never gets here */
3215 register unsigned int d;
3216 for (i = min; i < max; i++)
3219 if (eptr >= md->end_subject)
3224 GETCHARLEN(d, eptr, len);
3228 if (possessive) continue;
3231 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3232 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3233 if (eptr-- == pp) break; /* Stop if tried at original pos */
3239 /* Not UTF-8 mode */
3241 for (i = min; i < max; i++)
3243 if (eptr >= md->end_subject)
3248 if (fc == *eptr) break;
3251 if (possessive) continue;
3254 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3255 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 RRETURN(MATCH_NOMATCH);
3263 /* Control never gets here */
3265 /* Match a single character type repeatedly; several different opcodes
3266 share code. This is very similar to the code for single characters, but we
3267 repeat it in the interests of efficiency. */
3270 min = max = GET2(ecode, 1);
3276 case OP_TYPEMINUPTO:
3278 max = GET2(ecode, 1);
3279 minimize = *ecode == OP_TYPEMINUPTO;
3283 case OP_TYPEPOSSTAR:
3290 case OP_TYPEPOSPLUS:
3297 case OP_TYPEPOSQUERY:
3304 case OP_TYPEPOSUPTO:
3307 max = GET2(ecode, 1);
3312 case OP_TYPEMINSTAR:
3314 case OP_TYPEMINPLUS:
3316 case OP_TYPEMINQUERY:
3317 c = *ecode++ - OP_TYPESTAR;
3318 minimize = (c & 1) != 0;
3319 min = rep_min[c]; /* Pick up values from tables; */
3320 max = rep_max[c]; /* zero for max => infinity */
3321 if (max == 0) max = INT_MAX;
3323 /* Common code for all repeated single character type matches. Note that
3324 in UTF-8 mode, '.' matches a character of any length, but for the other
3325 character types, the valid characters are all one-byte long. */
3328 ctype = *ecode++; /* Code for the character type */
3331 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3333 prop_fail_result = ctype == OP_NOTPROP;
3334 prop_type = *ecode++;
3335 prop_value = *ecode++;
3337 else prop_type = -1;
3340 /* First, ensure the minimum number of matches are present. Use inline
3341 code for maximizing the speed, and do the type test once at the start
3342 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3343 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3344 and single-bytes. */
3354 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3355 for (i = 1; i <= min; i++)
3357 if (eptr >= md->end_subject)
3360 RRETURN(MATCH_NOMATCH);
3362 GETCHARINCTEST(c, eptr);
3367 for (i = 1; i <= min; i++)
3369 if (eptr >= md->end_subject)
3372 RRETURN(MATCH_NOMATCH);
3374 GETCHARINCTEST(c, eptr);
3375 prop_chartype = UCD_CHARTYPE(c);
3376 if ((prop_chartype == ucp_Lu ||
3377 prop_chartype == ucp_Ll ||
3378 prop_chartype == ucp_Lt) == prop_fail_result)
3379 RRETURN(MATCH_NOMATCH);
3384 for (i = 1; i <= min; i++)
3386 if (eptr >= md->end_subject)
3389 RRETURN(MATCH_NOMATCH);
3391 GETCHARINCTEST(c, eptr);
3392 prop_category = UCD_CATEGORY(c);
3393 if ((prop_category == prop_value) == prop_fail_result)
3394 RRETURN(MATCH_NOMATCH);
3399 for (i = 1; i <= min; i++)
3401 if (eptr >= md->end_subject)
3404 RRETURN(MATCH_NOMATCH);
3406 GETCHARINCTEST(c, eptr);
3407 prop_chartype = UCD_CHARTYPE(c);
3408 if ((prop_chartype == prop_value) == prop_fail_result)
3409 RRETURN(MATCH_NOMATCH);
3414 for (i = 1; i <= min; i++)
3416 if (eptr >= md->end_subject)
3419 RRETURN(MATCH_NOMATCH);
3421 GETCHARINCTEST(c, eptr);
3422 prop_script = UCD_SCRIPT(c);
3423 if ((prop_script == prop_value) == prop_fail_result)
3424 RRETURN(MATCH_NOMATCH);
3429 RRETURN(PCRE_ERROR_INTERNAL);
3433 /* Match extended Unicode sequences. We will get here only if the
3434 support is in the binary; otherwise a compile-time error occurs. */
3436 else if (ctype == OP_EXTUNI)
3438 for (i = 1; i <= min; i++)
3440 if (eptr >= md->end_subject)
3443 RRETURN(MATCH_NOMATCH);
3445 GETCHARINCTEST(c, eptr);
3446 prop_category = UCD_CATEGORY(c);
3447 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3448 while (eptr < md->end_subject)
3451 if (!utf8) c = *eptr;
3452 else { GETCHARLEN(c, eptr, len); }
3453 prop_category = UCD_CATEGORY(c);
3454 if (prop_category != ucp_M) break;
3461 #endif /* SUPPORT_UCP */
3463 /* Handle all other cases when the coding is UTF-8 */
3466 if (utf8) switch(ctype)
3469 for (i = 1; i <= min; i++)
3471 if (eptr >= md->end_subject)
3474 RRETURN(MATCH_NOMATCH);
3476 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3478 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3483 for (i = 1; i <= min; i++)
3485 if (eptr >= md->end_subject)
3488 RRETURN(MATCH_NOMATCH);
3491 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3496 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3501 for (i = 1; i <= min; i++)
3503 if (eptr >= md->end_subject)
3506 RRETURN(MATCH_NOMATCH);
3508 GETCHARINC(c, eptr);
3511 default: RRETURN(MATCH_NOMATCH);
3513 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3524 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3531 for (i = 1; i <= min; i++)
3533 if (eptr >= md->end_subject)
3536 RRETURN(MATCH_NOMATCH);
3538 GETCHARINC(c, eptr);
3543 case 0x20: /* SPACE */
3544 case 0xa0: /* NBSP */
3545 case 0x1680: /* OGHAM SPACE MARK */
3546 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3547 case 0x2000: /* EN QUAD */
3548 case 0x2001: /* EM QUAD */
3549 case 0x2002: /* EN SPACE */
3550 case 0x2003: /* EM SPACE */
3551 case 0x2004: /* THREE-PER-EM SPACE */
3552 case 0x2005: /* FOUR-PER-EM SPACE */
3553 case 0x2006: /* SIX-PER-EM SPACE */
3554 case 0x2007: /* FIGURE SPACE */
3555 case 0x2008: /* PUNCTUATION SPACE */
3556 case 0x2009: /* THIN SPACE */
3557 case 0x200A: /* HAIR SPACE */
3558 case 0x202f: /* NARROW NO-BREAK SPACE */
3559 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3560 case 0x3000: /* IDEOGRAPHIC SPACE */
3561 RRETURN(MATCH_NOMATCH);
3567 for (i = 1; i <= min; i++)
3569 if (eptr >= md->end_subject)
3572 RRETURN(MATCH_NOMATCH);
3574 GETCHARINC(c, eptr);
3577 default: RRETURN(MATCH_NOMATCH);
3579 case 0x20: /* SPACE */
3580 case 0xa0: /* NBSP */
3581 case 0x1680: /* OGHAM SPACE MARK */
3582 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3583 case 0x2000: /* EN QUAD */
3584 case 0x2001: /* EM QUAD */
3585 case 0x2002: /* EN SPACE */
3586 case 0x2003: /* EM SPACE */
3587 case 0x2004: /* THREE-PER-EM SPACE */
3588 case 0x2005: /* FOUR-PER-EM SPACE */
3589 case 0x2006: /* SIX-PER-EM SPACE */
3590 case 0x2007: /* FIGURE SPACE */
3591 case 0x2008: /* PUNCTUATION SPACE */
3592 case 0x2009: /* THIN SPACE */
3593 case 0x200A: /* HAIR SPACE */
3594 case 0x202f: /* NARROW NO-BREAK SPACE */
3595 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3596 case 0x3000: /* IDEOGRAPHIC SPACE */
3603 for (i = 1; i <= min; i++)
3605 if (eptr >= md->end_subject)
3608 RRETURN(MATCH_NOMATCH);
3610 GETCHARINC(c, eptr);
3618 case 0x85: /* NEL */
3619 case 0x2028: /* LINE SEPARATOR */
3620 case 0x2029: /* PARAGRAPH SEPARATOR */
3621 RRETURN(MATCH_NOMATCH);
3627 for (i = 1; i <= min; i++)
3629 if (eptr >= md->end_subject)
3632 RRETURN(MATCH_NOMATCH);
3634 GETCHARINC(c, eptr);
3637 default: RRETURN(MATCH_NOMATCH);
3642 case 0x85: /* NEL */
3643 case 0x2028: /* LINE SEPARATOR */
3644 case 0x2029: /* PARAGRAPH SEPARATOR */
3651 for (i = 1; i <= min; i++)
3653 if (eptr >= md->end_subject)
3656 RRETURN(MATCH_NOMATCH);
3658 GETCHARINC(c, eptr);
3659 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3660 RRETURN(MATCH_NOMATCH);
3665 for (i = 1; i <= min; i++)
3667 if (eptr >= md->end_subject)
3670 RRETURN(MATCH_NOMATCH);
3672 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3673 RRETURN(MATCH_NOMATCH);
3674 /* No need to skip more bytes - we know it's a 1-byte character */
3678 case OP_NOT_WHITESPACE:
3679 for (i = 1; i <= min; i++)
3681 if (eptr >= md->end_subject)
3684 RRETURN(MATCH_NOMATCH);
3686 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3687 RRETURN(MATCH_NOMATCH);
3688 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3693 for (i = 1; i <= min; i++)
3695 if (eptr >= md->end_subject)
3698 RRETURN(MATCH_NOMATCH);
3700 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3701 RRETURN(MATCH_NOMATCH);
3702 /* No need to skip more bytes - we know it's a 1-byte character */
3706 case OP_NOT_WORDCHAR:
3707 for (i = 1; i <= min; i++)
3709 if (eptr >= md->end_subject)
3712 RRETURN(MATCH_NOMATCH);
3714 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3715 RRETURN(MATCH_NOMATCH);
3716 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3721 for (i = 1; i <= min; i++)
3723 if (eptr >= md->end_subject)
3726 RRETURN(MATCH_NOMATCH);
3728 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3729 RRETURN(MATCH_NOMATCH);
3730 /* No need to skip more bytes - we know it's a 1-byte character */
3735 RRETURN(PCRE_ERROR_INTERNAL);
3736 } /* End switch(ctype) */
3739 #endif /* SUPPORT_UTF8 */
3741 /* Code for the non-UTF-8 case for minimum matching of operators other
3742 than OP_PROP and OP_NOTPROP. */
3747 for (i = 1; i <= min; i++)
3749 if (eptr >= md->end_subject)
3752 RRETURN(MATCH_NOMATCH);
3754 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3760 if (eptr > md->end_subject - min)
3763 RRETURN(MATCH_NOMATCH);
3769 if (eptr > md->end_subject - min)
3772 RRETURN(MATCH_NOMATCH);
3778 for (i = 1; i <= min; i++)
3780 if (eptr >= md->end_subject)
3783 RRETURN(MATCH_NOMATCH);
3787 default: RRETURN(MATCH_NOMATCH);
3789 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3797 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3804 for (i = 1; i <= min; i++)
3806 if (eptr >= md->end_subject)
3809 RRETURN(MATCH_NOMATCH);
3815 case 0x20: /* SPACE */
3816 case 0xa0: /* NBSP */
3817 RRETURN(MATCH_NOMATCH);
3823 for (i = 1; i <= min; i++)
3825 if (eptr >= md->end_subject)
3828 RRETURN(MATCH_NOMATCH);
3832 default: RRETURN(MATCH_NOMATCH);
3834 case 0x20: /* SPACE */
3835 case 0xa0: /* NBSP */
3842 for (i = 1; i <= min; i++)
3844 if (eptr >= md->end_subject)
3847 RRETURN(MATCH_NOMATCH);
3856 case 0x85: /* NEL */
3857 RRETURN(MATCH_NOMATCH);
3863 for (i = 1; i <= min; i++)
3865 if (eptr >= md->end_subject)
3868 RRETURN(MATCH_NOMATCH);
3872 default: RRETURN(MATCH_NOMATCH);
3877 case 0x85: /* NEL */
3884 for (i = 1; i <= min; i++)
3886 if (eptr >= md->end_subject)
3889 RRETURN(MATCH_NOMATCH);
3891 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3896 for (i = 1; i <= min; i++)
3898 if (eptr >= md->end_subject)
3901 RRETURN(MATCH_NOMATCH);
3903 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3907 case OP_NOT_WHITESPACE:
3908 for (i = 1; i <= min; i++)
3910 if (eptr >= md->end_subject)
3913 RRETURN(MATCH_NOMATCH);
3915 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3920 for (i = 1; i <= min; i++)
3922 if (eptr >= md->end_subject)
3925 RRETURN(MATCH_NOMATCH);
3927 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3931 case OP_NOT_WORDCHAR:
3932 for (i = 1; i <= min; i++)
3934 if (eptr >= md->end_subject)
3937 RRETURN(MATCH_NOMATCH);
3939 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3940 RRETURN(MATCH_NOMATCH);
3945 for (i = 1; i <= min; i++)
3947 if (eptr >= md->end_subject)
3950 RRETURN(MATCH_NOMATCH);
3952 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3953 RRETURN(MATCH_NOMATCH);
3958 RRETURN(PCRE_ERROR_INTERNAL);
3962 /* If min = max, continue at the same level without recursing */
3964 if (min == max) continue;
3966 /* If minimizing, we have to test the rest of the pattern before each
3967 subsequent match. Again, separate the UTF-8 case for speed, and also
3968 separate the UCP cases. */
3978 for (fi = min;; fi++)
3980 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3982 if (fi >= max) RRETURN(MATCH_NOMATCH);
3983 if (eptr >= md->end_subject)
3986 RRETURN(MATCH_NOMATCH);
3988 GETCHARINC(c, eptr);
3989 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3991 /* Control never gets here */
3994 for (fi = min;; fi++)
3996 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3997 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3998 if (fi >= max) RRETURN(MATCH_NOMATCH);
3999 if (eptr >= md->end_subject)
4002 RRETURN(MATCH_NOMATCH);
4004 GETCHARINC(c, eptr);
4005 prop_chartype = UCD_CHARTYPE(c);
4006 if ((prop_chartype == ucp_Lu ||
4007 prop_chartype == ucp_Ll ||
4008 prop_chartype == ucp_Lt) == prop_fail_result)
4009 RRETURN(MATCH_NOMATCH);
4011 /* Control never gets here */
4014 for (fi = min;; fi++)
4016 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4017 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4018 if (fi >= max) RRETURN(MATCH_NOMATCH);
4019 if (eptr >= md->end_subject)
4022 RRETURN(MATCH_NOMATCH);
4024 GETCHARINC(c, eptr);
4025 prop_category = UCD_CATEGORY(c);
4026 if ((prop_category == prop_value) == prop_fail_result)
4027 RRETURN(MATCH_NOMATCH);
4029 /* Control never gets here */
4032 for (fi = min;; fi++)
4034 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4036 if (fi >= max) RRETURN(MATCH_NOMATCH);
4037 if (eptr >= md->end_subject)
4040 RRETURN(MATCH_NOMATCH);
4042 GETCHARINC(c, eptr);
4043 prop_chartype = UCD_CHARTYPE(c);
4044 if ((prop_chartype == prop_value) == prop_fail_result)
4045 RRETURN(MATCH_NOMATCH);
4047 /* Control never gets here */
4050 for (fi = min;; fi++)
4052 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4054 if (fi >= max) RRETURN(MATCH_NOMATCH);
4055 if (eptr >= md->end_subject)
4058 RRETURN(MATCH_NOMATCH);
4060 GETCHARINC(c, eptr);
4061 prop_script = UCD_SCRIPT(c);
4062 if ((prop_script == prop_value) == prop_fail_result)
4063 RRETURN(MATCH_NOMATCH);
4065 /* Control never gets here */
4068 RRETURN(PCRE_ERROR_INTERNAL);
4072 /* Match extended Unicode sequences. We will get here only if the
4073 support is in the binary; otherwise a compile-time error occurs. */
4075 else if (ctype == OP_EXTUNI)
4077 for (fi = min;; fi++)
4079 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4081 if (fi >= max) RRETURN(MATCH_NOMATCH);
4082 if (eptr >= md->end_subject)
4085 RRETURN(MATCH_NOMATCH);
4087 GETCHARINCTEST(c, eptr);
4088 prop_category = UCD_CATEGORY(c);
4089 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
4090 while (eptr < md->end_subject)
4093 if (!utf8) c = *eptr;
4094 else { GETCHARLEN(c, eptr, len); }
4095 prop_category = UCD_CATEGORY(c);
4096 if (prop_category != ucp_M) break;
4103 #endif /* SUPPORT_UCP */
4109 for (fi = min;; fi++)
4111 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4112 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4113 if (fi >= max) RRETURN(MATCH_NOMATCH);
4114 if (eptr >= md->end_subject)
4117 RRETURN(MATCH_NOMATCH);
4119 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4120 RRETURN(MATCH_NOMATCH);
4121 GETCHARINC(c, eptr);
4124 case OP_ANY: /* This is the non-NL case */
4132 default: RRETURN(MATCH_NOMATCH);
4134 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4144 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4154 case 0x20: /* SPACE */
4155 case 0xa0: /* NBSP */
4156 case 0x1680: /* OGHAM SPACE MARK */
4157 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4158 case 0x2000: /* EN QUAD */
4159 case 0x2001: /* EM QUAD */
4160 case 0x2002: /* EN SPACE */
4161 case 0x2003: /* EM SPACE */
4162 case 0x2004: /* THREE-PER-EM SPACE */
4163 case 0x2005: /* FOUR-PER-EM SPACE */
4164 case 0x2006: /* SIX-PER-EM SPACE */
4165 case 0x2007: /* FIGURE SPACE */
4166 case 0x2008: /* PUNCTUATION SPACE */
4167 case 0x2009: /* THIN SPACE */
4168 case 0x200A: /* HAIR SPACE */
4169 case 0x202f: /* NARROW NO-BREAK SPACE */
4170 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4171 case 0x3000: /* IDEOGRAPHIC SPACE */
4172 RRETURN(MATCH_NOMATCH);
4179 default: RRETURN(MATCH_NOMATCH);
4181 case 0x20: /* SPACE */
4182 case 0xa0: /* NBSP */
4183 case 0x1680: /* OGHAM SPACE MARK */
4184 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4185 case 0x2000: /* EN QUAD */
4186 case 0x2001: /* EM QUAD */
4187 case 0x2002: /* EN SPACE */
4188 case 0x2003: /* EM SPACE */
4189 case 0x2004: /* THREE-PER-EM SPACE */
4190 case 0x2005: /* FOUR-PER-EM SPACE */
4191 case 0x2006: /* SIX-PER-EM SPACE */
4192 case 0x2007: /* FIGURE SPACE */
4193 case 0x2008: /* PUNCTUATION SPACE */
4194 case 0x2009: /* THIN SPACE */
4195 case 0x200A: /* HAIR SPACE */
4196 case 0x202f: /* NARROW NO-BREAK SPACE */
4197 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4198 case 0x3000: /* IDEOGRAPHIC SPACE */
4211 case 0x85: /* NEL */
4212 case 0x2028: /* LINE SEPARATOR */
4213 case 0x2029: /* PARAGRAPH SEPARATOR */
4214 RRETURN(MATCH_NOMATCH);
4221 default: RRETURN(MATCH_NOMATCH);
4226 case 0x85: /* NEL */
4227 case 0x2028: /* LINE SEPARATOR */
4228 case 0x2029: /* PARAGRAPH SEPARATOR */
4234 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4235 RRETURN(MATCH_NOMATCH);
4239 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4240 RRETURN(MATCH_NOMATCH);
4243 case OP_NOT_WHITESPACE:
4244 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4245 RRETURN(MATCH_NOMATCH);
4249 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4250 RRETURN(MATCH_NOMATCH);
4253 case OP_NOT_WORDCHAR:
4254 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4255 RRETURN(MATCH_NOMATCH);
4259 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4260 RRETURN(MATCH_NOMATCH);
4264 RRETURN(PCRE_ERROR_INTERNAL);
4270 /* Not UTF-8 mode */
4272 for (fi = min;; fi++)
4274 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4275 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4276 if (fi >= max) RRETURN(MATCH_NOMATCH);
4277 if (eptr >= md->end_subject)
4280 RRETURN(MATCH_NOMATCH);
4282 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4283 RRETURN(MATCH_NOMATCH);
4287 case OP_ANY: /* This is the non-NL case */
4295 default: RRETURN(MATCH_NOMATCH);
4297 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4306 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4316 case 0x20: /* SPACE */
4317 case 0xa0: /* NBSP */
4318 RRETURN(MATCH_NOMATCH);
4325 default: RRETURN(MATCH_NOMATCH);
4327 case 0x20: /* SPACE */
4328 case 0xa0: /* NBSP */
4341 case 0x85: /* NEL */
4342 RRETURN(MATCH_NOMATCH);
4349 default: RRETURN(MATCH_NOMATCH);
4354 case 0x85: /* NEL */
4360 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4364 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4367 case OP_NOT_WHITESPACE:
4368 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4372 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4375 case OP_NOT_WORDCHAR:
4376 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4380 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4384 RRETURN(PCRE_ERROR_INTERNAL);
4388 /* Control never gets here */
4391 /* If maximizing, it is worth using inline code for speed, doing the type
4392 test once at the start (i.e. keep it out of the loop). Again, keep the
4393 UTF-8 and UCP stuff separate. */
4397 pp = eptr; /* Remember where we started */
4405 for (i = min; i < max; i++)
4408 if (eptr >= md->end_subject)
4413 GETCHARLEN(c, eptr, len);
4414 if (prop_fail_result) break;
4420 for (i = min; i < max; i++)
4423 if (eptr >= md->end_subject)
4428 GETCHARLEN(c, eptr, len);
4429 prop_chartype = UCD_CHARTYPE(c);
4430 if ((prop_chartype == ucp_Lu ||
4431 prop_chartype == ucp_Ll ||
4432 prop_chartype == ucp_Lt) == prop_fail_result)
4439 for (i = min; i < max; i++)
4442 if (eptr >= md->end_subject)
4447 GETCHARLEN(c, eptr, len);
4448 prop_category = UCD_CATEGORY(c);
4449 if ((prop_category == prop_value) == prop_fail_result)
4456 for (i = min; i < max; i++)
4459 if (eptr >= md->end_subject)
4464 GETCHARLEN(c, eptr, len);
4465 prop_chartype = UCD_CHARTYPE(c);
4466 if ((prop_chartype == prop_value) == prop_fail_result)
4473 for (i = min; i < max; i++)
4476 if (eptr >= md->end_subject)
4481 GETCHARLEN(c, eptr, len);
4482 prop_script = UCD_SCRIPT(c);
4483 if ((prop_script == prop_value) == prop_fail_result)
4490 /* eptr is now past the end of the maximum run */
4492 if (possessive) continue;
4495 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4496 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4497 if (eptr-- == pp) break; /* Stop if tried at original pos */
4498 if (utf8) BACKCHAR(eptr);
4502 /* Match extended Unicode sequences. We will get here only if the
4503 support is in the binary; otherwise a compile-time error occurs. */
4505 else if (ctype == OP_EXTUNI)
4507 for (i = min; i < max; i++)
4509 if (eptr >= md->end_subject)
4514 GETCHARINCTEST(c, eptr);
4515 prop_category = UCD_CATEGORY(c);
4516 if (prop_category == ucp_M) break;
4517 while (eptr < md->end_subject)
4520 if (!utf8) c = *eptr; else
4522 GETCHARLEN(c, eptr, len);
4524 prop_category = UCD_CATEGORY(c);
4525 if (prop_category != ucp_M) break;
4530 /* eptr is now past the end of the maximum run */
4532 if (possessive) continue;
4536 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4538 if (eptr-- == pp) break; /* Stop if tried at original pos */
4539 for (;;) /* Move back over one extended */
4542 if (!utf8) c = *eptr; else
4545 GETCHARLEN(c, eptr, len);
4547 prop_category = UCD_CATEGORY(c);
4548 if (prop_category != ucp_M) break;
4555 #endif /* SUPPORT_UCP */
4567 for (i = min; i < max; i++)
4569 if (eptr >= md->end_subject)
4574 if (IS_NEWLINE(eptr)) break;
4576 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4580 /* Handle unlimited UTF-8 repeat */
4584 for (i = min; i < max; i++)
4586 if (eptr >= md->end_subject)
4591 if (IS_NEWLINE(eptr)) break;
4593 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4601 for (i = min; i < max; i++)
4603 if (eptr >= md->end_subject)
4609 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4612 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4615 /* The byte case is the same as non-UTF8 */
4619 if (c > (unsigned int)(md->end_subject - eptr))
4621 eptr = md->end_subject;
4628 for (i = min; i < max; i++)
4631 if (eptr >= md->end_subject)
4636 GETCHARLEN(c, eptr, len);
4639 if (++eptr >= md->end_subject) break;
4640 if (*eptr == 0x000a) eptr++;
4646 (c != 0x000b && c != 0x000c &&
4647 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4656 for (i = min; i < max; i++)
4660 if (eptr >= md->end_subject)
4665 GETCHARLEN(c, eptr, len);
4668 default: gotspace = FALSE; break;
4670 case 0x20: /* SPACE */
4671 case 0xa0: /* NBSP */
4672 case 0x1680: /* OGHAM SPACE MARK */
4673 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4674 case 0x2000: /* EN QUAD */
4675 case 0x2001: /* EM QUAD */
4676 case 0x2002: /* EN SPACE */
4677 case 0x2003: /* EM SPACE */
4678 case 0x2004: /* THREE-PER-EM SPACE */
4679 case 0x2005: /* FOUR-PER-EM SPACE */
4680 case 0x2006: /* SIX-PER-EM SPACE */
4681 case 0x2007: /* FIGURE SPACE */
4682 case 0x2008: /* PUNCTUATION SPACE */
4683 case 0x2009: /* THIN SPACE */
4684 case 0x200A: /* HAIR SPACE */
4685 case 0x202f: /* NARROW NO-BREAK SPACE */
4686 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4687 case 0x3000: /* IDEOGRAPHIC SPACE */
4691 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4698 for (i = min; i < max; i++)
4702 if (eptr >= md->end_subject)
4707 GETCHARLEN(c, eptr, len);
4710 default: gotspace = FALSE; break;
4715 case 0x85: /* NEL */
4716 case 0x2028: /* LINE SEPARATOR */
4717 case 0x2029: /* PARAGRAPH SEPARATOR */
4721 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4727 for (i = min; i < max; i++)
4730 if (eptr >= md->end_subject)
4735 GETCHARLEN(c, eptr, len);
4736 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4742 for (i = min; i < max; i++)
4745 if (eptr >= md->end_subject)
4750 GETCHARLEN(c, eptr, len);
4751 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4756 case OP_NOT_WHITESPACE:
4757 for (i = min; i < max; i++)
4760 if (eptr >= md->end_subject)
4765 GETCHARLEN(c, eptr, len);
4766 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4772 for (i = min; i < max; i++)
4775 if (eptr >= md->end_subject)
4780 GETCHARLEN(c, eptr, len);
4781 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4786 case OP_NOT_WORDCHAR:
4787 for (i = min; i < max; i++)
4790 if (eptr >= md->end_subject)
4795 GETCHARLEN(c, eptr, len);
4796 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4802 for (i = min; i < max; i++)
4805 if (eptr >= md->end_subject)
4810 GETCHARLEN(c, eptr, len);
4811 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4817 RRETURN(PCRE_ERROR_INTERNAL);
4820 /* eptr is now past the end of the maximum run */
4822 if (possessive) continue;
4825 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4827 if (eptr-- == pp) break; /* Stop if tried at original pos */
4832 #endif /* SUPPORT_UTF8 */
4834 /* Not UTF-8 mode */
4839 for (i = min; i < max; i++)
4841 if (eptr >= md->end_subject)
4846 if (IS_NEWLINE(eptr)) break;
4854 if (c > (unsigned int)(md->end_subject - eptr))
4856 eptr = md->end_subject;
4863 for (i = min; i < max; i++)
4865 if (eptr >= md->end_subject)
4873 if (++eptr >= md->end_subject) break;
4874 if (*eptr == 0x000a) eptr++;
4880 (c != 0x000b && c != 0x000c && c != 0x0085)))
4888 for (i = min; i < max; i++)
4890 if (eptr >= md->end_subject)
4896 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4902 for (i = min; i < max; i++)
4904 if (eptr >= md->end_subject)
4910 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4916 for (i = min; i < max; i++)
4918 if (eptr >= md->end_subject)
4924 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4931 for (i = min; i < max; i++)
4933 if (eptr >= md->end_subject)
4939 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4946 for (i = min; i < max; i++)
4948 if (eptr >= md->end_subject)
4953 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
4959 for (i = min; i < max; i++)
4961 if (eptr >= md->end_subject)
4966 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
4971 case OP_NOT_WHITESPACE:
4972 for (i = min; i < max; i++)
4974 if (eptr >= md->end_subject)
4979 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
4985 for (i = min; i < max; i++)
4987 if (eptr >= md->end_subject)
4992 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
4997 case OP_NOT_WORDCHAR:
4998 for (i = min; i < max; i++)
5000 if (eptr >= md->end_subject)
5005 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5011 for (i = min; i < max; i++)
5013 if (eptr >= md->end_subject)
5018 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5024 RRETURN(PCRE_ERROR_INTERNAL);
5027 /* eptr is now past the end of the maximum run */
5029 if (possessive) continue;
5032 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5038 /* Get here if we can't make it match with any permitted repetitions */
5040 RRETURN(MATCH_NOMATCH);
5042 /* Control never gets here */
5044 /* There's been some horrible disaster. Arrival here can only mean there is
5045 something seriously wrong in the code above or the OP_xxx definitions. */
5048 DPRINTF(("Unknown opcode %d\n", *ecode));
5049 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5052 /* Do not stick any code in here without much thought; it is assumed
5053 that "continue" in the code above comes out to here to repeat the main
5056 } /* End of main loop */
5057 /* Control never reaches here */
5060 /* When compiling to use the heap rather than the stack for recursive calls to
5061 match(), the RRETURN() macro jumps here. The number that is saved in
5062 frame->Xwhere indicates which label we actually want to return to. */
5065 #define LBL(val) case val: goto L_RM##val;
5067 switch (frame->Xwhere)
5069 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5070 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5071 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5072 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5075 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5076 LBL(32) LBL(34) LBL(42) LBL(46)
5078 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5079 #endif /* SUPPORT_UCP */
5080 #endif /* SUPPORT_UTF8 */
5082 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5083 return PCRE_ERROR_INTERNAL;
5086 #endif /* NO_RECURSE */
5090 /***************************************************************************
5091 ****************************************************************************
5092 RECURSION IN THE match() FUNCTION
5094 Undefine all the macros that were defined above to handle this. */
5113 #undef new_recursive
5128 #undef save_capture_last
5138 /* These two are defined as macros in both cases */
5143 /***************************************************************************
5144 ***************************************************************************/
5148 /*************************************************
5149 * Execute a Regular Expression *
5150 *************************************************/
5152 /* This function applies a compiled re to a subject string and picks out
5153 portions of the string if it matches. Two elements in the vector are set for
5154 each substring: the offsets to the start and end of the substring.
5157 argument_re points to the compiled expression
5158 extra_data points to extra data or is NULL
5159 subject points to the subject string
5160 length length of subject string (may contain binary zeros)
5161 start_offset where to start in the subject string
5163 offsets points to a vector of ints to be filled in with offsets
5164 offsetcount the number of elements in the vector
5166 Returns: > 0 => success; value is the number of elements filled in
5167 = 0 => success, but offsets is not big enough
5168 -1 => failed to match
5169 < -1 => some kind of unexpected problem
5172 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5173 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5174 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5177 int rc, resetcount, ocount;
5178 int first_byte = -1;
5182 unsigned long int ims;
5183 BOOL using_temporary_offsets = FALSE;
5187 BOOL first_byte_caseless = FALSE;
5188 BOOL req_byte_caseless = FALSE;
5190 match_data match_block;
5191 match_data *md = &match_block;
5192 const uschar *tables;
5193 const uschar *start_bits = NULL;
5194 USPTR start_match = (USPTR)subject + start_offset;
5196 USPTR start_partial = NULL;
5197 USPTR req_byte_ptr = start_match - 1;
5199 pcre_study_data internal_study;
5200 const pcre_study_data *study;
5202 real_pcre internal_re;
5203 const real_pcre *external_re = (const real_pcre *)argument_re;
5204 const real_pcre *re = external_re;
5206 /* Plausibility checks */
5208 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5209 if (re == NULL || subject == NULL ||
5210 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5211 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5213 /* This information is for finding all the numbers associated with a given
5214 name, for condition testing. */
5216 md->name_table = (uschar *)re + re->name_table_offset;
5217 md->name_count = re->name_count;
5218 md->name_entry_size = re->name_entry_size;
5220 /* Fish out the optional data from the extra_data structure, first setting
5221 the default values. */
5224 md->match_limit = MATCH_LIMIT;
5225 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5226 md->callout_data = NULL;
5228 /* The table pointer is always in native byte order. */
5230 tables = external_re->tables;
5232 if (extra_data != NULL)
5234 register unsigned int flags = extra_data->flags;
5235 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5236 study = (const pcre_study_data *)extra_data->study_data;
5237 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5238 md->match_limit = extra_data->match_limit;
5239 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5240 md->match_limit_recursion = extra_data->match_limit_recursion;
5241 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5242 md->callout_data = extra_data->callout_data;
5243 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5246 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5247 is a feature that makes it possible to save compiled regex and re-use them
5248 in other programs later. */
5250 if (tables == NULL) tables = _pcre_default_tables;
5252 /* Check that the first field in the block is the magic number. If it is not,
5253 test for a regex that was compiled on a host of opposite endianness. If this is
5254 the case, flipped values are put in internal_re and internal_study if there was
5257 if (re->magic_number != MAGIC_NUMBER)
5259 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5260 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5261 if (study != NULL) study = &internal_study;
5264 /* Set up other data */
5266 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5267 startline = (re->flags & PCRE_STARTLINE) != 0;
5268 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5270 /* The code starts after the real_pcre block and the capture name table. */
5272 md->start_code = (const uschar *)external_re + re->name_table_offset +
5273 re->name_count * re->name_entry_size;
5275 md->start_subject = (USPTR)subject;
5276 md->start_offset = start_offset;
5277 md->end_subject = md->start_subject + length;
5278 end_subject = md->end_subject;
5280 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5281 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5282 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5284 md->notbol = (options & PCRE_NOTBOL) != 0;
5285 md->noteol = (options & PCRE_NOTEOL) != 0;
5286 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5287 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5288 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5289 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5292 md->recursive = NULL; /* No recursion at top level */
5294 md->lcc = tables + lcc_offset;
5295 md->ctypes = tables + ctypes_offset;
5297 /* Handle different \R options. */
5299 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5302 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5303 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5306 md->bsr_anycrlf = TRUE;
5308 md->bsr_anycrlf = FALSE;
5312 case PCRE_BSR_ANYCRLF:
5313 md->bsr_anycrlf = TRUE;
5316 case PCRE_BSR_UNICODE:
5317 md->bsr_anycrlf = FALSE;
5320 default: return PCRE_ERROR_BADNEWLINE;
5323 /* Handle different types of newline. The three bits give eight cases. If
5324 nothing is set at run time, whatever was used at compile time applies. */
5326 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5327 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5329 case 0: newline = NEWLINE; break; /* Compile-time default */
5330 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5331 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5332 case PCRE_NEWLINE_CR+
5333 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5334 case PCRE_NEWLINE_ANY: newline = -1; break;
5335 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5336 default: return PCRE_ERROR_BADNEWLINE;
5341 md->nltype = NLTYPE_ANYCRLF;
5343 else if (newline < 0)
5345 md->nltype = NLTYPE_ANY;
5349 md->nltype = NLTYPE_FIXED;
5353 md->nl[0] = (newline >> 8) & 255;
5354 md->nl[1] = newline & 255;
5359 md->nl[0] = newline;
5363 /* Partial matching was originally supported only for a restricted set of
5364 regexes; from release 8.00 there are no restrictions, but the bits are still
5365 defined (though never set). So there's no harm in leaving this code. */
5367 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5368 return PCRE_ERROR_BADPARTIAL;
5370 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5371 back the character offset. */
5374 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5376 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5377 return PCRE_ERROR_BADUTF8;
5378 if (start_offset > 0 && start_offset < length)
5380 int tb = ((USPTR)subject)[start_offset];
5384 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5390 /* The ims options can vary during the matching as a result of the presence
5391 of (?ims) items in the pattern. They are kept in a local variable so that
5392 restoring at the exit of a group is easy. */
5394 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5396 /* If the expression has got more back references than the offsets supplied can
5397 hold, we get a temporary chunk of working store to use during the matching.
5398 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5401 ocount = offsetcount - (offsetcount % 3);
5403 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5405 ocount = re->top_backref * 3 + 3;
5406 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5407 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5408 using_temporary_offsets = TRUE;
5409 DPRINTF(("Got memory to hold back references\n"));
5411 else md->offset_vector = offsets;
5413 md->offset_end = ocount;
5414 md->offset_max = (2*ocount)/3;
5415 md->offset_overflow = FALSE;
5416 md->capture_last = -1;
5418 /* Compute the minimum number of offsets that we need to reset each time. Doing
5419 this makes a huge difference to execution time when there aren't many brackets
5422 resetcount = 2 + re->top_bracket * 2;
5423 if (resetcount > offsetcount) resetcount = ocount;
5425 /* Reset the working variable associated with each extraction. These should
5426 never be used unless previously set, but they get saved and restored, and so we
5427 initialize them to avoid reading uninitialized locations. */
5429 if (md->offset_vector != NULL)
5431 register int *iptr = md->offset_vector + ocount;
5432 register int *iend = iptr - resetcount/2 + 1;
5433 while (--iptr >= iend) *iptr = -1;
5436 /* Set up the first character to match, if available. The first_byte value is
5437 never set for an anchored regular expression, but the anchoring may be forced
5438 at run time, so we have to test for anchoring. The first char may be unset for
5439 an unanchored pattern, of course. If there's no first char and the pattern was
5440 studied, there may be a bitmap of possible first characters. */
5444 if ((re->flags & PCRE_FIRSTSET) != 0)
5446 first_byte = re->first_byte & 255;
5447 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5448 first_byte = md->lcc[first_byte];
5451 if (!startline && study != NULL &&
5452 (study->flags & PCRE_STUDY_MAPPED) != 0)
5453 start_bits = study->start_bits;
5456 /* For anchored or unanchored matches, there may be a "last known required
5459 if ((re->flags & PCRE_REQCHSET) != 0)
5461 req_byte = re->req_byte & 255;
5462 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5463 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5467 /* ==========================================================================*/
5469 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5470 the loop runs just once. */
5474 USPTR save_end_subject = end_subject;
5475 USPTR new_start_match;
5477 /* Reset the maximum number of extractions we might see. */
5479 if (md->offset_vector != NULL)
5481 register int *iptr = md->offset_vector;
5482 register int *iend = iptr + resetcount;
5483 while (iptr < iend) *iptr++ = -1;
5486 /* If firstline is TRUE, the start of the match is constrained to the first
5487 line of a multiline string. That is, the match must be before or at the first
5488 newline. Implement this by temporarily adjusting end_subject so that we stop
5489 scanning at a newline. If the match fails at the newline, later code breaks
5494 USPTR t = start_match;
5498 while (t < md->end_subject && !IS_NEWLINE(t))
5501 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5506 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5510 /* There are some optimizations that avoid running the match if a known
5511 starting point is not found, or if a known later character is not present.
5512 However, there is an option that disables these, for testing and for ensuring
5513 that all callouts do actually occur. */
5515 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5517 /* Advance to a unique first byte if there is one. */
5519 if (first_byte >= 0)
5521 if (first_byte_caseless)
5522 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5525 while (start_match < end_subject && *start_match != first_byte)
5529 /* Or to just after a linebreak for a multiline match */
5533 if (start_match > md->start_subject + start_offset)
5538 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5541 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5547 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5550 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5551 and we are now at a LF, advance the match position by one more character.
5554 if (start_match[-1] == CHAR_CR &&
5555 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5556 start_match < end_subject &&
5557 *start_match == CHAR_NL)
5562 /* Or to a non-unique first byte after study */
5564 else if (start_bits != NULL)
5566 while (start_match < end_subject)
5568 register unsigned int c = *start_match;
5569 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5573 } /* Starting optimizations */
5575 /* Restore fudged end_subject */
5577 end_subject = save_end_subject;
5579 /* The following two optimizations are disabled for partial matching or if
5580 disabling is explicitly requested. */
5582 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5584 /* If the pattern was studied, a minimum subject length may be set. This is
5585 a lower bound; no actual string of that length may actually match the
5586 pattern. Although the value is, strictly, in characters, we treat it as
5587 bytes to avoid spending too much time in this optimization. */
5589 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5590 (pcre_uint32)(end_subject - start_match) < study->minlength)
5596 /* If req_byte is set, we know that that character must appear in the
5597 subject for the match to succeed. If the first character is set, req_byte
5598 must be later in the subject; otherwise the test starts at the match point.
5599 This optimization can save a huge amount of backtracking in patterns with
5600 nested unlimited repeats that aren't going to match. Writing separate code
5601 for cased/caseless versions makes it go faster, as does using an
5602 autoincrement and backing off on a match.
5604 HOWEVER: when the subject string is very, very long, searching to its end
5605 can take a long time, and give bad performance on quite ordinary patterns.
5606 This showed up when somebody was matching something like /^\d+C/ on a
5607 32-megabyte string... so we don't do this when the string is sufficiently
5610 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5612 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5614 /* We don't need to repeat the search if we haven't yet reached the
5615 place we found it at last time. */
5617 if (p > req_byte_ptr)
5619 if (req_byte_caseless)
5621 while (p < end_subject)
5623 register int pp = *p++;
5624 if (pp == req_byte || pp == req_byte2) { p--; break; }
5629 while (p < end_subject)
5631 if (*p++ == req_byte) { p--; break; }
5635 /* If we can't find the required character, break the matching loop,
5636 forcing a match failure. */
5638 if (p >= end_subject)
5644 /* If we have found the required character, save the point where we
5645 found it, so that we don't search again next time round the loop if
5646 the start hasn't passed this character yet. */
5653 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5654 printf(">>>> Match against: ");
5655 pchars(start_match, end_subject - start_match, TRUE, md);
5659 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5660 first starting point for which a partial match was found. */
5662 md->start_match_ptr = start_match;
5663 md->start_used_ptr = start_match;
5664 md->match_call_count = 0;
5665 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
5667 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5671 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5672 exactly like PRUNE. */
5677 new_start_match = start_match + 1;
5680 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5685 /* SKIP passes back the next starting point explicitly. */
5688 new_start_match = md->start_match_ptr;
5691 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5697 /* Any other return is either a match, or some kind of error. */
5703 /* Control reaches here for the various types of "no match at this point"
5704 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5708 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5709 newline in the subject (though it may continue over the newline). Therefore,
5710 if we have just failed to match, starting at a newline, do not continue. */
5712 if (firstline && IS_NEWLINE(start_match)) break;
5714 /* Advance to new matching position */
5716 start_match = new_start_match;
5718 /* Break the loop if the pattern is anchored or if we have passed the end of
5721 if (anchored || start_match > end_subject) break;
5723 /* If we have just passed a CR and we are now at a LF, and the pattern does
5724 not contain any explicit matches for \r or \n, and the newline option is CRLF
5725 or ANY or ANYCRLF, advance the match position by one more character. */
5727 if (start_match[-1] == CHAR_CR &&
5728 start_match < end_subject &&
5729 *start_match == CHAR_NL &&
5730 (re->flags & PCRE_HASCRORLF) == 0 &&
5731 (md->nltype == NLTYPE_ANY ||
5732 md->nltype == NLTYPE_ANYCRLF ||
5736 } /* End of for(;;) "bumpalong" loop */
5738 /* ==========================================================================*/
5740 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5743 (1) The pattern is anchored or the match was failed by (*COMMIT);
5745 (2) We are past the end of the subject;
5747 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5748 this option requests that a match occur at or before the first newline in
5751 When we have a match and the offset vector is big enough to deal with any
5752 backreferences, captured substring offsets will already be set up. In the case
5753 where we had to get some local store to hold offsets for backreference
5754 processing, copy those that we can. In this case there need not be overflow if
5755 certain parts of the pattern were not used, even though there are more
5756 capturing parentheses than vector slots. */
5760 if (rc == MATCH_MATCH)
5762 if (using_temporary_offsets)
5764 if (offsetcount >= 4)
5766 memcpy(offsets + 2, md->offset_vector + 2,
5767 (offsetcount - 2) * sizeof(int));
5768 DPRINTF(("Copied offsets from temporary memory\n"));
5770 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5771 DPRINTF(("Freeing temporary memory\n"));
5772 (pcre_free)(md->offset_vector);
5775 /* Set the return code to the number of captured strings, or 0 if there are
5776 too many to fit into the vector. */
5778 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5780 /* If there is space, set up the whole thing as substring 0. The value of
5781 md->start_match_ptr might be modified if \K was encountered on the success
5784 if (offsetcount < 2) rc = 0; else
5786 offsets[0] = md->start_match_ptr - md->start_subject;
5787 offsets[1] = md->end_match_ptr - md->start_subject;
5790 DPRINTF((">>>> returning %d\n", rc));
5794 /* Control gets here if there has been an error, or if the overall match
5795 attempt has failed at all permitted starting positions. */
5797 if (using_temporary_offsets)
5799 DPRINTF(("Freeing temporary memory\n"));
5800 (pcre_free)(md->offset_vector);
5803 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5805 DPRINTF((">>>> error: returning %d\n", rc));
5808 else if (start_partial != NULL)
5810 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5811 if (offsetcount > 1)
5813 offsets[0] = start_partial - (USPTR)subject;
5814 offsets[1] = end_subject - (USPTR)subject;
5816 return PCRE_ERROR_PARTIAL;
5820 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5821 return PCRE_ERROR_NOMATCH;
5825 /* End of pcre_exec.c */