below for why this module is different).
Written by Philip Hazel
- Copyright (c) 1997-2010 University of Cambridge
+ Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
-----------------------------------------------------------------------------
*/
-
/* This module contains the external function pcre_dfa_exec(), which is an
alternative matching function that uses a sort of DFA algorithm (not a true
-FSM). This is NOT Perl- compatible, but it has advantages in certain
+FSM). This is NOT Perl-compatible, but it has advantages in certain
applications. */
/* This table identifies those opcodes that are followed immediately by a
-character that is to be tested in some way. This makes is possible to
+character that is to be tested in some way. This makes it possible to
centralize the loading of these characters. In the case of Type * etc, the
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
small value. Non-zero values in the table are the offsets from the opcode where
the character is to be found. ***NOTE*** If the start of this table is
modified, the three tables that follow must also be modified. */
-static const uschar coptable[] = {
+static const pcre_uint8 coptable[] = {
0, /* End */
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
0, 0, /* \P, \p */
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
0, /* \X */
- 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
+ 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
1, /* Char */
- 1, /* Charnc */
+ 1, /* Chari */
1, /* not */
+ 1, /* noti */
/* Positive single-char repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
- 3, 3, 3, /* upto, minupto, exact */
- 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
+ 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
+ 1+IMM2_SIZE, /* exact */
+ 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
+ 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
+ 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
+ 1+IMM2_SIZE, /* exact I */
+ 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
/* Negative single-char repeats - only for chars < 256 */
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
- 3, 3, 3, /* NOT upto, minupto, exact */
- 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
+ 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
+ 1+IMM2_SIZE, /* NOT exact */
+ 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
+ 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
+ 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
+ 1+IMM2_SIZE, /* NOT exact I */
+ 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
/* Positive type repeats */
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
- 3, 3, 3, /* Type upto, minupto, exact */
- 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
+ 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
+ 1+IMM2_SIZE, /* Type exact */
+ 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
/* Character class & ref repeats */
0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
0, 0, /* CRRANGE, CRMINRANGE */
0, /* NCLASS */
0, /* XCLASS - variable length */
0, /* REF */
+ 0, /* REFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
0, /* KetRmin */
+ 0, /* KetRpos */
+ 0, /* Reverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
0, /* Assert behind not */
- 0, /* Reverse */
- 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
- 0, 0, 0, /* SBRA, SCBRA, SCOND */
+ 0, 0, /* ONCE, ONCE_NC */
+ 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
+ 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
0, 0, /* CREF, NCREF */
0, 0, /* RREF, NRREF */
0, /* DEF */
- 0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
+ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
+ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
+ 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
+ 0, 0 /* CLOSE, SKIPZERO */
};
/* This table identifies those opcodes that inspect a character. It is used to
the subject is reached. ***NOTE*** If the start of this table is modified, the
two tables that follow must also be modified. */
-static const uschar poptable[] = {
+static const pcre_uint8 poptable[] = {
0, /* End */
0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
1, 1, /* \P, \p */
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
1, /* \X */
- 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
+ 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
1, /* Char */
- 1, /* Charnc */
+ 1, /* Chari */
1, /* not */
+ 1, /* noti */
/* Positive single-char repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1, 1, 1, /* upto, minupto, exact */
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
+ 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
+ 1, 1, 1, /* upto I, minupto I, exact I */
+ 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
/* Negative single-char repeats - only for chars < 256 */
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
1, 1, 1, /* NOT upto, minupto, exact */
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
+ 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
+ 1, 1, 1, /* NOT upto I, minupto I, exact I */
+ 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
/* Positive type repeats */
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
1, 1, 1, /* Type upto, minupto, exact */
1, /* NCLASS */
1, /* XCLASS - variable length */
0, /* REF */
+ 0, /* REFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
0, /* KetRmin */
+ 0, /* KetRpos */
+ 0, /* Reverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
0, /* Assert behind not */
- 0, /* Reverse */
- 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
- 0, 0, 0, /* SBRA, SCBRA, SCOND */
+ 0, 0, /* ONCE, ONCE_NC */
+ 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
+ 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
0, 0, /* CREF, NCREF */
0, 0, /* RREF, NRREF */
0, /* DEF */
- 0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
+ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
+ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
+ 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
+ 0, 0 /* CLOSE, SKIPZERO */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
and \w */
-static const uschar toptable1[] = {
+static const pcre_uint8 toptable1[] = {
0, 0, 0, 0, 0, 0,
ctype_digit, ctype_digit,
ctype_space, ctype_space,
0, 0 /* OP_ANY, OP_ALLANY */
};
-static const uschar toptable2[] = {
+static const pcre_uint8 toptable2[] = {
0, 0, 0, 0, 0, 0,
ctype_digit, 0,
ctype_space, 0,
typedef struct stateblock {
int offset; /* Offset to opcode */
int count; /* Count for repeats */
- int ims; /* ims flag bits */
int data; /* Some use extra data */
} stateblock;
-#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
+#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
#ifdef PCRE_DEBUG
*/
static void
-pchars(unsigned char *p, int length, FILE *f)
+pchars(const pcre_uchar *p, int length, FILE *f)
{
int c;
while (length-- > 0)
offsetcount size of same
workspace vector of workspace
wscount size of same
- ims the current ims flags
rlevel function call recursion level
- recursing regex recursive call level
Returns: > 0 => number of match offset pairs placed in offsets
= 0 => offsets overflowed; longest matches are present
{ \
next_active_state->offset = (x); \
next_active_state->count = (y); \
- next_active_state->ims = ims; \
next_active_state++; \
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
} \
{ \
next_active_state->offset = (x); \
next_active_state->count = (y); \
- next_active_state->ims = ims; \
next_active_state->data = (z); \
next_active_state++; \
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
{ \
next_new_state->offset = (x); \
next_new_state->count = (y); \
- next_new_state->ims = ims; \
next_new_state++; \
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
} \
{ \
next_new_state->offset = (x); \
next_new_state->count = (y); \
- next_new_state->ims = ims; \
next_new_state->data = (z); \
next_new_state++; \
- DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+ DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
+ (x), (y), (z), __LINE__)); \
} \
else return PCRE_ERROR_DFA_WSSIZE
static int
internal_dfa_exec(
dfa_match_data *md,
- const uschar *this_start_code,
- const uschar *current_subject,
+ const pcre_uchar *this_start_code,
+ const pcre_uchar *current_subject,
int start_offset,
int *offsets,
int offsetcount,
int *workspace,
int wscount,
- int ims,
- int rlevel,
- int recursing)
+ int rlevel)
{
stateblock *active_states, *new_states, *temp_states;
stateblock *next_active_state, *next_new_state;
-const uschar *ctypes, *lcc, *fcc;
-const uschar *ptr;
-const uschar *end_code, *first_op;
+const pcre_uint8 *ctypes, *lcc, *fcc;
+const pcre_uchar *ptr;
+const pcre_uchar *end_code, *first_op;
+
+dfa_recursion_info new_recursive;
int active_count, new_count, match_count;
/* Some fields in the md block are frequently referenced, so we load them into
independent variables in the hope that this will perform better. */
-const uschar *start_subject = md->start_subject;
-const uschar *end_subject = md->end_subject;
-const uschar *start_code = md->start_code;
+const pcre_uchar *start_subject = md->start_subject;
+const pcre_uchar *end_subject = md->end_subject;
+const pcre_uchar *start_code = md->start_code;
-#ifdef SUPPORT_UTF8
-BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
+#ifdef SUPPORT_UTF
+BOOL utf = (md->poptions & PCRE_UTF8) != 0;
#else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
#endif
+BOOL reset_could_continue = FALSE;
+
rlevel++;
offsetcount &= (-2);
(2 * INTS_PER_STATEBLOCK);
DPRINTF(("\n%.*s---------------------\n"
- "%.*sCall to internal_dfa_exec f=%d r=%d\n",
- rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
+ "%.*sCall to internal_dfa_exec f=%d\n",
+ rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
ctypes = md->tables + ctypes_offset;
lcc = md->tables + lcc_offset;
new_count = 0;
first_op = this_start_code + 1 + LINK_SIZE +
- ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
+ ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
+ *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
+ ? IMM2_SIZE:0);
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
the alternative states onto the list, and find out where the end is. This
/* If we can't go back the amount required for the longest lookbehind
pattern, go back as far as we can; some alternatives may still be viable. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
/* In character mode we have to step back character by character */
- if (utf8)
+ if (utf)
{
for (gone_back = 0; gone_back < max_back; gone_back++)
{
if (current_subject <= start_subject) break;
current_subject--;
- while (current_subject > start_subject &&
- (*current_subject & 0xc0) == 0x80)
- current_subject--;
+ ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
}
}
else
{
gone_back = (current_subject - max_back < start_subject)?
- current_subject - start_subject : max_back;
+ (int)(current_subject - start_subject) : max_back;
current_subject -= gone_back;
}
int back = GET(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
- int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
+ int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
ADD_NEW_DATA(-bstate, 0, gone_back - back);
}
end_code += GET(end_code, 1);
else
{
int length = 1 + LINK_SIZE +
- ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
+ ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
+ *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
+ ? IMM2_SIZE:0);
do
{
- ADD_NEW(end_code - start_code + length, 0);
+ ADD_NEW((int)(end_code - start_code + length), 0);
end_code += GET(end_code, 1);
length = 1 + LINK_SIZE;
}
workspace[0] = 0; /* Bit indicating which vector is current */
-DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
+DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
/* Loop for scanning the subject */
int clen, dlen;
unsigned int c, d;
int forced_fail = 0;
- BOOL could_continue = FALSE;
+ BOOL partial_newline = FALSE;
+ BOOL could_continue = reset_could_continue;
+ reset_could_continue = FALSE;
/* Make the new state list into the active state list and empty the
new state list. */
#ifdef PCRE_DEBUG
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
- pchars((uschar *)ptr, strlen((char *)ptr), stdout);
+ pchars(ptr, STRLEN_UC(ptr), stdout);
printf("\"\n");
printf("%.*sActive states: ", rlevel*2-2, SP);
if (ptr < end_subject)
{
- clen = 1; /* Number of bytes in the character */
-#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARLEN(c, ptr, clen); } else
-#endif /* SUPPORT_UTF8 */
+ clen = 1; /* Number of data items in the character */
+#ifdef SUPPORT_UTF
+ if (utf) { GETCHARLEN(c, ptr, clen); } else
+#endif /* SUPPORT_UTF */
c = *ptr;
}
else
for (i = 0; i < active_count; i++)
{
stateblock *current_state = active_states + i;
- const uschar *code;
+ BOOL caseless = FALSE;
+ const pcre_uchar *code;
int state_offset = current_state->offset;
int count, codevalue, rrc;
else printf("0x%02x\n", c);
#endif
- /* This variable is referred to implicity in the ADD_xxx macros. */
-
- ims = current_state->ims;
-
/* A negative offset is a special case meaning "hold off going to this
(negated) state until the number of characters in the data field have
- been skipped". */
+ been skipped". If the could_continue flag was passed over from a previous
+ state, arrange for it to passed on. */
if (state_offset < 0)
{
DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
ADD_NEW_DATA(state_offset, current_state->count,
current_state->data - 1);
+ if (could_continue) reset_could_continue = TRUE;
continue;
}
else
permitted.
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
- argument that is not a data character - but is always one byte long. We
- have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
- this case. To keep the other cases fast, convert these ones to new opcodes.
- */
+ argument that is not a data character - but is always one byte long because
+ the values are small. We have to take special action to deal with \P, \p,
+ \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
+ these ones to new opcodes. */
if (coptable[codevalue] > 0)
{
dlen = 1;
-#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
-#endif /* SUPPORT_UTF8 */
+#ifdef SUPPORT_UTF
+ if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
+#endif /* SUPPORT_UTF */
d = code[coptable[codevalue]];
if (codevalue >= OP_TYPESTAR)
{
/* ========================================================================== */
/* Reached a closing bracket. If not at the end of the pattern, carry
- on with the next opcode. Otherwise, unless we have an empty string and
+ on with the next opcode. For repeating opcodes, also add the repeat
+ state. Note that KETRPOS will always be encountered at the end of the
+ subpattern, because the possessive subpattern repeats are always handled
+ using recursive calls. Thus, it never adds any new states.
+
+ At the end of the (sub)pattern, unless we have an empty string and
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
start of the subject, save the match data, shifting up all previous
matches so we always have the longest first. */
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
+ case OP_KETRPOS:
if (code != end_code)
{
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
current_subject > start_subject + md->start_offset)))
{
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
- else if (match_count > 0 && ++match_count * 2 >= offsetcount)
+ else if (match_count > 0 && ++match_count * 2 > offsetcount)
match_count = 0;
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if (offsetcount >= 2)
{
- offsets[0] = current_subject - start_subject;
- offsets[1] = ptr - start_subject;
+ offsets[0] = (int)(current_subject - start_subject);
+ offsets[1] = (int)(ptr - start_subject);
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
- offsets[1] - offsets[0], current_subject));
+ offsets[1] - offsets[0], (char *)current_subject));
}
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
{
/*-----------------------------------------------------------------*/
case OP_ALT:
do { code += GET(code, 1); } while (*code == OP_ALT);
- ADD_ACTIVE(code - start_code, 0);
+ ADD_ACTIVE((int)(code - start_code), 0);
break;
/*-----------------------------------------------------------------*/
case OP_SBRA:
do
{
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
while (*code == OP_ALT);
/*-----------------------------------------------------------------*/
case OP_CBRA:
case OP_SCBRA:
- ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
code += GET(code, 1);
while (*code == OP_ALT)
{
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
break;
ADD_ACTIVE(state_offset + 1, 0);
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
case OP_SKIPZERO:
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
case OP_CIRC:
- if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
- ((ims & PCRE_MULTILINE) != 0 &&
- ptr != end_subject &&
- WAS_NEWLINE(ptr)))
+ if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
- case OP_EOD:
- if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+ case OP_CIRCM:
+ if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
+ (ptr != end_subject && WAS_NEWLINE(ptr)))
+ { ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
- case OP_OPT:
- ims = code[1];
- ADD_ACTIVE(state_offset + 2, 0);
+ case OP_EOD:
+ if (ptr >= end_subject)
+ {
+ if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else { ADD_ACTIVE(state_offset + 1, 0); }
+ }
break;
/*-----------------------------------------------------------------*/
/*-----------------------------------------------------------------*/
case OP_ANY:
if (clen > 0 && !IS_NEWLINE(ptr))
- { ADD_NEW(state_offset + 1, 0); }
+ {
+ if (ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ could_continue = partial_newline = TRUE;
+ }
+ else
+ {
+ ADD_NEW(state_offset + 1, 0);
+ }
+ }
break;
/*-----------------------------------------------------------------*/
/*-----------------------------------------------------------------*/
case OP_EODN:
- if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
case OP_DOLL:
if ((md->moptions & PCRE_NOTEOL) == 0)
{
- if (clen == 0 ||
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 ||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
- ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
+ (ptr == end_subject - md->nllen)
))
{ ADD_ACTIVE(state_offset + 1, 0); }
+ else if (ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+ {
+ reset_could_continue = TRUE;
+ ADD_NEW_DATA(-(state_offset + 1), 0, 1);
+ }
+ else could_continue = partial_newline = TRUE;
+ }
}
- else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_DOLLM:
+ if ((md->moptions & PCRE_NOTEOL) == 0)
+ {
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 ||
+ ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
+ { ADD_ACTIVE(state_offset + 1, 0); }
+ else if (ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+ {
+ reset_could_continue = TRUE;
+ ADD_NEW_DATA(-(state_offset + 1), 0, 1);
+ }
+ else could_continue = partial_newline = TRUE;
+ }
+ }
+ else if (IS_NEWLINE(ptr))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
if (ptr > start_subject)
{
- const uschar *temp = ptr - 1;
+ const pcre_uchar *temp = ptr - 1;
if (temp < md->start_used_ptr) md->start_used_ptr = temp;
-#ifdef SUPPORT_UTF8
- if (utf8) BACKCHAR(temp);
+#ifdef SUPPORT_UTF
+ if (utf) { BACKCHAR(temp); }
#endif
GETCHARTEST(d, temp);
+#ifdef SUPPORT_UCP
+ if ((md->poptions & PCRE_UCP) != 0)
+ {
+ if (d == '_') left_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(d);
+ left_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
}
- else left_word = 0;
+ else left_word = FALSE;
if (clen > 0)
+ {
+#ifdef SUPPORT_UCP
+ if ((md->poptions & PCRE_UCP) != 0)
+ {
+ if (c == '_') right_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ right_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
- else right_word = 0;
+ }
+ else right_word = FALSE;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
if (clen > 0)
{
BOOL OK;
- int chartype = UCD_CHARTYPE(c);
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
switch(code[1])
{
case PT_ANY:
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
- OK = _pcre_ucp_gentype[chartype] == code[2];
+ OK = PRIV(ucp_gentype)[chartype] == code[2];
break;
case PT_PC:
OK = UCD_SCRIPT(c) == code[2];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ could_continue = partial_newline = TRUE;
+ }
+ else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ could_continue = partial_newline = TRUE;
+ }
+ else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ could_continue = partial_newline = TRUE;
+ }
+ else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
count = current_state->count; /* Number already matched */
if (clen > 0)
{
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ could_continue = partial_newline = TRUE;
+ }
+ else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + 4, 0); }
+ { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
else
{ ADD_NEW(state_offset, count); }
}
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- ADD_ACTIVE(state_offset + 4, 0);
+ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
count = current_state->count; /* Number already matched */
if (clen > 0)
{
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+ (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ c == NLBLOCK->nl[0])
+ {
+ could_continue = partial_newline = TRUE;
+ }
+ else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
next_active_state--;
}
if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + 4, 0); }
+ { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
else
{ ADD_NEW(state_offset, count); }
}
if (clen > 0)
{
BOOL OK;
- int chartype = UCD_CHARTYPE(c);
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
switch(code[2])
{
case PT_ANY:
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
- OK = _pcre_ucp_gentype[chartype] == code[3];
+ OK = PRIV(ucp_gentype)[chartype] == code[3];
break;
case PT_PC:
OK = UCD_SCRIPT(c) == code[3];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
{
- const uschar *nptr = ptr + clen;
+ const pcre_uchar *nptr = ptr + clen;
int ncount = 0;
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
{
if (clen > 0)
{
BOOL OK;
- int chartype = UCD_CHARTYPE(c);
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
switch(code[2])
{
case PT_ANY:
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
- OK = _pcre_ucp_gentype[chartype] == code[3];
+ OK = PRIV(ucp_gentype)[chartype] == code[3];
break;
case PT_PC:
OK = UCD_SCRIPT(c) == code[3];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
{
- const uschar *nptr = ptr + clen;
+ const pcre_uchar *nptr = ptr + clen;
int ncount = 0;
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 6, 0); }
+ { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
BOOL OK;
- int chartype = UCD_CHARTYPE(c);
- switch(code[4])
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
+ switch(code[1 + IMM2_SIZE + 1])
{
case PT_ANY:
OK = TRUE;
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
- OK = _pcre_ucp_gentype[chartype] == code[5];
+ OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
break;
case PT_PC:
- OK = chartype == code[5];
+ OK = chartype == code[1 + IMM2_SIZE + 2];
break;
case PT_SC:
- OK = UCD_SCRIPT(c) == code[5];
+ OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
+ break;
+
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
break;
/* Should never occur, but keep compilers from grumbling. */
next_active_state--;
}
if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + 6, 0); }
+ { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
else
{ ADD_NEW(state_offset, count); }
}
case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
+ { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
{
- const uschar *nptr = ptr + clen;
+ const pcre_uchar *nptr = ptr + clen;
int ncount = 0;
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
{
ncount++;
nptr += ndlen;
}
+ if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ reset_could_continue = TRUE;
if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
+ { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
else
{ ADD_NEW_DATA(-state_offset, count, ncount); }
}
case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
+ { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
next_active_state--;
}
if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
+ { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
else
{ ADD_NEW_DATA(-state_offset, count, ncount); }
break;
case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
+ { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
next_active_state--;
}
if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
+ { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
else
{ ADD_NEW_DATA(-state_offset, count, 0); }
}
case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
+ { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
next_active_state--;
}
if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
+ { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
else
{ ADD_NEW_DATA(-state_offset, count, 0); }
}
break;
/*-----------------------------------------------------------------*/
- case OP_CHARNC:
+ case OP_CHARI:
if (clen == 0) break;
-#ifdef SUPPORT_UTF8
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
unsigned int othercase;
- if (c < 128) othercase = fcc[c]; else
-
- /* If we have Unicode property support, we can use it to test the
- other case of the character. */
-
+ if (c < 128)
+ othercase = fcc[c];
+ else
+ /* If we have Unicode property support, we can use it to test the
+ other case of the character. */
#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE(c);
+ othercase = UCD_OTHERCASE(c);
#else
- othercase = NOTACHAR;
+ othercase = NOTACHAR;
#endif
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
}
}
else
-#endif /* SUPPORT_UTF8 */
-
- /* Non-UTF-8 mode */
+#endif /* SUPPORT_UTF */
+ /* Not UTF mode */
{
- if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
+ if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
+ { ADD_NEW(state_offset + 2, 0); }
}
break;
case OP_EXTUNI:
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
{
- const uschar *nptr = ptr + clen;
+ const pcre_uchar *nptr = ptr + clen;
int ncount = 0;
while (nptr < end_subject)
{
ncount++;
nptr += nclen;
}
+ if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ reset_could_continue = TRUE;
ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
}
break;
break;
case 0x000d:
- if (ptr + 1 < end_subject && ptr[1] == 0x0a)
+ if (ptr + 1 >= end_subject)
+ {
+ ADD_NEW(state_offset + 1, 0);
+ if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+ reset_could_continue = TRUE;
+ }
+ else if (ptr[1] == 0x0a)
{
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
}
break;
/*-----------------------------------------------------------------*/
- /* Match a negated single character. This is only used for one-byte
- characters, that is, we know that d < 256. The character we are
- checking (c) can be multibyte. */
+ /* Match a negated single character casefully. */
case OP_NOT:
+ if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ /* Match a negated single character caselessly. */
+
+ case OP_NOTI:
if (clen > 0)
{
- unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
- if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
+ unsigned int otherd;
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = UCD_OTHERCASE(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
+ if (c != d && c != otherd)
+ { ADD_NEW(state_offset + dlen + 1, 0); }
}
break;
/*-----------------------------------------------------------------*/
+ case OP_PLUSI:
+ case OP_MINPLUSI:
+ case OP_POSPLUSI:
+ case OP_NOTPLUSI:
+ case OP_NOTMINPLUSI:
+ case OP_NOTPOSPLUSI:
+ caseless = TRUE;
+ codevalue -= OP_STARI - OP_STAR;
+
+ /* Fall through */
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
+ if (caseless)
{
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
#endif /* SUPPORT_UCP */
}
else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
break;
/*-----------------------------------------------------------------*/
+ case OP_QUERYI:
+ case OP_MINQUERYI:
+ case OP_POSQUERYI:
+ case OP_NOTQUERYI:
+ case OP_NOTMINQUERYI:
+ case OP_NOTPOSQUERYI:
+ caseless = TRUE;
+ codevalue -= OP_STARI - OP_STAR;
+ /* Fall through */
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
+ if (caseless)
{
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
#endif /* SUPPORT_UCP */
}
else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
break;
/*-----------------------------------------------------------------*/
+ case OP_STARI:
+ case OP_MINSTARI:
+ case OP_POSSTARI:
+ case OP_NOTSTARI:
+ case OP_NOTMINSTARI:
+ case OP_NOTPOSSTARI:
+ caseless = TRUE;
+ codevalue -= OP_STARI - OP_STAR;
+ /* Fall through */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
+ if (caseless)
{
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
#endif /* SUPPORT_UCP */
}
else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
break;
/*-----------------------------------------------------------------*/
+ case OP_EXACTI:
+ case OP_NOTEXACTI:
+ caseless = TRUE;
+ codevalue -= OP_STARI - OP_STAR;
+ /* Fall through */
case OP_EXACT:
case OP_NOTEXACT:
count = current_state->count; /* Number already matched */
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
+ if (caseless)
{
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
#endif /* SUPPORT_UCP */
}
else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + dlen + 3, 0); }
+ { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
else
{ ADD_NEW(state_offset, count); }
}
break;
/*-----------------------------------------------------------------*/
+ case OP_UPTOI:
+ case OP_MINUPTOI:
+ case OP_POSUPTOI:
+ case OP_NOTUPTOI:
+ case OP_NOTMINUPTOI:
+ case OP_NOTPOSUPTOI:
+ caseless = TRUE;
+ codevalue -= OP_STARI - OP_STAR;
+ /* Fall through */
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
case OP_NOTUPTO:
case OP_NOTMINUPTO:
case OP_NOTPOSUPTO:
- ADD_ACTIVE(state_offset + dlen + 3, 0);
+ ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
count = current_state->count; /* Number already matched */
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
+ if (caseless)
{
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
#endif /* SUPPORT_UCP */
}
else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
next_active_state--;
}
if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + dlen + 3, 0); }
+ { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
else
{ ADD_NEW(state_offset, count); }
}
{
BOOL isinclass = FALSE;
int next_state_offset;
- const uschar *ecode;
+ const pcre_uchar *ecode;
/* For a simple class, there is always just a 32-byte table, and we
can set isinclass from it. */
if (codevalue != OP_XCLASS)
{
- ecode = code + 33;
+ ecode = code + 1 + (32 / sizeof(pcre_uchar));
if (clen > 0)
{
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
- ((code[1 + c/8] & (1 << (c&7))) != 0);
+ ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
}
}
else
{
ecode = code + GET(code, 1);
- if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
+ if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
}
/* At this point, isinclass is set for all kinds of class, and ecode
points to the byte after the end of the class. If there is a
quantifier, this is where it will be. */
- next_state_offset = ecode - start_code;
+ next_state_offset = (int)(ecode - start_code);
switch (*ecode)
{
case OP_CRMINRANGE:
count = current_state->count; /* Already matched */
if (count >= GET2(ecode, 1))
- { ADD_ACTIVE(next_state_offset + 5, 0); }
+ { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
if (isinclass)
{
- int max = GET2(ecode, 3);
+ int max = GET2(ecode, 1 + IMM2_SIZE);
if (++count >= max && max != 0) /* Max 0 => no limit */
- { ADD_NEW(next_state_offset + 5, 0); }
+ { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
else
{ ADD_NEW(state_offset, count); }
}
int rc;
int local_offsets[2];
int local_workspace[1000];
- const uschar *endasscode = code + GET(code, 1);
+ const pcre_uchar *endasscode = code + GET(code, 1);
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
md, /* static match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing); /* pass on regex recursion */
+ rlevel); /* function recursion level */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
}
break;
if (code[LINK_SIZE+1] == OP_CALLOUT)
{
rrc = 0;
- if (pcre_callout != NULL)
+ if (PUBL(callout) != NULL)
{
- pcre_callout_block cb;
+ PUBL(callout_block) cb;
cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = code[LINK_SIZE+2];
cb.offset_vector = offsets;
+#ifdef COMPILE_PCRE8
cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
+#else
+ cb.subject = (PCRE_SPTR16)start_subject;
+#endif
+ cb.subject_length = (int)(end_subject - start_subject);
+ cb.start_match = (int)(current_subject - start_subject);
+ cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, LINK_SIZE + 3);
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
+ cb.mark = NULL; /* No (*MARK) support */
+ if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc > 0) break; /* Fail this thread */
- code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
+ code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
}
condcode = code[LINK_SIZE+1];
else if (condcode == OP_RREF || condcode == OP_NRREF)
{
- int value = GET2(code, LINK_SIZE+2);
+ int value = GET2(code, LINK_SIZE + 2);
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
- if (recursing > 0)
- { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
+ if (md->recursive != NULL)
+ { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
else
{
int rc;
- const uschar *asscode = code + LINK_SIZE + 1;
- const uschar *endasscode = asscode + GET(asscode, 1);
+ const pcre_uchar *asscode = code + LINK_SIZE + 1;
+ const pcre_uchar *endasscode = asscode + GET(asscode, 1);
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
md, /* fixed match data */
asscode, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing); /* pass on regex recursion */
+ rlevel); /* function recursion level */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
else
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
/*-----------------------------------------------------------------*/
case OP_RECURSE:
{
+ dfa_recursion_info *ri;
int local_offsets[1000];
int local_workspace[1000];
+ const pcre_uchar *callpat = start_code + GET(code, 1);
+ int recno = (callpat == md->start_code)? 0 :
+ GET2(callpat, 1 + LINK_SIZE);
int rc;
- DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
- recursing + 1));
+ DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
+
+ /* Check for repeating a recursion without advancing the subject
+ pointer. This should catch convoluted mutual recursions. (Some simple
+ cases are caught at compile time.) */
+
+ for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
+ if (recno == ri->group_num && ptr == ri->subject_position)
+ return PCRE_ERROR_RECURSELOOP;
+
+ /* Remember this recursion and where we started it so as to
+ catch infinite loops. */
+
+ new_recursive.group_num = recno;
+ new_recursive.subject_position = ptr;
+ new_recursive.prevrec = md->recursive;
+ md->recursive = &new_recursive;
rc = internal_dfa_exec(
md, /* fixed match data */
- start_code + GET(code, 1), /* this subexpression's code */
+ callpat, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing + 1); /* regex recurse level */
+ rlevel); /* function recursion level */
+
+ md->recursive = new_recursive.prevrec; /* Done this recursion */
- DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
- recursing + 1, rc));
+ DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
+ rc));
/* Ran out of internal offsets */
{
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
{
- const uschar *p = start_subject + local_offsets[rc];
- const uschar *pp = start_subject + local_offsets[rc+1];
int charcount = local_offsets[rc+1] - local_offsets[rc];
- while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
+#ifdef SUPPORT_UTF
+ if (utf)
+ {
+ const pcre_uchar *p = start_subject + local_offsets[rc];
+ const pcre_uchar *pp = start_subject + local_offsets[rc+1];
+ while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ }
+#endif
if (charcount > 0)
{
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
break;
/*-----------------------------------------------------------------*/
+ case OP_BRAPOS:
+ case OP_SBRAPOS:
+ case OP_CBRAPOS:
+ case OP_SCBRAPOS:
+ case OP_BRAPOSZERO:
+ {
+ int charcount, matched_count;
+ const pcre_uchar *local_ptr = ptr;
+ BOOL allow_zero;
+
+ if (codevalue == OP_BRAPOSZERO)
+ {
+ allow_zero = TRUE;
+ codevalue = *(++code); /* Codevalue will be one of above BRAs */
+ }
+ else allow_zero = FALSE;
+
+ /* Loop to match the subpattern as many times as possible as if it were
+ a complete pattern. */
+
+ for (matched_count = 0;; matched_count++)
+ {
+ int local_offsets[2];
+ int local_workspace[1000];
+
+ int rc = internal_dfa_exec(
+ md, /* fixed match data */
+ code, /* this subexpression's code */
+ local_ptr, /* where we currently are */
+ (int)(ptr - start_subject), /* start offset */
+ local_offsets, /* offset vector */
+ sizeof(local_offsets)/sizeof(int), /* size of same */
+ local_workspace, /* workspace vector */
+ sizeof(local_workspace)/sizeof(int), /* size of same */
+ rlevel); /* function recursion level */
+
+ /* Failed to match */
+
+ if (rc < 0)
+ {
+ if (rc != PCRE_ERROR_NOMATCH) return rc;
+ break;
+ }
+
+ /* Matched: break the loop if zero characters matched. */
+
+ charcount = local_offsets[1] - local_offsets[0];
+ if (charcount == 0) break;
+ local_ptr += charcount; /* Advance temporary position ptr */
+ }
+
+ /* At this point we have matched the subpattern matched_count
+ times, and local_ptr is pointing to the character after the end of the
+ last match. */
+
+ if (matched_count > 0 || allow_zero)
+ {
+ const pcre_uchar *end_subpattern = code;
+ int next_state_offset;
+
+ do { end_subpattern += GET(end_subpattern, 1); }
+ while (*end_subpattern == OP_ALT);
+ next_state_offset =
+ (int)(end_subpattern - start_code + LINK_SIZE + 1);
+
+ /* Optimization: if there are no more active states, and there
+ are no new states yet set up, then skip over the subject string
+ right here, to save looping. Otherwise, set up the new state to swing
+ into action when the end of the matched substring is reached. */
+
+ if (i + 1 >= active_count && new_count == 0)
+ {
+ ptr = local_ptr;
+ clen = 0;
+ ADD_NEW(next_state_offset, 0);
+ }
+ else
+ {
+ const pcre_uchar *p = ptr;
+ const pcre_uchar *pp = local_ptr;
+ charcount = (int)(pp - p);
+#ifdef SUPPORT_UTF
+ if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+#endif
+ ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
+ }
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
case OP_ONCE:
+ case OP_ONCE_NC:
{
int local_offsets[2];
int local_workspace[1000];
md, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing); /* pass on regex recursion */
+ rlevel); /* function recursion level */
if (rc >= 0)
{
- const uschar *end_subpattern = code;
+ const pcre_uchar *end_subpattern = code;
int charcount = local_offsets[1] - local_offsets[0];
int next_state_offset, repeat_state_offset;
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
- next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
+ next_state_offset =
+ (int)(end_subpattern - start_code + LINK_SIZE + 1);
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
arrange for the repeat state also to be added to the relevant list.
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
*end_subpattern == OP_KETRMIN)?
- end_subpattern - start_code - GET(end_subpattern, 1) : -1;
+ (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
/* If we have matched an empty string, add the next state at the
current character pointer. This is important so that the duplicate
/* Optimization: if there are no more active states, and there
are no new states yet set up, then skip over the subject string
right here, to save looping. Otherwise, set up the new state to swing
- into action when the end of the substring is reached. */
+ into action when the end of the matched substring is reached. */
else if (i + 1 >= active_count && new_count == 0)
{
}
else
{
- const uschar *p = start_subject + local_offsets[0];
- const uschar *pp = start_subject + local_offsets[1];
- while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
+#ifdef SUPPORT_UTF
+ if (utf)
+ {
+ const pcre_uchar *p = start_subject + local_offsets[0];
+ const pcre_uchar *pp = start_subject + local_offsets[1];
+ while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ }
+#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
if (repeat_state_offset >= 0)
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
}
-
}
else if (rc != PCRE_ERROR_NOMATCH) return rc;
}
case OP_CALLOUT:
rrc = 0;
- if (pcre_callout != NULL)
+ if (PUBL(callout) != NULL)
{
- pcre_callout_block cb;
+ PUBL(callout_block) cb;
cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = code[1];
cb.offset_vector = offsets;
+#ifdef COMPILE_PCRE8
cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
+#else
+ cb.subject = (PCRE_SPTR16)start_subject;
+#endif
+ cb.subject_length = (int)(end_subject - start_subject);
+ cb.start_match = (int)(current_subject - start_subject);
+ cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, 2);
cb.next_item_length = GET(code, 2 + LINK_SIZE);
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
+ cb.mark = NULL; /* No (*MARK) support */
+ if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc == 0)
- { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
+ { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
break;
if (new_count <= 0)
{
if (rlevel == 1 && /* Top level, and */
- could_continue && /* Some could go on */
+ could_continue && /* Some could go on, and */
forced_fail != workspace[1] && /* Not all forced fail & */
( /* either... */
(md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
match_count < 0) /* no matches */
) && /* And... */
- ptr >= end_subject && /* Reached end of subject */
- ptr > current_subject) /* Matched non-empty string */
+ (
+ partial_newline || /* Either partial NL */
+ ( /* or ... */
+ ptr >= end_subject && /* End of subject and */
+ ptr > md->start_used_ptr) /* Inspected non-empty string */
+ )
+ )
{
if (offsetcount >= 2)
{
- offsets[0] = md->start_used_ptr - start_subject;
- offsets[1] = end_subject - start_subject;
+ offsets[0] = (int)(md->start_used_ptr - start_subject);
+ offsets[1] = (int)(end_subject - start_subject);
}
match_count = PCRE_ERROR_PARTIAL;
}
< -1 => some kind of unexpected problem
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
const char *subject, int length, int start_offset, int options, int *offsets,
int offsetcount, int *workspace, int wscount)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
+ PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
+ int offsetcount, int *workspace, int wscount)
+#endif
{
-real_pcre *re = (real_pcre *)argument_re;
+REAL_PCRE *re = (REAL_PCRE *)argument_re;
dfa_match_data match_block;
dfa_match_data *md = &match_block;
-BOOL utf8, anchored, startline, firstline;
-const uschar *current_subject, *end_subject, *lcc;
-
-pcre_study_data internal_study;
+BOOL utf, anchored, startline, firstline;
+const pcre_uchar *current_subject, *end_subject;
const pcre_study_data *study = NULL;
-real_pcre internal_re;
-
-const uschar *req_byte_ptr;
-const uschar *start_bits = NULL;
-BOOL first_byte_caseless = FALSE;
-BOOL req_byte_caseless = FALSE;
-int first_byte = -1;
-int req_byte = -1;
-int req_byte2 = -1;
+
+const pcre_uchar *req_char_ptr;
+const pcre_uint8 *start_bits = NULL;
+BOOL has_first_char = FALSE;
+BOOL has_req_char = FALSE;
+pcre_uchar first_char = 0;
+pcre_uchar first_char2 = 0;
+pcre_uchar req_char = 0;
+pcre_uchar req_char2 = 0;
int newline;
/* Plausibility checks */
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
-/* We need to find the pointer to any study data before we test for byte
-flipping, so we scan the extra_data block first. This may set two fields in the
-match block, so we must initialize them beforehand. However, the other fields
-in the match block must not be set until after the byte flipping. */
+/* Check that the first field in the block is the magic number. If it is not,
+return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
+REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
+means that the pattern is likely compiled with different endianness. */
+
+if (re->magic_number != MAGIC_NUMBER)
+ return re->magic_number == REVERSED_MAGIC_NUMBER?
+ PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
+if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
+
+/* If restarting after a partial match, do some sanity checks on the contents
+of the workspace. */
+
+if ((options & PCRE_DFA_RESTART) != 0)
+ {
+ if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
+ workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
+ return PCRE_ERROR_DFA_BADRESTART;
+ }
+
+/* Set up study, callout, and table data */
md->tables = re->tables;
md->callout_data = NULL;
md->tables = extra_data->tables;
}
-/* Check that the first field in the block is the magic number. If it is not,
-test for a regex that was compiled on a host of opposite endianness. If this is
-the case, flipped values are put in internal_re and internal_study if there was
-study data too. */
-
-if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- if (study != NULL) study = &internal_study;
- }
-
/* Set some local values */
-current_subject = (const unsigned char *)subject + start_offset;
-end_subject = (const unsigned char *)subject + length;
-req_byte_ptr = current_subject - 1;
+current_subject = (const pcre_uchar *)subject + start_offset;
+end_subject = (const pcre_uchar *)subject + length;
+req_char_ptr = current_subject - 1;
-#ifdef SUPPORT_UTF8
-utf8 = (re->options & PCRE_UTF8) != 0;
+#ifdef SUPPORT_UTF
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (re->options & PCRE_UTF8) != 0;
#else
-utf8 = FALSE;
+utf = FALSE;
#endif
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
/* The remaining fixed data for passing around. */
-md->start_code = (const uschar *)argument_re +
+md->start_code = (const pcre_uchar *)argument_re +
re->name_table_offset + re->name_count * re->name_entry_size;
-md->start_subject = (const unsigned char *)subject;
+md->start_subject = (const pcre_uchar *)subject;
md->end_subject = end_subject;
md->start_offset = start_offset;
md->moptions = options;
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
back the character offset. */
-#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+#ifdef SUPPORT_UTF
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
- if (start_offset > 0 && start_offset < length)
+ int erroroffset;
+ int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
+ if (errorcode != 0)
{
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
+ if (offsetcount >= 2)
{
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
+ offsets[0] = erroroffset;
+ offsets[1] = errorcode;
}
+ return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
}
+ if (start_offset > 0 && start_offset < length &&
+ NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
+ return PCRE_ERROR_BADUTF8_OFFSET;
}
#endif
is a feature that makes it possible to save compiled regex and re-use them
in other programs later. */
-if (md->tables == NULL) md->tables = _pcre_default_tables;
+if (md->tables == NULL) md->tables = PRIV(default_tables);
-/* The lower casing table and the "must be at the start of a line" flag are
-used in a loop when finding where to start. */
+/* The "must be at the start of a line" flags are used in a loop when finding
+where to start. */
-lcc = md->tables + lcc_offset;
startline = (re->flags & PCRE_STARTLINE) != 0;
firstline = (re->options & PCRE_FIRSTLINE) != 0;
{
if ((re->flags & PCRE_FIRSTSET) != 0)
{
- first_byte = re->first_byte & 255;
- if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
- first_byte = lcc[first_byte];
+ has_first_char = TRUE;
+ first_char = first_char2 = (pcre_uchar)(re->first_char);
+ if ((re->flags & PCRE_FCH_CASELESS) != 0)
+ {
+ first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (utf && first_char > 127)
+ first_char2 = UCD_OTHERCASE(first_char);
+#endif
+ }
}
else
{
if ((re->flags & PCRE_REQCHSET) != 0)
{
- req_byte = re->req_byte & 255;
- req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
- req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
+ has_req_char = TRUE;
+ req_char = req_char2 = (pcre_uchar)(re->req_char);
+ if ((re->flags & PCRE_RCH_CASELESS) != 0)
+ {
+ req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (utf && req_char > 127)
+ req_char2 = UCD_OTHERCASE(req_char);
+#endif
+ }
}
/* Call the main matching function, looping for a non-anchored regex after a
if ((options & PCRE_DFA_RESTART) == 0)
{
- const uschar *save_end_subject = end_subject;
+ const pcre_uchar *save_end_subject = end_subject;
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. Implement this by temporarily adjusting
if (firstline)
{
- USPTR t = current_subject;
-#ifdef SUPPORT_UTF8
- if (utf8)
+ PCRE_PUCHAR t = current_subject;
+#ifdef SUPPORT_UTF
+ if (utf)
{
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ ACROSSCHAR(t < end_subject, *t, t++);
}
}
else
/* There are some optimizations that avoid running the match if a known
starting point is not found. However, there is an option that disables
- these, for testing and for ensuring that all callouts do actually occur. */
+ these, for testing and for ensuring that all callouts do actually occur.
+ The option can be set in the regex by (*NO_START_OPT) or passed in
+ match-time options. */
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+ if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
- /* Advance to a known first byte. */
+ /* Advance to a known first char. */
- if (first_byte >= 0)
+ if (has_first_char)
{
- if (first_byte_caseless)
+ if (first_char != first_char2)
while (current_subject < end_subject &&
- lcc[*current_subject] != first_byte)
+ *current_subject != first_char && *current_subject != first_char2)
current_subject++;
else
while (current_subject < end_subject &&
- *current_subject != first_byte)
+ *current_subject != first_char)
current_subject++;
}
{
if (current_subject > md->start_subject + start_offset)
{
-#ifdef SUPPORT_UTF8
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
while (current_subject < end_subject &&
!WAS_NEWLINE(current_subject))
{
current_subject++;
- while(current_subject < end_subject &&
- (*current_subject & 0xc0) == 0x80)
- current_subject++;
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
+ current_subject++);
}
}
else
{
while (current_subject < end_subject)
{
- register unsigned int c = *current_subject;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
- else break;
+ unsigned int c = *current_subject;
+#ifndef COMPILE_PCRE8
+ if (c > 255) c = 255;
+#endif
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ current_subject++;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ /* In non 8-bit mode, the iteration will stop for
+ characters > 255 at the beginning or not stop at all. */
+ if (utf)
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
+ current_subject++);
+#endif
+ }
+ else break;
}
}
}
disabling is explicitly requested (and of course, by the test above, this
code is not obeyed when restarting after a partial match). */
- if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
+ if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
{
/* If the pattern was studied, a minimum subject length may be set. This
(pcre_uint32)(end_subject - current_subject) < study->minlength)
return PCRE_ERROR_NOMATCH;
- /* If req_byte is set, we know that that character must appear in the
- subject for the match to succeed. If the first character is set, req_byte
+ /* If req_char is set, we know that that character must appear in the
+ subject for the match to succeed. If the first character is set, req_char
must be later in the subject; otherwise the test starts at the match
point. This optimization can save a huge amount of work in patterns with
nested unlimited repeats that aren't going to match. Writing separate
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
string... so we don't do this when the string is sufficiently long. */
- if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
+ if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
{
- register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
+ PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
- if (p > req_byte_ptr)
+ if (p > req_char_ptr)
{
- if (req_byte_caseless)
+ if (req_char != req_char2)
{
while (p < end_subject)
{
- register int pp = *p++;
- if (pp == req_byte || pp == req_byte2) { p--; break; }
+ int pp = *p++;
+ if (pp == req_char || pp == req_char2) { p--; break; }
}
}
else
{
while (p < end_subject)
{
- if (*p++ == req_byte) { p--; break; }
+ if (*p++ == req_char) { p--; break; }
}
}
found it, so that we don't search again next time round the loop if
the start hasn't passed this character yet. */
- req_byte_ptr = p;
+ req_char_ptr = p;
}
}
}
/* OK, now we can do the business */
md->start_used_ptr = current_subject;
+ md->recursive = NULL;
rc = internal_dfa_exec(
md, /* fixed match data */
offsetcount, /* size of same */
workspace, /* workspace vector */
wscount, /* size of same */
- re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
- 0, /* function recurse level */
- 0); /* regex recurse level */
+ 0); /* function recurse level */
/* Anything other than "no match" means we are done, always; otherwise, carry
on only if not anchored. */
if (firstline && IS_NEWLINE(current_subject)) break;
current_subject++;
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
- while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
- current_subject++;
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
+ current_subject++);
}
+#endif
if (current_subject > end_subject) break;
/* If we have just passed a CR and we are now at a LF, and the pattern does