/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
- Copyright (c) 1997-2009 University of Cambridge
+ Copyright (c) 1997-2010 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
/* The current PCRE version information. */
#define PCRE_MAJOR 8
-#define PCRE_MINOR 02
+#define PCRE_MINOR 12
#define PCRE_PRERELEASE
-#define PCRE_DATE 2010-03-19
+#define PCRE_DATE 2011-01-15
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate
#endif
/* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
-
-#define PCRE_CASELESS 0x00000001
-#define PCRE_MULTILINE 0x00000002
-#define PCRE_DOTALL 0x00000004
-#define PCRE_EXTENDED 0x00000008
-#define PCRE_ANCHORED 0x00000010
-#define PCRE_DOLLAR_ENDONLY 0x00000020
-#define PCRE_EXTRA 0x00000040
-#define PCRE_NOTBOL 0x00000080
-#define PCRE_NOTEOL 0x00000100
-#define PCRE_UNGREEDY 0x00000200
-#define PCRE_NOTEMPTY 0x00000400
-#define PCRE_UTF8 0x00000800
-#define PCRE_NO_AUTO_CAPTURE 0x00001000
-#define PCRE_NO_UTF8_CHECK 0x00002000
-#define PCRE_AUTO_CALLOUT 0x00004000
-#define PCRE_PARTIAL_SOFT 0x00008000
+both, so we keep them all distinct. However, almost all the bits in the options
+word are now used. In the long run, we may have to re-use some of the
+compile-time only bits for runtime options, or vice versa. */
+
+#define PCRE_CASELESS 0x00000001 /* Compile */
+#define PCRE_MULTILINE 0x00000002 /* Compile */
+#define PCRE_DOTALL 0x00000004 /* Compile */
+#define PCRE_EXTENDED 0x00000008 /* Compile */
+#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
+#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
+#define PCRE_EXTRA 0x00000040 /* Compile */
+#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
+#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
+#define PCRE_UNGREEDY 0x00000200 /* Compile */
+#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
+#define PCRE_UTF8 0x00000800 /* Compile */
+#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
+#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
+#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
+#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
-#define PCRE_DFA_SHORTEST 0x00010000
-#define PCRE_DFA_RESTART 0x00020000
-#define PCRE_FIRSTLINE 0x00040000
-#define PCRE_DUPNAMES 0x00080000
-#define PCRE_NEWLINE_CR 0x00100000
-#define PCRE_NEWLINE_LF 0x00200000
-#define PCRE_NEWLINE_CRLF 0x00300000
-#define PCRE_NEWLINE_ANY 0x00400000
-#define PCRE_NEWLINE_ANYCRLF 0x00500000
-#define PCRE_BSR_ANYCRLF 0x00800000
-#define PCRE_BSR_UNICODE 0x01000000
-#define PCRE_JAVASCRIPT_COMPAT 0x02000000
-#define PCRE_NO_START_OPTIMIZE 0x04000000
-#define PCRE_NO_START_OPTIMISE 0x04000000
-#define PCRE_PARTIAL_HARD 0x08000000
-#define PCRE_NOTEMPTY_ATSTART 0x10000000
+#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
+#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
+#define PCRE_FIRSTLINE 0x00040000 /* Compile */
+#define PCRE_DUPNAMES 0x00080000 /* Compile */
+#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_CRLF 0x00300000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANY 0x00400000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
+#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
+#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
+#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
+#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
+#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
+#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
+#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
+#define PCRE_UCP 0x20000000 /* Compile */
/* Exec-time and get/set-time error codes */
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
+#define PCRE_ERROR_BADOFFSET (-24)
+#define PCRE_ERROR_SHORTUTF8 (-25)
/* Request types for pcre_fullinfo() */
#define PCRE_EXTRA_CALLOUT_DATA 0x0004
#define PCRE_EXTRA_TABLES 0x0008
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
+#define PCRE_EXTRA_MARK 0x0020
/* Types */
void *callout_data; /* Data passed back in callouts */
const unsigned char *tables; /* Pointer to character tables */
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
+ unsigned char **mark; /* For passing back a mark pointer */
} pcre_extra;
/* The structure for passing out data via the pcre_callout_function. We use a
current locale. If PCRE is configured with --enable-rebuild-chartables, this
happens automatically.
-The following #includes are present because without the gcc 4.x may remove the
+The following #includes are present because without them gcc 4.x may remove the
array definition from the final binary if PCRE is built into a static library
and dead code stripping is activated. This leads to link errors. Pulling in the
header ensures that the array gets flagged as "someone outside this compilation
-ESC_H, 0,
0, -ESC_K,
0, 0,
- 0, 0,
+ -ESC_N, 0,
-ESC_P, -ESC_Q,
-ESC_R, -ESC_S,
0, 0,
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
-/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
+/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
platforms. */
typedef struct verbitem {
- int len;
- int op;
+ int len; /* Length of verb name */
+ int op; /* Op when no arg, or -1 if arg mandatory */
+ int op_arg; /* Op when arg present, or -1 if not allowed */
} verbitem;
static const char verbnames[] =
+ "\0" /* Empty name is a shorthand for MARK */
+ STRING_MARK0
STRING_ACCEPT0
STRING_COMMIT0
STRING_F0
STRING_THEN;
static const verbitem verbs[] = {
- { 6, OP_ACCEPT },
- { 6, OP_COMMIT },
- { 1, OP_FAIL },
- { 4, OP_FAIL },
- { 5, OP_PRUNE },
- { 4, OP_SKIP },
- { 4, OP_THEN }
+ { 0, -1, OP_MARK },
+ { 4, -1, OP_MARK },
+ { 6, OP_ACCEPT, -1 },
+ { 6, OP_COMMIT, -1 },
+ { 1, OP_FAIL, -1 },
+ { 4, OP_FAIL, -1 },
+ { 5, OP_PRUNE, OP_PRUNE_ARG },
+ { 4, OP_SKIP, OP_SKIP_ARG },
+ { 4, OP_THEN, OP_THEN_ARG }
};
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
cbit_xdigit,-1, 0 /* xdigit */
};
+/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
+substitutes must be in the order of the names, defined above, and there are
+both positive and negative cases. NULL means no substitute. */
+
+#ifdef SUPPORT_UCP
+static const uschar *substitutes[] = {
+ (uschar *)"\\P{Nd}", /* \D */
+ (uschar *)"\\p{Nd}", /* \d */
+ (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
+ (uschar *)"\\p{Xsp}", /* \s */
+ (uschar *)"\\P{Xwd}", /* \W */
+ (uschar *)"\\p{Xwd}" /* \w */
+};
+
+static const uschar *posix_substitutes[] = {
+ (uschar *)"\\p{L}", /* alpha */
+ (uschar *)"\\p{Ll}", /* lower */
+ (uschar *)"\\p{Lu}", /* upper */
+ (uschar *)"\\p{Xan}", /* alnum */
+ NULL, /* ascii */
+ (uschar *)"\\h", /* blank */
+ NULL, /* cntrl */
+ (uschar *)"\\p{Nd}", /* digit */
+ NULL, /* graph */
+ NULL, /* print */
+ NULL, /* punct */
+ (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
+ (uschar *)"\\p{Xwd}", /* word */
+ NULL, /* xdigit */
+ /* Negated cases */
+ (uschar *)"\\P{L}", /* ^alpha */
+ (uschar *)"\\P{Ll}", /* ^lower */
+ (uschar *)"\\P{Lu}", /* ^upper */
+ (uschar *)"\\P{Xan}", /* ^alnum */
+ NULL, /* ^ascii */
+ (uschar *)"\\H", /* ^blank */
+ NULL, /* ^cntrl */
+ (uschar *)"\\P{Nd}", /* ^digit */
+ NULL, /* ^graph */
+ NULL, /* ^print */
+ NULL, /* ^punct */
+ (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
+ (uschar *)"\\P{Xwd}", /* ^word */
+ NULL /* ^xdigit */
+};
+#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
+#endif
#define STRING(a) # a
#define XSTRING(s) STRING(s)
/* 35 */
"invalid condition (?(0)\0"
"\\C not allowed in lookbehind assertion\0"
- "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
+ "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
"number after (?C is > 255\0"
"closing ) for (?C expected\0"
/* 40 */
"inconsistent NEWLINE options\0"
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
"a numbered reference must not be zero\0"
- "(*VERB) with an argument is not supported\0"
+ "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
/* 60 */
"(*VERB) not recognized\0"
"number is too big\0"
"digit expected after (?+\0"
"] is an invalid data character in JavaScript compatibility mode\0"
/* 65 */
- "different names for subpatterns of the same number are not allowed\0";
+ "different names for subpatterns of the same number are not allowed\0"
+ "(*MARK) must have an argument\0"
+ "this version of PCRE is not compiled with PCRE_UCP support\0"
+ "\\c must be followed by an ASCII character\0"
+ ;
/* Definition to allow mutual recursion */
case CHAR_l:
case CHAR_L:
- case CHAR_N:
case CHAR_u:
case CHAR_U:
*errorcodeptr = ERR37;
break;
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
- This coding is ASCII-specific, but then the whole concept of \cx is
+ An error is given if the byte following \c is not an ASCII character. This
+ coding is ASCII-specific, but then the whole concept of \cx is
ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
case CHAR_c:
*errorcodeptr = ERR2;
break;
}
-
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
+#ifndef EBCDIC /* ASCII/UTF-8 coding */
+ if (c > 127) /* Excludes all non-ASCII in either mode */
+ {
+ *errorcodeptr = ERR68;
+ break;
+ }
if (c >= CHAR_a && c <= CHAR_z) c -= 32;
c ^= 0x40;
-#else /* EBCDIC coding */
+#else /* EBCDIC coding */
if (c >= CHAR_a && c <= CHAR_z) c += 64;
c ^= 0xC0;
#endif
}
}
+/* Perl supports \N{name} for character names, as well as plain \N for "not
+newline". PCRE does not support \N{name}. */
+
+if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
+ *errorcodeptr = ERR37;
+
+/* If PCRE_UCP is set, we change the values for \d etc. */
+
+if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
+ c -= (ESC_DU - ESC_D);
+
+/* Set the pointer to the final character before returning. */
+
*ptrptr = ptr;
return c;
}
start at a parenthesis. It scans along a pattern's text looking for capturing
subpatterns, and counting them. If it finds a named pattern that matches the
name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. We know that if (?P< is
-encountered, the name will be terminated by '>' because that is checked in the
-first pass. Recursion is used to keep track of subpatterns that reset the
-capturing group numbers - the (?| feature.
+returns when it reaches a given numbered subpattern. Recursion is used to keep
+track of subpatterns that reset the capturing group numbers - the (?| feature.
+
+This function was originally called only from the second pass, in which we know
+that if (?< or (?' or (?P< is encountered, the name will be correctly
+terminated because that is checked in the first pass. There is now one call to
+this function in the first pass, to check for a recursive back reference by
+name (so that we can make the whole group atomic). In this case, we need check
+only up to the current position in the pattern, and that is still OK because
+and previous occurrences will have been checked. To make this work, the test
+for "end of pattern" is a check against cd->end_pattern in the main loop,
+instead of looking for a binary zero. This means that the special first-pass
+call can adjust cd->end_pattern temporarily. (Checks for binary zero while
+processing items within the loop are OK, because afterwards the main loop will
+terminate.)
Arguments:
ptrptr address of the current character pointer (updated)
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
count pointer to the current capturing subpattern number (updated)
Returns: the number of the named subpattern, or -1 if not found
static int
find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
+ BOOL xmode, BOOL utf8, int *count)
{
uschar *ptr = *ptrptr;
int start_count = *count;
if (ptr[0] == CHAR_LEFT_PARENTHESIS)
{
- if (ptr[1] == CHAR_QUESTION_MARK &&
- ptr[2] == CHAR_VERTICAL_LINE)
+ /* Handle specials such as (*SKIP) or (*UTF8) etc. */
+
+ if (ptr[1] == CHAR_ASTERISK) ptr += 2;
+
+ /* Handle a normal, unnamed capturing parenthesis. */
+
+ else if (ptr[1] != CHAR_QUESTION_MARK)
+ {
+ *count += 1;
+ if (name == NULL && *count == lorn) return *count;
+ ptr++;
+ }
+
+ /* All cases now have (? at the start. Remember when we are in a group
+ where the parenthesis numbers are duplicated. */
+
+ else if (ptr[2] == CHAR_VERTICAL_LINE)
{
ptr += 3;
dup_parens = TRUE;
}
- /* Handle a normal, unnamed capturing parenthesis */
+ /* Handle comments; all characters are allowed until a ket is reached. */
- else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
+ else if (ptr[2] == CHAR_NUMBER_SIGN)
{
- *count += 1;
- if (name == NULL && *count == lorn) return *count;
- ptr++;
+ for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
+ goto FAIL_EXIT;
}
/* Handle a condition. If it is an assertion, just carry on so that it
is processed as normal. If not, skip to the closing parenthesis of the
- condition (there can't be any nested parens. */
+ condition (there can't be any nested parens). */
else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
{
}
}
- /* We have either (? or (* and not a condition */
+ /* Start with (? but not a condition. */
else
{
}
/* Past any initial parenthesis handling, scan for parentheses or vertical
-bars. */
+bars. Stop if we get to cd->end_pattern. Note that this is important for the
+first-pass call when this value is temporarily adjusted to stop at the current
+position. So DO NOT change this to a test for binary zero. */
-for (; *ptr != 0; ptr++)
+for (; ptr < cd->end_pattern; ptr++)
{
/* Skip over backslashed characters and also entire \Q...\E */
if (xmode && *ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+ ptr++;
+ while (*ptr != 0)
+ {
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
if (*ptr == 0) goto FAIL_EXIT;
continue;
}
if (*ptr == CHAR_LEFT_PARENTHESIS)
{
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+ int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
if (rc > 0) return rc;
if (*ptr == 0) goto FAIL_EXIT;
}
else if (*ptr == CHAR_RIGHT_PARENTHESIS)
{
if (dup_parens && *count < hwm_count) *count = hwm_count;
- *ptrptr = ptr;
- return -1;
+ goto FAIL_EXIT;
}
else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
Returns: the number of the found subpattern, or -1 if not found
*/
static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+ BOOL utf8)
{
uschar *ptr = (uschar *)cd->start_pattern;
int count = 0;
for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
if (rc > 0 || *ptr++ == 0) break;
}
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
+ two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
+ must add in its length. */
else
{
case OP_TYPEPOSUPTO:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ code += code[1];
+ break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
}
/* Add in the fixed length from the table */
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
+ two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
+ must add in its length. */
else
{
case OP_TYPEEXACT:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ code += code[1];
+ break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
}
/* Add in the fixed length from the table */
break;
#endif
+ /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
+ string. */
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ code += code[1];
+ break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
+
/* None of the remaining opcodes are required to match a character. */
default:
{
*code++ = OP_CALLOUT;
*code++ = 255;
-PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
-PUT(code, LINK_SIZE, 0); /* Default length */
+PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
+PUT(code, LINK_SIZE, 0); /* Default length */
return code + 2*LINK_SIZE;
}
static void
complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
{
-int length = ptr - cd->start_pattern - GET(previous_callout, 2);
+int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
PUT(previous_callout, 2 + LINK_SIZE, length);
}
return TRUE;
}
+
+
+
+/*************************************************
+* Check a character and a property *
+*************************************************/
+
+/* This function is called by check_auto_possessive() when a property item
+is adjacent to a fixed character.
+
+Arguments:
+ c the character
+ ptype the property type
+ pdata the data for the type
+ negated TRUE if it's a negated property (\P or \p{^)
+
+Returns: TRUE if auto-possessifying is OK
+*/
+
+static BOOL
+check_char_prop(int c, int ptype, int pdata, BOOL negated)
+{
+int chartype = UCD_CHARTYPE(c);
+switch(ptype)
+ {
+ case PT_LAMP:
+ return (chartype == ucp_Lu ||
+ chartype == ucp_Ll ||
+ chartype == ucp_Lt) == negated;
+
+ case PT_GC:
+ return (pdata == _pcre_ucp_gentype[chartype]) == negated;
+
+ case PT_PC:
+ return (pdata == chartype) == negated;
+
+ case PT_SC:
+ return (pdata == UCD_SCRIPT(c)) == negated;
+
+ /* These are specials */
+
+ case PT_ALNUM:
+ return (_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N) == negated;
+
+ case PT_SPACE: /* Perl space */
+ return (_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == negated;
+
+ case PT_PXSPACE: /* POSIX space */
+ return (_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR)
+ == negated;
+
+ case PT_WORD:
+ return (_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE) == negated;
+ }
+return FALSE;
+}
#endif /* SUPPORT_UCP */
sense to automatically possessify the repeated item.
Arguments:
- op_code the repeated op code
- this data for this item, depends on the opcode
+ previous pointer to the repeated opcode
utf8 TRUE in UTF-8 mode
- utf8_char used for utf8 character bytes, NULL if not relevant
ptr next character in pattern
options options bits
cd contains pointers to tables etc.
*/
static BOOL
-check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
- const uschar *ptr, int options, compile_data *cd)
+check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
+ int options, compile_data *cd)
{
-int next;
+int c, next;
+int op_code = *previous++;
/* Skip whitespace and comments in extended mode */
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
return FALSE;
-/* Now compare the next item with the previous opcode. If the previous is a
-positive single character match, "item" either contains the character or, if
-"item" is greater than 127 in utf8 mode, the character's bytes are in
-utf8_char. */
-
-
-/* Handle cases when the next item is a character. */
+/* Now compare the next item with the previous opcode. First, handle cases when
+the next item is a character. */
if (next >= 0) switch(op_code)
{
case OP_CHAR:
#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+ GETCHARTEST(c, previous);
#else
- (void)(utf8_char); /* Keep compiler happy by referencing function argument */
+ c = *previous;
#endif
- return item != next;
+ return c != next;
/* For CHARNC (caseless character) we must check the other case. If we have
Unicode property support, we can use it to test the other case of
case OP_CHARNC:
#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
#endif
- if (item == next) return FALSE;
+ if (c == next) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
{
#else
othercase = NOTACHAR;
#endif
- return (unsigned int)item != othercase;
+ return (unsigned int)c != othercase;
}
else
#endif /* SUPPORT_UTF8 */
- return (item != cd->fcc[next]); /* Non-UTF-8 mode */
+ return (c != cd->fcc[next]); /* Non-UTF-8 mode */
- /* For OP_NOT, "item" must be a single-byte character. */
+ /* For OP_NOT, its data is always a single-byte character. */
case OP_NOT:
- if (item == next) return TRUE;
+ if ((c = *previous) == next) return TRUE;
if ((options & PCRE_CASELESS) == 0) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
#else
othercase = NOTACHAR;
#endif
- return (unsigned int)item == othercase;
+ return (unsigned int)c == othercase;
}
else
#endif /* SUPPORT_UTF8 */
- return (item == cd->fcc[next]); /* Non-UTF-8 mode */
+ return (c == cd->fcc[next]); /* Non-UTF-8 mode */
+
+ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
+ When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
case OP_DIGIT:
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
case 0x202f:
case 0x205f:
case 0x3000:
- return op_code != OP_HSPACE;
+ return op_code == OP_NOT_HSPACE;
default:
- return op_code == OP_HSPACE;
+ return op_code != OP_NOT_HSPACE;
}
+ case OP_ANYNL:
case OP_VSPACE:
case OP_NOT_VSPACE:
switch(next)
case 0x85:
case 0x2028:
case 0x2029:
- return op_code != OP_VSPACE;
+ return op_code == OP_NOT_VSPACE;
default:
- return op_code == OP_VSPACE;
+ return op_code != OP_NOT_VSPACE;
}
+#ifdef SUPPORT_UCP
+ case OP_PROP:
+ return check_char_prop(next, previous[0], previous[1], FALSE);
+
+ case OP_NOTPROP:
+ return check_char_prop(next, previous[0], previous[1], TRUE);
+#endif
+
default:
return FALSE;
}
-/* Handle the case when the next item is \d, \s, etc. */
+/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
+is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
+generated only when PCRE_UCP is *not* set, that is, when only ASCII
+characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
+replaced by OP_PROP codes when PCRE_UCP is set. */
switch(op_code)
{
case OP_CHAR:
case OP_CHARNC:
#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
#endif
switch(-next)
{
case ESC_d:
- return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
case ESC_D:
- return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
case ESC_s:
- return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
case ESC_S:
- return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
case ESC_w:
- return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
case ESC_W:
- return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
case ESC_h:
case ESC_H:
- switch(item)
+ switch(c)
{
case 0x09:
case 0x20:
case ESC_v:
case ESC_V:
- switch(item)
+ switch(c)
{
case 0x0a:
case 0x0b:
return -next == ESC_v;
}
+ /* When PCRE_UCP is set, these values get generated for \d etc. Find
+ their substitutions and process them. The result will always be either
+ -ESC_p or -ESC_P. Then fall through to process those values. */
+
+#ifdef SUPPORT_UCP
+ case ESC_du:
+ case ESC_DU:
+ case ESC_wu:
+ case ESC_WU:
+ case ESC_su:
+ case ESC_SU:
+ {
+ int temperrorcode = 0;
+ ptr = substitutes[-next - ESC_DU];
+ next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
+ if (temperrorcode != 0) return FALSE;
+ ptr++; /* For compatibility */
+ }
+ /* Fall through */
+
+ case ESC_p:
+ case ESC_P:
+ {
+ int ptype, pdata, errorcodeptr;
+ BOOL negated;
+
+ ptr--; /* Make ptr point at the p or P */
+ ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
+ if (ptype < 0) return FALSE;
+ ptr++; /* Point past the final curly ket */
+
+ /* If the property item is optional, we have to give up. (When generated
+ from \d etc by PCRE_UCP, this test will have been applied much earlier,
+ to the original \d etc. At this point, ptr will point to a zero byte. */
+
+ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
+ strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
+ return FALSE;
+
+ /* Do the property check. */
+
+ return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
+ }
+#endif
+
default:
return FALSE;
}
+ /* In principle, support for Unicode properties should be integrated here as
+ well. It means re-organizing the above code so as to get hold of the property
+ values before switching on the op-code. However, I wonder how many patterns
+ combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
+ these op-codes are never generated.) */
+
case OP_DIGIT:
return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
- next == -ESC_h || next == -ESC_v;
+ next == -ESC_h || next == -ESC_v || next == -ESC_R;
case OP_NOT_DIGIT:
return next == -ESC_d;
case OP_WHITESPACE:
- return next == -ESC_S || next == -ESC_d || next == -ESC_w;
+ return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
case OP_NOT_WHITESPACE:
return next == -ESC_s || next == -ESC_h || next == -ESC_v;
case OP_HSPACE:
- return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
+ return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
+ next == -ESC_w || next == -ESC_v || next == -ESC_R;
case OP_NOT_HSPACE:
return next == -ESC_h;
/* Can't have \S in here because VT matches \S (Perl anomaly) */
+ case OP_ANYNL:
case OP_VSPACE:
return next == -ESC_V || next == -ESC_d || next == -ESC_w;
case OP_NOT_VSPACE:
- return next == -ESC_v;
+ return next == -ESC_v || next == -ESC_R;
case OP_WORDCHAR:
- return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
+ return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
+ next == -ESC_v || next == -ESC_R;
case OP_NOT_WORDCHAR:
return next == -ESC_w || next == -ESC_d;
BOOL groupsetfirstbyte = FALSE;
const uschar *ptr = *ptrptr;
const uschar *tempptr;
+const uschar *nestptr = NULL;
uschar *previous = NULL;
uschar *previous_callout = NULL;
uschar *save_hwm = NULL;
c = *ptr;
+ /* If we are at the end of a nested substitution, revert to the outer level
+ string. Nesting only happens one level deep. */
+
+ if (c == 0 && nestptr != NULL)
+ {
+ ptr = nestptr;
+ nestptr = NULL;
+ c = *ptr;
+ }
+
/* If we are in the pre-compile phase, accumulate the length used for the
previous cycle of this loop. */
goto FAILED;
}
- *lengthptr += code - last_code;
+ *lengthptr += (int)(code - last_code);
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
/* If "previous" is set and it is not at the start of the work space, move
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
}
if (*ptr != 0) continue;
*errorcodeptr = ERR20;
goto FAILED;
}
- *lengthptr += code - last_code; /* To include callout length */
+ *lengthptr += (int)(code - last_code); /* To include callout length */
DPRINTF((">> end branch\n"));
}
return TRUE;
ptr++;
}
- posix_class = check_posix_name(ptr, tempptr - ptr);
+ posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
if (posix_class < 0)
{
*errorcodeptr = ERR30;
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
- /* We build the bit map for the POSIX class in a chunk of local store
- because we may be adding and subtracting from it, and we don't want to
- subtract bits that may be in the main map already. At the end we or the
- result into the bit map that is being built. */
+ /* When PCRE_UCP is set, some of the POSIX classes are converted to
+ different escape sequences that use Unicode properties. */
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_UCP) != 0)
+ {
+ int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
+ if (posix_substitutes[pc] != NULL)
+ {
+ nestptr = tempptr + 1;
+ ptr = posix_substitutes[pc] - 1;
+ continue;
+ }
+ }
+#endif
+ /* In the non-UCP case, we build the bit map for the POSIX class in a
+ chunk of local store because we may be adding and subtracting from it,
+ and we don't want to subtract bits that may be in the main map already.
+ At the end we or the result into the bit map that is being built. */
posix_class *= 3;
/* Backslash may introduce a single character, or it may introduce one
of the specials, which just set a flag. The sequence \b is a special
- case. Inside a class (and only there) it is treated as backspace.
- Elsewhere it marks a word boundary. Other escapes have preset maps ready
- to 'or' into the one we are building. We assume they have more than one
- character in them, so set class_charcount bigger than one. */
+ case. Inside a class (and only there) it is treated as backspace. We
+ assume that other escapes have more than one character in them, so set
+ class_charcount bigger than one. Unrecognized escapes fall through and
+ are either treated as literal characters (by default), or are faulted if
+ PCRE_EXTRA is set. */
if (c == CHAR_BACKSLASH)
{
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
- if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
- else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
- else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
+ if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
else if (-c == ESC_Q) /* Handle start of quoted string */
{
if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
register const uschar *cbits = cd->cbits;
class_charcount += 2; /* Greater than 1 is what matters */
- /* Save time by not doing this in the pre-compile phase. */
-
- if (lengthptr == NULL) switch (-c)
+ switch (-c)
{
+#ifdef SUPPORT_UCP
+ case ESC_du: /* These are the values given for \d etc */
+ case ESC_DU: /* when PCRE_UCP is set. We replace the */
+ case ESC_wu: /* escape sequence with an appropriate \p */
+ case ESC_WU: /* or \P to test Unicode properties instead */
+ case ESC_su: /* of the default ASCII testing. */
+ case ESC_SU:
+ nestptr = ptr;
+ ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
+ class_charcount -= 2; /* Undo! */
+ continue;
+#endif
case ESC_d:
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
continue;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
continue;
+ /* Perl 5.004 onwards omits VT from \s, but we must preserve it
+ if it was previously set by something earlier in the character
+ class. */
+
case ESC_s:
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
- classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
+ classbits[0] |= cbits[cbit_space];
+ classbits[1] |= cbits[cbit_space+1] & ~0x08;
+ for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
continue;
case ESC_S:
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
- default: /* Not recognized; fall through */
- break; /* Need "default" setting to stop compiler warning. */
- }
-
- /* In the pre-compile phase, just do the recognition. */
-
- else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
- c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
-
- /* We need to deal with \H, \h, \V, and \v in both phases because
- they use extra memory. */
-
- if (-c == ESC_h)
- {
+ case ESC_h:
SETBIT(classbits, 0x09); /* VT */
SETBIT(classbits, 0x20); /* SPACE */
SETBIT(classbits, 0xa0); /* NSBP */
}
#endif
continue;
- }
- if (-c == ESC_H)
- {
+ case ESC_H:
for (c = 0; c < 32; c++)
{
int x = 0xff;
}
#endif
continue;
- }
- if (-c == ESC_v)
- {
+ case ESC_v:
SETBIT(classbits, 0x0a); /* LF */
SETBIT(classbits, 0x0b); /* VT */
SETBIT(classbits, 0x0c); /* FF */
}
#endif
continue;
- }
- if (-c == ESC_V)
- {
+ case ESC_V:
for (c = 0; c < 32; c++)
{
int x = 0xff;
}
#endif
continue;
- }
-
- /* We need to deal with \P and \p in both phases. */
#ifdef SUPPORT_UCP
- if (-c == ESC_p || -c == ESC_P)
- {
- BOOL negated;
- int pdata;
- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
- if (ptype < 0) goto FAILED;
- class_utf8 = TRUE;
- *class_utf8data++ = ((-c == ESC_p) != negated)?
- XCL_PROP : XCL_NOTPROP;
- *class_utf8data++ = ptype;
- *class_utf8data++ = pdata;
- class_charcount -= 2; /* Not a < 256 character */
- continue;
- }
+ case ESC_p:
+ case ESC_P:
+ {
+ BOOL negated;
+ int pdata;
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
+ if (ptype < 0) goto FAILED;
+ class_utf8 = TRUE;
+ *class_utf8data++ = ((-c == ESC_p) != negated)?
+ XCL_PROP : XCL_NOTPROP;
+ *class_utf8data++ = ptype;
+ *class_utf8data++ = pdata;
+ class_charcount -= 2; /* Not a < 256 character */
+ continue;
+ }
#endif
- /* Unrecognized escapes are faulted if PCRE is running in its
- strict mode. By default, for compatibility with Perl, they are
- treated as literals. */
+ /* Unrecognized escapes are faulted if PCRE is running in its
+ strict mode. By default, for compatibility with Perl, they are
+ treated as literals. */
- if ((options & PCRE_EXTRA) != 0)
- {
- *errorcodeptr = ERR7;
- goto FAILED;
+ default:
+ if ((options & PCRE_EXTRA) != 0)
+ {
+ *errorcodeptr = ERR7;
+ goto FAILED;
+ }
+ class_charcount -= 2; /* Undo the default count from above */
+ c = *ptr; /* Get the final character and fall through */
+ break;
}
-
- class_charcount -= 2; /* Undo the default count from above */
- c = *ptr; /* Get the final character and fall through */
}
/* Fall through if we have a single character (c >= 0). This may be
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
- /* \b is backspace; \X is literal X; \R is literal R; any other
- special means the '-' was literal */
+ /* \b is backspace; any other special means the '-' was literal */
if (d < 0)
{
- if (d == -ESC_b) d = CHAR_BS;
- else if (d == -ESC_X) d = CHAR_X;
- else if (d == -ESC_R) d = CHAR_R; else
+ if (d == -ESC_b) d = CHAR_BS; else
{
ptr = oldptr;
goto LONE_SINGLE_CHARACTER; /* A few lines below */
}
}
- /* Loop until ']' reached. This "while" is the end of the "do" above. */
+ /* Loop until ']' reached. This "while" is the end of the "do" far above.
+ If we are at the end of an internal nested string, revert to the outer
+ string. */
- while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
+ while (((c = *(++ptr)) != 0 ||
+ (nestptr != NULL &&
+ (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
+ (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
- if (c == 0) /* Missing terminating ']' */
+ /* Check for missing terminating ']' */
+
+ if (c == 0)
{
*errorcodeptr = ERR6;
goto FAILED;
}
-
-/* This code has been disabled because it would mean that \s counts as
-an explicit \r or \n reference, and that's not really what is wanted. Now
-we set the flag only if there is a literal "\r" or "\n" in the class. */
-
-#if 0
- /* Remember whether \r or \n are in this class */
-
- if (negate_class)
- {
- if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
- }
- else
- {
- if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
- }
-#endif
-
-
/* If class_charcount is 1, we saw precisely one character whose value is
less than 256. As long as there were no characters >= 128 and there was no
use of \p or \P, in other words, no use of any XCLASS features, we can
/* If there are characters with values > 255, we have to compile an
extended class, with its own opcode, unless there was a negated special
- such as \S in the class, because in that case all characters > 255 are in
- the class, so any that were explicitly given as well can be ignored. If
- (when there are explicit characters > 255 that must be listed) there are no
- characters < 256, we can omit the bitmap in the actual compiled code. */
+ such as \S in the class, and PCRE_UCP is not set, because in that case all
+ characters > 255 are in the class, so any that were explicitly given as
+ well can be ignored. If (when there are explicit characters > 255 that must
+ be listed) there are no characters < 256, we can omit the bitmap in the
+ actual compiled code. */
#ifdef SUPPORT_UTF8
- if (class_utf8 && !should_flip_negation)
+ if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
{
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
}
#endif
- /* If there are no characters > 255, set the opcode to OP_CLASS or
- OP_NCLASS, depending on whether the whole class was negated and whether
- there were negative specials such as \S in the class. Then copy the 32-byte
- map into the code vector, negating it if necessary. */
+ /* If there are no characters > 255, or they are all to be included or
+ excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
+ whole class was negated and whether there were negative specials such as \S
+ (non-UCP) in the class. Then copy the 32-byte map into the code vector,
+ negating it if necessary. */
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
if (negate_class)
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
- options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
c = previous[1];
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
{
register int i;
int ketoffset = 0;
- int len = code - previous;
+ int len = (int)(code - previous);
uschar *bralink = NULL;
/* Repeating a DEFINE group is pointless */
{
register uschar *ket = previous;
do ket += GET(ket, 1); while (*ket != OP_KET);
- ketoffset = code - ket;
+ ketoffset = (int)(code - ket);
}
/* The case of a zero minimum is special because of the need to stick
/* We chain together the bracket offset fields that have to be
filled in later when the ends of the brackets are reached. */
- offset = (bralink == NULL)? 0 : previous - bralink;
+ offset = (bralink == NULL)? 0 : (int)(previous - bralink);
bralink = previous;
PUTINC(previous, 0, offset);
}
{
int offset;
*code++ = OP_BRA;
- offset = (bralink == NULL)? 0 : code - bralink;
+ offset = (bralink == NULL)? 0 : (int)(code - bralink);
bralink = code;
PUTINC(code, 0, offset);
}
while (bralink != NULL)
{
int oldlinkoffset;
- int offset = code - bralink + 1;
+ int offset = (int)(code - bralink + 1);
uschar *bra = code - offset;
oldlinkoffset = GET(bra, 1);
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
#endif
}
- len = code - tempcode;
+ len = (int)(code - tempcode);
if (len > 0) switch (*tempcode)
{
case OP_STAR: *tempcode = OP_POSSTAR; break;
/* First deal with various "verbs" that can be introduced by '*'. */
- if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
+ if (*(++ptr) == CHAR_ASTERISK &&
+ ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
{
int i, namelen;
+ int arglen = 0;
const char *vn = verbnames;
- const uschar *name = ++ptr;
+ const uschar *name = ptr + 1;
+ const uschar *arg = NULL;
previous = NULL;
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
+ namelen = (int)(ptr - name);
+
if (*ptr == CHAR_COLON)
{
- *errorcodeptr = ERR59; /* Not supported */
- goto FAILED;
+ arg = ++ptr;
+ while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
+ || *ptr == '_') ptr++;
+ arglen = (int)(ptr - arg);
}
+
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
*errorcodeptr = ERR60;
goto FAILED;
}
- namelen = ptr - name;
+
+ /* Scan the table of verb names */
+
for (i = 0; i < verbcount; i++)
{
if (namelen == verbs[i].len &&
PUT2INC(code, 0, oc->number);
}
}
- *code++ = verbs[i].op;
- break;
+
+ /* Handle the cases with/without an argument */
+
+ if (arglen == 0)
+ {
+ if (verbs[i].op < 0) /* Argument is mandatory */
+ {
+ *errorcodeptr = ERR66;
+ goto FAILED;
+ }
+ *code = verbs[i].op;
+ if (*code++ == OP_THEN)
+ {
+ PUT(code, 0, code - bcptr->current_branch - 1);
+ code += LINK_SIZE;
+ }
+ }
+
+ else
+ {
+ if (verbs[i].op_arg < 0) /* Argument is forbidden */
+ {
+ *errorcodeptr = ERR59;
+ goto FAILED;
+ }
+ *code = verbs[i].op_arg;
+ if (*code++ == OP_THEN_ARG)
+ {
+ PUT(code, 0, code - bcptr->current_branch - 1);
+ code += LINK_SIZE;
+ }
+ *code++ = arglen;
+ memcpy(code, arg, arglen);
+ code += arglen;
+ *code++ = 0;
+ }
+
+ break; /* Found verb, exit loop */
}
+
vn += verbs[i].len + 1;
}
- if (i < verbcount) continue;
- *errorcodeptr = ERR60;
+
+ if (i < verbcount) continue; /* Successfully handled a verb */
+ *errorcodeptr = ERR60; /* Verb not recognized */
goto FAILED;
}
recno * 10 + *ptr - CHAR_0 : -1;
ptr++;
}
- namelen = ptr - name;
+ namelen = (int)(ptr - name);
if ((terminator > 0 && *ptr++ != terminator) ||
*ptr++ != CHAR_RIGHT_PARENTHESIS)
/* Search the pattern for a forward reference */
else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) > 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
code[1+LINK_SIZE]++;
goto FAILED;
}
*code++ = n;
- PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
- PUT(code, LINK_SIZE, 0); /* Default length */
+ PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
+ PUT(code, LINK_SIZE, 0); /* Default length */
code += 2 * LINK_SIZE;
}
previous = NULL;
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
- namelen = ptr - name;
+ namelen = (int)(ptr - name);
/* In the pre-compile phase, just do a syntax check. */
NAMED_REF_OR_RECURSE:
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
- namelen = ptr - name;
+ namelen = (int)(ptr - name);
- /* In the pre-compile phase, do a syntax check and set a dummy
- reference number. */
+ /* In the pre-compile phase, do a syntax check. We used to just set
+ a dummy reference number, because it was not used in the first pass.
+ However, with the change of recursive back references to be atomic,
+ we have to look for the number so that this state can be identified, as
+ otherwise the incorrect length is computed. If it's not a backwards
+ reference, the dummy number will do. */
if (lengthptr != NULL)
{
+ const uschar *temp;
+
if (namelen == 0)
{
*errorcodeptr = ERR62;
*errorcodeptr = ERR48;
goto FAILED;
}
- recno = 0;
+
+ /* The name table does not exist in the first pass, so we cannot
+ do a simple search as in the code below. Instead, we have to scan the
+ pattern to find the number. It is important that we scan it only as
+ far as we have got because the syntax of named subpatterns has not
+ been checked for the rest of the pattern, and find_parens() assumes
+ correct syntax. In any case, it's a waste of resources to scan
+ further. We stop the scan at the current point by temporarily
+ adjusting the value of cd->endpattern. */
+
+ temp = cd->end_pattern;
+ cd->end_pattern = ptr;
+ recno = find_parens(cd, name, namelen,
+ (options & PCRE_EXTENDED) != 0, utf8);
+ cd->end_pattern = temp;
+ if (recno < 0) recno = 0; /* Forward ref; set dummy number */
}
/* In the real compile, seek the name in the table. We check the name
}
else if ((recno = /* Forward back reference */
find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) <= 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
if (called == NULL)
{
if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0) < 0)
+ (options & PCRE_EXTENDED) != 0, utf8) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
of the group. */
called = cd->start_code + recno;
- PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
+ PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
}
/* If not a forward reference, and the subpattern is still open,
code += 1 + LINK_SIZE;
*code = OP_RECURSE;
- PUT(code, 1, called - cd->start_code);
+ PUT(code, 1, (int)(called - cd->start_code));
code += 1 + LINK_SIZE;
*code = OP_KET;
} /* End of switch for character following (? */
} /* End of (? handling */
- /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
- all unadorned brackets become non-capturing and behave like (?:...)
+ /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
+ is set, all unadorned brackets become non-capturing and behave like (?:...)
brackets. */
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
/* ===================================================================*/
/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
- are arranged to be the negation of the corresponding OP_values. For the
- back references, the values are ESC_REF plus the reference number. Only
- back references and those types that consume a character may be repeated.
- We can test for values between ESC_b and ESC_Z for the latter; this may
- have to change if any new ones are ever created. */
+ are arranged to be the negation of the corresponding OP_values in the
+ default case when PCRE_UCP is not set. For the back references, the values
+ are ESC_REF plus the reference number. Only back references and those types
+ that consume a character may be repeated. We can test for values between
+ ESC_b and ESC_Z for the latter; this may have to change if any new ones are
+ ever created. */
case CHAR_BACKSLASH:
tempptr = ptr;
#endif
/* For the rest (including \X when Unicode properties are supported), we
- can obtain the OP value by negating the escape value. */
+ can obtain the OP value by negating the escape value in the default
+ situation when PCRE_UCP is not set. When it *is* set, we substitute
+ Unicode property tests. */
else
{
- previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = -c;
+#ifdef SUPPORT_UCP
+ if (-c >= ESC_DU && -c <= ESC_wu)
+ {
+ nestptr = ptr + 1; /* Where to resume */
+ ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
+ }
+ else
+#endif
+ {
+ previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
+ *code++ = -c;
+ }
}
continue;
}
{
if (lengthptr == NULL)
{
- int branch_length = code - last_branch;
+ int branch_length = (int)(code - last_branch);
do
{
int prev_length = GET(last_branch, 1);
/* Fill in the ket */
*code = OP_KET;
- PUT(code, 1, code - start_bracket);
+ PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
/* If it was a capturing subpattern, check to see if it contained any
code - start_bracket);
*start_bracket = OP_ONCE;
code += 1 + LINK_SIZE;
- PUT(start_bracket, 1, code - start_bracket);
+ PUT(start_bracket, 1, (int)(code - start_bracket));
*code = OP_KET;
- PUT(code, 1, code - start_bracket);
+ PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
length += 2 + 2*LINK_SIZE;
}
else
{
*code = OP_ALT;
- PUT(code, 1, code - last_branch);
+ PUT(code, 1, (int)(code - last_branch));
bc.current_branch = last_branch = code;
code += 1 + LINK_SIZE;
}
with errorptr and erroroffset set
*/
-#ifdef NOT_USED_IN_GLIB
-
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
}
-#endif
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
int firstbyte, reqbyte, newline;
int errorcode = 0;
int skipatstart = 0;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+BOOL utf8;
size_t size;
uschar *code;
const uschar *codestart;
if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
{ skipatstart += 7; options |= PCRE_UTF8; continue; }
+ else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
+ { skipatstart += 6; options |= PCRE_UCP; continue; }
+ else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
+ { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
else break;
}
+utf8 = (options & PCRE_UTF8) != 0;
+
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
#ifdef SUPPORT_UTF8
}
#endif
+/* Can't support UCP unless PCRE has been compiled to include the code. */
+
+#ifndef SUPPORT_UCP
+if ((options & PCRE_UCP) != 0)
+ {
+ errorcode = ERR67;
+ goto PCRE_EARLY_ERROR_RETURN;
+ }
+#endif
+
/* Check validity of \R options. */
switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
pointers. */
re->magic_number = MAGIC_NUMBER;
-re->size = size;
+re->size = (int)size;
re->options = cd->external_options;
re->flags = cd->external_flags;
re->dummy1 = 0;
recno = GET(codestart, offset);
groupptr = _pcre_find_bracket(codestart, utf8, recno);
if (groupptr == NULL) errorcode = ERR53;
- else PUT(((uschar *)codestart), offset, groupptr - codestart);
+ else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
}
/* Give an error if there's back reference to a non-existent capturing
{
(pcre_free)(re);
PCRE_EARLY_ERROR_RETURN:
- *erroroffset = ptr - (const uschar *)pattern;
+ *erroroffset = (int)(ptr - (const uschar *)pattern);
PCRE_EARLY_ERROR_RETURN2:
*errorptr = find_error_text(errorcode);
if (errorcodeptr != NULL) *errorcodeptr = errorcode;
/* This table identifies those opcodes that are followed immediately by a
-character that is to be tested in some way. This makes is possible to
+character that is to be tested in some way. This makes it possible to
centralize the loading of these characters. In the case of Type * etc, the
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
small value. Non-zero values in the table are the offsets from the opcode where
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
+ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
+ 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* This table identifies those opcodes that inspect a character. It is used to
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
+ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
+ 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
{
gone_back = (current_subject - max_back < start_subject)?
- current_subject - start_subject : max_back;
+ (int)(current_subject - start_subject) : max_back;
current_subject -= gone_back;
}
int back = GET(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
- int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
+ int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
ADD_NEW_DATA(-bstate, 0, gone_back - back);
}
end_code += GET(end_code, 1);
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
do
{
- ADD_NEW(end_code - start_code + length, 0);
+ ADD_NEW((int)(end_code - start_code + length), 0);
end_code += GET(end_code, 1);
length = 1 + LINK_SIZE;
}
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if (offsetcount >= 2)
{
- offsets[0] = current_subject - start_subject;
- offsets[1] = ptr - start_subject;
+ offsets[0] = (int)(current_subject - start_subject);
+ offsets[1] = (int)(ptr - start_subject);
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
offsets[1] - offsets[0], current_subject));
}
/*-----------------------------------------------------------------*/
case OP_ALT:
do { code += GET(code, 1); } while (*code == OP_ALT);
- ADD_ACTIVE(code - start_code, 0);
+ ADD_ACTIVE((int)(code - start_code), 0);
break;
/*-----------------------------------------------------------------*/
case OP_SBRA:
do
{
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
while (*code == OP_ALT);
/*-----------------------------------------------------------------*/
case OP_CBRA:
case OP_SCBRA:
- ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
code += GET(code, 1);
while (*code == OP_ALT)
{
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
break;
ADD_ACTIVE(state_offset + 1, 0);
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
case OP_SKIPZERO:
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
/*-----------------------------------------------------------------*/
case OP_EOD:
- if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+ if (ptr >= end_subject)
+ {
+ if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else { ADD_ACTIVE(state_offset + 1, 0); }
+ }
break;
/*-----------------------------------------------------------------*/
/*-----------------------------------------------------------------*/
case OP_EODN:
- if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
case OP_DOLL:
if ((md->moptions & PCRE_NOTEOL) == 0)
{
- if (clen == 0 ||
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 ||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
))
if (utf8) BACKCHAR(temp);
#endif
GETCHARTEST(d, temp);
+#ifdef SUPPORT_UCP
+ if ((md->poptions & PCRE_UCP) != 0)
+ {
+ if (d == '_') left_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(d);
+ left_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
}
- else left_word = 0;
+ else left_word = FALSE;
if (clen > 0)
+ {
+#ifdef SUPPORT_UCP
+ if ((md->poptions & PCRE_UCP) != 0)
+ {
+ if (c == '_') right_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ right_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
- else right_word = 0;
+ }
+ else right_word = FALSE;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
OK = UCD_SCRIPT(c) == code[2];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
OK = UCD_SCRIPT(c) == code[3];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
OK = UCD_SCRIPT(c) == code[3];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
OK = UCD_SCRIPT(c) == code[5];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
points to the byte after the end of the class. If there is a
quantifier, this is where it will be. */
- next_state_offset = ecode - start_code;
+ next_state_offset = (int)(ecode - start_code);
switch (*ecode)
{
md, /* static match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
}
break;
cb.callout_number = code[LINK_SIZE+2];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
+ cb.subject_length = (int)(end_subject - start_subject);
+ cb.start_match = (int)(current_subject - start_subject);
+ cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, LINK_SIZE + 3);
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
cb.capture_top = 1;
md, /* fixed match data */
asscode, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
else
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
md, /* fixed match data */
start_code + GET(code, 1), /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
md, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
- next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
+ next_state_offset =
+ (int)(end_subpattern - start_code + LINK_SIZE + 1);
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
arrange for the repeat state also to be added to the relevant list.
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
*end_subpattern == OP_KETRMIN)?
- end_subpattern - start_code - GET(end_subpattern, 1) : -1;
+ (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
/* If we have matched an empty string, add the next state at the
current character pointer. This is important so that the duplicate
cb.callout_number = code[1];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
+ cb.subject_length = (int)(end_subject - start_subject);
+ cb.start_match = (int)(current_subject - start_subject);
+ cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, 2);
cb.next_item_length = GET(code, 2 + LINK_SIZE);
cb.capture_top = 1;
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
match_count < 0) /* no matches */
) && /* And... */
- ptr >= end_subject && /* Reached end of subject */
- ptr > current_subject) /* Matched non-empty string */
+ ptr >= end_subject && /* Reached end of subject */
+ ptr > md->start_used_ptr) /* Inspected non-empty string */
{
if (offsetcount >= 2)
{
- offsets[0] = md->start_used_ptr - start_subject;
- offsets[1] = end_subject - start_subject;
+ offsets[0] = (int)(md->start_used_ptr - start_subject);
+ offsets[1] = (int)(end_subject - start_subject);
}
match_count = PCRE_ERROR_PARTIAL;
}
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* We need to find the pointer to any study data before we test for byte
flipping, so we scan the extra_data block first. This may set two fields in the
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
+ int tb;
+ if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
+ return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ tb = ((USPTR)subject)[start_offset] & 0xc0;
+ if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
/* There are some optimizations that avoid running the match if a known
starting point is not found. However, there is an option that disables
- these, for testing and for ensuring that all callouts do actually occur. */
+ these, for testing and for ensuring that all callouts do actually occur.
+ The option can be set in the regex by (*NO_START_OPT) or passed in
+ match-time options. */
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+ if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
/* Advance to a known first byte. */
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ current_subject++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(current_subject < end_subject &&
+ (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif
+ }
+ else break;
}
}
}
/* Special internal returns from the match() function. Make them sufficiently
negative to avoid the external error codes. */
-#define MATCH_COMMIT (-999)
-#define MATCH_PRUNE (-998)
-#define MATCH_SKIP (-997)
-#define MATCH_THEN (-996)
+#define MATCH_ACCEPT (-999)
+#define MATCH_COMMIT (-998)
+#define MATCH_PRUNE (-997)
+#define MATCH_SKIP (-996)
+#define MATCH_SKIP_ARG (-995)
+#define MATCH_THEN (-994)
+
+/* This is a convenience macro for code that occurs many times. */
+
+#define MRRETURN(ra) \
+ { \
+ md->mark = markptr; \
+ RRETURN(ra); \
+ }
/* Maximum number of ints of offset to save on the stack for recursive calls.
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
- RM51, RM52, RM53, RM54 };
+ RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
+ RM61, RM62 };
/* These versions of the macros use the stack, as normal. There are debugging
versions and production versions. Note that the "rw" argument of RMATCH isn't
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
{\
- heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
+ heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
+ if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
frame->Xwhere = rw; \
newframe->Xeptr = ra;\
newframe->Xecode = rb;\
#define RRETURN(ra)\
{\
- heapframe *newframe = frame;\
- frame = newframe->Xprevframe;\
- (pcre_stack_free)(newframe);\
+ heapframe *oldframe = frame;\
+ frame = oldframe->Xprevframe;\
+ (pcre_stack_free)(oldframe);\
if (frame != NULL)\
{\
rrc = ra;\
the subject. */
#define CHECK_PARTIAL()\
- if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
- {\
- md->hitend = TRUE;\
- if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
+ if (md->partial != 0 && eptr >= md->end_subject && \
+ eptr > md->start_used_ptr) \
+ { \
+ md->hitend = TRUE; \
+ if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
}
#define SCHECK_PARTIAL()\
- if (md->partial != 0 && eptr > mstart)\
- {\
- md->hitend = TRUE;\
- if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
+ if (md->partial != 0 && eptr > md->start_used_ptr) \
+ { \
+ md->hitend = TRUE; \
+ if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
}
Returns: MATCH_MATCH if matched ) these values are >= 0
MATCH_NOMATCH if failed to match )
+ a negative MATCH_xxx value for PRUNE, SKIP, etc
a negative PCRE_ERROR_xxx value if aborted by an error condition
(e.g. stopped by repeated call or recursion limit)
*/
static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR
- markptr, int offset_top, match_data *md, unsigned long int ims,
+match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
+ const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
eptrblock *eptrb, int flags, unsigned int rdepth)
{
/* These variables do not need to be preserved over recursion in this function,
heap whenever RMATCH() does a "recursion". See the macro definitions above. */
#ifdef NO_RECURSE
-heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
+heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
+if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
frame->Xprevframe = NULL; /* Marks the top level */
/* Copy in the original argument variables */
switch(op)
{
+ case OP_MARK:
+ markptr = ecode + 2;
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+ ims, eptrb, flags, RM55);
+
+ /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
+ argument, and we must check whether that argument matches this MARK's
+ argument. It is passed back in md->start_match_ptr (an overloading of that
+ variable). If it does match, we reset that variable to the current subject
+ position and return MATCH_SKIP. Otherwise, pass back the return code
+ unaltered. */
+
+ if (rrc == MATCH_SKIP_ARG &&
+ strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
+ {
+ md->start_match_ptr = eptr;
+ RRETURN(MATCH_SKIP);
+ }
+
+ if (md->mark == NULL) md->mark = markptr;
+ RRETURN(rrc);
+
case OP_FAIL:
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+
+ /* COMMIT overrides PRUNE, SKIP, and THEN */
+
+ case OP_COMMIT:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM52);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
+ rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
+ rrc != MATCH_THEN)
+ RRETURN(rrc);
+ MRRETURN(MATCH_COMMIT);
+
+ /* PRUNE overrides THEN */
case OP_PRUNE:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM51);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ MRRETURN(MATCH_PRUNE);
+
+ case OP_PRUNE_ARG:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+ ims, eptrb, flags, RM56);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ md->mark = ecode + 2;
RRETURN(MATCH_PRUNE);
- case OP_COMMIT:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM52);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_COMMIT);
+ /* SKIP overrides PRUNE and THEN */
case OP_SKIP:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM53);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
+ RRETURN(rrc);
md->start_match_ptr = eptr; /* Pass back current position */
- RRETURN(MATCH_SKIP);
+ MRRETURN(MATCH_SKIP);
+
+ case OP_SKIP_ARG:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+ ims, eptrb, flags, RM57);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
+ RRETURN(rrc);
+
+ /* Pass back the current skip name by overloading md->start_match_ptr and
+ returning the special MATCH_SKIP_ARG return code. This will either be
+ caught by a matching MARK, or get to the top, where it is treated the same
+ as PRUNE. */
+
+ md->start_match_ptr = ecode + 2;
+ RRETURN(MATCH_SKIP_ARG);
+
+ /* For THEN (and THEN_ARG) we pass back the address of the bracket or
+ the alt that is at the start of the current branch. This makes it possible
+ to skip back past alternatives that precede the THEN within the current
+ branch. */
case OP_THEN:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM54);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ md->start_match_ptr = ecode - GET(ecode, 1);
+ MRRETURN(MATCH_THEN);
+
+ case OP_THEN_ARG:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
+ offset_top, md, ims, eptrb, flags, RM58);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ md->start_match_ptr = ecode - GET(ecode, 1);
+ md->mark = ecode + LINK_SIZE + 2;
RRETURN(MATCH_THEN);
/* Handle a capturing bracket. If there is space in the offset vector, save
save_capture_last = md->capture_last;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
+ md->offset_vector[md->offset_end - number] =
+ (int)(eptr - md->start_subject);
flags = (op == OP_SCBRA)? match_cbegroup : 0;
do
{
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM1);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
}
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
+ if (rrc != MATCH_THEN) md->mark = markptr;
RRETURN(MATCH_NOMATCH);
}
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
eptrb, flags, RM48);
+ if (rrc == MATCH_NOMATCH) md->mark = markptr;
RRETURN(rrc);
}
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
eptrb, flags, RM2);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode, 1);
}
/* Control never reaches here. */
cb.callout_number = ecode[LINK_SIZE+2];
cb.offset_vector = md->offset_vector;
cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
+ cb.subject_length = (int)(md->end_subject - md->start_subject);
+ cb.start_match = (int)(mstart - md->start_subject);
+ cb.current_position = (int)(eptr - md->start_subject);
cb.pattern_position = GET(ecode, LINK_SIZE + 3);
cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+ if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
ecode += _pcre_OP_lengths[OP_CALLOUT];
ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
}
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+ else if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
{
RRETURN(rrc); /* Need braces because of following else */
}
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
+ md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
if (offset_top <= offset) offset_top = offset + 2;
}
ecode += 3;
(md->notempty ||
(md->notempty_atstart &&
mstart == md->start_subject + md->start_offset)))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* Otherwise, we have a match. */
md->end_match_ptr = eptr; /* Record where we ended */
md->end_offset_top = offset_top; /* and how many extracts were taken */
md->start_match_ptr = mstart; /* and the start (\K can modify) */
- RRETURN(MATCH_MATCH);
+
+ /* For some reason, the macros don't work properly if an expression is
+ given as the argument to MRRETURN when the heap is in use. */
+
+ rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
+ MRRETURN(rrc);
/* Change option settings */
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM4);
- if (rrc == MATCH_MATCH)
+ if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
mstart = md->start_match_ptr; /* In case \K reset it */
break;
}
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
- if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
+ if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
/* If checking an assertion for a condition, return MATCH_MATCH. */
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM5);
- if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
+ if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
{
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
break;
}
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
while (i-- > 0)
{
eptr--;
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
BACKCHAR(eptr);
}
}
{
eptr -= GET(ecode, 1);
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
}
/* Save the earliest consulted character, then skip to next op code */
cb.callout_number = ecode[1];
cb.offset_vector = md->offset_vector;
cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
+ cb.subject_length = (int)(md->end_subject - md->start_subject);
+ cb.start_match = (int)(mstart - md->start_subject);
+ cb.current_position = (int)(eptr - md->start_subject);
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+ if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
ecode += 2 + 2*LINK_SIZE;
{
RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
md, ims, eptrb, flags, RM6);
- if (rrc == MATCH_MATCH)
+ if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
DPRINTF(("Recursion matched\n"));
md->recursive = new_recursive.prevrec;
if (new_recursive.offset_save != stacksave)
(pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_MATCH);
+ MRRETURN(MATCH_MATCH);
}
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+ else if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
{
DPRINTF(("Recursion gave error %d\n", rrc));
if (new_recursive.offset_save != stacksave)
md->recursive = new_recursive.prevrec;
if (new_recursive.offset_save != stacksave)
(pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never reaches here */
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
- if (rrc == MATCH_MATCH)
+ if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
{
mstart = md->start_match_ptr;
break;
}
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
md->start_match_ptr = mstart;
- RRETURN(MATCH_MATCH);
+ MRRETURN(MATCH_MATCH);
}
/* For capturing groups we have to check the group number back at the start
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
+ md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
if (offset_top <= offset) offset_top = offset + 2;
}
/* Start of subject unless notbol, or after internal newline if multiline */
case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr != md->start_subject &&
(eptr == md->end_subject || !WAS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
}
/* Start of subject assertion */
case OP_SOD:
- if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
ecode++;
break;
/* Start of match assertion */
case OP_SOM:
- if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
+ if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr < md->end_subject)
- { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
+ { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
else
- { if (md->noteol) RRETURN(MATCH_NOMATCH); }
+ {
+ if (md->noteol) MRRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
+ }
ecode++;
break;
}
- else
+ else /* Not multiline */
{
- if (md->noteol) RRETURN(MATCH_NOMATCH);
- if (!md->endonly)
- {
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
+ if (md->noteol) MRRETURN(MATCH_NOMATCH);
+ if (!md->endonly) goto ASSERT_NL_OR_EOS;
}
+
/* ... else fall through for endonly */
/* End of subject assertion (\z) */
case OP_EOD:
- if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
ecode++;
break;
/* End of subject or ending \n assertion (\Z) */
case OP_EODN:
- if (eptr != md->end_subject &&
+ ASSERT_NL_OR_EOS:
+ if (eptr < md->end_subject &&
(!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+
+ /* Either at end of string or \n before end. */
+
+ SCHECK_PARTIAL();
ecode++;
break;
#ifdef SUPPORT_UTF8
if (utf8)
{
+ /* Get status of previous character */
+
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
USPTR lastptr = eptr - 1;
while((*lastptr & 0xc0) == 0x80) lastptr--;
if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
GETCHAR(c, lastptr);
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ if (c == '_') prev_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ prev_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
+
+ /* Get status of next character */
+
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
else
{
GETCHAR(c, eptr);
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ if (c == '_') cur_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ cur_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
}
else
#endif
- /* Not in UTF-8 mode */
+ /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
+ consistency with the behaviour of \w we do use it in this case. */
{
+ /* Get status of previous character */
+
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ c = eptr[-1];
+ if (c == '_') prev_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ prev_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
}
+
+ /* Get status of next character */
+
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
cur_is_word = FALSE;
}
- else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
+ else
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ c = *eptr;
+ if (c == '_') cur_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ cur_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
+ cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
}
/* Now see if the situation is what we want */
if ((*ecode++ == OP_WORD_BOUNDARY)?
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
/* Match a single character type; inline for speed */
case OP_ANY:
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ALLANY:
if (eptr++ >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
if (eptr++ >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
#endif
(md->ctypes[c] & ctype_digit) != 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
#endif
(md->ctypes[c] & ctype_digit) == 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
#endif
(md->ctypes[c] & ctype_space) != 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
#endif
(md->ctypes[c] & ctype_space) == 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
#endif
(md->ctypes[c] & ctype_word) != 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
#endif
(md->ctypes[c] & ctype_word) == 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x0085:
case 0x2028:
case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
ecode++;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
{
int chartype = UCD_CHARTYPE(c);
+
switch(ecode[1])
{
case PT_ANY:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
break;
case PT_LAMP:
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
chartype == ucp_Lt) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
+ MRRETURN(MATCH_NOMATCH);
+ break;
case PT_GC:
if ((ecode[2] != _pcre_ucp_gentype[chartype]) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case PT_PC:
if ((ecode[2] != chartype) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case PT_SC:
if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ /* These are specials */
+
+ case PT_ALNUM:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N) == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
break;
+ case PT_SPACE: /* Perl space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR)
+ == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_WORD:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ /* This should never occur */
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
{
int category = UCD_CATEGORY(c);
- if (category == ucp_M) RRETURN(MATCH_NOMATCH);
+ if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
referenced subpattern. */
if (offset >= offset_top || md->offset_vector[offset] < 0)
- length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
+ length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
else
length = md->offset_vector[offset+1] - md->offset_vector[offset];
if (!match_ref(offset, eptr, length, md, ims))
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += length;
continue; /* With the main loop */
if (!match_ref(offset, eptr, length, md, ims))
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += length;
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (!match_ref(offset, eptr, length, md, ims))
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += length;
}
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr -= length;
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
if (c > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
}
else
{
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
if (c > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
}
else
{
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
}
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
}
/* If max == min we can continue with the main loop without the
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
if (eptr-- == pp) break; /* Stop if tried at original pos */
if (utf8) BACKCHAR(eptr);
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
if (length > md->end_subject - eptr)
{
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
}
else
#endif
if (md->end_subject - eptr < 1)
{
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
ecode += 2;
}
break;
if (length > md->end_subject - eptr)
{
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* If the pattern character's value is < 128, we have only one byte, and
if (fc < 128)
{
- if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
/* Otherwise we must pick up the subject character */
#ifdef SUPPORT_UCP
if (dc != UCD_OTHERCASE(fc))
#endif
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
}
if (md->end_subject - eptr < 1)
{
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
ecode += 2;
}
break;
else
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr <= md->end_subject - length &&
memcmp(eptr, charptr, length) == 0) eptr += length;
#ifdef SUPPORT_UCP
else
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
+ if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
if (minimize)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
GETCHARINCTEST(c, eptr);
if (c < 256)
#endif
c = md->lcc[c];
- if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
}
else
{
- if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
+ if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
}
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
}
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
}
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
switch(prop_type)
{
case PT_ANY:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_ALNUM:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_SPACE: /* Perl space */
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_WORD:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N ||
+ c == CHAR_UNDERSCORE)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
}
break;
+ /* This should not occur */
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
break;
case OP_ANYBYTE:
- if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
+ if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
eptr += min;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x0085:
case 0x2028:
case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
}
break;
if (eptr > md->end_subject - min)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += min;
break;
if (eptr > md->end_subject - min)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += min;
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000b:
case 0x000c:
case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
}
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if ((md->ctypes[*eptr++] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if ((md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
+ case PT_ALNUM:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_SPACE: /* Perl space */
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_PXSPACE: /* POSIX space */
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_WORD:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L ||
+ prop_category == ucp_N ||
+ c == CHAR_UNDERSCORE)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ /* This should never occur */
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (ctype == OP_ANY && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
case OP_ANYNL:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x0085:
case 0x2028:
case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
break;
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case OP_NOT_DIGIT:
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
default:
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (ctype == OP_ANY && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
case OP_ANYNL:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000b:
case 0x000c:
case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
break;
case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
break;
default:
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
if (prop_fail_result) break;
eptr+= len;
}
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
break;
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
break;
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
break;
eptr+= len;
}
break;
+
+ case PT_ALNUM:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N)
+ == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_SPACE: /* Perl space */
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_WORD:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N ||
+ c == CHAR_UNDERSCORE) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
/* Get here if we can't make it match with any permitted repetitions */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54)
+ LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
#ifdef SUPPORT_UTF8
LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
+ LBL(59) LBL(60) LBL(61) LBL(62)
#endif /* SUPPORT_UCP */
#endif /* SUPPORT_UTF8 */
default:
const real_pcre *re = external_re;
/* Plausibility checks */
-
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* This information is for finding all the numbers associated with a given
name, for condition testing. */
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->use_ucp = (re->options & PCRE_UCP) != 0;
md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0;
md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
md->hitend = FALSE;
+md->mark = NULL; /* In case never set */
md->recursive = NULL; /* No recursion at top level */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
+ int tb;
+ if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
+ return (tb == length && md->partial > 1)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((USPTR)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ tb = ((USPTR)subject)[start_offset] & 0xc0;
+ if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
/* There are some optimizations that avoid running the match if a known
starting point is not found, or if a known later character is not present.
However, there is an option that disables these, for testing and for ensuring
- that all callouts do actually occur. */
+ that all callouts do actually occur. The option can be set in the regex by
+ (*NO_START_OPT) or passed in match-time options. */
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+ if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
/* Advance to a unique first byte if there is one. */
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ start_match++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+#endif
+ }
+ else break;
}
}
} /* Starting optimizations */
switch(rc)
{
+ /* SKIP passes back the next starting point explicitly, but if it is the
+ same as the match we have just done, treat it as NOMATCH. */
+
+ case MATCH_SKIP:
+ if (md->start_match_ptr != start_match)
+ {
+ new_start_match = md->start_match_ptr;
+ break;
+ }
+ /* Fall through */
+
+ /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
+ the SKIP's arg was not found. We also treat this as NOMATCH. */
+
+ case MATCH_SKIP_ARG:
+ /* Fall through */
+
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
exactly like PRUNE. */
#endif
break;
- /* SKIP passes back the next starting point explicitly. */
-
- case MATCH_SKIP:
- new_start_match = md->start_match_ptr;
- break;
-
/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
case MATCH_COMMIT:
md->nllen == 2))
start_match++;
- } /* End of for(;;) "bumpalong" loop */
+ md->mark = NULL; /* Reset for start of next match attempt */
+ } /* End of for(;;) "bumpalong" loop */
/* ==========================================================================*/
ENDLOOP:
-if (rc == MATCH_MATCH)
+if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
{
if (using_temporary_offsets)
{
if (offsetcount < 2) rc = 0; else
{
- offsets[0] = md->start_match_ptr - md->start_subject;
- offsets[1] = md->end_match_ptr - md->start_subject;
+ offsets[0] = (int)(md->start_match_ptr - md->start_subject);
+ offsets[1] = (int)(md->end_match_ptr - md->start_subject);
}
DPRINTF((">>>> returning %d\n", rc));
- return rc;
+ goto RETURN_MARK;
}
/* Control gets here if there has been an error, or if the overall match
(pcre_free)(md->offset_vector);
}
+/* For anything other than nomatch or partial match, just return the code. */
+
if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
{
DPRINTF((">>>> error: returning %d\n", rc));
return rc;
}
-else if (start_partial != NULL)
+
+/* Handle partial matches - disable any mark data */
+
+if (start_partial != NULL)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
+ md->mark = NULL;
if (offsetcount > 1)
{
- offsets[0] = start_partial - (USPTR)subject;
- offsets[1] = end_subject - (USPTR)subject;
+ offsets[0] = (int)(start_partial - (USPTR)subject);
+ offsets[1] = (int)(end_subject - (USPTR)subject);
}
- return PCRE_ERROR_PARTIAL;
+ rc = PCRE_ERROR_PARTIAL;
}
+
+/* This is the classic nomatch case */
+
else
{
DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
- return PCRE_ERROR_NOMATCH;
+ rc = PCRE_ERROR_NOMATCH;
}
+
+/* Return the MARK data if it has been requested. */
+
+RETURN_MARK:
+
+if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
+ *(extra_data->mark) = (unsigned char *)(md->mark);
+return rc;
}
/* End of pcre_exec.c */
/* When UTF-8 encoding is being used, a character is no longer just a single
byte. The macros for character handling generate simple sequences when used in
-byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
-never be called in byte mode. To make sure it can never even appear when UTF-8
-support is omitted, we don't even define it. */
+byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
+not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
+never be called in byte mode. To make sure they can never even appear when
+UTF-8 support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF8
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
#define GETCHARINCTEST(c, eptr) c = *eptr++;
#define GETCHARLEN(c, eptr, len) c = *eptr;
+/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
#else /* SUPPORT_UTF8 */
+/* These macros were originally written in the form of loops that used data
+from the tables whose names start with _pcre_utf8_table. They were rewritten by
+a user so as not to use loops, because in some environments this gives a
+significant performance advantage, and it seems never to do any harm. */
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer. */
+
+#define GETUTF8(c, eptr) \
+ { \
+ if ((c & 0x20) == 0) \
+ c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+ else if ((c & 0x10) == 0) \
+ c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+ else if ((c & 0x08) == 0) \
+ c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+ ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+ else if ((c & 0x04) == 0) \
+ c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+ ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+ (eptr[4] & 0x3f); \
+ else \
+ c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+ ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+ ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+ }
+
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
#define GETCHAR(c, eptr) \
c = *eptr; \
- if (c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- }
+ if (c >= 0xc0) GETUTF8(c, eptr);
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
- if (utf8 && c >= 0xc0) \
+ if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
{ \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
+ if ((c & 0x20) == 0) \
+ c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
+ else if ((c & 0x10) == 0) \
{ \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
+ c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
+ eptr += 2; \
+ } \
+ else if ((c & 0x08) == 0) \
+ { \
+ c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
+ ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+ eptr += 3; \
+ } \
+ else if ((c & 0x04) == 0) \
+ { \
+ c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
+ ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
+ (eptr[3] & 0x3f); \
+ eptr += 4; \
+ } \
+ else \
+ { \
+ c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
+ ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
+ ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
+ eptr += 5; \
} \
}
#define GETCHARINC(c, eptr) \
c = *eptr++; \
- if (c >= 0xc0) \
- { \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- while (gcaa-- > 0) \
- { \
- gcss -= 6; \
- c |= (*eptr++ & 0x3f) << gcss; \
- } \
- }
+ if (c >= 0xc0) GETUTF8INC(c, eptr);
-/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
- if (utf8 && c >= 0xc0) \
+ if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF8LEN(c, eptr, len) \
{ \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- while (gcaa-- > 0) \
+ if ((c & 0x20) == 0) \
+ { \
+ c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+ len++; \
+ } \
+ else if ((c & 0x10) == 0) \
{ \
- gcss -= 6; \
- c |= (*eptr++ & 0x3f) << gcss; \
+ c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+ len += 2; \
+ } \
+ else if ((c & 0x08) == 0) \
+ {\
+ c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+ ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+ len += 3; \
+ } \
+ else if ((c & 0x04) == 0) \
+ { \
+ c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+ ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+ (eptr[4] & 0x3f); \
+ len += 4; \
+ } \
+ else \
+ {\
+ c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+ ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+ ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+ len += 5; \
} \
}
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
- if (c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
+ if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
-know we are in UTF-8 mode. */
+do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
- if (utf8 && c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
+ if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
-#endif
+#endif /* SUPPORT_UTF8 */
/* In case there is no definition of offsetof() provided - though any proper
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
- PCRE_JAVASCRIPT_COMPAT)
+ PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
environments where these macros are defined elsewhere. Unfortunately, there
is no way to do the same for the typedef. */
-typedef gboolean BOOL;
+typedef gboolean BOOL;
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
character constants like '*' because the compiler would emit their EBCDIC code,
#define STRING_COMMIT0 "COMMIT\0"
#define STRING_F0 "F\0"
#define STRING_FAIL0 "FAIL\0"
+#define STRING_MARK0 "MARK\0"
#define STRING_PRUNE0 "PRUNE\0"
#define STRING_SKIP0 "SKIP\0"
#define STRING_THEN "THEN"
#define STRING_DEFINE "DEFINE"
-#define STRING_CR_RIGHTPAR "CR)"
-#define STRING_LF_RIGHTPAR "LF)"
-#define STRING_CRLF_RIGHTPAR "CRLF)"
-#define STRING_ANY_RIGHTPAR "ANY)"
-#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
-#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
-#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
-#define STRING_UTF8_RIGHTPAR "UTF8)"
+#define STRING_CR_RIGHTPAR "CR)"
+#define STRING_LF_RIGHTPAR "LF)"
+#define STRING_CRLF_RIGHTPAR "CRLF)"
+#define STRING_ANY_RIGHTPAR "ANY)"
+#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
+#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
+#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
+#define STRING_UTF8_RIGHTPAR "UTF8)"
+#define STRING_UCP_RIGHTPAR "UCP)"
+#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
#else /* SUPPORT_UTF8 */
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
#define STRING_F0 STR_F "\0"
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
+#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
#define STRING_THEN STR_T STR_H STR_E STR_N
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
-#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
-#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
-#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
-#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
+#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
+#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
+#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
+#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
#endif /* SUPPORT_UTF8 */
#define PT_ANY 0 /* Any property - matches all chars */
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
-#define PT_GC 2 /* General characteristic (e.g. L) */
-#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
+#define PT_GC 2 /* Specified general characteristic (e.g. L) */
+#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
#define PT_SC 4 /* Script (e.g. Han) */
+#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
+#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
+#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD 8 /* Word - L plus N plus underscore */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence, and another for OP_ALLANY
-(which is used for [^] in JavaScript compatibility mode).
+definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
+corresponds to "." in DOTALL mode rather than an escape sequence. It is also
+used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
+like \N.
+
+The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
+when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
+They must be contiguous, and remain in order so that the replacements can be
+looked up from a table.
The final escape must be ESC_REF as subsequent values are used for
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
*/
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
- ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
- ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
+ ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
+ ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
ESC_REF };
-
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
OP_EOD must correspond in order to the list of escapes immediately above.
OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
- OP_ANY, /* 12 Match any character (subject to DOTALL) */
- OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
+ OP_ANY, /* 12 Match any character except newline */
+ OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 15 \P (not Unicode property) */
OP_PROP, /* 16 \p (Unicode property) */
/* These are backtracking control verbs */
- OP_PRUNE, /* 107 */
- OP_SKIP, /* 108 */
- OP_THEN, /* 109 */
- OP_COMMIT, /* 110 */
+ OP_MARK, /* 107 always has an argument */
+ OP_PRUNE, /* 108 */
+ OP_PRUNE_ARG, /* 109 same, but with argument */
+ OP_SKIP, /* 110 */
+ OP_SKIP_ARG, /* 111 same, but with argument */
+ OP_THEN, /* 112 */
+ OP_THEN_ARG, /* 113 same, but with argument */
+ OP_COMMIT, /* 114 */
/* These are forced failure and success verbs */
- OP_FAIL, /* 111 */
- OP_ACCEPT, /* 112 */
- OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
+ OP_FAIL, /* 115 */
+ OP_ACCEPT, /* 116 */
+ OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
- OP_SKIPZERO, /* 114 */
+ OP_SKIPZERO, /* 118 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
-called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
+called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
"Brazero", "Braminzero", \
- "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
+ "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
+ "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"Close", "Skip zero"
3, 3, /* RREF, NRREF */ \
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
- 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
+ 1, 3, /* SKIP, SKIP_ARG */ \
+ 1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
+ 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
- ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
+ ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
+ ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
BOOL noteol; /* NOTEOL flag */
BOOL utf8; /* UTF8 flag */
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
+ BOOL use_ucp; /* PCRE_UCP flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
BOOL notempty_atstart; /* Empty string match at start not wanted */
int eptrn; /* Next free eptrblock */
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
+ const uschar *mark; /* Mark pointer to pass back */
} match_data;
/* A similar structure is used for the same purpose by the DFA matching
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
-#define _pcre_valid_utf8(u, i) TRUE
+#define _pcre_valid_utf8(USPTR, int) TRUE
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
extern BOOL _pcre_xclass(int, const uschar *);
#include "pcre_internal.h"
+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
/* Returns from set_start_bits() */
#endif
break;
+ /* Skip these, but we need to add in the name length. */
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ cc += _pcre_OP_lengths[op] + cc[1];
+ break;
+
+ case OP_THEN_ARG:
+ cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
+ break;
+
/* For the record, these are the opcodes that are matched by "default":
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
OP_THEN. */
* Set a bit and maybe its alternate case *
*************************************************/
-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.
Arguments:
start_bits points to the bit map
- c is the character
+ p points to the character
caseless the caseless flag
cd the block with char table pointers
+ utf8 TRUE for UTF-8 mode
-Returns: nothing
+Returns: pointer after the character
+*/
+
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+ compile_data *cd, BOOL utf8)
+{
+unsigned int c = *p;
+
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ uschar buff[8];
+ c = UCD_OTHERCASE(c);
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+#endif
+ return p;
+ }
+#endif
+
+/* Not UTF-8 mode, or character is less than 127. */
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+}
+
+
+
+/*************************************************
+* Set bits for a positive character type *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
*/
static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
-start_bits[c/8] |= (1 << (c&7));
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+ {
+ if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+ {
+ uschar buff[8];
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+ }
+}
+
+
+/*************************************************
+* Set bits for a negative character type *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at
+0xc0 (192) for simplicity.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+ compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
}
{
register int c;
int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- set_table_bit(start_bits, tcode[1], caseless, cd);
- tcode += 2;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
break;
/* Single-char upto sets the bit and tries the next */
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- set_table_bit(start_bits, tcode[3], caseless, cd);
- tcode += 4;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
break;
/* At least one single char sets the bit and stops */
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- set_table_bit(start_bits, tcode[1], caseless, cd);
+ (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
+ try_next = FALSE;
+ break;
+
+ /* Special spacing and line-terminating items. These recognize specific
+ lists of characters. The difference between VSPACE and ANYNL is that the
+ latter can match the two-character CRLF sequence, but that is not
+ relevant for finding the first character, so their code here is
+ identical. */
+
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+00A0 */
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ else SET_BIT(0xA0);
+ try_next = FALSE;
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
try_next = FALSE;
break;
- /* Single character type sets the bits and stops */
+ /* Single character types set the bits and stop. Note that if PCRE_UCP
+ is set, we do not see these op codes because \d etc are converted to
+ properties. Therefore, these apply in the case when only characters less
+ than 256 are recognized to match the types. */
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it is set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ not set it from the table. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
+ case OP_TYPEPOSPLUS:
tcode++;
break;
case OP_TYPEPOSQUERY:
switch(tcode[1])
{
+ default:
case OP_ANY:
case OP_ALLANY:
return SSB_FAIL;
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+00A0 */
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ else SET_BIT(0xA0);
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
+ break;
+
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it gets set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ avoid setting it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
break;
}
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
+#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
+#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
#define STRING_Lu0 STR_L STR_u "\0"
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
+#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mc0 STR_M STR_c "\0"
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
#define STRING_Vai0 STR_V STR_a STR_i "\0"
+#define STRING_Xan0 STR_X STR_a STR_n "\0"
+#define STRING_Xps0 STR_X STR_p STR_s "\0"
+#define STRING_Xsp0 STR_X STR_s STR_p "\0"
+#define STRING_Xwd0 STR_X STR_w STR_d "\0"
#define STRING_Yi0 STR_Y STR_i "\0"
#define STRING_Z0 STR_Z "\0"
#define STRING_Zl0 STR_Z STR_l "\0"
STRING_Avestan0
STRING_Balinese0
STRING_Bamum0
+ STRING_Batak0
STRING_Bengali0
STRING_Bopomofo0
+ STRING_Brahmi0
STRING_Braille0
STRING_Buginese0
STRING_Buhid0
STRING_Lydian0
STRING_M0
STRING_Malayalam0
+ STRING_Mandaic0
STRING_Mc0
STRING_Me0
STRING_Meetei_Mayek0
STRING_Tifinagh0
STRING_Ugaritic0
STRING_Vai0
+ STRING_Xan0
+ STRING_Xps0
+ STRING_Xsp0
+ STRING_Xwd0
STRING_Yi0
STRING_Z0
STRING_Zl0
{ 20, PT_SC, ucp_Avestan },
{ 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bamum },
- { 43, PT_SC, ucp_Bengali },
- { 51, PT_SC, ucp_Bopomofo },
- { 60, PT_SC, ucp_Braille },
- { 68, PT_SC, ucp_Buginese },
- { 77, PT_SC, ucp_Buhid },
- { 83, PT_GC, ucp_C },
- { 85, PT_SC, ucp_Canadian_Aboriginal },
- { 105, PT_SC, ucp_Carian },
- { 112, PT_PC, ucp_Cc },
- { 115, PT_PC, ucp_Cf },
- { 118, PT_SC, ucp_Cham },
- { 123, PT_SC, ucp_Cherokee },
- { 132, PT_PC, ucp_Cn },
- { 135, PT_PC, ucp_Co },
- { 138, PT_SC, ucp_Common },
- { 145, PT_SC, ucp_Coptic },
- { 152, PT_PC, ucp_Cs },
- { 155, PT_SC, ucp_Cuneiform },
- { 165, PT_SC, ucp_Cypriot },
- { 173, PT_SC, ucp_Cyrillic },
- { 182, PT_SC, ucp_Deseret },
- { 190, PT_SC, ucp_Devanagari },
- { 201, PT_SC, ucp_Egyptian_Hieroglyphs },
- { 222, PT_SC, ucp_Ethiopic },
- { 231, PT_SC, ucp_Georgian },
- { 240, PT_SC, ucp_Glagolitic },
- { 251, PT_SC, ucp_Gothic },
- { 258, PT_SC, ucp_Greek },
- { 264, PT_SC, ucp_Gujarati },
- { 273, PT_SC, ucp_Gurmukhi },
- { 282, PT_SC, ucp_Han },
- { 286, PT_SC, ucp_Hangul },
- { 293, PT_SC, ucp_Hanunoo },
- { 301, PT_SC, ucp_Hebrew },
- { 308, PT_SC, ucp_Hiragana },
- { 317, PT_SC, ucp_Imperial_Aramaic },
- { 334, PT_SC, ucp_Inherited },
- { 344, PT_SC, ucp_Inscriptional_Pahlavi },
- { 366, PT_SC, ucp_Inscriptional_Parthian },
- { 389, PT_SC, ucp_Javanese },
- { 398, PT_SC, ucp_Kaithi },
- { 405, PT_SC, ucp_Kannada },
- { 413, PT_SC, ucp_Katakana },
- { 422, PT_SC, ucp_Kayah_Li },
- { 431, PT_SC, ucp_Kharoshthi },
- { 442, PT_SC, ucp_Khmer },
- { 448, PT_GC, ucp_L },
- { 450, PT_LAMP, 0 },
- { 453, PT_SC, ucp_Lao },
- { 457, PT_SC, ucp_Latin },
- { 463, PT_SC, ucp_Lepcha },
- { 470, PT_SC, ucp_Limbu },
- { 476, PT_SC, ucp_Linear_B },
- { 485, PT_SC, ucp_Lisu },
- { 490, PT_PC, ucp_Ll },
- { 493, PT_PC, ucp_Lm },
- { 496, PT_PC, ucp_Lo },
- { 499, PT_PC, ucp_Lt },
- { 502, PT_PC, ucp_Lu },
- { 505, PT_SC, ucp_Lycian },
- { 512, PT_SC, ucp_Lydian },
- { 519, PT_GC, ucp_M },
- { 521, PT_SC, ucp_Malayalam },
- { 531, PT_PC, ucp_Mc },
- { 534, PT_PC, ucp_Me },
- { 537, PT_SC, ucp_Meetei_Mayek },
- { 550, PT_PC, ucp_Mn },
- { 553, PT_SC, ucp_Mongolian },
- { 563, PT_SC, ucp_Myanmar },
- { 571, PT_GC, ucp_N },
- { 573, PT_PC, ucp_Nd },
- { 576, PT_SC, ucp_New_Tai_Lue },
- { 588, PT_SC, ucp_Nko },
- { 592, PT_PC, ucp_Nl },
- { 595, PT_PC, ucp_No },
- { 598, PT_SC, ucp_Ogham },
- { 604, PT_SC, ucp_Ol_Chiki },
- { 613, PT_SC, ucp_Old_Italic },
- { 624, PT_SC, ucp_Old_Persian },
- { 636, PT_SC, ucp_Old_South_Arabian },
- { 654, PT_SC, ucp_Old_Turkic },
- { 665, PT_SC, ucp_Oriya },
- { 671, PT_SC, ucp_Osmanya },
- { 679, PT_GC, ucp_P },
- { 681, PT_PC, ucp_Pc },
- { 684, PT_PC, ucp_Pd },
- { 687, PT_PC, ucp_Pe },
- { 690, PT_PC, ucp_Pf },
- { 693, PT_SC, ucp_Phags_Pa },
- { 702, PT_SC, ucp_Phoenician },
- { 713, PT_PC, ucp_Pi },
- { 716, PT_PC, ucp_Po },
- { 719, PT_PC, ucp_Ps },
- { 722, PT_SC, ucp_Rejang },
- { 729, PT_SC, ucp_Runic },
- { 735, PT_GC, ucp_S },
- { 737, PT_SC, ucp_Samaritan },
- { 747, PT_SC, ucp_Saurashtra },
- { 758, PT_PC, ucp_Sc },
- { 761, PT_SC, ucp_Shavian },
- { 769, PT_SC, ucp_Sinhala },
- { 777, PT_PC, ucp_Sk },
- { 780, PT_PC, ucp_Sm },
- { 783, PT_PC, ucp_So },
- { 786, PT_SC, ucp_Sundanese },
- { 796, PT_SC, ucp_Syloti_Nagri },
- { 809, PT_SC, ucp_Syriac },
- { 816, PT_SC, ucp_Tagalog },
- { 824, PT_SC, ucp_Tagbanwa },
- { 833, PT_SC, ucp_Tai_Le },
- { 840, PT_SC, ucp_Tai_Tham },
- { 849, PT_SC, ucp_Tai_Viet },
- { 858, PT_SC, ucp_Tamil },
- { 864, PT_SC, ucp_Telugu },
- { 871, PT_SC, ucp_Thaana },
- { 878, PT_SC, ucp_Thai },
- { 883, PT_SC, ucp_Tibetan },
- { 891, PT_SC, ucp_Tifinagh },
- { 900, PT_SC, ucp_Ugaritic },
- { 909, PT_SC, ucp_Vai },
- { 913, PT_SC, ucp_Yi },
- { 916, PT_GC, ucp_Z },
- { 918, PT_PC, ucp_Zl },
- { 921, PT_PC, ucp_Zp },
- { 924, PT_PC, ucp_Zs }
+ { 43, PT_SC, ucp_Batak },
+ { 49, PT_SC, ucp_Bengali },
+ { 57, PT_SC, ucp_Bopomofo },
+ { 66, PT_SC, ucp_Brahmi },
+ { 73, PT_SC, ucp_Braille },
+ { 81, PT_SC, ucp_Buginese },
+ { 90, PT_SC, ucp_Buhid },
+ { 96, PT_GC, ucp_C },
+ { 98, PT_SC, ucp_Canadian_Aboriginal },
+ { 118, PT_SC, ucp_Carian },
+ { 125, PT_PC, ucp_Cc },
+ { 128, PT_PC, ucp_Cf },
+ { 131, PT_SC, ucp_Cham },
+ { 136, PT_SC, ucp_Cherokee },
+ { 145, PT_PC, ucp_Cn },
+ { 148, PT_PC, ucp_Co },
+ { 151, PT_SC, ucp_Common },
+ { 158, PT_SC, ucp_Coptic },
+ { 165, PT_PC, ucp_Cs },
+ { 168, PT_SC, ucp_Cuneiform },
+ { 178, PT_SC, ucp_Cypriot },
+ { 186, PT_SC, ucp_Cyrillic },
+ { 195, PT_SC, ucp_Deseret },
+ { 203, PT_SC, ucp_Devanagari },
+ { 214, PT_SC, ucp_Egyptian_Hieroglyphs },
+ { 235, PT_SC, ucp_Ethiopic },
+ { 244, PT_SC, ucp_Georgian },
+ { 253, PT_SC, ucp_Glagolitic },
+ { 264, PT_SC, ucp_Gothic },
+ { 271, PT_SC, ucp_Greek },
+ { 277, PT_SC, ucp_Gujarati },
+ { 286, PT_SC, ucp_Gurmukhi },
+ { 295, PT_SC, ucp_Han },
+ { 299, PT_SC, ucp_Hangul },
+ { 306, PT_SC, ucp_Hanunoo },
+ { 314, PT_SC, ucp_Hebrew },
+ { 321, PT_SC, ucp_Hiragana },
+ { 330, PT_SC, ucp_Imperial_Aramaic },
+ { 347, PT_SC, ucp_Inherited },
+ { 357, PT_SC, ucp_Inscriptional_Pahlavi },
+ { 379, PT_SC, ucp_Inscriptional_Parthian },
+ { 402, PT_SC, ucp_Javanese },
+ { 411, PT_SC, ucp_Kaithi },
+ { 418, PT_SC, ucp_Kannada },
+ { 426, PT_SC, ucp_Katakana },
+ { 435, PT_SC, ucp_Kayah_Li },
+ { 444, PT_SC, ucp_Kharoshthi },
+ { 455, PT_SC, ucp_Khmer },
+ { 461, PT_GC, ucp_L },
+ { 463, PT_LAMP, 0 },
+ { 466, PT_SC, ucp_Lao },
+ { 470, PT_SC, ucp_Latin },
+ { 476, PT_SC, ucp_Lepcha },
+ { 483, PT_SC, ucp_Limbu },
+ { 489, PT_SC, ucp_Linear_B },
+ { 498, PT_SC, ucp_Lisu },
+ { 503, PT_PC, ucp_Ll },
+ { 506, PT_PC, ucp_Lm },
+ { 509, PT_PC, ucp_Lo },
+ { 512, PT_PC, ucp_Lt },
+ { 515, PT_PC, ucp_Lu },
+ { 518, PT_SC, ucp_Lycian },
+ { 525, PT_SC, ucp_Lydian },
+ { 532, PT_GC, ucp_M },
+ { 534, PT_SC, ucp_Malayalam },
+ { 544, PT_SC, ucp_Mandaic },
+ { 552, PT_PC, ucp_Mc },
+ { 555, PT_PC, ucp_Me },
+ { 558, PT_SC, ucp_Meetei_Mayek },
+ { 571, PT_PC, ucp_Mn },
+ { 574, PT_SC, ucp_Mongolian },
+ { 584, PT_SC, ucp_Myanmar },
+ { 592, PT_GC, ucp_N },
+ { 594, PT_PC, ucp_Nd },
+ { 597, PT_SC, ucp_New_Tai_Lue },
+ { 609, PT_SC, ucp_Nko },
+ { 613, PT_PC, ucp_Nl },
+ { 616, PT_PC, ucp_No },
+ { 619, PT_SC, ucp_Ogham },
+ { 625, PT_SC, ucp_Ol_Chiki },
+ { 634, PT_SC, ucp_Old_Italic },
+ { 645, PT_SC, ucp_Old_Persian },
+ { 657, PT_SC, ucp_Old_South_Arabian },
+ { 675, PT_SC, ucp_Old_Turkic },
+ { 686, PT_SC, ucp_Oriya },
+ { 692, PT_SC, ucp_Osmanya },
+ { 700, PT_GC, ucp_P },
+ { 702, PT_PC, ucp_Pc },
+ { 705, PT_PC, ucp_Pd },
+ { 708, PT_PC, ucp_Pe },
+ { 711, PT_PC, ucp_Pf },
+ { 714, PT_SC, ucp_Phags_Pa },
+ { 723, PT_SC, ucp_Phoenician },
+ { 734, PT_PC, ucp_Pi },
+ { 737, PT_PC, ucp_Po },
+ { 740, PT_PC, ucp_Ps },
+ { 743, PT_SC, ucp_Rejang },
+ { 750, PT_SC, ucp_Runic },
+ { 756, PT_GC, ucp_S },
+ { 758, PT_SC, ucp_Samaritan },
+ { 768, PT_SC, ucp_Saurashtra },
+ { 779, PT_PC, ucp_Sc },
+ { 782, PT_SC, ucp_Shavian },
+ { 790, PT_SC, ucp_Sinhala },
+ { 798, PT_PC, ucp_Sk },
+ { 801, PT_PC, ucp_Sm },
+ { 804, PT_PC, ucp_So },
+ { 807, PT_SC, ucp_Sundanese },
+ { 817, PT_SC, ucp_Syloti_Nagri },
+ { 830, PT_SC, ucp_Syriac },
+ { 837, PT_SC, ucp_Tagalog },
+ { 845, PT_SC, ucp_Tagbanwa },
+ { 854, PT_SC, ucp_Tai_Le },
+ { 861, PT_SC, ucp_Tai_Tham },
+ { 870, PT_SC, ucp_Tai_Viet },
+ { 879, PT_SC, ucp_Tamil },
+ { 885, PT_SC, ucp_Telugu },
+ { 892, PT_SC, ucp_Thaana },
+ { 899, PT_SC, ucp_Thai },
+ { 904, PT_SC, ucp_Tibetan },
+ { 912, PT_SC, ucp_Tifinagh },
+ { 921, PT_SC, ucp_Ugaritic },
+ { 930, PT_SC, ucp_Vai },
+ { 934, PT_ALNUM, 0 },
+ { 938, PT_PXSPACE, 0 },
+ { 942, PT_SPACE, 0 },
+ { 946, PT_WORD, 0 },
+ { 950, PT_SC, ucp_Yi },
+ { 953, PT_GC, ucp_Z },
+ { 955, PT_PC, ucp_Zl },
+ { 958, PT_PC, ucp_Zp },
+ { 961, PT_PC, ucp_Zs }
};
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
+ Copyright (c) 1997-2010 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
else /* XCL_PROP & XCL_NOTPROP */
{
int chartype = UCD_CHARTYPE(c);
+
switch(*data)
{
case PT_ANY:
break;
case PT_LAMP:
- if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
- (t == XCL_PROP)) return !negated;
+ if ((chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
break;
case PT_GC:
- if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
+ if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP))
+ return !negated;
break;
case PT_PC:
if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
break;
+ case PT_ALNUM:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_WORD:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE)
+ == (t == XCL_PROP))
+ return !negated;
+ break;
+
/* This should never occur, but compilers may mutter if there is no
default. */
ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
- ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
+ ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
+ ucp_Batak = G_UNICODE_SCRIPT_BATAK,
+ ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
+ ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC
};
#endif