Forgotten files

author Matthias Clasen <mclasen@redhat.com>

Sat, 22 Jan 2011 05:01:54 +0000 (00:01 -0500)

committer Matthias Clasen <mclasen@redhat.com>

Sat, 22 Jan 2011 05:01:54 +0000 (00:01 -0500)
author Matthias Clasen <mclasen@redhat.com>
Sat, 22 Jan 2011 05:01:54 +0000 (00:01 -0500)
committer Matthias Clasen <mclasen@redhat.com>
Sat, 22 Jan 2011 05:01:54 +0000 (00:01 -0500)
diff --git a/glib/pcre/pcre.h b/glib/pcre/pcre.h

index 4864bd099b0597716c67c69251419e0345b8221b..7c4c04011119adf215466044937768c40afec25d 100644 (file)
--- a/glib/pcre/pcre.h
+++ b/glib/pcre/pcre.h
@@ -5,7 +5,7 @@
  /* This is the public header file for the PCRE library, to be #included by
  applications that call the PCRE functions.
  
-           Copyright (c) 1997-2009 University of Cambridge
+           Copyright (c) 1997-2010 University of Cambridge
  
  -----------------------------------------------------------------------------
  Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
  /* The current PCRE version information. */
  
  #define PCRE_MAJOR          8
-#define PCRE_MINOR          02
+#define PCRE_MINOR          12
  #define PCRE_PRERELEASE     
-#define PCRE_DATE           2010-03-19
+#define PCRE_DATE           2011-01-15
  
  /* When an application links to a PCRE DLL in Windows, the symbols that are
  imported have to be identified as such. When building PCRE, the appropriate
@@ -96,41 +96,44 @@ extern "C" {
  #endif
  
  /* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
-
-#define PCRE_CASELESS           0x00000001
-#define PCRE_MULTILINE          0x00000002
-#define PCRE_DOTALL             0x00000004
-#define PCRE_EXTENDED           0x00000008
-#define PCRE_ANCHORED           0x00000010
-#define PCRE_DOLLAR_ENDONLY     0x00000020
-#define PCRE_EXTRA              0x00000040
-#define PCRE_NOTBOL             0x00000080
-#define PCRE_NOTEOL             0x00000100
-#define PCRE_UNGREEDY           0x00000200
-#define PCRE_NOTEMPTY           0x00000400
-#define PCRE_UTF8               0x00000800
-#define PCRE_NO_AUTO_CAPTURE    0x00001000
-#define PCRE_NO_UTF8_CHECK      0x00002000
-#define PCRE_AUTO_CALLOUT       0x00004000
-#define PCRE_PARTIAL_SOFT       0x00008000
+both, so we keep them all distinct. However, almost all the bits in the options
+word are now used. In the long run, we may have to re-use some of the
+compile-time only bits for runtime options, or vice versa. */
+
+#define PCRE_CASELESS           0x00000001  /* Compile */
+#define PCRE_MULTILINE          0x00000002  /* Compile */
+#define PCRE_DOTALL             0x00000004  /* Compile */
+#define PCRE_EXTENDED           0x00000008  /* Compile */
+#define PCRE_ANCHORED           0x00000010  /* Compile, exec, DFA exec */
+#define PCRE_DOLLAR_ENDONLY     0x00000020  /* Compile */
+#define PCRE_EXTRA              0x00000040  /* Compile */
+#define PCRE_NOTBOL             0x00000080  /* Exec, DFA exec */
+#define PCRE_NOTEOL             0x00000100  /* Exec, DFA exec */
+#define PCRE_UNGREEDY           0x00000200  /* Compile */
+#define PCRE_NOTEMPTY           0x00000400  /* Exec, DFA exec */
+#define PCRE_UTF8               0x00000800  /* Compile */
+#define PCRE_NO_AUTO_CAPTURE    0x00001000  /* Compile */
+#define PCRE_NO_UTF8_CHECK      0x00002000  /* Compile, exec, DFA exec */
+#define PCRE_AUTO_CALLOUT       0x00004000  /* Compile */
+#define PCRE_PARTIAL_SOFT       0x00008000  /* Exec, DFA exec */
  #define PCRE_PARTIAL            0x00008000  /* Backwards compatible synonym */
-#define PCRE_DFA_SHORTEST       0x00010000
-#define PCRE_DFA_RESTART        0x00020000
-#define PCRE_FIRSTLINE          0x00040000
-#define PCRE_DUPNAMES           0x00080000
-#define PCRE_NEWLINE_CR         0x00100000
-#define PCRE_NEWLINE_LF         0x00200000
-#define PCRE_NEWLINE_CRLF       0x00300000
-#define PCRE_NEWLINE_ANY        0x00400000
-#define PCRE_NEWLINE_ANYCRLF    0x00500000
-#define PCRE_BSR_ANYCRLF        0x00800000
-#define PCRE_BSR_UNICODE        0x01000000
-#define PCRE_JAVASCRIPT_COMPAT  0x02000000
-#define PCRE_NO_START_OPTIMIZE  0x04000000
-#define PCRE_NO_START_OPTIMISE  0x04000000
-#define PCRE_PARTIAL_HARD       0x08000000
-#define PCRE_NOTEMPTY_ATSTART   0x10000000
+#define PCRE_DFA_SHORTEST       0x00010000  /* DFA exec */
+#define PCRE_DFA_RESTART        0x00020000  /* DFA exec */
+#define PCRE_FIRSTLINE          0x00040000  /* Compile */
+#define PCRE_DUPNAMES           0x00080000  /* Compile */
+#define PCRE_NEWLINE_CR         0x00100000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_LF         0x00200000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_CRLF       0x00300000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANY        0x00400000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANYCRLF    0x00500000  /* Compile, exec, DFA exec */
+#define PCRE_BSR_ANYCRLF        0x00800000  /* Compile, exec, DFA exec */
+#define PCRE_BSR_UNICODE        0x01000000  /* Compile, exec, DFA exec */
+#define PCRE_JAVASCRIPT_COMPAT  0x02000000  /* Compile */
+#define PCRE_NO_START_OPTIMIZE  0x04000000  /* Compile, exec, DFA exec */
+#define PCRE_NO_START_OPTIMISE  0x04000000  /* Synonym */
+#define PCRE_PARTIAL_HARD       0x08000000  /* Exec, DFA exec */
+#define PCRE_NOTEMPTY_ATSTART   0x10000000  /* Exec, DFA exec */
+#define PCRE_UCP                0x20000000  /* Compile */
  
  /* Exec-time and get/set-time error codes */
  
@@ -158,6 +161,8 @@ both, so we keep them all distinct. */
  #define PCRE_ERROR_RECURSIONLIMIT (-21)
  #define PCRE_ERROR_NULLWSLIMIT    (-22)  /* No longer actually used */
  #define PCRE_ERROR_BADNEWLINE     (-23)
+#define PCRE_ERROR_BADOFFSET      (-24)
+#define PCRE_ERROR_SHORTUTF8      (-25)
  
  /* Request types for pcre_fullinfo() */
  
@@ -200,6 +205,7 @@ these bits, just add new ones on the end, in order to remain compatible. */
  #define PCRE_EXTRA_CALLOUT_DATA           0x0004
  #define PCRE_EXTRA_TABLES                 0x0008
  #define PCRE_EXTRA_MATCH_LIMIT_RECURSION  0x0010
+#define PCRE_EXTRA_MARK                   0x0020
  
  /* Types */
  
@@ -225,6 +231,7 @@ typedef struct pcre_extra {
    void *callout_data;             /* Data passed back in callouts */
    const unsigned char *tables;    /* Pointer to character tables */
    unsigned long int match_limit_recursion; /* Max recursive calls to match() */
+  unsigned char **mark;           /* For passing back a mark pointer */
  } pcre_extra;
  
  /* The structure for passing out data via the pcre_callout_function. We use a
diff --git a/glib/pcre/pcre_chartables.c b/glib/pcre/pcre_chartables.c

index ae45db0ca35b595a8e119b89b2880724813283fd..9117ae3c7faab884b1fd1fbdc3cc91dadbb1ba14 100644 (file)
--- a/glib/pcre/pcre_chartables.c
+++ b/glib/pcre/pcre_chartables.c
@@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the
  current locale. If PCRE is configured with --enable-rebuild-chartables, this
  happens automatically.
  
-The following #includes are present because without the gcc 4.x may remove the
+The following #includes are present because without them gcc 4.x may remove the
  array definition from the final binary if PCRE is built into a static library
  and dead code stripping is activated. This leads to link errors. Pulling in the
  header ensures that the array gets flagged as "someone outside this compilation
diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c

index a00a9901770253cfca7c156fbd7217159fe8b333..f0bae53eee7a7ea29bb80c904e1979e93fb7d4b6 100644 (file)
--- a/glib/pcre/pcre_compile.c
+++ b/glib/pcre/pcre_compile.c
@@ -124,7 +124,7 @@ static const short int escapes[] = {
       -ESC_H,                  0,
       0,                       -ESC_K,
       0,                       0,
-     0,                       0,
+     -ESC_N,                  0,
       -ESC_P,                  -ESC_Q,
       -ESC_R,                  -ESC_S,
       0,                       0,
@@ -171,7 +171,7 @@ static const short int escapes[] = {
  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
-/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
+/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
@@ -188,11 +188,14 @@ string is built from string macros so that it works in UTF-8 mode on EBCDIC
  platforms. */
  
  typedef struct verbitem {
-  int   len;
-  int   op;
+  int   len;                 /* Length of verb name */
+  int   op;                  /* Op when no arg, or -1 if arg mandatory */
+  int   op_arg;              /* Op when arg present, or -1 if not allowed */
  } verbitem;
  
  static const char verbnames[] =
+  "\0"                       /* Empty name is a shorthand for MARK */
+  STRING_MARK0
    STRING_ACCEPT0
    STRING_COMMIT0
    STRING_F0
@@ -202,13 +205,15 @@ static const char verbnames[] =
    STRING_THEN;
  
  static const verbitem verbs[] = {
-  { 6, OP_ACCEPT },
-  { 6, OP_COMMIT },
-  { 1, OP_FAIL },
-  { 4, OP_FAIL },
-  { 5, OP_PRUNE },
-  { 4, OP_SKIP  },
-  { 4, OP_THEN  }
+  { 0, -1,        OP_MARK },
+  { 4, -1,        OP_MARK },
+  { 6, OP_ACCEPT, -1 },
+  { 6, OP_COMMIT, -1 },
+  { 1, OP_FAIL,   -1 },
+  { 4, OP_FAIL,   -1 },
+  { 5, OP_PRUNE,  OP_PRUNE_ARG },
+  { 4, OP_SKIP,   OP_SKIP_ARG  },
+  { 4, OP_THEN,   OP_THEN_ARG  }
  };
  
  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
@@ -256,6 +261,53 @@ static const int posix_class_maps[] = {
    cbit_xdigit,-1,          0              /* xdigit */
  };
  
+/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
+substitutes must be in the order of the names, defined above, and there are
+both positive and negative cases. NULL means no substitute. */
+
+#ifdef SUPPORT_UCP
+static const uschar *substitutes[] = {
+  (uschar *)"\\P{Nd}",    /* \D */
+  (uschar *)"\\p{Nd}",    /* \d */
+  (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
+  (uschar *)"\\p{Xsp}",   /* \s */
+  (uschar *)"\\P{Xwd}",   /* \W */
+  (uschar *)"\\p{Xwd}"    /* \w */
+};
+
+static const uschar *posix_substitutes[] = {
+  (uschar *)"\\p{L}",     /* alpha */
+  (uschar *)"\\p{Ll}",    /* lower */
+  (uschar *)"\\p{Lu}",    /* upper */
+  (uschar *)"\\p{Xan}",   /* alnum */
+  NULL,                   /* ascii */
+  (uschar *)"\\h",        /* blank */
+  NULL,                   /* cntrl */
+  (uschar *)"\\p{Nd}",    /* digit */
+  NULL,                   /* graph */
+  NULL,                   /* print */
+  NULL,                   /* punct */
+  (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
+  (uschar *)"\\p{Xwd}",   /* word */
+  NULL,                   /* xdigit */
+  /* Negated cases */
+  (uschar *)"\\P{L}",     /* ^alpha */
+  (uschar *)"\\P{Ll}",    /* ^lower */
+  (uschar *)"\\P{Lu}",    /* ^upper */
+  (uschar *)"\\P{Xan}",   /* ^alnum */
+  NULL,                   /* ^ascii */
+  (uschar *)"\\H",        /* ^blank */
+  NULL,                   /* ^cntrl */
+  (uschar *)"\\P{Nd}",    /* ^digit */
+  NULL,                   /* ^graph */
+  NULL,                   /* ^print */
+  NULL,                   /* ^punct */
+  (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
+  (uschar *)"\\P{Xwd}",   /* ^word */
+  NULL                    /* ^xdigit */
+};
+#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
+#endif
  
  #define STRING(a)  # a
  #define XSTRING(s) STRING(s)
@@ -319,7 +371,7 @@ static const char error_texts[] =
    /* 35 */
    "invalid condition (?(0)\0"
    "\\C not allowed in lookbehind assertion\0"
-  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
+  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
    "number after (?C is > 255\0"
    "closing ) for (?C expected\0"
    /* 40 */
@@ -345,7 +397,7 @@ static const char error_texts[] =
    "inconsistent NEWLINE options\0"
    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
    "a numbered reference must not be zero\0"
-  "(*VERB) with an argument is not supported\0"
+  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
    /* 60 */
    "(*VERB) not recognized\0"
    "number is too big\0"
@@ -353,7 +405,11 @@ static const char error_texts[] =
    "digit expected after (?+\0"
    "] is an invalid data character in JavaScript compatibility mode\0"
    /* 65 */
-  "different names for subpatterns of the same number are not allowed\0";
+  "different names for subpatterns of the same number are not allowed\0"
+  "(*MARK) must have an argument\0"
+  "this version of PCRE is not compiled with PCRE_UCP support\0"
+  "\\c must be followed by an ASCII character\0"
+  ;
  
  
  /* Definition to allow mutual recursion */
@@ -456,7 +512,6 @@ else
  
      case CHAR_l:
      case CHAR_L:
-    case CHAR_N:
      case CHAR_u:
      case CHAR_U:
      *errorcodeptr = ERR37;
@@ -657,7 +712,8 @@ else
      break;
  
      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
-    This coding is ASCII-specific, but then the whole concept of \cx is
+    An error is given if the byte following \c is not an ASCII character. This
+    coding is ASCII-specific, but then the whole concept of \cx is
      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
  
      case CHAR_c:
@@ -667,11 +723,15 @@ else
        *errorcodeptr = ERR2;
        break;
        }
-
-#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+#ifndef EBCDIC    /* ASCII/UTF-8 coding */
+    if (c > 127)  /* Excludes all non-ASCII in either mode */
+      {
+      *errorcodeptr = ERR68;
+      break;
+      }
      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
      c ^= 0x40;
-#else           /* EBCDIC coding */
+#else             /* EBCDIC coding */
      if (c >= CHAR_a && c <= CHAR_z) c += 64;
      c ^= 0xC0;
  #endif
@@ -694,6 +754,19 @@ else
      }
    }
  
+/* Perl supports \N{name} for character names, as well as plain \N for "not
+newline". PCRE does not support \N{name}. */
+
+if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
+  *errorcodeptr = ERR37;
+
+/* If PCRE_UCP is set, we change the values for \d etc. */
+
+if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
+  c -= (ESC_DU - ESC_D);
+
+/* Set the pointer to the final character before returning. */
+
  *ptrptr = ptr;
  return c;
  }
@@ -902,10 +975,21 @@ top-level call starts at the beginning of the pattern. All other calls must
  start at a parenthesis. It scans along a pattern's text looking for capturing
  subpatterns, and counting them. If it finds a named pattern that matches the
  name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. We know that if (?P< is
-encountered, the name will be terminated by '>' because that is checked in the
-first pass. Recursion is used to keep track of subpatterns that reset the
-capturing group numbers - the (?| feature.
+returns when it reaches a given numbered subpattern. Recursion is used to keep
+track of subpatterns that reset the capturing group numbers - the (?| feature.
+
+This function was originally called only from the second pass, in which we know
+that if (?< or (?' or (?P< is encountered, the name will be correctly
+terminated because that is checked in the first pass. There is now one call to
+this function in the first pass, to check for a recursive back reference by
+name (so that we can make the whole group atomic). In this case, we need check
+only up to the current position in the pattern, and that is still OK because
+and previous occurrences will have been checked. To make this work, the test
+for "end of pattern" is a check against cd->end_pattern in the main loop,
+instead of looking for a binary zero. This means that the special first-pass
+call can adjust cd->end_pattern temporarily. (Checks for binary zero while
+processing items within the loop are OK, because afterwards the main loop will
+terminate.)
  
  Arguments:
    ptrptr       address of the current character pointer (updated)
@@ -913,6 +997,7 @@ Arguments:
    name         name to seek, or NULL if seeking a numbered subpattern
    lorn         name length, or subpattern number if name is NULL
    xmode        TRUE if we are in /x mode
+  utf8         TRUE if we are in UTF-8 mode
    count        pointer to the current capturing subpattern number (updated)
  
  Returns:       the number of the named subpattern, or -1 if not found
@@ -920,7 +1005,7 @@ Returns:       the number of the named subpattern, or -1 if not found
  
  static int
  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
-  BOOL xmode, int *count)
+  BOOL xmode, BOOL utf8, int *count)
  {
  uschar *ptr = *ptrptr;
  int start_count = *count;
@@ -932,25 +1017,39 @@ dealing with. The very first call may not start with a parenthesis. */
  
  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
    {
-  if (ptr[1] == CHAR_QUESTION_MARK &&
-      ptr[2] == CHAR_VERTICAL_LINE)
+  /* Handle specials such as (*SKIP) or (*UTF8) etc. */
+
+  if (ptr[1] == CHAR_ASTERISK) ptr += 2;
+
+  /* Handle a normal, unnamed capturing parenthesis. */
+
+  else if (ptr[1] != CHAR_QUESTION_MARK)
+    {
+    *count += 1;
+    if (name == NULL && *count == lorn) return *count;
+    ptr++;
+    }
+
+  /* All cases now have (? at the start. Remember when we are in a group
+  where the parenthesis numbers are duplicated. */
+
+  else if (ptr[2] == CHAR_VERTICAL_LINE)
      {
      ptr += 3;
      dup_parens = TRUE;
      }
  
-  /* Handle a normal, unnamed capturing parenthesis */
+  /* Handle comments; all characters are allowed until a ket is reached. */
  
-  else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
+  else if (ptr[2] == CHAR_NUMBER_SIGN)
      {
-    *count += 1;
-    if (name == NULL && *count == lorn) return *count;
-    ptr++;
+    for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
+    goto FAIL_EXIT;
      }
  
    /* Handle a condition. If it is an assertion, just carry on so that it
    is processed as normal. If not, skip to the closing parenthesis of the
-  condition (there can't be any nested parens. */
+  condition (there can't be any nested parens). */
  
    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
      {
@@ -962,7 +1061,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
        }
      }
  
-  /* We have either (? or (* and not a condition */
+  /* Start with (? but not a condition. */
  
    else
      {
@@ -991,9 +1090,11 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
    }
  
  /* Past any initial parenthesis handling, scan for parentheses or vertical
-bars. */
+bars. Stop if we get to cd->end_pattern. Note that this is important for the
+first-pass call when this value is temporarily adjusted to stop at the current
+position. So DO NOT change this to a test for binary zero. */
  
-for (; *ptr != 0; ptr++)
+for (; ptr < cd->end_pattern; ptr++)
    {
    /* Skip over backslashed characters and also entire \Q...\E */
  
@@ -1067,7 +1168,15 @@ for (; *ptr != 0; ptr++)
  
    if (xmode && *ptr == CHAR_NUMBER_SIGN)
      {
-    while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+    ptr++;
+    while (*ptr != 0)
+      {
+      if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+      ptr++;
+#ifdef SUPPORT_UTF8
+      if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+      }
      if (*ptr == 0) goto FAIL_EXIT;
      continue;
      }
@@ -1076,7 +1185,7 @@ for (; *ptr != 0; ptr++)
  
    if (*ptr == CHAR_LEFT_PARENTHESIS)
      {
-    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
      if (rc > 0) return rc;
      if (*ptr == 0) goto FAIL_EXIT;
      }
@@ -1084,8 +1193,7 @@ for (; *ptr != 0; ptr++)
    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
      {
      if (dup_parens && *count < hwm_count) *count = hwm_count;
-    *ptrptr = ptr;
-    return -1;
+    goto FAIL_EXIT;
      }
  
    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
@@ -1123,12 +1231,14 @@ Arguments:
    name         name to seek, or NULL if seeking a numbered subpattern
    lorn         name length, or subpattern number if name is NULL
    xmode        TRUE if we are in /x mode
+  utf8         TRUE if we are in UTF-8 mode
  
  Returns:       the number of the found subpattern, or -1 if not found
  */
  
  static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+  BOOL utf8)
  {
  uschar *ptr = (uschar *)cd->start_pattern;
  int count = 0;
@@ -1141,7 +1251,7 @@ matching closing parens. That is why we have to have a loop. */
  
  for (;;)
    {
-  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
    if (rc > 0 || *ptr++ == 0) break;
    }
  
@@ -1485,7 +1595,8 @@ for (;;)
  
    /* Otherwise, we can get the item's length from the table, except that for
    repeated character types, we have to test for \p and \P, which have an extra
-  two bytes of parameters. */
+  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
+  must add in its length. */
  
    else
      {
@@ -1509,6 +1620,16 @@ for (;;)
        case OP_TYPEPOSUPTO:
        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
        break;
+
+      case OP_MARK:
+      case OP_PRUNE_ARG:
+      case OP_SKIP_ARG:
+      code += code[1];
+      break;
+
+      case OP_THEN_ARG:
+      code += code[1+LINK_SIZE];
+      break;
        }
  
      /* Add in the fixed length from the table */
@@ -1580,7 +1701,8 @@ for (;;)
  
    /* Otherwise, we can get the item's length from the table, except that for
    repeated character types, we have to test for \p and \P, which have an extra
-  two bytes of parameters. */
+  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
+  must add in its length. */
  
    else
      {
@@ -1604,6 +1726,16 @@ for (;;)
        case OP_TYPEEXACT:
        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
        break;
+
+      case OP_MARK:
+      case OP_PRUNE_ARG:
+      case OP_SKIP_ARG:
+      code += code[1];
+      break;
+
+      case OP_THEN_ARG:
+      code += code[1+LINK_SIZE];
+      break;
        }
  
      /* Add in the fixed length from the table */
@@ -1873,6 +2005,19 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
      break;
  #endif
  
+    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
+    string. */
+
+    case OP_MARK:
+    case OP_PRUNE_ARG:
+    case OP_SKIP_ARG:
+    code += code[1];
+    break;
+
+    case OP_THEN_ARG:
+    code += code[1+LINK_SIZE];
+    break;
+
      /* None of the remaining opcodes are required to match a character. */
  
      default:
@@ -2093,8 +2238,8 @@ auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
  {
  *code++ = OP_CALLOUT;
  *code++ = 255;
-PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
-PUT(code, LINK_SIZE, 0);                /* Default length */
+PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
+PUT(code, LINK_SIZE, 0);                       /* Default length */
  return code + 2*LINK_SIZE;
  }
  
@@ -2119,7 +2264,7 @@ Returns:             nothing
  static void
  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
  {
-int length = ptr - cd->start_pattern - GET(previous_callout, 2);
+int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
  PUT(previous_callout, 2 + LINK_SIZE, length);
  }
  
@@ -2169,6 +2314,69 @@ for (++c; c <= d; c++)
  
  return TRUE;
  }
+
+
+
+/*************************************************
+*        Check a character and a property        *
+*************************************************/
+
+/* This function is called by check_auto_possessive() when a property item
+is adjacent to a fixed character.
+
+Arguments:
+  c            the character
+  ptype        the property type
+  pdata        the data for the type
+  negated      TRUE if it's a negated property (\P or \p{^)
+
+Returns:       TRUE if auto-possessifying is OK
+*/
+
+static BOOL
+check_char_prop(int c, int ptype, int pdata, BOOL negated)
+{
+int chartype = UCD_CHARTYPE(c);
+switch(ptype)
+  {
+  case PT_LAMP:
+  return (chartype == ucp_Lu ||
+          chartype == ucp_Ll ||
+          chartype == ucp_Lt) == negated;
+
+  case PT_GC:
+  return (pdata == _pcre_ucp_gentype[chartype]) == negated;
+
+  case PT_PC:
+  return (pdata == chartype) == negated;
+
+  case PT_SC:
+  return (pdata == UCD_SCRIPT(c)) == negated;
+
+  /* These are specials */
+
+  case PT_ALNUM:
+  return (_pcre_ucp_gentype[chartype] == ucp_L ||
+          _pcre_ucp_gentype[chartype] == ucp_N) == negated;
+
+  case PT_SPACE:    /* Perl space */
+  return (_pcre_ucp_gentype[chartype] == ucp_Z ||
+          c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+          == negated;
+
+  case PT_PXSPACE:  /* POSIX space */
+  return (_pcre_ucp_gentype[chartype] == ucp_Z ||
+          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+          c == CHAR_FF || c == CHAR_CR)
+          == negated;
+
+  case PT_WORD:
+  return (_pcre_ucp_gentype[chartype] == ucp_L ||
+          _pcre_ucp_gentype[chartype] == ucp_N ||
+          c == CHAR_UNDERSCORE) == negated;
+  }
+return FALSE;
+}
  #endif  /* SUPPORT_UCP */
  
  
@@ -2182,10 +2390,8 @@ whether the next thing could possibly match the repeated item. If not, it makes
  sense to automatically possessify the repeated item.
  
  Arguments:
-  op_code       the repeated op code
-  this          data for this item, depends on the opcode
+  previous      pointer to the repeated opcode
    utf8          TRUE in UTF-8 mode
-  utf8_char     used for utf8 character bytes, NULL if not relevant
    ptr           next character in pattern
    options       options bits
    cd            contains pointers to tables etc.
@@ -2194,10 +2400,11 @@ Returns:        TRUE if possessifying is wanted
  */
  
  static BOOL
-check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
-  const uschar *ptr, int options, compile_data *cd)
+check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
+  int options, compile_data *cd)
  {
-int next;
+int c, next;
+int op_code = *previous++;
  
  /* Skip whitespace and comments in extended mode */
  
@@ -2208,8 +2415,15 @@ if ((options & PCRE_EXTENDED) != 0)
      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
      if (*ptr == CHAR_NUMBER_SIGN)
        {
-      while (*(++ptr) != 0)
+      ptr++;
+      while (*ptr != 0)
+        {
          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+        }
        }
      else break;
      }
@@ -2245,8 +2459,15 @@ if ((options & PCRE_EXTENDED) != 0)
      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
      if (*ptr == CHAR_NUMBER_SIGN)
        {
-      while (*(++ptr) != 0)
+      ptr++;
+      while (*ptr != 0)
+        {
          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+        }
        }
      else break;
      }
@@ -2258,23 +2479,18 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
      return FALSE;
  
-/* Now compare the next item with the previous opcode. If the previous is a
-positive single character match, "item" either contains the character or, if
-"item" is greater than 127 in utf8 mode, the character's bytes are in
-utf8_char. */
-
-
-/* Handle cases when the next item is a character. */
+/* Now compare the next item with the previous opcode. First, handle cases when
+the next item is a character. */
  
  if (next >= 0) switch(op_code)
    {
    case OP_CHAR:
  #ifdef SUPPORT_UTF8
-  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+  GETCHARTEST(c, previous);
  #else
-  (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
+  c = *previous;
  #endif
-  return item != next;
+  return c != next;
  
    /* For CHARNC (caseless character) we must check the other case. If we have
    Unicode property support, we can use it to test the other case of
@@ -2282,9 +2498,11 @@ if (next >= 0) switch(op_code)
  
    case OP_CHARNC:
  #ifdef SUPPORT_UTF8
-  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
  #endif
-  if (item == next) return FALSE;
+  if (c == next) return FALSE;
  #ifdef SUPPORT_UTF8
    if (utf8)
      {
@@ -2295,16 +2513,16 @@ if (next >= 0) switch(op_code)
  #else
      othercase = NOTACHAR;
  #endif
-    return (unsigned int)item != othercase;
+    return (unsigned int)c != othercase;
      }
    else
  #endif  /* SUPPORT_UTF8 */
-  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
+  return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
  
-  /* For OP_NOT, "item" must be a single-byte character. */
+  /* For OP_NOT, its data is always a single-byte character. */
  
    case OP_NOT:
-  if (item == next) return TRUE;
+  if ((c = *previous) == next) return TRUE;
    if ((options & PCRE_CASELESS) == 0) return FALSE;
  #ifdef SUPPORT_UTF8
    if (utf8)
@@ -2316,11 +2534,14 @@ if (next >= 0) switch(op_code)
  #else
      othercase = NOTACHAR;
  #endif
-    return (unsigned int)item == othercase;
+    return (unsigned int)c == othercase;
      }
    else
  #endif  /* SUPPORT_UTF8 */
-  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
+  return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
+
+  /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
+  When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
  
    case OP_DIGIT:
    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
@@ -2363,11 +2584,12 @@ if (next >= 0) switch(op_code)
      case 0x202f:
      case 0x205f:
      case 0x3000:
-    return op_code != OP_HSPACE;
+    return op_code == OP_NOT_HSPACE;
      default:
-    return op_code == OP_HSPACE;
+    return op_code != OP_NOT_HSPACE;
      }
  
+  case OP_ANYNL:
    case OP_VSPACE:
    case OP_NOT_VSPACE:
    switch(next)
@@ -2379,48 +2601,62 @@ if (next >= 0) switch(op_code)
      case 0x85:
      case 0x2028:
      case 0x2029:
-    return op_code != OP_VSPACE;
+    return op_code == OP_NOT_VSPACE;
      default:
-    return op_code == OP_VSPACE;
+    return op_code != OP_NOT_VSPACE;
      }
  
+#ifdef SUPPORT_UCP
+  case OP_PROP:
+  return check_char_prop(next, previous[0], previous[1], FALSE);
+
+  case OP_NOTPROP:
+  return check_char_prop(next, previous[0], previous[1], TRUE);
+#endif
+
    default:
    return FALSE;
    }
  
  
-/* Handle the case when the next item is \d, \s, etc. */
+/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
+is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
+generated only when PCRE_UCP is *not* set, that is, when only ASCII
+characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
+replaced by OP_PROP codes when PCRE_UCP is set. */
  
  switch(op_code)
    {
    case OP_CHAR:
    case OP_CHARNC:
  #ifdef SUPPORT_UTF8
-  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
  #endif
    switch(-next)
      {
      case ESC_d:
-    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
+    return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
  
      case ESC_D:
-    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
+    return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
  
      case ESC_s:
-    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
+    return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
  
      case ESC_S:
-    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
+    return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
  
      case ESC_w:
-    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
+    return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
  
      case ESC_W:
-    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
+    return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
  
      case ESC_h:
      case ESC_H:
-    switch(item)
+    switch(c)
        {
        case 0x09:
        case 0x20:
@@ -2448,7 +2684,7 @@ switch(op_code)
  
      case ESC_v:
      case ESC_V:
-    switch(item)
+    switch(c)
        {
        case 0x0a:
        case 0x0b:
@@ -2462,38 +2698,92 @@ switch(op_code)
        return -next == ESC_v;
        }
  
+    /* When PCRE_UCP is set, these values get generated for \d etc. Find
+    their substitutions and process them. The result will always be either
+    -ESC_p or -ESC_P. Then fall through to process those values. */
+
+#ifdef SUPPORT_UCP
+    case ESC_du:
+    case ESC_DU:
+    case ESC_wu:
+    case ESC_WU:
+    case ESC_su:
+    case ESC_SU:
+      {
+      int temperrorcode = 0;
+      ptr = substitutes[-next - ESC_DU];
+      next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
+      if (temperrorcode != 0) return FALSE;
+      ptr++;    /* For compatibility */
+      }
+    /* Fall through */
+
+    case ESC_p:
+    case ESC_P:
+      {
+      int ptype, pdata, errorcodeptr;
+      BOOL negated;
+
+      ptr--;      /* Make ptr point at the p or P */
+      ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
+      if (ptype < 0) return FALSE;
+      ptr++;      /* Point past the final curly ket */
+
+      /* If the property item is optional, we have to give up. (When generated
+      from \d etc by PCRE_UCP, this test will have been applied much earlier,
+      to the original \d etc. At this point, ptr will point to a zero byte. */
+
+      if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
+        strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
+          return FALSE;
+
+      /* Do the property check. */
+
+      return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
+      }
+#endif
+
      default:
      return FALSE;
      }
  
+  /* In principle, support for Unicode properties should be integrated here as
+  well. It means re-organizing the above code so as to get hold of the property
+  values before switching on the op-code. However, I wonder how many patterns
+  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
+  these op-codes are never generated.) */
+
    case OP_DIGIT:
    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
-         next == -ESC_h || next == -ESC_v;
+         next == -ESC_h || next == -ESC_v || next == -ESC_R;
  
    case OP_NOT_DIGIT:
    return next == -ESC_d;
  
    case OP_WHITESPACE:
-  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
+  return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
  
    case OP_NOT_WHITESPACE:
    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
  
    case OP_HSPACE:
-  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
+  return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
+         next == -ESC_w || next == -ESC_v || next == -ESC_R;
  
    case OP_NOT_HSPACE:
    return next == -ESC_h;
  
    /* Can't have \S in here because VT matches \S (Perl anomaly) */
+  case OP_ANYNL:
    case OP_VSPACE:
    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
  
    case OP_NOT_VSPACE:
-  return next == -ESC_v;
+  return next == -ESC_v || next == -ESC_R;
  
    case OP_WORDCHAR:
-  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
+  return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
+         next == -ESC_v || next == -ESC_R;
  
    case OP_NOT_WORDCHAR:
    return next == -ESC_w || next == -ESC_d;
@@ -2557,6 +2847,7 @@ BOOL inescq = FALSE;
  BOOL groupsetfirstbyte = FALSE;
  const uschar *ptr = *ptrptr;
  const uschar *tempptr;
+const uschar *nestptr = NULL;
  uschar *previous = NULL;
  uschar *previous_callout = NULL;
  uschar *save_hwm = NULL;
@@ -2627,6 +2918,16 @@ for (;; ptr++)
  
    c = *ptr;
  
+  /* If we are at the end of a nested substitution, revert to the outer level
+  string. Nesting only happens one level deep. */
+
+  if (c == 0 && nestptr != NULL)
+    {
+    ptr = nestptr;
+    nestptr = NULL;
+    c = *ptr;
+    }
+
    /* If we are in the pre-compile phase, accumulate the length used for the
    previous cycle of this loop. */
  
@@ -2657,7 +2958,7 @@ for (;; ptr++)
        goto FAILED;
        }
  
-    *lengthptr += code - last_code;
+    *lengthptr += (int)(code - last_code);
      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
  
      /* If "previous" is set and it is not at the start of the work space, move
@@ -2739,9 +3040,14 @@ for (;; ptr++)
      if ((cd->ctypes[c] & ctype_space) != 0) continue;
      if (c == CHAR_NUMBER_SIGN)
        {
-      while (*(++ptr) != 0)
+      ptr++;
+      while (*ptr != 0)
          {
          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
          }
        if (*ptr != 0) continue;
  
@@ -2775,7 +3081,7 @@ for (;; ptr++)
          *errorcodeptr = ERR20;
          goto FAILED;
          }
-      *lengthptr += code - last_code;   /* To include callout length */
+      *lengthptr += (int)(code - last_code);   /* To include callout length */
        DPRINTF((">> end branch\n"));
        }
      return TRUE;
@@ -2980,7 +3286,7 @@ for (;; ptr++)
            ptr++;
            }
  
-        posix_class = check_posix_name(ptr, tempptr - ptr);
+        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
          if (posix_class < 0)
            {
            *errorcodeptr = ERR30;
@@ -2994,10 +3300,25 @@ for (;; ptr++)
          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
            posix_class = 0;
  
-        /* We build the bit map for the POSIX class in a chunk of local store
-        because we may be adding and subtracting from it, and we don't want to
-        subtract bits that may be in the main map already. At the end we or the
-        result into the bit map that is being built. */
+        /* When PCRE_UCP is set, some of the POSIX classes are converted to
+        different escape sequences that use Unicode properties. */
+
+#ifdef SUPPORT_UCP
+        if ((options & PCRE_UCP) != 0)
+          {
+          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
+          if (posix_substitutes[pc] != NULL)
+            {
+            nestptr = tempptr + 1;
+            ptr = posix_substitutes[pc] - 1;
+            continue;
+            }
+          }
+#endif
+        /* In the non-UCP case, we build the bit map for the POSIX class in a
+        chunk of local store because we may be adding and subtracting from it,
+        and we don't want to subtract bits that may be in the main map already.
+        At the end we or the result into the bit map that is being built. */
  
          posix_class *= 3;
  
@@ -3041,19 +3362,18 @@ for (;; ptr++)
  
        /* Backslash may introduce a single character, or it may introduce one
        of the specials, which just set a flag. The sequence \b is a special
-      case. Inside a class (and only there) it is treated as backspace.
-      Elsewhere it marks a word boundary. Other escapes have preset maps ready
-      to 'or' into the one we are building. We assume they have more than one
-      character in them, so set class_charcount bigger than one. */
+      case. Inside a class (and only there) it is treated as backspace. We
+      assume that other escapes have more than one character in them, so set
+      class_charcount bigger than one. Unrecognized escapes fall through and
+      are either treated as literal characters (by default), or are faulted if
+      PCRE_EXTRA is set. */
  
        if (c == CHAR_BACKSLASH)
          {
          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
          if (*errorcodeptr != 0) goto FAILED;
  
-        if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
-        else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
-        else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
+        if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
          else if (-c == ESC_Q)            /* Handle start of quoted string */
            {
            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
@@ -3070,10 +3390,20 @@ for (;; ptr++)
            register const uschar *cbits = cd->cbits;
            class_charcount += 2;     /* Greater than 1 is what matters */
  
-          /* Save time by not doing this in the pre-compile phase. */
-
-          if (lengthptr == NULL) switch (-c)
+          switch (-c)
              {
+#ifdef SUPPORT_UCP
+            case ESC_du:     /* These are the values given for \d etc */
+            case ESC_DU:     /* when PCRE_UCP is set. We replace the */
+            case ESC_wu:     /* escape sequence with an appropriate \p */
+            case ESC_WU:     /* or \P to test Unicode properties instead */
+            case ESC_su:     /* of the default ASCII testing. */
+            case ESC_SU:
+            nestptr = ptr;
+            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
+            class_charcount -= 2;                /* Undo! */
+            continue;
+#endif
              case ESC_d:
              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
              continue;
@@ -3092,9 +3422,14 @@ for (;; ptr++)
              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
              continue;
  
+            /* Perl 5.004 onwards omits VT from \s, but we must preserve it
+            if it was previously set by something earlier in the character
+            class. */
+
              case ESC_s:
-            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
-            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
+            classbits[0] |= cbits[cbit_space];
+            classbits[1] |= cbits[cbit_space+1] & ~0x08;
+            for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
              continue;
  
              case ESC_S:
@@ -3103,20 +3438,7 @@ for (;; ptr++)
              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
              continue;
  
-            default:    /* Not recognized; fall through */
-            break;      /* Need "default" setting to stop compiler warning. */
-            }
-
-          /* In the pre-compile phase, just do the recognition. */
-
-          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
-                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
-
-          /* We need to deal with \H, \h, \V, and \v in both phases because
-          they use extra memory. */
-
-          if (-c == ESC_h)
-            {
+            case ESC_h:
              SETBIT(classbits, 0x09); /* VT */
              SETBIT(classbits, 0x20); /* SPACE */
              SETBIT(classbits, 0xa0); /* NSBP */
@@ -3140,10 +3462,8 @@ for (;; ptr++)
                }
  #endif
              continue;
-            }
  
-          if (-c == ESC_H)
-            {
+            case ESC_H:
              for (c = 0; c < 32; c++)
                {
                int x = 0xff;
@@ -3185,10 +3505,8 @@ for (;; ptr++)
                }
  #endif
              continue;
-            }
  
-          if (-c == ESC_v)
-            {
+            case ESC_v:
              SETBIT(classbits, 0x0a); /* LF */
              SETBIT(classbits, 0x0b); /* VT */
              SETBIT(classbits, 0x0c); /* FF */
@@ -3204,10 +3522,8 @@ for (;; ptr++)
                }
  #endif
              continue;
-            }
  
-          if (-c == ESC_V)
-            {
+            case ESC_V:
              for (c = 0; c < 32; c++)
                {
                int x = 0xff;
@@ -3237,38 +3553,38 @@ for (;; ptr++)
                }
  #endif
              continue;
-            }
-
-          /* We need to deal with \P and \p in both phases. */
  
  #ifdef SUPPORT_UCP
-          if (-c == ESC_p || -c == ESC_P)
-            {
-            BOOL negated;
-            int pdata;
-            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
-            if (ptype < 0) goto FAILED;
-            class_utf8 = TRUE;
-            *class_utf8data++ = ((-c == ESC_p) != negated)?
-              XCL_PROP : XCL_NOTPROP;
-            *class_utf8data++ = ptype;
-            *class_utf8data++ = pdata;
-            class_charcount -= 2;   /* Not a < 256 character */
-            continue;
-            }
+            case ESC_p:
+            case ESC_P:
+              {
+              BOOL negated;
+              int pdata;
+              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
+              if (ptype < 0) goto FAILED;
+              class_utf8 = TRUE;
+              *class_utf8data++ = ((-c == ESC_p) != negated)?
+                XCL_PROP : XCL_NOTPROP;
+              *class_utf8data++ = ptype;
+              *class_utf8data++ = pdata;
+              class_charcount -= 2;   /* Not a < 256 character */
+              continue;
+              }
  #endif
-          /* Unrecognized escapes are faulted if PCRE is running in its
-          strict mode. By default, for compatibility with Perl, they are
-          treated as literals. */
+            /* Unrecognized escapes are faulted if PCRE is running in its
+            strict mode. By default, for compatibility with Perl, they are
+            treated as literals. */
  
-          if ((options & PCRE_EXTRA) != 0)
-            {
-            *errorcodeptr = ERR7;
-            goto FAILED;
+            default:
+            if ((options & PCRE_EXTRA) != 0)
+              {
+              *errorcodeptr = ERR7;
+              goto FAILED;
+              }
+            class_charcount -= 2;  /* Undo the default count from above */
+            c = *ptr;              /* Get the final character and fall through */
+            break;
              }
-
-          class_charcount -= 2;  /* Undo the default count from above */
-          c = *ptr;              /* Get the final character and fall through */
            }
  
          /* Fall through if we have a single character (c >= 0). This may be
@@ -3338,14 +3654,11 @@ for (;; ptr++)
            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
            if (*errorcodeptr != 0) goto FAILED;
  
-          /* \b is backspace; \X is literal X; \R is literal R; any other
-          special means the '-' was literal */
+          /* \b is backspace; any other special means the '-' was literal */
  
            if (d < 0)
              {
-            if (d == -ESC_b) d = CHAR_BS;
-            else if (d == -ESC_X) d = CHAR_X;
-            else if (d == -ESC_R) d = CHAR_R; else
+            if (d == -ESC_b) d = CHAR_BS; else
                {
                ptr = oldptr;
                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
@@ -3511,35 +3824,23 @@ for (;; ptr++)
          }
        }
  
-    /* Loop until ']' reached. This "while" is the end of the "do" above. */
+    /* Loop until ']' reached. This "while" is the end of the "do" far above.
+    If we are at the end of an internal nested string, revert to the outer
+    string. */
  
-    while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
+    while (((c = *(++ptr)) != 0 ||
+           (nestptr != NULL &&
+             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
+           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
  
-    if (c == 0)                          /* Missing terminating ']' */
+    /* Check for missing terminating ']' */
+
+    if (c == 0)
        {
        *errorcodeptr = ERR6;
        goto FAILED;
        }
  
-
-/* This code has been disabled because it would mean that \s counts as
-an explicit \r or \n reference, and that's not really what is wanted. Now
-we set the flag only if there is a literal "\r" or "\n" in the class. */
-
-#if 0
-    /* Remember whether \r or \n are in this class */
-
-    if (negate_class)
-      {
-      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
-      }
-    else
-      {
-      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
-      }
-#endif
-
-
      /* If class_charcount is 1, we saw precisely one character whose value is
      less than 256. As long as there were no characters >= 128 and there was no
      use of \p or \P, in other words, no use of any XCLASS features, we can
@@ -3603,13 +3904,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  
      /* If there are characters with values > 255, we have to compile an
      extended class, with its own opcode, unless there was a negated special
-    such as \S in the class, because in that case all characters > 255 are in
-    the class, so any that were explicitly given as well can be ignored. If
-    (when there are explicit characters > 255 that must be listed) there are no
-    characters < 256, we can omit the bitmap in the actual compiled code. */
+    such as \S in the class, and PCRE_UCP is not set, because in that case all
+    characters > 255 are in the class, so any that were explicitly given as
+    well can be ignored. If (when there are explicit characters > 255 that must
+    be listed) there are no characters < 256, we can omit the bitmap in the
+    actual compiled code. */
  
  #ifdef SUPPORT_UTF8
-    if (class_utf8 && !should_flip_negation)
+    if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
        {
        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
        *code++ = OP_XCLASS;
@@ -3635,10 +3937,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
        }
  #endif
  
-    /* If there are no characters > 255, set the opcode to OP_CLASS or
-    OP_NCLASS, depending on whether the whole class was negated and whether
-    there were negative specials such as \S in the class. Then copy the 32-byte
-    map into the code vector, negating it if necessary. */
+    /* If there are no characters > 255, or they are all to be included or
+    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
+    whole class was negated and whether there were negative specials such as \S
+    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
+    negating it if necessary. */
  
      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
      if (negate_class)
@@ -3762,8 +4065,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  
        if (!possessive_quantifier &&
            repeat_max < 0 &&
-          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
-            options, cd))
+          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
          {
          repeat_type = 0;    /* Force greedy */
          possessive_quantifier = TRUE;
@@ -3784,7 +4086,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
        c = previous[1];
        if (!possessive_quantifier &&
            repeat_max < 0 &&
-          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
          {
          repeat_type = 0;    /* Force greedy */
          possessive_quantifier = TRUE;
@@ -3808,7 +4110,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  
        if (!possessive_quantifier &&
            repeat_max < 0 &&
-          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
          {
          repeat_type = 0;    /* Force greedy */
          possessive_quantifier = TRUE;
@@ -4018,7 +4320,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
        {
        register int i;
        int ketoffset = 0;
-      int len = code - previous;
+      int len = (int)(code - previous);
        uschar *bralink = NULL;
  
        /* Repeating a DEFINE group is pointless */
@@ -4039,7 +4341,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
          {
          register uschar *ket = previous;
          do ket += GET(ket, 1); while (*ket != OP_KET);
-        ketoffset = code - ket;
+        ketoffset = (int)(code - ket);
          }
  
        /* The case of a zero minimum is special because of the need to stick
@@ -4107,7 +4409,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
            /* We chain together the bracket offset fields that have to be
            filled in later when the ends of the brackets are reached. */
  
-          offset = (bralink == NULL)? 0 : previous - bralink;
+          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
            bralink = previous;
            PUTINC(previous, 0, offset);
            }
@@ -4216,7 +4518,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
              {
              int offset;
              *code++ = OP_BRA;
-            offset = (bralink == NULL)? 0 : code - bralink;
+            offset = (bralink == NULL)? 0 : (int)(code - bralink);
              bralink = code;
              PUTINC(code, 0, offset);
              }
@@ -4237,7 +4539,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
          while (bralink != NULL)
            {
            int oldlinkoffset;
-          int offset = code - bralink + 1;
+          int offset = (int)(code - bralink + 1);
            uschar *bra = code - offset;
            oldlinkoffset = GET(bra, 1);
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
@@ -4325,7 +4627,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  #endif
          }
  
-      len = code - tempcode;
+      len = (int)(code - tempcode);
        if (len > 0) switch (*tempcode)
          {
          case OP_STAR:  *tempcode = OP_POSSTAR; break;
@@ -4384,24 +4686,34 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  
      /* First deal with various "verbs" that can be introduced by '*'. */
  
-    if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
+    if (*(++ptr) == CHAR_ASTERISK &&
+         ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
        {
        int i, namelen;
+      int arglen = 0;
        const char *vn = verbnames;
-      const uschar *name = ++ptr;
+      const uschar *name = ptr + 1;
+      const uschar *arg = NULL;
        previous = NULL;
        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
+      namelen = (int)(ptr - name);
+
        if (*ptr == CHAR_COLON)
          {
-        *errorcodeptr = ERR59;   /* Not supported */
-        goto FAILED;
+        arg = ++ptr;
+        while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
+          || *ptr == '_') ptr++;
+        arglen = (int)(ptr - arg);
          }
+
        if (*ptr != CHAR_RIGHT_PARENTHESIS)
          {
          *errorcodeptr = ERR60;
          goto FAILED;
          }
-      namelen = ptr - name;
+
+      /* Scan the table of verb names */
+
        for (i = 0; i < verbcount; i++)
          {
          if (namelen == verbs[i].len &&
@@ -4419,13 +4731,51 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
                PUT2INC(code, 0, oc->number);
                }
              }
-          *code++ = verbs[i].op;
-          break;
+
+          /* Handle the cases with/without an argument */
+
+          if (arglen == 0)
+            {
+            if (verbs[i].op < 0)   /* Argument is mandatory */
+              {
+              *errorcodeptr = ERR66;
+              goto FAILED;
+              }
+            *code = verbs[i].op;
+            if (*code++ == OP_THEN)
+              {
+              PUT(code, 0, code - bcptr->current_branch - 1);
+              code += LINK_SIZE;
+              }
+            }
+
+          else
+            {
+            if (verbs[i].op_arg < 0)   /* Argument is forbidden */
+              {
+              *errorcodeptr = ERR59;
+              goto FAILED;
+              }
+            *code = verbs[i].op_arg;
+            if (*code++ == OP_THEN_ARG)
+              {
+              PUT(code, 0, code - bcptr->current_branch - 1);
+              code += LINK_SIZE;
+              }
+            *code++ = arglen;
+            memcpy(code, arg, arglen);
+            code += arglen;
+            *code++ = 0;
+            }
+
+          break;  /* Found verb, exit loop */
            }
+
          vn += verbs[i].len + 1;
          }
-      if (i < verbcount) continue;
-      *errorcodeptr = ERR60;
+
+      if (i < verbcount) continue;    /* Successfully handled a verb */
+      *errorcodeptr = ERR60;          /* Verb not recognized */
        goto FAILED;
        }
  
@@ -4544,7 +4894,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
                recno * 10 + *ptr - CHAR_0 : -1;
            ptr++;
            }
-        namelen = ptr - name;
+        namelen = (int)(ptr - name);
  
          if ((terminator > 0 && *ptr++ != terminator) ||
              *ptr++ != CHAR_RIGHT_PARENTHESIS)
@@ -4605,7 +4955,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
          /* Search the pattern for a forward reference */
  
          else if ((i = find_parens(cd, name, namelen,
-                        (options & PCRE_EXTENDED) != 0)) > 0)
+                        (options & PCRE_EXTENDED) != 0, utf8)) > 0)
            {
            PUT2(code, 2+LINK_SIZE, i);
            code[1+LINK_SIZE]++;
@@ -4740,8 +5090,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
              goto FAILED;
              }
            *code++ = n;
-          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
-          PUT(code, LINK_SIZE, 0);                    /* Default length */
+          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
+          PUT(code, LINK_SIZE, 0);                          /* Default length */
            code += 2 * LINK_SIZE;
            }
          previous = NULL;
@@ -4774,7 +5124,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
            name = ++ptr;
  
            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
-          namelen = ptr - name;
+          namelen = (int)(ptr - name);
  
            /* In the pre-compile phase, just do a syntax check. */
  
@@ -4904,13 +5254,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
          NAMED_REF_OR_RECURSE:
          name = ++ptr;
          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
-        namelen = ptr - name;
+        namelen = (int)(ptr - name);
  
-        /* In the pre-compile phase, do a syntax check and set a dummy
-        reference number. */
+        /* In the pre-compile phase, do a syntax check. We used to just set
+        a dummy reference number, because it was not used in the first pass.
+        However, with the change of recursive back references to be atomic,
+        we have to look for the number so that this state can be identified, as
+        otherwise the incorrect length is computed. If it's not a backwards
+        reference, the dummy number will do. */
  
          if (lengthptr != NULL)
            {
+          const uschar *temp;
+
            if (namelen == 0)
              {
              *errorcodeptr = ERR62;
@@ -4926,7 +5282,22 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
              *errorcodeptr = ERR48;
              goto FAILED;
              }
-          recno = 0;
+
+          /* The name table does not exist in the first pass, so we cannot
+          do a simple search as in the code below. Instead, we have to scan the
+          pattern to find the number. It is important that we scan it only as
+          far as we have got because the syntax of named subpatterns has not
+          been checked for the rest of the pattern, and find_parens() assumes
+          correct syntax. In any case, it's a waste of resources to scan
+          further. We stop the scan at the current point by temporarily
+          adjusting the value of cd->endpattern. */
+
+          temp = cd->end_pattern;
+          cd->end_pattern = ptr;
+          recno = find_parens(cd, name, namelen,
+            (options & PCRE_EXTENDED) != 0, utf8);
+          cd->end_pattern = temp;
+          if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
            }
  
          /* In the real compile, seek the name in the table. We check the name
@@ -4951,7 +5322,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
              }
            else if ((recno =                /* Forward back reference */
                      find_parens(cd, name, namelen,
-                      (options & PCRE_EXTENDED) != 0)) <= 0)
+                      (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
              {
              *errorcodeptr = ERR15;
              goto FAILED;
@@ -5062,7 +5433,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
              if (called == NULL)
                {
                if (find_parens(cd, NULL, recno,
-                    (options & PCRE_EXTENDED) != 0) < 0)
+                    (options & PCRE_EXTENDED) != 0, utf8) < 0)
                  {
                  *errorcodeptr = ERR15;
                  goto FAILED;
@@ -5073,7 +5444,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
                of the group. */
  
                called = cd->start_code + recno;
-              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
+              PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
                }
  
              /* If not a forward reference, and the subpattern is still open,
@@ -5097,7 +5468,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
            code += 1 + LINK_SIZE;
  
            *code = OP_RECURSE;
-          PUT(code, 1, called - cd->start_code);
+          PUT(code, 1, (int)(called - cd->start_code));
            code += 1 + LINK_SIZE;
  
            *code = OP_KET;
@@ -5208,8 +5579,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
          }     /* End of switch for character following (? */
        }       /* End of (? handling */
  
-    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
-    all unadorned brackets become non-capturing and behave like (?:...)
+    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
+    is set, all unadorned brackets become non-capturing and behave like (?:...)
      brackets. */
  
      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
@@ -5401,11 +5772,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  
      /* ===================================================================*/
      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
-    are arranged to be the negation of the corresponding OP_values. For the
-    back references, the values are ESC_REF plus the reference number. Only
-    back references and those types that consume a character may be repeated.
-    We can test for values between ESC_b and ESC_Z for the latter; this may
-    have to change if any new ones are ever created. */
+    are arranged to be the negation of the corresponding OP_values in the
+    default case when PCRE_UCP is not set. For the back references, the values
+    are ESC_REF plus the reference number. Only back references and those types
+    that consume a character may be repeated. We can test for values between
+    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
+    ever created. */
  
      case CHAR_BACKSLASH:
      tempptr = ptr;
@@ -5565,12 +5937,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
  #endif
  
        /* For the rest (including \X when Unicode properties are supported), we
-      can obtain the OP value by negating the escape value. */
+      can obtain the OP value by negating the escape value in the default
+      situation when PCRE_UCP is not set. When it *is* set, we substitute
+      Unicode property tests. */
  
        else
          {
-        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
-        *code++ = -c;
+#ifdef SUPPORT_UCP
+        if (-c >= ESC_DU && -c <= ESC_wu)
+          {
+          nestptr = ptr + 1;                   /* Where to resume */
+          ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
+          }
+        else
+#endif
+          {
+          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
+          *code++ = -c;
+          }
          }
        continue;
        }
@@ -5902,7 +6286,7 @@ for (;;)
      {
      if (lengthptr == NULL)
        {
-      int branch_length = code - last_branch;
+      int branch_length = (int)(code - last_branch);
        do
          {
          int prev_length = GET(last_branch, 1);
@@ -5916,7 +6300,7 @@ for (;;)
      /* Fill in the ket */
  
      *code = OP_KET;
-    PUT(code, 1, code - start_bracket);
+    PUT(code, 1, (int)(code - start_bracket));
      code += 1 + LINK_SIZE;
  
      /* If it was a capturing subpattern, check to see if it contained any
@@ -5931,9 +6315,9 @@ for (;;)
            code - start_bracket);
          *start_bracket = OP_ONCE;
          code += 1 + LINK_SIZE;
-        PUT(start_bracket, 1, code - start_bracket);
+        PUT(start_bracket, 1, (int)(code - start_bracket));
          *code = OP_KET;
-        PUT(code, 1, code - start_bracket);
+        PUT(code, 1, (int)(code - start_bracket));
          code += 1 + LINK_SIZE;
          length += 2 + 2*LINK_SIZE;
          }
@@ -5988,7 +6372,7 @@ for (;;)
    else
      {
      *code = OP_ALT;
-    PUT(code, 1, code - last_branch);
+    PUT(code, 1, (int)(code - last_branch));
      bc.current_branch = last_branch = code;
      code += 1 + LINK_SIZE;
      }
@@ -6290,8 +6674,6 @@ Returns:        pointer to compiled data block, or NULL on error,
                  with errorptr and erroroffset set
  */
  
-#ifdef NOT_USED_IN_GLIB
-
  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
  pcre_compile(const char *pattern, int options, const char **errorptr,
    int *erroroffset, const unsigned char *tables)
@@ -6299,7 +6681,6 @@ pcre_compile(const char *pattern, int options, const char **errorptr,
  return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
  }
  
-#endif
  
  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
@@ -6310,7 +6691,7 @@ int length = 1;  /* For final END opcode */
  int firstbyte, reqbyte, newline;
  int errorcode = 0;
  int skipatstart = 0;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+BOOL utf8;
  size_t size;
  uschar *code;
  const uschar *codestart;
@@ -6380,6 +6761,10 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
  
    if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
      { skipatstart += 7; options |= PCRE_UTF8; continue; }
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
+    { skipatstart += 6; options |= PCRE_UCP; continue; }
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
+    { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
  
    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
@@ -6404,6 +6789,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
    else break;
    }
  
+utf8 = (options & PCRE_UTF8) != 0;
+
  /* Can't support UTF8 unless PCRE has been compiled to include the code. */
  
  #ifdef SUPPORT_UTF8
@@ -6421,6 +6808,16 @@ if (utf8)
    }
  #endif
  
+/* Can't support UCP unless PCRE has been compiled to include the code. */
+
+#ifndef SUPPORT_UCP
+if ((options & PCRE_UCP) != 0)
+  {
+  errorcode = ERR67;
+  goto PCRE_EARLY_ERROR_RETURN;
+  }
+#endif
+
  /* Check validity of \R options. */
  
  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
@@ -6549,7 +6946,7 @@ regex compiled on a system with 4-byte pointers is run on another with 8-byte
  pointers. */
  
  re->magic_number = MAGIC_NUMBER;
-re->size = size;
+re->size = (int)size;
  re->options = cd->external_options;
  re->flags = cd->external_flags;
  re->dummy1 = 0;
@@ -6620,7 +7017,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)
    recno = GET(codestart, offset);
    groupptr = _pcre_find_bracket(codestart, utf8, recno);
    if (groupptr == NULL) errorcode = ERR53;
-    else PUT(((uschar *)codestart), offset, groupptr - codestart);
+    else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
    }
  
  /* Give an error if there's back reference to a non-existent capturing
@@ -6675,7 +7072,7 @@ if (errorcode != 0)
    {
    (pcre_free)(re);
    PCRE_EARLY_ERROR_RETURN:
-  *erroroffset = ptr - (const uschar *)pattern;
+  *erroroffset = (int)(ptr - (const uschar *)pattern);
    PCRE_EARLY_ERROR_RETURN2:
    *errorptr = find_error_text(errorcode);
    if (errorcodeptr != NULL) *errorcodeptr = errorcode;
diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c

index c241f5b05822f1cd85a80f9ea17c40144f738cd2..4d61a325d2f3a6305c86821750c512079483e142 100644 (file)
--- a/glib/pcre/pcre_dfa_exec.c
+++ b/glib/pcre/pcre_dfa_exec.c
@@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */
  
  
  /* This table identifies those opcodes that are followed immediately by a
-character that is to be tested in some way. This makes is possible to
+character that is to be tested in some way. This makes it possible to
  centralize the loading of these characters. In the case of Type * etc, the
  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  small value. Non-zero values in the table are the offsets from the opcode where
@@ -161,8 +161,9 @@ static const uschar coptable[] = {
    0, 0,                          /* RREF, NRREF                            */
    0,                             /* DEF                                    */
    0, 0,                          /* BRAZERO, BRAMINZERO                    */
-  0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
-  0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
+  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
+  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
+  0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
  };
  
  /* This table identifies those opcodes that inspect a character. It is used to
@@ -218,8 +219,9 @@ static const uschar poptable[] = {
    0, 0,                          /* RREF, NRREF                            */
    0,                             /* DEF                                    */
    0, 0,                          /* BRAZERO, BRAMINZERO                    */
-  0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
-  0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
+  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
+  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
+  0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
  };
  
  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE)
  
      {
      gone_back = (current_subject - max_back < start_subject)?
-      current_subject - start_subject : max_back;
+      (int)(current_subject - start_subject) : max_back;
      current_subject -= gone_back;
      }
  
@@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE)
      int back = GET(end_code, 2+LINK_SIZE);
      if (back <= gone_back)
        {
-      int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
+      int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
        ADD_NEW_DATA(-bstate, 0, gone_back - back);
        }
      end_code += GET(end_code, 1);
@@ -526,7 +528,7 @@ else
        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
      do
        {
-      ADD_NEW(end_code - start_code + length, 0);
+      ADD_NEW((int)(end_code - start_code + length), 0);
        end_code += GET(end_code, 1);
        length = 1 + LINK_SIZE;
        }
@@ -753,8 +755,8 @@ for (;;)
            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
            if (offsetcount >= 2)
              {
-            offsets[0] = current_subject - start_subject;
-            offsets[1] = ptr - start_subject;
+            offsets[0] = (int)(current_subject - start_subject);
+            offsets[1] = (int)(ptr - start_subject);
              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
                offsets[1] - offsets[0], current_subject));
              }
@@ -776,7 +778,7 @@ for (;;)
        /*-----------------------------------------------------------------*/
        case OP_ALT:
        do { code += GET(code, 1); } while (*code == OP_ALT);
-      ADD_ACTIVE(code - start_code, 0);
+      ADD_ACTIVE((int)(code - start_code), 0);
        break;
  
        /*-----------------------------------------------------------------*/
@@ -784,7 +786,7 @@ for (;;)
        case OP_SBRA:
        do
          {
-        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
          code += GET(code, 1);
          }
        while (*code == OP_ALT);
@@ -793,11 +795,11 @@ for (;;)
        /*-----------------------------------------------------------------*/
        case OP_CBRA:
        case OP_SCBRA:
-      ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
+      ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
        code += GET(code, 1);
        while (*code == OP_ALT)
          {
-        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
+        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
          code += GET(code, 1);
          }
        break;
@@ -808,14 +810,14 @@ for (;;)
        ADD_ACTIVE(state_offset + 1, 0);
        code += 1 + GET(code, 2);
        while (*code == OP_ALT) code += GET(code, 1);
-      ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
        break;
  
        /*-----------------------------------------------------------------*/
        case OP_SKIPZERO:
        code += 1 + GET(code, 2);
        while (*code == OP_ALT) code += GET(code, 1);
-      ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
        break;
  
        /*-----------------------------------------------------------------*/
@@ -829,7 +831,12 @@ for (;;)
  
        /*-----------------------------------------------------------------*/
        case OP_EOD:
-      if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+      if (ptr >= end_subject)
+        {
+        if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+          could_continue = TRUE;
+        else { ADD_ACTIVE(state_offset + 1, 0); }
+        }
        break;
  
        /*-----------------------------------------------------------------*/
@@ -869,7 +876,9 @@ for (;;)
  
        /*-----------------------------------------------------------------*/
        case OP_EODN:
-      if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
+      if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+        could_continue = TRUE;
+      else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
          { ADD_ACTIVE(state_offset + 1, 0); }
        break;
  
@@ -877,7 +886,9 @@ for (;;)
        case OP_DOLL:
        if ((md->moptions & PCRE_NOTEOL) == 0)
          {
-        if (clen == 0 ||
+        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+          could_continue = TRUE;
+        else if (clen == 0 ||
              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
              ))
@@ -920,13 +931,37 @@ for (;;)
            if (utf8) BACKCHAR(temp);
  #endif
            GETCHARTEST(d, temp);
+#ifdef SUPPORT_UCP
+          if ((md->poptions & PCRE_UCP) != 0)
+            {
+            if (d == '_') left_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(d);
+              left_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
            }
-        else left_word = 0;
+        else left_word = FALSE;
  
          if (clen > 0)
+          {
+#ifdef SUPPORT_UCP
+          if ((md->poptions & PCRE_UCP) != 0)
+            {
+            if (c == '_') right_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(c);
+              right_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
-        else right_word = 0;
+          }
+        else right_word = FALSE;
  
          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
            { ADD_ACTIVE(state_offset + 1, 0); }
@@ -953,7 +988,8 @@ for (;;)
            break;
  
            case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+               chartype == ucp_Lt;
            break;
  
            case PT_GC:
@@ -968,6 +1004,30 @@ for (;;)
            OK = UCD_SCRIPT(c) == code[2];
            break;
  
+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
            /* Should never occur, but keep compilers from grumbling. */
  
            default:
@@ -1122,7 +1182,8 @@ for (;;)
            break;
  
            case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+            chartype == ucp_Lt;
            break;
  
            case PT_GC:
@@ -1137,6 +1198,30 @@ for (;;)
            OK = UCD_SCRIPT(c) == code[3];
            break;
  
+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
            /* Should never occur, but keep compilers from grumbling. */
  
            default:
@@ -1344,7 +1429,8 @@ for (;;)
            break;
  
            case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+            chartype == ucp_Lt;
            break;
  
            case PT_GC:
@@ -1359,6 +1445,30 @@ for (;;)
            OK = UCD_SCRIPT(c) == code[3];
            break;
  
+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
            /* Should never occur, but keep compilers from grumbling. */
  
            default:
@@ -1591,7 +1701,8 @@ for (;;)
            break;
  
            case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+            chartype == ucp_Lt;
            break;
  
            case PT_GC:
@@ -1606,6 +1717,30 @@ for (;;)
            OK = UCD_SCRIPT(c) == code[5];
            break;
  
+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
            /* Should never occur, but keep compilers from grumbling. */
  
            default:
@@ -2233,7 +2368,7 @@ for (;;)
          points to the byte after the end of the class. If there is a
          quantifier, this is where it will be. */
  
-        next_state_offset = ecode - start_code;
+        next_state_offset = (int)(ecode - start_code);
  
          switch (*ecode)
            {
@@ -2304,7 +2439,7 @@ for (;;)
            md,                                   /* static match data */
            code,                                 /* this subexpression's code */
            ptr,                                  /* where we currently are */
-          ptr - start_subject,                  /* start offset */
+          (int)(ptr - start_subject),           /* start offset */
            local_offsets,                        /* offset vector */
            sizeof(local_offsets)/sizeof(int),    /* size of same */
            local_workspace,                      /* workspace vector */
@@ -2315,7 +2450,7 @@ for (;;)
  
          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
-            { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
          }
        break;
  
@@ -2342,9 +2477,9 @@ for (;;)
              cb.callout_number   = code[LINK_SIZE+2];
              cb.offset_vector    = offsets;
              cb.subject          = (PCRE_SPTR)start_subject;
-            cb.subject_length   = end_subject - start_subject;
-            cb.start_match      = current_subject - start_subject;
-            cb.current_position = ptr - start_subject;
+            cb.subject_length   = (int)(end_subject - start_subject);
+            cb.start_match      = (int)(current_subject - start_subject);
+            cb.current_position = (int)(ptr - start_subject);
              cb.pattern_position = GET(code, LINK_SIZE + 3);
              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
              cb.capture_top      = 1;
@@ -2395,7 +2530,7 @@ for (;;)
              md,                                   /* fixed match data */
              asscode,                              /* this subexpression's code */
              ptr,                                  /* where we currently are */
-            ptr - start_subject,                  /* start offset */
+            (int)(ptr - start_subject),           /* start offset */
              local_offsets,                        /* offset vector */
              sizeof(local_offsets)/sizeof(int),    /* size of same */
              local_workspace,                      /* workspace vector */
@@ -2407,7 +2542,7 @@ for (;;)
            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
            if ((rc >= 0) ==
                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
-            { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
            else
              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
            }
@@ -2428,7 +2563,7 @@ for (;;)
            md,                                   /* fixed match data */
            start_code + GET(code, 1),            /* this subexpression's code */
            ptr,                                  /* where we currently are */
-          ptr - start_subject,                  /* start offset */
+          (int)(ptr - start_subject),           /* start offset */
            local_offsets,                        /* offset vector */
            sizeof(local_offsets)/sizeof(int),    /* size of same */
            local_workspace,                      /* workspace vector */
@@ -2480,7 +2615,7 @@ for (;;)
            md,                                   /* fixed match data */
            code,                                 /* this subexpression's code */
            ptr,                                  /* where we currently are */
-          ptr - start_subject,                  /* start offset */
+          (int)(ptr - start_subject),           /* start offset */
            local_offsets,                        /* offset vector */
            sizeof(local_offsets)/sizeof(int),    /* size of same */
            local_workspace,                      /* workspace vector */
@@ -2497,7 +2632,8 @@ for (;;)
  
            do { end_subpattern += GET(end_subpattern, 1); }
              while (*end_subpattern == OP_ALT);
-          next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
+          next_state_offset =
+            (int)(end_subpattern - start_code + LINK_SIZE + 1);
  
            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
            arrange for the repeat state also to be added to the relevant list.
@@ -2505,7 +2641,7 @@ for (;;)
  
            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
                                   *end_subpattern == OP_KETRMIN)?
-            end_subpattern - start_code - GET(end_subpattern, 1) : -1;
+            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
  
            /* If we have matched an empty string, add the next state at the
            current character pointer. This is important so that the duplicate
@@ -2569,9 +2705,9 @@ for (;;)
          cb.callout_number   = code[1];
          cb.offset_vector    = offsets;
          cb.subject          = (PCRE_SPTR)start_subject;
-        cb.subject_length   = end_subject - start_subject;
-        cb.start_match      = current_subject - start_subject;
-        cb.current_position = ptr - start_subject;
+        cb.subject_length   = (int)(end_subject - start_subject);
+        cb.start_match      = (int)(current_subject - start_subject);
+        cb.current_position = (int)(ptr - start_subject);
          cb.pattern_position = GET(code, 2);
          cb.next_item_length = GET(code, 2 + LINK_SIZE);
          cb.capture_top      = 1;
@@ -2617,13 +2753,13 @@ for (;;)
          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
           match_count < 0)                            /* no matches */
          ) &&                                         /* And... */
-        ptr >= end_subject &&                     /* Reached end of subject */
-        ptr > current_subject)                    /* Matched non-empty string */
+        ptr >= end_subject &&                  /* Reached end of subject */
+        ptr > md->start_used_ptr)              /* Inspected non-empty string */
        {
        if (offsetcount >= 2)
          {
-        offsets[0] = md->start_used_ptr - start_subject;
-        offsets[1] = end_subject - start_subject;
+        offsets[0] = (int)(md->start_used_ptr - start_subject);
+        offsets[1] = (int)(end_subject - start_subject);
          }
        match_count = PCRE_ERROR_PARTIAL;
        }
@@ -2708,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL ||
     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
  
  /* We need to find the pointer to any study data before we test for byte
  flipping, so we scan the extra_data block first. This may set two fields in the
@@ -2826,16 +2963,14 @@ back the character offset. */
  #ifdef SUPPORT_UTF8
  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
    {
-  if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
-    return PCRE_ERROR_BADUTF8;
+  int tb;
+  if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
+    return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
+      PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
    if (start_offset > 0 && start_offset < length)
      {
-    int tb = ((uschar *)subject)[start_offset];
-    if (tb > 127)
-      {
-      tb &= 0xc0;
-      if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
-      }
+    tb = ((USPTR)subject)[start_offset] & 0xc0;
+    if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
      }
    }
  #endif
@@ -2922,9 +3057,11 @@ for (;;)
  
      /* There are some optimizations that avoid running the match if a known
      starting point is not found. However, there is an option that disables
-    these, for testing and for ensuring that all callouts do actually occur. */
+    these, for testing and for ensuring that all callouts do actually occur.
+    The option can be set in the regex by (*NO_START_OPT) or passed in
+    match-time options. */
  
-    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
        {
        /* Advance to a known first byte. */
  
@@ -2982,8 +3119,16 @@ for (;;)
          while (current_subject < end_subject)
            {
            register unsigned int c = *current_subject;
-          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
-            else break;
+          if ((start_bits[c/8] & (1 << (c&7))) == 0)
+            {
+            current_subject++;
+#ifdef SUPPORT_UTF8
+            if (utf8)
+              while(current_subject < end_subject &&
+                    (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif
+            }
+          else break;
            }
          }
        }
diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c

index 0a44fccedd6c0c9f81a0a56371cd86c9a7a91cda..569207cc34792e90d4fcd55b4d0f0b80dd70b65c 100644 (file)
--- a/glib/pcre/pcre_exec.c
+++ b/glib/pcre/pcre_exec.c
@@ -71,10 +71,20 @@ defined PCRE_ERROR_xxx codes, which are all negative. */
  /* Special internal returns from the match() function. Make them sufficiently
  negative to avoid the external error codes. */
  
-#define MATCH_COMMIT       (-999)
-#define MATCH_PRUNE        (-998)
-#define MATCH_SKIP         (-997)
-#define MATCH_THEN         (-996)
+#define MATCH_ACCEPT       (-999)
+#define MATCH_COMMIT       (-998)
+#define MATCH_PRUNE        (-997)
+#define MATCH_SKIP         (-996)
+#define MATCH_SKIP_ARG     (-995)
+#define MATCH_THEN         (-994)
+
+/* This is a convenience macro for code that occurs many times. */
+
+#define MRRETURN(ra) \
+  { \
+  md->mark = markptr; \
+  RRETURN(ra); \
+  }
  
  /* Maximum number of ints of offset to save on the stack for recursive calls.
  If the offset vector is bigger, malloc is used. This should be a multiple of 3,
@@ -245,7 +255,8 @@ enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM6,  RM7,  RM8,  RM9,  RM10,
         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
-       RM51,  RM52, RM53, RM54 };
+       RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
+       RM61,  RM62 };
  
  /* These versions of the macros use the stack, as normal. There are debugging
  versions and production versions. Note that the "rw" argument of RMATCH isn't
@@ -283,7 +294,8 @@ argument of match(), which never changes. */
  
  #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
    {\
-  heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
+  heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
+  if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
    frame->Xwhere = rw; \
    newframe->Xeptr = ra;\
    newframe->Xecode = rb;\
@@ -304,9 +316,9 @@ argument of match(), which never changes. */
  
  #define RRETURN(ra)\
    {\
-  heapframe *newframe = frame;\
-  frame = newframe->Xprevframe;\
-  (pcre_stack_free)(newframe);\
+  heapframe *oldframe = frame;\
+  frame = oldframe->Xprevframe;\
+  (pcre_stack_free)(oldframe);\
    if (frame != NULL)\
      {\
      rrc = ra;\
@@ -410,17 +422,18 @@ immediately. The second one is used when we already know we are past the end of
  the subject. */
  
  #define CHECK_PARTIAL()\
-  if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
-    {\
-    md->hitend = TRUE;\
-    if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
+  if (md->partial != 0 && eptr >= md->end_subject && \
+      eptr > md->start_used_ptr) \
+    { \
+    md->hitend = TRUE; \
+    if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
      }
  
  #define SCHECK_PARTIAL()\
-  if (md->partial != 0 && eptr > mstart)\
-    {\
-    md->hitend = TRUE;\
-    if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
+  if (md->partial != 0 && eptr > md->start_used_ptr) \
+    { \
+    md->hitend = TRUE; \
+    if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
      }
  
  
@@ -448,13 +461,14 @@ Arguments:
  
  Returns:       MATCH_MATCH if matched            )  these values are >= 0
                 MATCH_NOMATCH if failed to match  )
+               a negative MATCH_xxx value for PRUNE, SKIP, etc
                 a negative PCRE_ERROR_xxx value if aborted by an error condition
                   (e.g. stopped by repeated call or recursion limit)
  */
  
  static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR
-  markptr, int offset_top, match_data *md, unsigned long int ims,
+match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
+  const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
    eptrblock *eptrb, int flags, unsigned int rdepth)
  {
  /* These variables do not need to be preserved over recursion in this function,
@@ -475,7 +489,8 @@ heap storage. Set up the top-level frame here; others are obtained from the
  heap whenever RMATCH() does a "recursion". See the macro definitions above. */
  
  #ifdef NO_RECURSE
-heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
+heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
+if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
  frame->Xprevframe = NULL;            /* Marks the top level */
  
  /* Copy in the original argument variables */
@@ -671,32 +686,99 @@ for (;;)
  
    switch(op)
      {
+    case OP_MARK:
+    markptr = ecode + 2;
+    RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+      ims, eptrb, flags, RM55);
+
+    /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
+    argument, and we must check whether that argument matches this MARK's
+    argument. It is passed back in md->start_match_ptr (an overloading of that
+    variable). If it does match, we reset that variable to the current subject
+    position and return MATCH_SKIP. Otherwise, pass back the return code
+    unaltered. */
+
+    if (rrc == MATCH_SKIP_ARG &&
+        strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
+      {
+      md->start_match_ptr = eptr;
+      RRETURN(MATCH_SKIP);
+      }
+
+    if (md->mark == NULL) md->mark = markptr;
+    RRETURN(rrc);
+
      case OP_FAIL:
-    RRETURN(MATCH_NOMATCH);
+    MRRETURN(MATCH_NOMATCH);
+
+    /* COMMIT overrides PRUNE, SKIP, and THEN */
+
+    case OP_COMMIT:
+    RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+      ims, eptrb, flags, RM52);
+    if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
+        rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
+        rrc != MATCH_THEN)
+      RRETURN(rrc);
+    MRRETURN(MATCH_COMMIT);
+
+    /* PRUNE overrides THEN */
  
      case OP_PRUNE:
      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
        ims, eptrb, flags, RM51);
-    if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+    if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+    MRRETURN(MATCH_PRUNE);
+
+    case OP_PRUNE_ARG:
+    RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+      ims, eptrb, flags, RM56);
+    if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+    md->mark = ecode + 2;
      RRETURN(MATCH_PRUNE);
  
-    case OP_COMMIT:
-    RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
-      ims, eptrb, flags, RM52);
-    if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-    RRETURN(MATCH_COMMIT);
+    /* SKIP overrides PRUNE and THEN */
  
      case OP_SKIP:
      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
        ims, eptrb, flags, RM53);
-    if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+    if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
+      RRETURN(rrc);
      md->start_match_ptr = eptr;   /* Pass back current position */
-    RRETURN(MATCH_SKIP);
+    MRRETURN(MATCH_SKIP);
+
+    case OP_SKIP_ARG:
+    RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+      ims, eptrb, flags, RM57);
+    if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
+      RRETURN(rrc);
+
+    /* Pass back the current skip name by overloading md->start_match_ptr and
+    returning the special MATCH_SKIP_ARG return code. This will either be
+    caught by a matching MARK, or get to the top, where it is treated the same
+    as PRUNE. */
+
+    md->start_match_ptr = ecode + 2;
+    RRETURN(MATCH_SKIP_ARG);
+
+    /* For THEN (and THEN_ARG) we pass back the address of the bracket or
+    the alt that is at the start of the current branch. This makes it possible
+    to skip back past alternatives that precede the THEN within the current
+    branch. */
  
      case OP_THEN:
      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
        ims, eptrb, flags, RM54);
      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+    md->start_match_ptr = ecode - GET(ecode, 1);
+    MRRETURN(MATCH_THEN);
+
+    case OP_THEN_ARG:
+    RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
+      offset_top, md, ims, eptrb, flags, RM58);
+    if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+    md->start_match_ptr = ecode - GET(ecode, 1);
+    md->mark = ecode + LINK_SIZE + 2;
      RRETURN(MATCH_THEN);
  
      /* Handle a capturing bracket. If there is space in the offset vector, save
@@ -733,14 +815,17 @@ for (;;)
        save_capture_last = md->capture_last;
  
        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
-      md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
+      md->offset_vector[md->offset_end - number] =
+        (int)(eptr - md->start_subject);
  
        flags = (op == OP_SCBRA)? match_cbegroup : 0;
        do
          {
          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
            ims, eptrb, flags, RM1);
-        if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+        if (rrc != MATCH_NOMATCH &&
+            (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+          RRETURN(rrc);
          md->capture_last = save_capture_last;
          ecode += GET(ecode, 1);
          }
@@ -752,6 +837,7 @@ for (;;)
        md->offset_vector[offset+1] = save_offset2;
        md->offset_vector[md->offset_end - number] = save_offset3;
  
+      if (rrc != MATCH_THEN) md->mark = markptr;
        RRETURN(MATCH_NOMATCH);
        }
  
@@ -791,6 +877,7 @@ for (;;)
  
          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
            eptrb, flags, RM48);
+        if (rrc == MATCH_NOMATCH) md->mark = markptr;
          RRETURN(rrc);
          }
  
@@ -799,7 +886,9 @@ for (;;)
  
        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
          eptrb, flags, RM2);
-      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+      if (rrc != MATCH_NOMATCH &&
+          (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+        RRETURN(rrc);
        ecode += GET(ecode, 1);
        }
      /* Control never reaches here. */
@@ -826,15 +915,15 @@ for (;;)
          cb.callout_number   = ecode[LINK_SIZE+2];
          cb.offset_vector    = md->offset_vector;
          cb.subject          = (PCRE_SPTR)md->start_subject;
-        cb.subject_length   = md->end_subject - md->start_subject;
-        cb.start_match      = mstart - md->start_subject;
-        cb.current_position = eptr - md->start_subject;
+        cb.subject_length   = (int)(md->end_subject - md->start_subject);
+        cb.start_match      = (int)(mstart - md->start_subject);
+        cb.current_position = (int)(eptr - md->start_subject);
          cb.pattern_position = GET(ecode, LINK_SIZE + 3);
          cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
          cb.capture_top      = offset_top/2;
          cb.capture_last     = md->capture_last;
          cb.callout_data     = md->callout_data;
-        if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
          if (rrc < 0) RRETURN(rrc);
          }
        ecode += _pcre_OP_lengths[OP_CALLOUT];
@@ -1000,7 +1089,8 @@ for (;;)
          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
          while (*ecode == OP_ALT) ecode += GET(ecode, 1);
          }
-      else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+      else if (rrc != MATCH_NOMATCH &&
+              (rrc != MATCH_THEN || md->start_match_ptr != ecode))
          {
          RRETURN(rrc);         /* Need braces because of following else */
          }
@@ -1054,7 +1144,7 @@ for (;;)
        {
        md->offset_vector[offset] =
          md->offset_vector[md->offset_end - number];
-      md->offset_vector[offset+1] = eptr - md->start_subject;
+      md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
        if (offset_top <= offset) offset_top = offset + 2;
        }
      ecode += 3;
@@ -1089,14 +1179,19 @@ for (;;)
          (md->notempty ||
            (md->notempty_atstart &&
              mstart == md->start_subject + md->start_offset)))
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
  
      /* Otherwise, we have a match. */
  
      md->end_match_ptr = eptr;           /* Record where we ended */
      md->end_offset_top = offset_top;    /* and how many extracts were taken */
      md->start_match_ptr = mstart;       /* and the start (\K can modify) */
-    RRETURN(MATCH_MATCH);
+
+    /* For some reason, the macros don't work properly if an expression is
+    given as the argument to MRRETURN when the heap is in use. */
+
+    rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
+    MRRETURN(rrc);
  
      /* Change option settings */
  
@@ -1118,16 +1213,18 @@ for (;;)
        {
        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
          RM4);
-      if (rrc == MATCH_MATCH)
+      if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
          {
          mstart = md->start_match_ptr;   /* In case \K reset it */
          break;
          }
-      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+      if (rrc != MATCH_NOMATCH &&
+          (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+        RRETURN(rrc);
        ecode += GET(ecode, 1);
        }
      while (*ecode == OP_ALT);
-    if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
+    if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
  
      /* If checking an assertion for a condition, return MATCH_MATCH. */
  
@@ -1151,13 +1248,15 @@ for (;;)
        {
        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
          RM5);
-      if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
+      if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
        if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
          {
          do ecode += GET(ecode,1); while (*ecode == OP_ALT);
          break;
          }
-      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+      if (rrc != MATCH_NOMATCH &&
+          (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+        RRETURN(rrc);
        ecode += GET(ecode,1);
        }
      while (*ecode == OP_ALT);
@@ -1180,7 +1279,7 @@ for (;;)
        while (i-- > 0)
          {
          eptr--;
-        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+        if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
          BACKCHAR(eptr);
          }
        }
@@ -1191,7 +1290,7 @@ for (;;)
  
        {
        eptr -= GET(ecode, 1);
-      if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+      if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
        }
  
      /* Save the earliest consulted character, then skip to next op code */
@@ -1212,15 +1311,15 @@ for (;;)
        cb.callout_number   = ecode[1];
        cb.offset_vector    = md->offset_vector;
        cb.subject          = (PCRE_SPTR)md->start_subject;
-      cb.subject_length   = md->end_subject - md->start_subject;
-      cb.start_match      = mstart - md->start_subject;
-      cb.current_position = eptr - md->start_subject;
+      cb.subject_length   = (int)(md->end_subject - md->start_subject);
+      cb.start_match      = (int)(mstart - md->start_subject);
+      cb.current_position = (int)(eptr - md->start_subject);
        cb.pattern_position = GET(ecode, 2);
        cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
        cb.capture_top      = offset_top/2;
        cb.capture_last     = md->capture_last;
        cb.callout_data     = md->callout_data;
-      if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+      if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
        if (rrc < 0) RRETURN(rrc);
        }
      ecode += 2 + 2*LINK_SIZE;
@@ -1286,15 +1385,16 @@ for (;;)
          {
          RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
            md, ims, eptrb, flags, RM6);
-        if (rrc == MATCH_MATCH)
+        if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
            {
            DPRINTF(("Recursion matched\n"));
            md->recursive = new_recursive.prevrec;
            if (new_recursive.offset_save != stacksave)
              (pcre_free)(new_recursive.offset_save);
-          RRETURN(MATCH_MATCH);
+          MRRETURN(MATCH_MATCH);
            }
-        else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+        else if (rrc != MATCH_NOMATCH &&
+                (rrc != MATCH_THEN || md->start_match_ptr != ecode))
            {
            DPRINTF(("Recursion gave error %d\n", rrc));
            if (new_recursive.offset_save != stacksave)
@@ -1313,7 +1413,7 @@ for (;;)
        md->recursive = new_recursive.prevrec;
        if (new_recursive.offset_save != stacksave)
          (pcre_free)(new_recursive.offset_save);
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      /* Control never reaches here */
  
@@ -1332,12 +1432,14 @@ for (;;)
      do
        {
        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
-      if (rrc == MATCH_MATCH)
+      if (rrc == MATCH_MATCH)  /* Note: _not_ MATCH_ACCEPT */
          {
          mstart = md->start_match_ptr;
          break;
          }
-      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+      if (rrc != MATCH_NOMATCH &&
+          (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+        RRETURN(rrc);
        ecode += GET(ecode,1);
        }
      while (*ecode == OP_ALT);
@@ -1467,7 +1569,7 @@ for (;;)
        md->end_match_ptr = eptr;      /* For ONCE */
        md->end_offset_top = offset_top;
        md->start_match_ptr = mstart;
-      RRETURN(MATCH_MATCH);
+      MRRETURN(MATCH_MATCH);
        }
  
      /* For capturing groups we have to check the group number back at the start
@@ -1491,7 +1593,7 @@ for (;;)
          {
          md->offset_vector[offset] =
            md->offset_vector[md->offset_end - number];
-        md->offset_vector[offset+1] = eptr - md->start_subject;
+        md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
          if (offset_top <= offset) offset_top = offset + 2;
          }
  
@@ -1562,12 +1664,12 @@ for (;;)
      /* Start of subject unless notbol, or after internal newline if multiline */
  
      case OP_CIRC:
-    if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
+    if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
      if ((ims & PCRE_MULTILINE) != 0)
        {
        if (eptr != md->start_subject &&
            (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
        ecode++;
        break;
        }
@@ -1576,14 +1678,14 @@ for (;;)
      /* Start of subject assertion */
  
      case OP_SOD:
-    if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
+    if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
      /* Start of match assertion */
  
      case OP_SOM:
-    if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
+    if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1601,39 +1703,42 @@ for (;;)
      if ((ims & PCRE_MULTILINE) != 0)
        {
        if (eptr < md->end_subject)
-        { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
+        { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
        else
-        { if (md->noteol) RRETURN(MATCH_NOMATCH); }
+        {
+        if (md->noteol) MRRETURN(MATCH_NOMATCH);
+        SCHECK_PARTIAL();
+        }
        ecode++;
        break;
        }
-    else
+    else  /* Not multiline */
        {
-      if (md->noteol) RRETURN(MATCH_NOMATCH);
-      if (!md->endonly)
-        {
-        if (eptr != md->end_subject &&
-            (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
-          RRETURN(MATCH_NOMATCH);
-        ecode++;
-        break;
-        }
+      if (md->noteol) MRRETURN(MATCH_NOMATCH);
+      if (!md->endonly) goto ASSERT_NL_OR_EOS;
        }
+
      /* ... else fall through for endonly */
  
      /* End of subject assertion (\z) */
  
      case OP_EOD:
-    if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
+    if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
+    SCHECK_PARTIAL();
      ecode++;
      break;
  
      /* End of subject or ending \n assertion (\Z) */
  
      case OP_EODN:
-    if (eptr != md->end_subject &&
+    ASSERT_NL_OR_EOS:
+    if (eptr < md->end_subject &&
          (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
+
+    /* Either at end of string or \n before end. */
+
+    SCHECK_PARTIAL();
      ecode++;
      break;
  
@@ -1651,14 +1756,30 @@ for (;;)
  #ifdef SUPPORT_UTF8
        if (utf8)
          {
+        /* Get status of previous character */
+
          if (eptr == md->start_subject) prev_is_word = FALSE; else
            {
            USPTR lastptr = eptr - 1;
            while((*lastptr & 0xc0) == 0x80) lastptr--;
            if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
            GETCHAR(c, lastptr);
+#ifdef SUPPORT_UCP
+          if (md->use_ucp)
+            {
+            if (c == '_') prev_is_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(c);
+              prev_is_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
            }
+
+        /* Get status of next character */
+
          if (eptr >= md->end_subject)
            {
            SCHECK_PARTIAL();
@@ -1667,47 +1788,89 @@ for (;;)
          else
            {
            GETCHAR(c, eptr);
+#ifdef SUPPORT_UCP
+          if (md->use_ucp)
+            {
+            if (c == '_') cur_is_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(c);
+              cur_is_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
            cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
            }
          }
        else
  #endif
  
-      /* Not in UTF-8 mode */
+      /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
+      consistency with the behaviour of \w we do use it in this case. */
  
          {
+        /* Get status of previous character */
+
          if (eptr == md->start_subject) prev_is_word = FALSE; else
            {
            if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
+#ifdef SUPPORT_UCP
+          if (md->use_ucp)
+            {
+            c = eptr[-1];
+            if (c == '_') prev_is_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(c);
+              prev_is_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
            prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
            }
+
+        /* Get status of next character */
+
          if (eptr >= md->end_subject)
            {
            SCHECK_PARTIAL();
            cur_is_word = FALSE;
            }
-        else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
+        else
+#ifdef SUPPORT_UCP
+        if (md->use_ucp)
+          {
+          c = *eptr;
+          if (c == '_') cur_is_word = TRUE; else
+            {
+            int cat = UCD_CATEGORY(c);
+            cur_is_word = (cat == ucp_L || cat == ucp_N);
+            }
+          }
+        else
+#endif
+        cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
          }
  
        /* Now see if the situation is what we want */
  
        if ((*ecode++ == OP_WORD_BOUNDARY)?
             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
        }
      break;
  
      /* Match a single character type; inline for speed */
  
      case OP_ANY:
-    if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+    if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
      /* Fall through */
  
      case OP_ALLANY:
      if (eptr++ >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
      ecode++;
@@ -1720,7 +1883,7 @@ for (;;)
      if (eptr++ >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      ecode++;
      break;
@@ -1729,7 +1892,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      if (
@@ -1738,7 +1901,7 @@ for (;;)
  #endif
         (md->ctypes[c] & ctype_digit) != 0
         )
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1746,7 +1909,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      if (
@@ -1755,7 +1918,7 @@ for (;;)
  #endif
         (md->ctypes[c] & ctype_digit) == 0
         )
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1763,7 +1926,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      if (
@@ -1772,7 +1935,7 @@ for (;;)
  #endif
         (md->ctypes[c] & ctype_space) != 0
         )
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1780,7 +1943,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      if (
@@ -1789,7 +1952,7 @@ for (;;)
  #endif
         (md->ctypes[c] & ctype_space) == 0
         )
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1797,7 +1960,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      if (
@@ -1806,7 +1969,7 @@ for (;;)
  #endif
         (md->ctypes[c] & ctype_word) != 0
         )
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1814,7 +1977,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      if (
@@ -1823,7 +1986,7 @@ for (;;)
  #endif
         (md->ctypes[c] & ctype_word) == 0
         )
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
      ecode++;
      break;
  
@@ -1831,12 +1994,12 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      switch(c)
        {
-      default: RRETURN(MATCH_NOMATCH);
+      default: MRRETURN(MATCH_NOMATCH);
        case 0x000d:
        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
        break;
@@ -1849,7 +2012,7 @@ for (;;)
        case 0x0085:
        case 0x2028:
        case 0x2029:
-      if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+      if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
        break;
        }
      ecode++;
@@ -1859,7 +2022,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      switch(c)
@@ -1884,7 +2047,7 @@ for (;;)
        case 0x202f:    /* NARROW NO-BREAK SPACE */
        case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
        case 0x3000:    /* IDEOGRAPHIC SPACE */
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      ecode++;
      break;
@@ -1893,12 +2056,12 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      switch(c)
        {
-      default: RRETURN(MATCH_NOMATCH);
+      default: MRRETURN(MATCH_NOMATCH);
        case 0x09:      /* HT */
        case 0x20:      /* SPACE */
        case 0xa0:      /* NBSP */
@@ -1927,7 +2090,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      switch(c)
@@ -1940,7 +2103,7 @@ for (;;)
        case 0x85:      /* NEL */
        case 0x2028:    /* LINE SEPARATOR */
        case 0x2029:    /* PARAGRAPH SEPARATOR */
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      ecode++;
      break;
@@ -1949,12 +2112,12 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
      switch(c)
        {
-      default: RRETURN(MATCH_NOMATCH);
+      default: MRRETURN(MATCH_NOMATCH);
        case 0x0a:      /* LF */
        case 0x0b:      /* VT */
        case 0x0c:      /* FF */
@@ -1976,39 +2139,72 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
        {
        int chartype = UCD_CHARTYPE(c);
+
        switch(ecode[1])
          {
          case PT_ANY:
-        if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+        if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
          break;
  
          case PT_LAMP:
          if ((chartype == ucp_Lu ||
               chartype == ucp_Ll ||
               chartype == ucp_Lt) == (op == OP_NOTPROP))
-          RRETURN(MATCH_NOMATCH);
-         break;
+          MRRETURN(MATCH_NOMATCH);
+        break;
  
          case PT_GC:
          if ((ecode[2] != _pcre_ucp_gentype[chartype]) == (op == OP_PROP))
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
          break;
  
          case PT_PC:
          if ((ecode[2] != chartype) == (op == OP_PROP))
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
          break;
  
          case PT_SC:
          if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
+        break;
+
+        /* These are specials */
+
+        case PT_ALNUM:
+        if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+             _pcre_ucp_gentype[chartype] == ucp_N) == (op == OP_NOTPROP))
+          MRRETURN(MATCH_NOMATCH);
          break;
  
+        case PT_SPACE:    /* Perl space */
+        if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+             c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+               == (op == OP_NOTPROP))
+          MRRETURN(MATCH_NOMATCH);
+        break;
+
+        case PT_PXSPACE:  /* POSIX space */
+        if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+             c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+             c == CHAR_FF || c == CHAR_CR)
+               == (op == OP_NOTPROP))
+          MRRETURN(MATCH_NOMATCH);
+        break;
+
+        case PT_WORD:
+        if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+             _pcre_ucp_gentype[chartype] == ucp_N ||
+             c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
+          MRRETURN(MATCH_NOMATCH);
+        break;
+
+        /* This should never occur */
+
          default:
          RRETURN(PCRE_ERROR_INTERNAL);
          }
@@ -2024,12 +2220,12 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      GETCHARINCTEST(c, eptr);
        {
        int category = UCD_CATEGORY(c);
-      if (category == ucp_M) RRETURN(MATCH_NOMATCH);
+      if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
        while (eptr < md->end_subject)
          {
          int len = 1;
@@ -2074,7 +2270,7 @@ for (;;)
        referenced subpattern. */
  
        if (offset >= offset_top || md->offset_vector[offset] < 0)
-        length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
+        length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
        else
          length = md->offset_vector[offset+1] - md->offset_vector[offset];
  
@@ -2108,7 +2304,7 @@ for (;;)
          if (!match_ref(offset, eptr, length, md, ims))
            {
            CHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
          eptr += length;
          continue;              /* With the main loop */
@@ -2128,7 +2324,7 @@ for (;;)
          if (!match_ref(offset, eptr, length, md, ims))
            {
            CHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
          eptr += length;
          }
@@ -2146,11 +2342,11 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (!match_ref(offset, eptr, length, md, ims))
              {
              CHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            eptr += length;
            }
@@ -2177,7 +2373,7 @@ for (;;)
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
            eptr -= length;
            }
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
        }
      /* Control never gets here */
@@ -2239,16 +2435,16 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            if (c > 255)
              {
-            if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+            if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
              }
            else
              {
-            if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+            if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
              }
            }
          }
@@ -2261,10 +2457,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            c = *eptr++;
-          if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+          if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
            }
          }
  
@@ -2286,20 +2482,20 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINC(c, eptr);
              if (c > 255)
                {
-              if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+              if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
                }
              else
                {
-              if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+              if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
                }
              }
            }
@@ -2311,14 +2507,14 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              c = *eptr++;
-            if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+            if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
              }
            }
          /* Control never gets here */
@@ -2384,7 +2580,7 @@ for (;;)
              }
            }
  
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
        }
      /* Control never gets here */
@@ -2436,10 +2632,10 @@ for (;;)
          if (eptr >= md->end_subject)
            {
            SCHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
          GETCHARINCTEST(c, eptr);
-        if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+        if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
          }
  
        /* If max == min we can continue with the main loop without the
@@ -2456,14 +2652,14 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINCTEST(c, eptr);
-          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+          if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
            }
          /* Control never gets here */
          }
@@ -2492,7 +2688,7 @@ for (;;)
            if (eptr-- == pp) break;        /* Stop if tried at original pos */
            if (utf8) BACKCHAR(eptr);
            }
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
  
        /* Control never gets here */
@@ -2511,9 +2707,9 @@ for (;;)
        if (length > md->end_subject - eptr)
          {
          CHECK_PARTIAL();             /* Not SCHECK_PARTIAL() */
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
-      while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+      while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
        }
      else
  #endif
@@ -2523,9 +2719,9 @@ for (;;)
        if (md->end_subject - eptr < 1)
          {
          SCHECK_PARTIAL();            /* This one can use SCHECK_PARTIAL() */
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
-      if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
+      if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
        ecode += 2;
        }
      break;
@@ -2543,7 +2739,7 @@ for (;;)
        if (length > md->end_subject - eptr)
          {
          CHECK_PARTIAL();             /* Not SCHECK_PARTIAL() */
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
  
        /* If the pattern character's value is < 128, we have only one byte, and
@@ -2551,7 +2747,7 @@ for (;;)
  
        if (fc < 128)
          {
-        if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+        if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
          }
  
        /* Otherwise we must pick up the subject character */
@@ -2570,7 +2766,7 @@ for (;;)
  #ifdef SUPPORT_UCP
            if (dc != UCD_OTHERCASE(fc))
  #endif
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            }
          }
        }
@@ -2582,9 +2778,9 @@ for (;;)
        if (md->end_subject - eptr < 1)
          {
          SCHECK_PARTIAL();            /* This one can use SCHECK_PARTIAL() */
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
-      if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+      if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
        ecode += 2;
        }
      break;
@@ -2678,7 +2874,7 @@ for (;;)
            else
              {
              CHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            }
  
@@ -2690,7 +2886,7 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr <= md->end_subject - length &&
                memcmp(eptr, charptr, length) == 0) eptr += length;
  #ifdef SUPPORT_UCP
@@ -2701,7 +2897,7 @@ for (;;)
              else
                {
                CHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              }
            /* Control never gets here */
@@ -2732,7 +2928,7 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
+            if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
  #ifdef SUPPORT_UCP
              eptr--;
              BACKCHAR(eptr);
@@ -2775,9 +2971,9 @@ for (;;)
          if (eptr >= md->end_subject)
            {
            SCHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
-        if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+        if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
          }
        if (min == max) continue;
        if (minimize)
@@ -2786,13 +2982,13 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+          if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
            }
          /* Control never gets here */
          }
@@ -2818,7 +3014,7 @@ for (;;)
            eptr--;
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
            }
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
        /* Control never gets here */
        }
@@ -2832,9 +3028,9 @@ for (;;)
          if (eptr >= md->end_subject)
            {
            SCHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
-        if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+        if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
          }
  
        if (min == max) continue;
@@ -2845,13 +3041,13 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+          if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
            }
          /* Control never gets here */
          }
@@ -2876,7 +3072,7 @@ for (;;)
            eptr--;
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
            }
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
        }
      /* Control never gets here */
@@ -2888,7 +3084,7 @@ for (;;)
      if (eptr >= md->end_subject)
        {
        SCHECK_PARTIAL();
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      ecode++;
      GETCHARINCTEST(c, eptr);
@@ -2898,11 +3094,11 @@ for (;;)
        if (c < 256)
  #endif
        c = md->lcc[c];
-      if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
+      if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
        }
      else
        {
-      if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
+      if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
        }
      break;
  
@@ -2996,11 +3192,11 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(d, eptr);
            if (d < 256) d = md->lcc[d];
-          if (fc == d) RRETURN(MATCH_NOMATCH);
+          if (fc == d) MRRETURN(MATCH_NOMATCH);
            }
          }
        else
@@ -3013,9 +3209,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+          if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
            }
          }
  
@@ -3032,15 +3228,15 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINC(d, eptr);
              if (d < 256) d = md->lcc[d];
-            if (fc == d) RRETURN(MATCH_NOMATCH);
+            if (fc == d) MRRETURN(MATCH_NOMATCH);
              }
            }
          else
@@ -3051,13 +3247,13 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+            if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
              }
            }
          /* Control never gets here */
@@ -3119,7 +3315,7 @@ for (;;)
              }
            }
  
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
        /* Control never gets here */
        }
@@ -3138,10 +3334,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(d, eptr);
-          if (fc == d) RRETURN(MATCH_NOMATCH);
+          if (fc == d) MRRETURN(MATCH_NOMATCH);
            }
          }
        else
@@ -3153,9 +3349,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+          if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
            }
          }
  
@@ -3172,14 +3368,14 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINC(d, eptr);
-            if (fc == d) RRETURN(MATCH_NOMATCH);
+            if (fc == d) MRRETURN(MATCH_NOMATCH);
              }
            }
          else
@@ -3190,13 +3386,13 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+            if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
              }
            }
          /* Control never gets here */
@@ -3257,7 +3453,7 @@ for (;;)
              }
            }
  
-        RRETURN(MATCH_NOMATCH);
+        MRRETURN(MATCH_NOMATCH);
          }
        }
      /* Control never gets here */
@@ -3351,13 +3547,13 @@ for (;;)
          switch(prop_type)
            {
            case PT_ANY:
-          if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+          if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
            for (i = 1; i <= min; i++)
              {
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINCTEST(c, eptr);
              }
@@ -3369,14 +3565,14 @@ for (;;)
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINCTEST(c, eptr);
              prop_chartype = UCD_CHARTYPE(c);
              if ((prop_chartype == ucp_Lu ||
                   prop_chartype == ucp_Ll ||
                   prop_chartype == ucp_Lt) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            break;
  
@@ -3386,12 +3582,12 @@ for (;;)
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINCTEST(c, eptr);
              prop_category = UCD_CATEGORY(c);
              if ((prop_category == prop_value) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            break;
  
@@ -3401,12 +3597,12 @@ for (;;)
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINCTEST(c, eptr);
              prop_chartype = UCD_CHARTYPE(c);
              if ((prop_chartype == prop_value) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            break;
  
@@ -3416,15 +3612,84 @@ for (;;)
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              GETCHARINCTEST(c, eptr);
              prop_script = UCD_SCRIPT(c);
              if ((prop_script == prop_value) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
+            }
+          break;
+
+          case PT_ALNUM:
+          for (i = 1; i <= min; i++)
+            {
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_L || prop_category == ucp_N)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          for (i = 1; i <= min; i++)
+            {
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+                 c == CHAR_FF || c == CHAR_CR)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          for (i = 1; i <= min; i++)
+            {
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+                 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          break;
+
+          case PT_WORD:
+          for (i = 1; i <= min; i++)
+            {
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_L || prop_category == ucp_N ||
+                 c == CHAR_UNDERSCORE)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
              }
            break;
  
+          /* This should not occur */
+
            default:
            RRETURN(PCRE_ERROR_INTERNAL);
            }
@@ -3440,11 +3705,11 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINCTEST(c, eptr);
            prop_category = UCD_CATEGORY(c);
-          if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+          if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
            while (eptr < md->end_subject)
              {
              int len = 1;
@@ -3471,9 +3736,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+          if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
            eptr++;
            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
            }
@@ -3485,7 +3750,7 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            eptr++;
            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -3493,7 +3758,7 @@ for (;;)
          break;
  
          case OP_ANYBYTE:
-        if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
+        if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
          eptr += min;
          break;
  
@@ -3503,12 +3768,12 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            switch(c)
              {
-            default: RRETURN(MATCH_NOMATCH);
+            default: MRRETURN(MATCH_NOMATCH);
              case 0x000d:
              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
              break;
@@ -3521,7 +3786,7 @@ for (;;)
              case 0x0085:
              case 0x2028:
              case 0x2029:
-            if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+            if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
              break;
              }
            }
@@ -3533,7 +3798,7 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            switch(c)
@@ -3558,7 +3823,7 @@ for (;;)
              case 0x202f:    /* NARROW NO-BREAK SPACE */
              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
              case 0x3000:    /* IDEOGRAPHIC SPACE */
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            }
          break;
@@ -3569,12 +3834,12 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            switch(c)
              {
-            default: RRETURN(MATCH_NOMATCH);
+            default: MRRETURN(MATCH_NOMATCH);
              case 0x09:      /* HT */
              case 0x20:      /* SPACE */
              case 0xa0:      /* NBSP */
@@ -3605,7 +3870,7 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            switch(c)
@@ -3618,7 +3883,7 @@ for (;;)
              case 0x85:      /* NEL */
              case 0x2028:    /* LINE SEPARATOR */
              case 0x2029:    /* PARAGRAPH SEPARATOR */
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            }
          break;
@@ -3629,12 +3894,12 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            switch(c)
              {
-            default: RRETURN(MATCH_NOMATCH);
+            default: MRRETURN(MATCH_NOMATCH);
              case 0x0a:      /* LF */
              case 0x0b:      /* VT */
              case 0x0c:      /* FF */
@@ -3653,11 +3918,11 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINC(c, eptr);
            if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3667,10 +3932,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            /* No need to skip more bytes - we know it's a 1-byte character */
            }
          break;
@@ -3681,10 +3946,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
            }
          break;
@@ -3695,10 +3960,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            /* No need to skip more bytes - we know it's a 1-byte character */
            }
          break;
@@ -3709,10 +3974,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
            }
          break;
@@ -3723,10 +3988,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            /* No need to skip more bytes - we know it's a 1-byte character */
            }
          break;
@@ -3749,9 +4014,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+          if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
            eptr++;
            }
          break;
@@ -3760,7 +4025,7 @@ for (;;)
          if (eptr > md->end_subject - min)
            {
            SCHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
          eptr += min;
          break;
@@ -3769,7 +4034,7 @@ for (;;)
          if (eptr > md->end_subject - min)
            {
            SCHECK_PARTIAL();
-          RRETURN(MATCH_NOMATCH);
+          MRRETURN(MATCH_NOMATCH);
            }
          eptr += min;
          break;
@@ -3780,11 +4045,11 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            switch(*eptr++)
              {
-            default: RRETURN(MATCH_NOMATCH);
+            default: MRRETURN(MATCH_NOMATCH);
              case 0x000d:
              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
              break;
@@ -3794,7 +4059,7 @@ for (;;)
              case 0x000b:
              case 0x000c:
              case 0x0085:
-            if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+            if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
              break;
              }
            }
@@ -3806,7 +4071,7 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            switch(*eptr++)
              {
@@ -3814,7 +4079,7 @@ for (;;)
              case 0x09:      /* HT */
              case 0x20:      /* SPACE */
              case 0xa0:      /* NBSP */
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            }
          break;
@@ -3825,11 +4090,11 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            switch(*eptr++)
              {
-            default: RRETURN(MATCH_NOMATCH);
+            default: MRRETURN(MATCH_NOMATCH);
              case 0x09:      /* HT */
              case 0x20:      /* SPACE */
              case 0xa0:      /* NBSP */
@@ -3844,7 +4109,7 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            switch(*eptr++)
              {
@@ -3854,7 +4119,7 @@ for (;;)
              case 0x0c:      /* FF */
              case 0x0d:      /* CR */
              case 0x85:      /* NEL */
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            }
          break;
@@ -3865,11 +4130,11 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            switch(*eptr++)
              {
-            default: RRETURN(MATCH_NOMATCH);
+            default: MRRETURN(MATCH_NOMATCH);
              case 0x0a:      /* LF */
              case 0x0b:      /* VT */
              case 0x0c:      /* FF */
@@ -3886,9 +4151,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+          if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3898,9 +4163,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+          if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3910,9 +4175,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+          if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3922,9 +4187,9 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
-          if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+          if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3934,10 +4199,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if ((md->ctypes[*eptr++] & ctype_word) != 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3947,10 +4212,10 @@ for (;;)
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if ((md->ctypes[*eptr++] & ctype_word) == 0)
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            }
          break;
  
@@ -3979,14 +4244,14 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            GETCHARINC(c, eptr);
-            if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+            GETCHARINCTEST(c, eptr);
+            if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
              }
            /* Control never gets here */
  
@@ -3995,18 +4260,18 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            GETCHARINC(c, eptr);
+            GETCHARINCTEST(c, eptr);
              prop_chartype = UCD_CHARTYPE(c);
              if ((prop_chartype == ucp_Lu ||
                   prop_chartype == ucp_Ll ||
                   prop_chartype == ucp_Lt) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            /* Control never gets here */
  
@@ -4015,16 +4280,16 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            GETCHARINC(c, eptr);
+            GETCHARINCTEST(c, eptr);
              prop_category = UCD_CATEGORY(c);
              if ((prop_category == prop_value) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            /* Control never gets here */
  
@@ -4033,16 +4298,16 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            GETCHARINC(c, eptr);
+            GETCHARINCTEST(c, eptr);
              prop_chartype = UCD_CHARTYPE(c);
              if ((prop_chartype == prop_value) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            /* Control never gets here */
  
@@ -4051,19 +4316,101 @@ for (;;)
              {
              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (fi >= max) RRETURN(MATCH_NOMATCH);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
              if (eptr >= md->end_subject)
                {
                SCHECK_PARTIAL();
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
-            GETCHARINC(c, eptr);
+            GETCHARINCTEST(c, eptr);
              prop_script = UCD_SCRIPT(c);
              if ((prop_script == prop_value) == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              }
            /* Control never gets here */
  
+          case PT_ALNUM:
+          for (fi = min;; fi++)
+            {
+            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
+            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_L || prop_category == ucp_N)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          /* Control never gets here */
+
+          case PT_SPACE:    /* Perl space */
+          for (fi = min;; fi++)
+            {
+            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
+            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+                 c == CHAR_FF || c == CHAR_CR)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          /* Control never gets here */
+
+          case PT_PXSPACE:  /* POSIX space */
+          for (fi = min;; fi++)
+            {
+            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
+            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+                 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          /* Control never gets here */
+
+          case PT_WORD:
+          for (fi = min;; fi++)
+            {
+            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
+            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+            if (fi >= max) MRRETURN(MATCH_NOMATCH);
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              MRRETURN(MATCH_NOMATCH);
+              }
+            GETCHARINCTEST(c, eptr);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_L ||
+                 prop_category == ucp_N ||
+                 c == CHAR_UNDERSCORE)
+                   == prop_fail_result)
+              MRRETURN(MATCH_NOMATCH);
+            }
+          /* Control never gets here */
+
+          /* This should never occur */
+
            default:
            RRETURN(PCRE_ERROR_INTERNAL);
            }
@@ -4078,15 +4425,15 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            GETCHARINCTEST(c, eptr);
            prop_category = UCD_CATEGORY(c);
-          if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+          if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
            while (eptr < md->end_subject)
              {
              int len = 1;
@@ -4110,14 +4457,14 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (ctype == OP_ANY && IS_NEWLINE(eptr))
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            GETCHARINC(c, eptr);
            switch(ctype)
              {
@@ -4129,7 +4476,7 @@ for (;;)
              case OP_ANYNL:
              switch(c)
                {
-              default: RRETURN(MATCH_NOMATCH);
+              default: MRRETURN(MATCH_NOMATCH);
                case 0x000d:
                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
                break;
@@ -4141,7 +4488,7 @@ for (;;)
                case 0x0085:
                case 0x2028:
                case 0x2029:
-              if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+              if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
                break;
                }
              break;
@@ -4169,14 +4516,14 @@ for (;;)
                case 0x202f:    /* NARROW NO-BREAK SPACE */
                case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                case 0x3000:    /* IDEOGRAPHIC SPACE */
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              break;
  
              case OP_HSPACE:
              switch(c)
                {
-              default: RRETURN(MATCH_NOMATCH);
+              default: MRRETURN(MATCH_NOMATCH);
                case 0x09:      /* HT */
                case 0x20:      /* SPACE */
                case 0xa0:      /* NBSP */
@@ -4211,14 +4558,14 @@ for (;;)
                case 0x85:      /* NEL */
                case 0x2028:    /* LINE SEPARATOR */
                case 0x2029:    /* PARAGRAPH SEPARATOR */
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              break;
  
              case OP_VSPACE:
              switch(c)
                {
-              default: RRETURN(MATCH_NOMATCH);
+              default: MRRETURN(MATCH_NOMATCH);
                case 0x0a:      /* LF */
                case 0x0b:      /* VT */
                case 0x0c:      /* FF */
@@ -4232,32 +4579,32 @@ for (;;)
  
              case OP_NOT_DIGIT:
              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_DIGIT:
              if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_NOT_WHITESPACE:
              if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_WHITESPACE:
              if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_NOT_WORDCHAR:
              if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_WORDCHAR:
              if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
              break;
  
              default:
@@ -4273,14 +4620,14 @@ for (;;)
            {
            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (fi >= max) RRETURN(MATCH_NOMATCH);
+          if (fi >= max) MRRETURN(MATCH_NOMATCH);
            if (eptr >= md->end_subject)
              {
              SCHECK_PARTIAL();
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
              }
            if (ctype == OP_ANY && IS_NEWLINE(eptr))
-            RRETURN(MATCH_NOMATCH);
+            MRRETURN(MATCH_NOMATCH);
            c = *eptr++;
            switch(ctype)
              {
@@ -4292,7 +4639,7 @@ for (;;)
              case OP_ANYNL:
              switch(c)
                {
-              default: RRETURN(MATCH_NOMATCH);
+              default: MRRETURN(MATCH_NOMATCH);
                case 0x000d:
                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
                break;
@@ -4303,7 +4650,7 @@ for (;;)
                case 0x000b:
                case 0x000c:
                case 0x0085:
-              if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+              if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
                break;
                }
              break;
@@ -4315,14 +4662,14 @@ for (;;)
                case 0x09:      /* HT */
                case 0x20:      /* SPACE */
                case 0xa0:      /* NBSP */
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              break;
  
              case OP_HSPACE:
              switch(c)
                {
-              default: RRETURN(MATCH_NOMATCH);
+              default: MRRETURN(MATCH_NOMATCH);
                case 0x09:      /* HT */
                case 0x20:      /* SPACE */
                case 0xa0:      /* NBSP */
@@ -4339,14 +4686,14 @@ for (;;)
                case 0x0c:      /* FF */
                case 0x0d:      /* CR */
                case 0x85:      /* NEL */
-              RRETURN(MATCH_NOMATCH);
+              MRRETURN(MATCH_NOMATCH);
                }
              break;
  
              case OP_VSPACE:
              switch(c)
                {
-              default: RRETURN(MATCH_NOMATCH);
+              default: MRRETURN(MATCH_NOMATCH);
                case 0x0a:      /* LF */
                case 0x0b:      /* VT */
                case 0x0c:      /* FF */
@@ -4357,27 +4704,27 @@ for (;;)
              break;
  
              case OP_NOT_DIGIT:
-            if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+            if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_DIGIT:
-            if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+            if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_NOT_WHITESPACE:
-            if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+            if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_WHITESPACE:
-            if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+            if  ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_NOT_WORDCHAR:
-            if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
+            if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
              break;
  
              case OP_WORDCHAR:
-            if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
+            if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
              break;
  
              default:
@@ -4410,7 +4757,7 @@ for (;;)
                SCHECK_PARTIAL();
                break;
                }
-            GETCHARLEN(c, eptr, len);
+            GETCHARLENTEST(c, eptr, len);
              if (prop_fail_result) break;
              eptr+= len;
              }
@@ -4425,7 +4772,7 @@ for (;;)
                SCHECK_PARTIAL();
                break;
                }
-            GETCHARLEN(c, eptr, len);
+            GETCHARLENTEST(c, eptr, len);
              prop_chartype = UCD_CHARTYPE(c);
              if ((prop_chartype == ucp_Lu ||
                   prop_chartype == ucp_Ll ||
@@ -4444,7 +4791,7 @@ for (;;)
                SCHECK_PARTIAL();
                break;
                }
-            GETCHARLEN(c, eptr, len);
+            GETCHARLENTEST(c, eptr, len);
              prop_category = UCD_CATEGORY(c);
              if ((prop_category == prop_value) == prop_fail_result)
                break;
@@ -4461,7 +4808,7 @@ for (;;)
                SCHECK_PARTIAL();
                break;
                }
-            GETCHARLEN(c, eptr, len);
+            GETCHARLENTEST(c, eptr, len);
              prop_chartype = UCD_CHARTYPE(c);
              if ((prop_chartype == prop_value) == prop_fail_result)
                break;
@@ -4478,13 +4825,90 @@ for (;;)
                SCHECK_PARTIAL();
                break;
                }
-            GETCHARLEN(c, eptr, len);
+            GETCHARLENTEST(c, eptr, len);
              prop_script = UCD_SCRIPT(c);
              if ((prop_script == prop_value) == prop_fail_result)
                break;
              eptr+= len;
              }
            break;
+
+          case PT_ALNUM:
+          for (i = min; i < max; i++)
+            {
+            int len = 1;
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              break;
+              }
+            GETCHARLENTEST(c, eptr, len);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_L || prop_category == ucp_N)
+                 == prop_fail_result)
+              break;
+            eptr+= len;
+            }
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          for (i = min; i < max; i++)
+            {
+            int len = 1;
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              break;
+              }
+            GETCHARLENTEST(c, eptr, len);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+                 c == CHAR_FF || c == CHAR_CR)
+                 == prop_fail_result)
+              break;
+            eptr+= len;
+            }
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          for (i = min; i < max; i++)
+            {
+            int len = 1;
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              break;
+              }
+            GETCHARLENTEST(c, eptr, len);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+                 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+                 == prop_fail_result)
+              break;
+            eptr+= len;
+            }
+          break;
+
+          case PT_WORD:
+          for (i = min; i < max; i++)
+            {
+            int len = 1;
+            if (eptr >= md->end_subject)
+              {
+              SCHECK_PARTIAL();
+              break;
+              }
+            GETCHARLENTEST(c, eptr, len);
+            prop_category = UCD_CATEGORY(c);
+            if ((prop_category == ucp_L || prop_category == ucp_N ||
+                 c == CHAR_UNDERSCORE) == prop_fail_result)
+              break;
+            eptr+= len;
+            }
+          break;
+
+          default:
+          RRETURN(PCRE_ERROR_INTERNAL);
            }
  
          /* eptr is now past the end of the maximum run */
@@ -5037,7 +5461,7 @@ for (;;)
  
        /* Get here if we can't make it match with any permitted repetitions */
  
-      RRETURN(MATCH_NOMATCH);
+      MRRETURN(MATCH_NOMATCH);
        }
      /* Control never gets here */
  
@@ -5070,12 +5494,13 @@ switch (frame->Xwhere)
    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
-  LBL(53) LBL(54)
+  LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
  #ifdef SUPPORT_UTF8
    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
    LBL(32) LBL(34) LBL(42) LBL(46)
  #ifdef SUPPORT_UCP
    LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
+  LBL(59) LBL(60) LBL(61) LBL(62)
  #endif  /* SUPPORT_UCP */
  #endif  /* SUPPORT_UTF8 */
    default:
@@ -5204,11 +5629,11 @@ const real_pcre *external_re = (const real_pcre *)argument_re;
  const real_pcre *re = external_re;
  
  /* Plausibility checks */
-
  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
  if (re == NULL || subject == NULL ||
     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
  
  /* This information is for finding all the numbers associated with a given
  name, for condition testing. */
@@ -5279,6 +5704,7 @@ end_subject = md->end_subject;
  
  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->use_ucp = (re->options & PCRE_UCP) != 0;
  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
  
  md->notbol = (options & PCRE_NOTBOL) != 0;
@@ -5288,6 +5714,7 @@ md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
  md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
                ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
  md->hitend = FALSE;
+md->mark = NULL;                        /* In case never set */
  
  md->recursive = NULL;                   /* No recursion at top level */
  
@@ -5373,16 +5800,14 @@ back the character offset. */
  #ifdef SUPPORT_UTF8
  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
    {
-  if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
-    return PCRE_ERROR_BADUTF8;
+  int tb;
+  if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
+    return (tb == length && md->partial > 1)?
+      PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
    if (start_offset > 0 && start_offset < length)
      {
-    int tb = ((USPTR)subject)[start_offset];
-    if (tb > 127)
-      {
-      tb &= 0xc0;
-      if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
-      }
+    tb = ((USPTR)subject)[start_offset] & 0xc0;
+    if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
      }
    }
  #endif
@@ -5510,9 +5935,10 @@ for(;;)
    /* There are some optimizations that avoid running the match if a known
    starting point is not found, or if a known later character is not present.
    However, there is an option that disables these, for testing and for ensuring
-  that all callouts do actually occur. */
+  that all callouts do actually occur. The option can be set in the regex by
+  (*NO_START_OPT) or passed in match-time options. */
  
-  if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+  if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
      {
      /* Advance to a unique first byte if there is one. */
  
@@ -5566,8 +5992,16 @@ for(;;)
        while (start_match < end_subject)
          {
          register unsigned int c = *start_match;
-        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
-          else break;
+        if ((start_bits[c/8] & (1 << (c&7))) == 0)
+          {
+          start_match++;
+#ifdef SUPPORT_UTF8
+          if (utf8)
+            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+              start_match++;
+#endif
+          }
+        else break;
          }
        }
      }   /* Starting optimizations */
@@ -5668,6 +6102,23 @@ for(;;)
  
    switch(rc)
      {
+    /* SKIP passes back the next starting point explicitly, but if it is the
+    same as the match we have just done, treat it as NOMATCH. */
+
+    case MATCH_SKIP:
+    if (md->start_match_ptr != start_match)
+      {
+      new_start_match = md->start_match_ptr;
+      break;
+      }
+    /* Fall through */
+
+    /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
+    the SKIP's arg was not found. We also treat this as NOMATCH. */
+
+    case MATCH_SKIP_ARG:
+    /* Fall through */
+
      /* NOMATCH and PRUNE advance by one character. THEN at this level acts
      exactly like PRUNE. */
  
@@ -5682,12 +6133,6 @@ for(;;)
  #endif
      break;
  
-    /* SKIP passes back the next starting point explicitly. */
-
-    case MATCH_SKIP:
-    new_start_match = md->start_match_ptr;
-    break;
-
      /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
  
      case MATCH_COMMIT:
@@ -5733,7 +6178,8 @@ for(;;)
           md->nllen == 2))
      start_match++;
  
-  }   /* End of for(;;) "bumpalong" loop */
+  md->mark = NULL;   /* Reset for start of next match attempt */
+  }                  /* End of for(;;) "bumpalong" loop */
  
  /* ==========================================================================*/
  
@@ -5757,7 +6203,7 @@ capturing parentheses than vector slots. */
  
  ENDLOOP:
  
-if (rc == MATCH_MATCH)
+if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
    {
    if (using_temporary_offsets)
      {
@@ -5783,12 +6229,12 @@ if (rc == MATCH_MATCH)
  
    if (offsetcount < 2) rc = 0; else
      {
-    offsets[0] = md->start_match_ptr - md->start_subject;
-    offsets[1] = md->end_match_ptr - md->start_subject;
+    offsets[0] = (int)(md->start_match_ptr - md->start_subject);
+    offsets[1] = (int)(md->end_match_ptr - md->start_subject);
      }
  
    DPRINTF((">>>> returning %d\n", rc));
-  return rc;
+  goto RETURN_MARK;
    }
  
  /* Control gets here if there has been an error, or if the overall match
@@ -5800,26 +6246,43 @@ if (using_temporary_offsets)
    (pcre_free)(md->offset_vector);
    }
  
+/* For anything other than nomatch or partial match, just return the code. */
+
  if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
    {
    DPRINTF((">>>> error: returning %d\n", rc));
    return rc;
    }
-else if (start_partial != NULL)
+
+/* Handle partial matches - disable any mark data */
+
+if (start_partial != NULL)
    {
    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
+  md->mark = NULL;
    if (offsetcount > 1)
      {
-    offsets[0] = start_partial - (USPTR)subject;
-    offsets[1] = end_subject - (USPTR)subject;
+    offsets[0] = (int)(start_partial - (USPTR)subject);
+    offsets[1] = (int)(end_subject - (USPTR)subject);
      }
-  return PCRE_ERROR_PARTIAL;
+  rc = PCRE_ERROR_PARTIAL;
    }
+
+/* This is the classic nomatch case */
+
  else
    {
    DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
-  return PCRE_ERROR_NOMATCH;
+  rc = PCRE_ERROR_NOMATCH;
    }
+
+/* Return the MARK data if it has been requested. */
+
+RETURN_MARK:
+
+if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
+  *(extra_data->mark) = (unsigned char *)(md->mark);
+return rc;
  }
  
  /* End of pcre_exec.c */
diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h

index 7c7412f32d23d14d1d28b7256eb38bb6759f482f..9cff71d2700aea82950952006c5cdc5960fc1af3 100644 (file)
--- a/glib/pcre/pcre_internal.h
+++ b/glib/pcre/pcre_internal.h
@@ -408,9 +408,10 @@ capturing parenthesis numbers in back references. */
  
  /* When UTF-8 encoding is being used, a character is no longer just a single
  byte. The macros for character handling generate simple sequences when used in
-byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
-never be called in byte mode. To make sure it can never even appear when UTF-8
-support is omitted, we don't even define it. */
+byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
+not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
+never be called in byte mode. To make sure they can never even appear when
+UTF-8 support is omitted, we don't even define them. */
  
  #ifndef SUPPORT_UTF8
  #define GETCHAR(c, eptr) c = *eptr;
@@ -418,43 +419,83 @@ support is omitted, we don't even define it. */
  #define GETCHARINC(c, eptr) c = *eptr++;
  #define GETCHARINCTEST(c, eptr) c = *eptr++;
  #define GETCHARLEN(c, eptr, len) c = *eptr;
+/* #define GETCHARLENTEST(c, eptr, len) */
  /* #define BACKCHAR(eptr) */
  
  #else   /* SUPPORT_UTF8 */
  
+/* These macros were originally written in the form of loops that used data
+from the tables whose names start with _pcre_utf8_table. They were rewritten by
+a user so as not to use loops, because in some environments this gives a
+significant performance advantage, and it seems never to do any harm. */
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer. */
+
+#define GETUTF8(c, eptr) \
+    { \
+    if ((c & 0x20) == 0) \
+      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+    else if ((c & 0x10) == 0) \
+      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+    else if ((c & 0x08) == 0) \
+      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+      ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+    else if ((c & 0x04) == 0) \
+      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+          ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+          (eptr[4] & 0x3f); \
+    else \
+      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+          ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+          ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+    }
+
  /* Get the next UTF-8 character, not advancing the pointer. This is called when
  we know we are in UTF-8 mode. */
  
  #define GETCHAR(c, eptr) \
    c = *eptr; \
-  if (c >= 0xc0) \
-    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
-      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
-      } \
-    }
+  if (c >= 0xc0) GETUTF8(c, eptr);
  
  /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
  pointer. */
  
  #define GETCHARTEST(c, eptr) \
    c = *eptr; \
-  if (utf8 && c >= 0xc0) \
+  if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
      { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
+    if ((c & 0x20) == 0) \
+      c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
+    else if ((c & 0x10) == 0) \
        { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
+      c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
+      eptr += 2; \
+      } \
+    else if ((c & 0x08) == 0) \
+      { \
+      c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
+          ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+      eptr += 3; \
+      } \
+    else if ((c & 0x04) == 0) \
+      { \
+      c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
+          ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
+          (eptr[3] & 0x3f); \
+      eptr += 4; \
+      } \
+    else \
+      { \
+      c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
+          ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
+          ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
+      eptr += 5; \
        } \
      }
  
@@ -463,31 +504,49 @@ know we are in UTF-8 mode. */
  
  #define GETCHARINC(c, eptr) \
    c = *eptr++; \
-  if (c >= 0xc0) \
-    { \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    while (gcaa-- > 0) \
-      { \
-      gcss -= 6; \
-      c |= (*eptr++ & 0x3f) << gcss; \
-      } \
-    }
+  if (c >= 0xc0) GETUTF8INC(c, eptr);
  
-/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-8 mode. */
  
  #define GETCHARINCTEST(c, eptr) \
    c = *eptr++; \
-  if (utf8 && c >= 0xc0) \
+  if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF8LEN(c, eptr, len) \
      { \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    while (gcaa-- > 0) \
+    if ((c & 0x20) == 0) \
+      { \
+      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+      len++; \
+      } \
+    else if ((c & 0x10)  == 0) \
        { \
-      gcss -= 6; \
-      c |= (*eptr++ & 0x3f) << gcss; \
+      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+      len += 2; \
+      } \
+    else if ((c & 0x08)  == 0) \
+      {\
+      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+          ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+      len += 3; \
+      } \
+    else if ((c & 0x04)  == 0) \
+      { \
+      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+          ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+          (eptr[4] & 0x3f); \
+      len += 4; \
+      } \
+    else \
+      {\
+      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+          ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+          ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+      len += 5; \
        } \
      }
  
@@ -496,39 +555,15 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
  
  #define GETCHARLEN(c, eptr, len) \
    c = *eptr; \
-  if (c >= 0xc0) \
-    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
-      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
-      } \
-    len += gcaa; \
-    }
+  if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
  
  /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
  pointer, incrementing length if there are extra bytes. This is called when we
-know we are in UTF-8 mode. */
+do not know if we are in UTF-8 mode. */
  
  #define GETCHARLENTEST(c, eptr, len) \
    c = *eptr; \
-  if (utf8 && c >= 0xc0) \
-    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
-      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
-      } \
-    len += gcaa; \
-    }
+  if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
  
  /* If the pointer is not at the start of a character, move it back until
  it is. This is called only in UTF-8 mode - we don't put a test within the macro
@@ -536,7 +571,7 @@ because almost all calls are already within a block of UTF-8 only code. */
  
  #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
  
-#endif
+#endif  /* SUPPORT_UTF8 */
  
  
  /* In case there is no definition of offsetof() provided - though any proper
@@ -580,7 +615,7 @@ time, run time, or study time, respectively. */
     PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
     PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
     PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
-   PCRE_JAVASCRIPT_COMPAT)
+   PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
  
  #define PUBLIC_EXEC_OPTIONS \
    (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@@ -620,7 +655,7 @@ variable-length repeat, or a anything other than literal characters. */
  environments where these macros are defined elsewhere. Unfortunately, there
  is no way to do the same for the typedef. */
  
-typedef gboolean  BOOL;
+typedef gboolean BOOL;
  
  /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
  character constants like '*' because the compiler would emit their EBCDIC code,
@@ -870,6 +905,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
  #define STRING_COMMIT0              "COMMIT\0"
  #define STRING_F0                   "F\0"
  #define STRING_FAIL0                "FAIL\0"
+#define STRING_MARK0                "MARK\0"
  #define STRING_PRUNE0               "PRUNE\0"
  #define STRING_SKIP0                "SKIP\0"
  #define STRING_THEN                 "THEN"
@@ -891,14 +927,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
  
  #define STRING_DEFINE               "DEFINE"
  
-#define STRING_CR_RIGHTPAR          "CR)"
-#define STRING_LF_RIGHTPAR          "LF)"
-#define STRING_CRLF_RIGHTPAR        "CRLF)"
-#define STRING_ANY_RIGHTPAR         "ANY)"
-#define STRING_ANYCRLF_RIGHTPAR     "ANYCRLF)"
-#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
-#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
-#define STRING_UTF8_RIGHTPAR        "UTF8)"
+#define STRING_CR_RIGHTPAR             "CR)"
+#define STRING_LF_RIGHTPAR             "LF)"
+#define STRING_CRLF_RIGHTPAR           "CRLF)"
+#define STRING_ANY_RIGHTPAR            "ANY)"
+#define STRING_ANYCRLF_RIGHTPAR        "ANYCRLF)"
+#define STRING_BSR_ANYCRLF_RIGHTPAR    "BSR_ANYCRLF)"
+#define STRING_BSR_UNICODE_RIGHTPAR    "BSR_UNICODE)"
+#define STRING_UTF8_RIGHTPAR           "UTF8)"
+#define STRING_UCP_RIGHTPAR            "UCP)"
+#define STRING_NO_START_OPT_RIGHTPAR   "NO_START_OPT)"
  
  #else  /* SUPPORT_UTF8 */
  
@@ -1122,6 +1160,7 @@ only. */
  #define STRING_COMMIT0              STR_C STR_O STR_M STR_M STR_I STR_T "\0"
  #define STRING_F0                   STR_F "\0"
  #define STRING_FAIL0                STR_F STR_A STR_I STR_L "\0"
+#define STRING_MARK0                STR_M STR_A STR_R STR_K "\0"
  #define STRING_PRUNE0               STR_P STR_R STR_U STR_N STR_E "\0"
  #define STRING_SKIP0                STR_S STR_K STR_I STR_P "\0"
  #define STRING_THEN                 STR_T STR_H STR_E STR_N
@@ -1143,14 +1182,16 @@ only. */
  
  #define STRING_DEFINE               STR_D STR_E STR_F STR_I STR_N STR_E
  
-#define STRING_CR_RIGHTPAR          STR_C STR_R STR_RIGHT_PARENTHESIS
-#define STRING_LF_RIGHTPAR          STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_CRLF_RIGHTPAR        STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_ANY_RIGHTPAR         STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
-#define STRING_ANYCRLF_RIGHTPAR     STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
-#define STRING_UTF8_RIGHTPAR        STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_CR_RIGHTPAR             STR_C STR_R STR_RIGHT_PARENTHESIS
+#define STRING_LF_RIGHTPAR             STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_CRLF_RIGHTPAR           STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_ANY_RIGHTPAR            STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
+#define STRING_ANYCRLF_RIGHTPAR        STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_ANYCRLF_RIGHTPAR    STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_UNICODE_RIGHTPAR    STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
+#define STRING_UTF8_RIGHTPAR           STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_UCP_RIGHTPAR            STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
+#define STRING_NO_START_OPT_RIGHTPAR   STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
  
  #endif  /* SUPPORT_UTF8 */
  
@@ -1183,9 +1224,13 @@ only. */
  
  #define PT_ANY        0    /* Any property - matches all chars */
  #define PT_LAMP       1    /* L& - the union of Lu, Ll, Lt */
-#define PT_GC         2    /* General characteristic (e.g. L) */
-#define PT_PC         3    /* Particular characteristic (e.g. Lu) */
+#define PT_GC         2    /* Specified general characteristic (e.g. L) */
+#define PT_PC         3    /* Specified particular characteristic (e.g. Lu) */
  #define PT_SC         4    /* Script (e.g. Han) */
+#define PT_ALNUM      5    /* Alphanumeric - the union of L and N */
+#define PT_SPACE      6    /* Perl space - Z plus 9,10,12,13 */
+#define PT_PXSPACE    7    /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD       8    /* Word - L plus N plus underscore */
  
  /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
  contain UTF-8 characters with values greater than 255. */
@@ -1202,9 +1247,15 @@ contain UTF-8 characters with values greater than 255. */
  /* These are escaped items that aren't just an encoding of a particular data
  value such as \n. They must have non-zero values, as check_escape() returns
  their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence, and another for OP_ALLANY
-(which is used for [^] in JavaScript compatibility mode).
+definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
+corresponds to "." in DOTALL mode rather than an escape sequence. It is also
+used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
+like \N.
+
+The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
+when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
+They must be contiguous, and remain in order so that the replacements can be
+looked up from a table.
  
  The final escape must be ESC_REF as subsequent values are used for
  backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
@@ -1214,11 +1265,12 @@ put in between that don't consume a character, that code will have to change.
  */
  
  enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
-       ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
-       ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
+       ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
+       ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
+       ESC_E, ESC_Q, ESC_g, ESC_k,
+       ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
         ESC_REF };
  
-
  /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
  OP_EOD must correspond in order to the list of escapes immediately above.
  
@@ -1242,8 +1294,8 @@ enum {
    OP_WHITESPACE,         /*  9 \s */
    OP_NOT_WORDCHAR,       /* 10 \W */
    OP_WORDCHAR,           /* 11 \w */
-  OP_ANY,            /* 12 Match any character (subject to DOTALL) */
-  OP_ALLANY,         /* 13 Match any character (not subject to DOTALL) */
+  OP_ANY,            /* 12 Match any character except newline */
+  OP_ALLANY,         /* 13 Match any character */
    OP_ANYBYTE,        /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
    OP_NOTPROP,        /* 15 \P (not Unicode property) */
    OP_PROP,           /* 16 \p (Unicode property) */
@@ -1373,20 +1425,24 @@ enum {
  
    /* These are backtracking control verbs */
  
-  OP_PRUNE,          /* 107 */
-  OP_SKIP,           /* 108 */
-  OP_THEN,           /* 109 */
-  OP_COMMIT,         /* 110 */
+  OP_MARK,           /* 107 always has an argument */
+  OP_PRUNE,          /* 108 */
+  OP_PRUNE_ARG,      /* 109 same, but with argument */
+  OP_SKIP,           /* 110 */
+  OP_SKIP_ARG,       /* 111 same, but with argument */
+  OP_THEN,           /* 112 */
+  OP_THEN_ARG,       /* 113 same, but with argument */
+  OP_COMMIT,         /* 114 */
  
    /* These are forced failure and success verbs */
  
-  OP_FAIL,           /* 111 */
-  OP_ACCEPT,         /* 112 */
-  OP_CLOSE,          /* 113 Used before OP_ACCEPT to close open captures */
+  OP_FAIL,           /* 115 */
+  OP_ACCEPT,         /* 116 */
+  OP_CLOSE,          /* 117 Used before OP_ACCEPT to close open captures */
  
    /* This is used to skip a subpattern with a {0} quantifier */
  
-  OP_SKIPZERO,       /* 114 */
+  OP_SKIPZERO,       /* 118 */
  
    /* This is not an opcode, but is used to check that tables indexed by opcode
    are the correct length, in order to catch updating errors - there have been
@@ -1397,7 +1453,7 @@ enum {
  
  /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
  definitions that follow must also be updated to match. There are also tables
-called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
+called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
  
  
  /* This macro defines textual names for all the opcodes. These are used only
@@ -1422,7 +1478,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
    "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond",        \
    "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def",   \
    "Brazero", "Braminzero",                                        \
-  "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT",      \
+  "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP",                  \
+  "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT",                \
    "Close", "Skip zero"
  
  
@@ -1488,8 +1545,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */
    3, 3,                          /* RREF, NRREF                            */ \
    1,                             /* DEF                                    */ \
    1, 1,                          /* BRAZERO, BRAMINZERO                    */ \
-  1, 1, 1, 1,                    /* PRUNE, SKIP, THEN, COMMIT,             */ \
-  1, 1, 3, 1                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
+  3, 1, 3,                       /* MARK, PRUNE, PRUNE_ARG                 */ \
+  1, 3,                          /* SKIP, SKIP_ARG                         */ \
+  1+LINK_SIZE, 3+LINK_SIZE,      /* THEN, THEN_ARG                         */ \
+  1, 1, 1, 3, 1                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
  
  
  /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
@@ -1507,7 +1566,8 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
         ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
         ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
         ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
-       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
+       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
+       ERRCOUNT };
  
  /* The real format of the start of the pcre block; the index of names and the
  code vector run on as long as necessary after the end. We store an explicit
@@ -1650,6 +1710,7 @@ typedef struct match_data {
    BOOL   noteol;                /* NOTEOL flag */
    BOOL   utf8;                  /* UTF8 flag */
    BOOL   jscript_compat;        /* JAVASCRIPT_COMPAT flag */
+  BOOL   use_ucp;               /* PCRE_UCP flag */
    BOOL   endonly;               /* Dollar not before final \n */
    BOOL   notempty;              /* Empty string match not wanted */
    BOOL   notempty_atstart;      /* Empty string match at start not wanted */
@@ -1669,6 +1730,7 @@ typedef struct match_data {
    int    eptrn;                 /* Next free eptrblock */
    recursion_info *recursive;    /* Linked list of recursion data */
    void  *callout_data;          /* To pass back to callouts */
+  const uschar *mark;           /* Mark pointer to pass back */
  } match_data;
  
  /* A similar structure is used for the same purpose by the DFA matching
@@ -1764,7 +1826,7 @@ extern BOOL          _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
  extern int           _pcre_ord2utf8(int, uschar *);
  extern real_pcre    *_pcre_try_flipped(const real_pcre *, real_pcre *,
                         const pcre_study_data *, pcre_study_data *);
-#define              _pcre_valid_utf8(u, i) TRUE
+#define              _pcre_valid_utf8(USPTR, int) TRUE
  extern BOOL          _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
  extern BOOL          _pcre_xclass(int, const uschar *);
  
diff --git a/glib/pcre/pcre_study.c b/glib/pcre/pcre_study.c

index bd00a53a639fec41aab618f6b81f628deab815cf..be321fa25c185063637208a9e00229e1e678c8fa 100644 (file)
--- a/glib/pcre/pcre_study.c
+++ b/glib/pcre/pcre_study.c
@@ -48,6 +48,7 @@ supporting functions. */
  
  #include "pcre_internal.h"
  
+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
  
  /* Returns from set_start_bits() */
  
@@ -413,6 +414,18 @@ for (;;)
  #endif
      break;
  
+    /* Skip these, but we need to add in the name length. */
+
+    case OP_MARK:
+    case OP_PRUNE_ARG:
+    case OP_SKIP_ARG:
+    cc += _pcre_OP_lengths[op] + cc[1];
+    break;
+
+    case OP_THEN_ARG:
+    cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
+    break;
+
      /* For the record, these are the opcodes that are matched by "default":
      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
      OP_THEN. */
@@ -431,25 +444,121 @@ for (;;)
  *      Set a bit and maybe its alternate case    *
  *************************************************/
  
-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.
  
  Arguments:
    start_bits    points to the bit map
-  c             is the character
+  p             points to the character
    caseless      the caseless flag
    cd            the block with char table pointers
+  utf8          TRUE for UTF-8 mode
  
-Returns:        nothing
+Returns:        pointer after the character
+*/
+
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+  compile_data *cd, BOOL utf8)
+{
+unsigned int c = *p;
+
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+  {
+  GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+  if (caseless)
+    {
+    uschar buff[8];
+    c = UCD_OTHERCASE(c);
+    (void)_pcre_ord2utf8(c, buff);
+    SET_BIT(buff[0]);
+    }
+#endif
+  return p;
+  }
+#endif
+
+/* Not UTF-8 mode, or character is less than 127. */
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+}
+
+
+
+/*************************************************
+*     Set bits for a positive character type     *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+  start_bits     the starting bitmap
+  cbit type      the type of character wanted
+  table_limit    32 for non-UTF-8; 16 for UTF-8
+  cd             the block with char table pointers
+
+Returns:         nothing
  */
  
  static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
    compile_data *cd)
  {
-start_bits[c/8] |= (1 << (c&7));
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
-  start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+  {
+  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+    {
+    uschar buff[8];
+    (void)_pcre_ord2utf8(c, buff);
+    SET_BIT(buff[0]);
+    }
+  }
+}
+
+
+/*************************************************
+*     Set bits for a negative character type     *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at
+0xc0 (192) for simplicity.
+
+Arguments:
+  start_bits     the starting bitmap
+  cbit type      the type of character wanted
+  table_limit    32 for non-UTF-8; 16 for UTF-8
+  cd             the block with char table pointers
+
+Returns:         nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+  compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
  }
  
  
@@ -484,6 +593,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
  {
  register int c;
  int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
  
  #if 0
  /* ========================================================================= */
@@ -607,12 +717,7 @@ do
        case OP_QUERY:
        case OP_MINQUERY:
        case OP_POSQUERY:
-      set_table_bit(start_bits, tcode[1], caseless, cd);
-      tcode += 2;
-#ifdef SUPPORT_UTF8
-      if (utf8 && tcode[-1] >= 0xc0)
-        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+      tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
        break;
  
        /* Single-char upto sets the bit and tries the next */
@@ -620,12 +725,7 @@ do
        case OP_UPTO:
        case OP_MINUPTO:
        case OP_POSUPTO:
-      set_table_bit(start_bits, tcode[3], caseless, cd);
-      tcode += 4;
-#ifdef SUPPORT_UTF8
-      if (utf8 && tcode[-1] >= 0xc0)
-        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+      tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
        break;
  
        /* At least one single char sets the bit and stops */
@@ -638,59 +738,86 @@ do
        case OP_PLUS:
        case OP_MINPLUS:
        case OP_POSPLUS:
-      set_table_bit(start_bits, tcode[1], caseless, cd);
+      (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
+      try_next = FALSE;
+      break;
+
+      /* Special spacing and line-terminating items. These recognize specific
+      lists of characters. The difference between VSPACE and ANYNL is that the
+      latter can match the two-character CRLF sequence, but that is not
+      relevant for finding the first character, so their code here is
+      identical. */
+
+      case OP_HSPACE:
+      SET_BIT(0x09);
+      SET_BIT(0x20);
+      if (utf8)
+        {
+        SET_BIT(0xC2);  /* For U+00A0 */
+        SET_BIT(0xE1);  /* For U+1680, U+180E */
+        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
+        SET_BIT(0xE3);  /* For U+3000 */
+        }
+      else SET_BIT(0xA0);
+      try_next = FALSE;
+      break;
+
+      case OP_ANYNL:
+      case OP_VSPACE:
+      SET_BIT(0x0A);
+      SET_BIT(0x0B);
+      SET_BIT(0x0C);
+      SET_BIT(0x0D);
+      if (utf8)
+        {
+        SET_BIT(0xC2);  /* For U+0085 */
+        SET_BIT(0xE2);  /* For U+2028, U+2029 */
+        }
+      else SET_BIT(0x85);
        try_next = FALSE;
        break;
  
-      /* Single character type sets the bits and stops */
+      /* Single character types set the bits and stop. Note that if PCRE_UCP
+      is set, we do not see these op codes because \d etc are converted to
+      properties. Therefore, these apply in the case when only characters less
+      than 256 are recognized to match the types. */
  
        case OP_NOT_DIGIT:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= ~cd->cbits[c+cbit_digit];
+      set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
        try_next = FALSE;
        break;
  
        case OP_DIGIT:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= cd->cbits[c+cbit_digit];
+      set_type_bits(start_bits, cbit_digit, table_limit, cd);
        try_next = FALSE;
        break;
  
        /* The cbit_space table has vertical tab as whitespace; we have to
-      discard it. */
+      ensure it is set as not whitespace. */
  
        case OP_NOT_WHITESPACE:
-      for (c = 0; c < 32; c++)
-        {
-        int d = cd->cbits[c+cbit_space];
-        if (c == 1) d &= ~0x08;
-        start_bits[c] |= ~d;
-        }
+      set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+      start_bits[1] |= 0x08;
        try_next = FALSE;
        break;
  
        /* The cbit_space table has vertical tab as whitespace; we have to
-      discard it. */
+      not set it from the table. */
  
        case OP_WHITESPACE:
-      for (c = 0; c < 32; c++)
-        {
-        int d = cd->cbits[c+cbit_space];
-        if (c == 1) d &= ~0x08;
-        start_bits[c] |= d;
-        }
+      c = start_bits[1];    /* Save in case it was already set */
+      set_type_bits(start_bits, cbit_space, table_limit, cd);
+      start_bits[1] = (start_bits[1] & ~0x08) | c;
        try_next = FALSE;
        break;
  
        case OP_NOT_WORDCHAR:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= ~cd->cbits[c+cbit_word];
+      set_nottype_bits(start_bits, cbit_word, table_limit, cd);
        try_next = FALSE;
        break;
  
        case OP_WORDCHAR:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= cd->cbits[c+cbit_word];
+      set_type_bits(start_bits, cbit_word, table_limit, cd);
        try_next = FALSE;
        break;
  
@@ -699,6 +826,7 @@ do
  
        case OP_TYPEPLUS:
        case OP_TYPEMINPLUS:
+      case OP_TYPEPOSPLUS:
        tcode++;
        break;
  
@@ -722,52 +850,69 @@ do
        case OP_TYPEPOSQUERY:
        switch(tcode[1])
          {
+        default:
          case OP_ANY:
          case OP_ALLANY:
          return SSB_FAIL;
  
+        case OP_HSPACE:
+        SET_BIT(0x09);
+        SET_BIT(0x20);
+        if (utf8)
+          {
+          SET_BIT(0xC2);  /* For U+00A0 */
+          SET_BIT(0xE1);  /* For U+1680, U+180E */
+          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
+          SET_BIT(0xE3);  /* For U+3000 */
+          }
+        else SET_BIT(0xA0);
+        break;
+
+        case OP_ANYNL:
+        case OP_VSPACE:
+        SET_BIT(0x0A);
+        SET_BIT(0x0B);
+        SET_BIT(0x0C);
+        SET_BIT(0x0D);
+        if (utf8)
+          {
+          SET_BIT(0xC2);  /* For U+0085 */
+          SET_BIT(0xE2);  /* For U+2028, U+2029 */
+          }
+        else SET_BIT(0x85);
+        break;
+
          case OP_NOT_DIGIT:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= ~cd->cbits[c+cbit_digit];
+        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
          break;
  
          case OP_DIGIT:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= cd->cbits[c+cbit_digit];
+        set_type_bits(start_bits, cbit_digit, table_limit, cd);
          break;
  
          /* The cbit_space table has vertical tab as whitespace; we have to
-        discard it. */
+        ensure it gets set as not whitespace. */
  
          case OP_NOT_WHITESPACE:
-        for (c = 0; c < 32; c++)
-          {
-          int d = cd->cbits[c+cbit_space];
-          if (c == 1) d &= ~0x08;
-          start_bits[c] |= ~d;
-          }
+        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+        start_bits[1] |= 0x08;
          break;
  
          /* The cbit_space table has vertical tab as whitespace; we have to
-        discard it. */
+        avoid setting it. */
  
          case OP_WHITESPACE:
-        for (c = 0; c < 32; c++)
-          {
-          int d = cd->cbits[c+cbit_space];
-          if (c == 1) d &= ~0x08;
-          start_bits[c] |= d;
-          }
+        c = start_bits[1];    /* Save in case it was already set */
+        set_type_bits(start_bits, cbit_space, table_limit, cd);
+        start_bits[1] = (start_bits[1] & ~0x08) | c;
          break;
  
          case OP_NOT_WORDCHAR:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= ~cd->cbits[c+cbit_word];
+        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
          break;
  
          case OP_WORDCHAR:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= cd->cbits[c+cbit_word];
+        set_type_bits(start_bits, cbit_word, table_limit, cd);
          break;
          }
  
diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c

index b7f7ba5d15908a06782c36da6ce05fe643f731eb..8cc4eb3093bafa07674e215dfabe5bf6d1e33872 100644 (file)
--- a/glib/pcre/pcre_tables.c
+++ b/glib/pcre/pcre_tables.c
@@ -123,8 +123,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
  #define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
  #define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
  #define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
+#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
  #define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
  #define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
+#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
  #define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
  #define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
  #define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
@@ -184,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
  #define STRING_Lu0 STR_L STR_u "\0"
  #define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
  #define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
+#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
  #define STRING_M0 STR_M "\0"
  #define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
  #define STRING_Mc0 STR_M STR_c "\0"
@@ -243,6 +246,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
  #define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
  #define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
  #define STRING_Vai0 STR_V STR_a STR_i "\0"
+#define STRING_Xan0 STR_X STR_a STR_n "\0"
+#define STRING_Xps0 STR_X STR_p STR_s "\0"
+#define STRING_Xsp0 STR_X STR_s STR_p "\0"
+#define STRING_Xwd0 STR_X STR_w STR_d "\0"
  #define STRING_Yi0 STR_Y STR_i "\0"
  #define STRING_Z0 STR_Z "\0"
  #define STRING_Zl0 STR_Z STR_l "\0"
@@ -256,8 +263,10 @@ const char _pcre_utt_names[] =
    STRING_Avestan0
    STRING_Balinese0
    STRING_Bamum0
+  STRING_Batak0
    STRING_Bengali0
    STRING_Bopomofo0
+  STRING_Brahmi0
    STRING_Braille0
    STRING_Buginese0
    STRING_Buhid0
@@ -319,6 +328,7 @@ const char _pcre_utt_names[] =
    STRING_Lydian0
    STRING_M0
    STRING_Malayalam0
+  STRING_Mandaic0
    STRING_Mc0
    STRING_Me0
    STRING_Meetei_Mayek0
@@ -376,6 +386,10 @@ const char _pcre_utt_names[] =
    STRING_Tifinagh0
    STRING_Ugaritic0
    STRING_Vai0
+  STRING_Xan0
+  STRING_Xps0
+  STRING_Xsp0
+  STRING_Xwd0
    STRING_Yi0
    STRING_Z0
    STRING_Zl0
@@ -389,131 +403,138 @@ const ucp_type_table _pcre_utt[] = {
    {  20, PT_SC, ucp_Avestan },
    {  28, PT_SC, ucp_Balinese },
    {  37, PT_SC, ucp_Bamum },
-  {  43, PT_SC, ucp_Bengali },
-  {  51, PT_SC, ucp_Bopomofo },
-  {  60, PT_SC, ucp_Braille },
-  {  68, PT_SC, ucp_Buginese },
-  {  77, PT_SC, ucp_Buhid },
-  {  83, PT_GC, ucp_C },
-  {  85, PT_SC, ucp_Canadian_Aboriginal },
-  { 105, PT_SC, ucp_Carian },
-  { 112, PT_PC, ucp_Cc },
-  { 115, PT_PC, ucp_Cf },
-  { 118, PT_SC, ucp_Cham },
-  { 123, PT_SC, ucp_Cherokee },
-  { 132, PT_PC, ucp_Cn },
-  { 135, PT_PC, ucp_Co },
-  { 138, PT_SC, ucp_Common },
-  { 145, PT_SC, ucp_Coptic },
-  { 152, PT_PC, ucp_Cs },
-  { 155, PT_SC, ucp_Cuneiform },
-  { 165, PT_SC, ucp_Cypriot },
-  { 173, PT_SC, ucp_Cyrillic },
-  { 182, PT_SC, ucp_Deseret },
-  { 190, PT_SC, ucp_Devanagari },
-  { 201, PT_SC, ucp_Egyptian_Hieroglyphs },
-  { 222, PT_SC, ucp_Ethiopic },
-  { 231, PT_SC, ucp_Georgian },
-  { 240, PT_SC, ucp_Glagolitic },
-  { 251, PT_SC, ucp_Gothic },
-  { 258, PT_SC, ucp_Greek },
-  { 264, PT_SC, ucp_Gujarati },
-  { 273, PT_SC, ucp_Gurmukhi },
-  { 282, PT_SC, ucp_Han },
-  { 286, PT_SC, ucp_Hangul },
-  { 293, PT_SC, ucp_Hanunoo },
-  { 301, PT_SC, ucp_Hebrew },
-  { 308, PT_SC, ucp_Hiragana },
-  { 317, PT_SC, ucp_Imperial_Aramaic },
-  { 334, PT_SC, ucp_Inherited },
-  { 344, PT_SC, ucp_Inscriptional_Pahlavi },
-  { 366, PT_SC, ucp_Inscriptional_Parthian },
-  { 389, PT_SC, ucp_Javanese },
-  { 398, PT_SC, ucp_Kaithi },
-  { 405, PT_SC, ucp_Kannada },
-  { 413, PT_SC, ucp_Katakana },
-  { 422, PT_SC, ucp_Kayah_Li },
-  { 431, PT_SC, ucp_Kharoshthi },
-  { 442, PT_SC, ucp_Khmer },
-  { 448, PT_GC, ucp_L },
-  { 450, PT_LAMP, 0 },
-  { 453, PT_SC, ucp_Lao },
-  { 457, PT_SC, ucp_Latin },
-  { 463, PT_SC, ucp_Lepcha },
-  { 470, PT_SC, ucp_Limbu },
-  { 476, PT_SC, ucp_Linear_B },
-  { 485, PT_SC, ucp_Lisu },
-  { 490, PT_PC, ucp_Ll },
-  { 493, PT_PC, ucp_Lm },
-  { 496, PT_PC, ucp_Lo },
-  { 499, PT_PC, ucp_Lt },
-  { 502, PT_PC, ucp_Lu },
-  { 505, PT_SC, ucp_Lycian },
-  { 512, PT_SC, ucp_Lydian },
-  { 519, PT_GC, ucp_M },
-  { 521, PT_SC, ucp_Malayalam },
-  { 531, PT_PC, ucp_Mc },
-  { 534, PT_PC, ucp_Me },
-  { 537, PT_SC, ucp_Meetei_Mayek },
-  { 550, PT_PC, ucp_Mn },
-  { 553, PT_SC, ucp_Mongolian },
-  { 563, PT_SC, ucp_Myanmar },
-  { 571, PT_GC, ucp_N },
-  { 573, PT_PC, ucp_Nd },
-  { 576, PT_SC, ucp_New_Tai_Lue },
-  { 588, PT_SC, ucp_Nko },
-  { 592, PT_PC, ucp_Nl },
-  { 595, PT_PC, ucp_No },
-  { 598, PT_SC, ucp_Ogham },
-  { 604, PT_SC, ucp_Ol_Chiki },
-  { 613, PT_SC, ucp_Old_Italic },
-  { 624, PT_SC, ucp_Old_Persian },
-  { 636, PT_SC, ucp_Old_South_Arabian },
-  { 654, PT_SC, ucp_Old_Turkic },
-  { 665, PT_SC, ucp_Oriya },
-  { 671, PT_SC, ucp_Osmanya },
-  { 679, PT_GC, ucp_P },
-  { 681, PT_PC, ucp_Pc },
-  { 684, PT_PC, ucp_Pd },
-  { 687, PT_PC, ucp_Pe },
-  { 690, PT_PC, ucp_Pf },
-  { 693, PT_SC, ucp_Phags_Pa },
-  { 702, PT_SC, ucp_Phoenician },
-  { 713, PT_PC, ucp_Pi },
-  { 716, PT_PC, ucp_Po },
-  { 719, PT_PC, ucp_Ps },
-  { 722, PT_SC, ucp_Rejang },
-  { 729, PT_SC, ucp_Runic },
-  { 735, PT_GC, ucp_S },
-  { 737, PT_SC, ucp_Samaritan },
-  { 747, PT_SC, ucp_Saurashtra },
-  { 758, PT_PC, ucp_Sc },
-  { 761, PT_SC, ucp_Shavian },
-  { 769, PT_SC, ucp_Sinhala },
-  { 777, PT_PC, ucp_Sk },
-  { 780, PT_PC, ucp_Sm },
-  { 783, PT_PC, ucp_So },
-  { 786, PT_SC, ucp_Sundanese },
-  { 796, PT_SC, ucp_Syloti_Nagri },
-  { 809, PT_SC, ucp_Syriac },
-  { 816, PT_SC, ucp_Tagalog },
-  { 824, PT_SC, ucp_Tagbanwa },
-  { 833, PT_SC, ucp_Tai_Le },
-  { 840, PT_SC, ucp_Tai_Tham },
-  { 849, PT_SC, ucp_Tai_Viet },
-  { 858, PT_SC, ucp_Tamil },
-  { 864, PT_SC, ucp_Telugu },
-  { 871, PT_SC, ucp_Thaana },
-  { 878, PT_SC, ucp_Thai },
-  { 883, PT_SC, ucp_Tibetan },
-  { 891, PT_SC, ucp_Tifinagh },
-  { 900, PT_SC, ucp_Ugaritic },
-  { 909, PT_SC, ucp_Vai },
-  { 913, PT_SC, ucp_Yi },
-  { 916, PT_GC, ucp_Z },
-  { 918, PT_PC, ucp_Zl },
-  { 921, PT_PC, ucp_Zp },
-  { 924, PT_PC, ucp_Zs }
+  {  43, PT_SC, ucp_Batak },
+  {  49, PT_SC, ucp_Bengali },
+  {  57, PT_SC, ucp_Bopomofo },
+  {  66, PT_SC, ucp_Brahmi },
+  {  73, PT_SC, ucp_Braille },
+  {  81, PT_SC, ucp_Buginese },
+  {  90, PT_SC, ucp_Buhid },
+  {  96, PT_GC, ucp_C },
+  {  98, PT_SC, ucp_Canadian_Aboriginal },
+  { 118, PT_SC, ucp_Carian },
+  { 125, PT_PC, ucp_Cc },
+  { 128, PT_PC, ucp_Cf },
+  { 131, PT_SC, ucp_Cham },
+  { 136, PT_SC, ucp_Cherokee },
+  { 145, PT_PC, ucp_Cn },
+  { 148, PT_PC, ucp_Co },
+  { 151, PT_SC, ucp_Common },
+  { 158, PT_SC, ucp_Coptic },
+  { 165, PT_PC, ucp_Cs },
+  { 168, PT_SC, ucp_Cuneiform },
+  { 178, PT_SC, ucp_Cypriot },
+  { 186, PT_SC, ucp_Cyrillic },
+  { 195, PT_SC, ucp_Deseret },
+  { 203, PT_SC, ucp_Devanagari },
+  { 214, PT_SC, ucp_Egyptian_Hieroglyphs },
+  { 235, PT_SC, ucp_Ethiopic },
+  { 244, PT_SC, ucp_Georgian },
+  { 253, PT_SC, ucp_Glagolitic },
+  { 264, PT_SC, ucp_Gothic },
+  { 271, PT_SC, ucp_Greek },
+  { 277, PT_SC, ucp_Gujarati },
+  { 286, PT_SC, ucp_Gurmukhi },
+  { 295, PT_SC, ucp_Han },
+  { 299, PT_SC, ucp_Hangul },
+  { 306, PT_SC, ucp_Hanunoo },
+  { 314, PT_SC, ucp_Hebrew },
+  { 321, PT_SC, ucp_Hiragana },
+  { 330, PT_SC, ucp_Imperial_Aramaic },
+  { 347, PT_SC, ucp_Inherited },
+  { 357, PT_SC, ucp_Inscriptional_Pahlavi },
+  { 379, PT_SC, ucp_Inscriptional_Parthian },
+  { 402, PT_SC, ucp_Javanese },
+  { 411, PT_SC, ucp_Kaithi },
+  { 418, PT_SC, ucp_Kannada },
+  { 426, PT_SC, ucp_Katakana },
+  { 435, PT_SC, ucp_Kayah_Li },
+  { 444, PT_SC, ucp_Kharoshthi },
+  { 455, PT_SC, ucp_Khmer },
+  { 461, PT_GC, ucp_L },
+  { 463, PT_LAMP, 0 },
+  { 466, PT_SC, ucp_Lao },
+  { 470, PT_SC, ucp_Latin },
+  { 476, PT_SC, ucp_Lepcha },
+  { 483, PT_SC, ucp_Limbu },
+  { 489, PT_SC, ucp_Linear_B },
+  { 498, PT_SC, ucp_Lisu },
+  { 503, PT_PC, ucp_Ll },
+  { 506, PT_PC, ucp_Lm },
+  { 509, PT_PC, ucp_Lo },
+  { 512, PT_PC, ucp_Lt },
+  { 515, PT_PC, ucp_Lu },
+  { 518, PT_SC, ucp_Lycian },
+  { 525, PT_SC, ucp_Lydian },
+  { 532, PT_GC, ucp_M },
+  { 534, PT_SC, ucp_Malayalam },
+  { 544, PT_SC, ucp_Mandaic },
+  { 552, PT_PC, ucp_Mc },
+  { 555, PT_PC, ucp_Me },
+  { 558, PT_SC, ucp_Meetei_Mayek },
+  { 571, PT_PC, ucp_Mn },
+  { 574, PT_SC, ucp_Mongolian },
+  { 584, PT_SC, ucp_Myanmar },
+  { 592, PT_GC, ucp_N },
+  { 594, PT_PC, ucp_Nd },
+  { 597, PT_SC, ucp_New_Tai_Lue },
+  { 609, PT_SC, ucp_Nko },
+  { 613, PT_PC, ucp_Nl },
+  { 616, PT_PC, ucp_No },
+  { 619, PT_SC, ucp_Ogham },
+  { 625, PT_SC, ucp_Ol_Chiki },
+  { 634, PT_SC, ucp_Old_Italic },
+  { 645, PT_SC, ucp_Old_Persian },
+  { 657, PT_SC, ucp_Old_South_Arabian },
+  { 675, PT_SC, ucp_Old_Turkic },
+  { 686, PT_SC, ucp_Oriya },
+  { 692, PT_SC, ucp_Osmanya },
+  { 700, PT_GC, ucp_P },
+  { 702, PT_PC, ucp_Pc },
+  { 705, PT_PC, ucp_Pd },
+  { 708, PT_PC, ucp_Pe },
+  { 711, PT_PC, ucp_Pf },
+  { 714, PT_SC, ucp_Phags_Pa },
+  { 723, PT_SC, ucp_Phoenician },
+  { 734, PT_PC, ucp_Pi },
+  { 737, PT_PC, ucp_Po },
+  { 740, PT_PC, ucp_Ps },
+  { 743, PT_SC, ucp_Rejang },
+  { 750, PT_SC, ucp_Runic },
+  { 756, PT_GC, ucp_S },
+  { 758, PT_SC, ucp_Samaritan },
+  { 768, PT_SC, ucp_Saurashtra },
+  { 779, PT_PC, ucp_Sc },
+  { 782, PT_SC, ucp_Shavian },
+  { 790, PT_SC, ucp_Sinhala },
+  { 798, PT_PC, ucp_Sk },
+  { 801, PT_PC, ucp_Sm },
+  { 804, PT_PC, ucp_So },
+  { 807, PT_SC, ucp_Sundanese },
+  { 817, PT_SC, ucp_Syloti_Nagri },
+  { 830, PT_SC, ucp_Syriac },
+  { 837, PT_SC, ucp_Tagalog },
+  { 845, PT_SC, ucp_Tagbanwa },
+  { 854, PT_SC, ucp_Tai_Le },
+  { 861, PT_SC, ucp_Tai_Tham },
+  { 870, PT_SC, ucp_Tai_Viet },
+  { 879, PT_SC, ucp_Tamil },
+  { 885, PT_SC, ucp_Telugu },
+  { 892, PT_SC, ucp_Thaana },
+  { 899, PT_SC, ucp_Thai },
+  { 904, PT_SC, ucp_Tibetan },
+  { 912, PT_SC, ucp_Tifinagh },
+  { 921, PT_SC, ucp_Ugaritic },
+  { 930, PT_SC, ucp_Vai },
+  { 934, PT_ALNUM, 0 },
+  { 938, PT_PXSPACE, 0 },
+  { 942, PT_SPACE, 0 },
+  { 946, PT_WORD, 0 },
+  { 950, PT_SC, ucp_Yi },
+  { 953, PT_GC, ucp_Z },
+  { 955, PT_PC, ucp_Zl },
+  { 958, PT_PC, ucp_Zp },
+  { 961, PT_PC, ucp_Zs }
  };
  
  const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c

index c25ecdc75b60b50950eb36739685cc1ce1fb2cb0..5b1b6f4ff6d1ea37dbf545fe81cb08132c32c9c1 100644 (file)
--- a/glib/pcre/pcre_xclass.c
+++ b/glib/pcre/pcre_xclass.c
@@ -6,7 +6,7 @@
  and semantics are as close as possible to those of the Perl 5 language.
  
                         Written by Philip Hazel
-           Copyright (c) 1997-2009 University of Cambridge
+           Copyright (c) 1997-2010 University of Cambridge
  
  -----------------------------------------------------------------------------
  Redistribution and use in source and binary forms, with or without
@@ -104,6 +104,7 @@ while ((t = *data++) != XCL_END)
    else  /* XCL_PROP & XCL_NOTPROP */
      {
      int chartype = UCD_CHARTYPE(c);
+
      switch(*data)
        {
        case PT_ANY:
@@ -111,12 +112,13 @@ while ((t = *data++) != XCL_END)
        break;
  
        case PT_LAMP:
-      if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
-          (t == XCL_PROP)) return !negated;
+      if ((chartype == ucp_Lu || chartype == ucp_Ll ||
+           chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
        break;
  
        case PT_GC:
-      if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
+      if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP))
+        return !negated;
        break;
  
        case PT_PC:
@@ -127,6 +129,33 @@ while ((t = *data++) != XCL_END)
        if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
        break;
  
+      case PT_ALNUM:
+      if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+           _pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP))
+        return !negated;
+      break;
+
+      case PT_SPACE:    /* Perl space */
+      if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+             == (t == XCL_PROP))
+        return !negated;
+      break;
+
+      case PT_PXSPACE:  /* POSIX space */
+      if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+           c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
+        return !negated;
+      break;
+
+      case PT_WORD:
+      if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+           _pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE)
+             == (t == XCL_PROP))
+        return !negated;
+      break;
+
        /* This should never occur, but compilers may mutter if there is no
        default. */
  
diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h

index f1b68b0c22c356d4ee6b88a91d0defe110efdc40..dcaa827efa2d2f61ecd59e46be003f044b119ce2 100644 (file)
--- a/glib/pcre/ucp.h
+++ b/glib/pcre/ucp.h
@@ -150,7 +150,10 @@ enum {
    ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
    ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
    ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
-  ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
+  ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
+  ucp_Batak = G_UNICODE_SCRIPT_BATAK,
+  ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
+  ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC
  };
  
  #endif
author	Matthias Clasen <mclasen@redhat.com>
	Sat, 22 Jan 2011 05:01:54 +0000 (00:01 -0500)
committer	Matthias Clasen <mclasen@redhat.com>
	Sat, 22 Jan 2011 05:01:54 +0000 (00:01 -0500)
glib/pcre/pcre.h		patch \| blob \| history
glib/pcre/pcre_chartables.c		patch \| blob \| history
glib/pcre/pcre_compile.c		patch \| blob \| history
glib/pcre/pcre_dfa_exec.c		patch \| blob \| history
glib/pcre/pcre_exec.c		patch \| blob \| history
glib/pcre/pcre_internal.h		patch \| blob \| history
glib/pcre/pcre_study.c		patch \| blob \| history
glib/pcre/pcre_tables.c		patch \| blob \| history
glib/pcre/pcre_xclass.c		patch \| blob \| history
glib/pcre/ucp.h		patch \| blob \| history