regex: Add JavaScript compat mode

author Christian Persch <chpe@gnome.org>

Thu, 7 Jun 2012 18:12:11 +0000 (20:12 +0200)

committer Christian Persch <chpe@gnome.org>

Mon, 2 Jul 2012 13:59:39 +0000 (15:59 +0200)
author Christian Persch <chpe@gnome.org>
Thu, 7 Jun 2012 18:12:11 +0000 (20:12 +0200)
committer Christian Persch <chpe@gnome.org>
Mon, 2 Jul 2012 13:59:39 +0000 (15:59 +0200)
diff --git a/glib/gregex.c b/glib/gregex.c

index d6ccc88..cadcc50 100644 (file)
--- a/glib/gregex.c
+++ b/glib/gregex.c
@@ -88,6 +88,18 @@
   * unescaped "#" outside a character class is encountered. This indicates
   * a comment that lasts until after the next newline.
   *
+ * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern
+ * matching is changed to be compatible with the way that regular expressions
+ * work in JavaScript. More precisely, a lonely ']' character in the pattern
+ * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and
+ * you must use the '\u' escape sequence with 4 hex digits to specify a unicode
+ * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by
+ * the specified number of hex digits, they match 'x' and 'u' literally; also
+ * '\U' always matches 'U' instead of being an error in the pattern. Finally,
+ * pattern matching is modified so that back references to an unset subpattern
+ * group produces a match with the empty string instead of an error. See
+ * <ulink>man:pcreapi(3)<ulink> for more information.
+ *
   * Creating and manipulating the same #GRegex structure from different
   * threads is not a problem as #GRegex does not modify its internal
   * state between creation and destruction, on the other hand #GMatchInfo
@@ -114,7 +126,8 @@
                                G_REGEX_NEWLINE_LF        | \
                                G_REGEX_NEWLINE_CRLF      | \
                                G_REGEX_NEWLINE_ANYCRLF   | \
-                              G_REGEX_BSR_ANYCRLF)
+                              G_REGEX_BSR_ANYCRLF       | \
+                              G_REGEX_JAVASCRIPT_COMPAT)
  
  /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */
  #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK)
@@ -136,20 +149,21 @@
                              G_REGEX_MATCH_BSR_ANY)
  
  /* we rely on these flags having the same values */
-G_STATIC_ASSERT (G_REGEX_CASELESS        == PCRE_CASELESS);
-G_STATIC_ASSERT (G_REGEX_MULTILINE       == PCRE_MULTILINE);
-G_STATIC_ASSERT (G_REGEX_DOTALL          == PCRE_DOTALL);
-G_STATIC_ASSERT (G_REGEX_EXTENDED        == PCRE_EXTENDED);
-G_STATIC_ASSERT (G_REGEX_ANCHORED        == PCRE_ANCHORED);
-G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY  == PCRE_DOLLAR_ENDONLY);
-G_STATIC_ASSERT (G_REGEX_UNGREEDY        == PCRE_UNGREEDY);
-G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE);
-G_STATIC_ASSERT (G_REGEX_DUPNAMES        == PCRE_DUPNAMES);
-G_STATIC_ASSERT (G_REGEX_NEWLINE_CR      == PCRE_NEWLINE_CR);
-G_STATIC_ASSERT (G_REGEX_NEWLINE_LF      == PCRE_NEWLINE_LF);
-G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF    == PCRE_NEWLINE_CRLF);
-G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);
-G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF     == PCRE_BSR_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_CASELESS          == PCRE_CASELESS);
+G_STATIC_ASSERT (G_REGEX_MULTILINE         == PCRE_MULTILINE);
+G_STATIC_ASSERT (G_REGEX_DOTALL            == PCRE_DOTALL);
+G_STATIC_ASSERT (G_REGEX_EXTENDED          == PCRE_EXTENDED);
+G_STATIC_ASSERT (G_REGEX_ANCHORED          == PCRE_ANCHORED);
+G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY    == PCRE_DOLLAR_ENDONLY);
+G_STATIC_ASSERT (G_REGEX_UNGREEDY          == PCRE_UNGREEDY);
+G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE   == PCRE_NO_AUTO_CAPTURE);
+G_STATIC_ASSERT (G_REGEX_DUPNAMES          == PCRE_DUPNAMES);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_CR        == PCRE_NEWLINE_CR);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_LF        == PCRE_NEWLINE_LF);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF      == PCRE_NEWLINE_CRLF);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF   == PCRE_NEWLINE_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF       == PCRE_BSR_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT);
  
  G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED        == PCRE_ANCHORED);
  G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL          == PCRE_NOTBOL);
@@ -472,6 +486,9 @@ translate_compile_error (gint *errcode, const gchar **errmsg)
      case G_REGEX_ERROR_MISSING_DIGIT:
        *errmsg = _("digit expected after (?+");
        break;
+    case G_REGEX_ERROR_INVALID_DATA_CHARACTER:
+      *errmsg = _("] is an invalid data character in JavaScript compatibility mode");
+      break;
      case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
        *errmsg = _("different names for subpatterns of the same number are not allowed");
        break;
@@ -513,11 +530,6 @@ translate_compile_error (gint *errcode, const gchar **errmsg)
      case 174: /* invalid UTF-16 string */
        /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
         * and we do not check if strings are valid */
-    case 164: /* ] is an invalid data character in JavaScript compatibility mode */
-      /* This should not happen as we don't use PCRE_JAVASCRIPT_COMPAT */
-      g_warning ("%s", *errmsg);
-      *errcode = G_REGEX_ERROR_COMPILE;
-      break;
      case 170: /* internal error: unknown opcode in find_fixedlength() */
        *errcode = G_REGEX_ERROR_INTERNAL;
        break;
diff --git a/glib/gregex.h b/glib/gregex.h

index 8705230..91852bf 100644 (file)
--- a/glib/gregex.h
+++ b/glib/gregex.h
@@ -116,6 +116,8 @@ G_BEGIN_DECLS
   *     control verb. Since: 2.34
   * @G_REGEX_ERROR_NUMBER_TOO_BIG: number is too big in escape sequence. Since: 2.34
   * @G_REGEX_ERROR_MISSING_SUBPATTERN_NAME: Missing subpattern name. Since: 2.34
+ * @G_REGEX_ERROR_INVALID_DATA_CHARACTER: In JavaScript compatibility mode,
+ *     "[" is an invalid data character. Since: 2.34
   * @G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME: different names for subpatterns of the 
   *     same number are not allowed. Since: 2.34
   * @G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED: the backtracing control
@@ -185,6 +187,7 @@ typedef enum
    G_REGEX_ERROR_NUMBER_TOO_BIG = 161,
    G_REGEX_ERROR_MISSING_SUBPATTERN_NAME = 162,
    G_REGEX_ERROR_MISSING_DIGIT = 163,
+  G_REGEX_ERROR_INVALID_DATA_CHARACTER = 164,
    G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME = 165,
    G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED = 166,
    G_REGEX_ERROR_INVALID_CONTROL_CHAR = 168,
@@ -299,7 +302,8 @@ typedef enum
    G_REGEX_NEWLINE_LF        = 1 << 21,
    G_REGEX_NEWLINE_CRLF      = G_REGEX_NEWLINE_CR | G_REGEX_NEWLINE_LF,
    G_REGEX_NEWLINE_ANYCRLF   = G_REGEX_NEWLINE_CR | 1 << 22,
-  G_REGEX_BSR_ANYCRLF       = 1 << 23
+  G_REGEX_BSR_ANYCRLF       = 1 << 23,
+  G_REGEX_JAVASCRIPT_COMPAT = 1 << 25
  } GRegexCompileFlags;
  
  /**
@@ -355,6 +359,8 @@ typedef enum
   *     single characters U+000B LINE TABULATION, U+000C FORM FEED (FF),
   *     U+0085 NEXT LINE (NEL), U+2028 LINE SEPARATOR and
   *     U+2029 PARAGRAPH SEPARATOR. Since: 2.34
+ * @G_REGEX_JAVASCRIPT_COMPAT: Changes behaviour so that it is compatible with
+ *     JavaScript rather than PCRE. Since: 2.34
   *
   * Flags specifying match-time options.
   *
diff --git a/glib/tests/regex.c b/glib/tests/regex.c

index 6e97fda..f08db88 100644 (file)
--- a/glib/tests/regex.c
+++ b/glib/tests/regex.c
@@ -2149,6 +2149,7 @@ main (int argc, char *argv[])
    TEST_NEW_FAIL ("(?i:A{1,}\\6666666666)", 0, G_REGEX_ERROR_NUMBER_TOO_BIG);
    TEST_NEW_FAIL ("(?<a>)(?&)", 0, G_REGEX_ERROR_MISSING_SUBPATTERN_NAME);
    TEST_NEW_FAIL ("(?+-a)", 0, G_REGEX_ERROR_MISSING_DIGIT);
+  TEST_NEW_FAIL ("TA]", G_REGEX_JAVASCRIPT_COMPAT, G_REGEX_ERROR_INVALID_DATA_CHARACTER);
    TEST_NEW_FAIL ("(?|(?<a>A)|(?<b>B))", 0, G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME);
    TEST_NEW_FAIL ("a(*MARK)b", 0, G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED);
    TEST_NEW_FAIL ("^\\c€", 0, G_REGEX_ERROR_INVALID_CONTROL_CHAR);
author	Christian Persch <chpe@gnome.org>
	Thu, 7 Jun 2012 18:12:11 +0000 (20:12 +0200)
committer	Christian Persch <chpe@gnome.org>
	Mon, 2 Jul 2012 13:59:39 +0000 (15:59 +0200)
glib/gregex.c		patch \| blob \| history
glib/gregex.h		patch \| blob \| history
glib/tests/regex.c		patch \| blob \| history