From: Christian Persch Date: Thu, 7 Jun 2012 18:12:11 +0000 (+0200) Subject: regex: Add JavaScript compat mode X-Git-Tag: 2.33.4~58 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0a2967030af2a5cce9fd6ae916a535f26239bcd3;p=platform%2Fupstream%2Fglib.git regex: Add JavaScript compat mode Since PCRE 7.7, there's a flag that changes the behaviour to be more JavaScript compatible. Since it's no effort to expose it, just do so. --- diff --git a/glib/gregex.c b/glib/gregex.c index d6ccc88..cadcc50 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -88,6 +88,18 @@ * unescaped "#" outside a character class is encountered. This indicates * a comment that lasts until after the next newline. * + * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern + * matching is changed to be compatible with the way that regular expressions + * work in JavaScript. More precisely, a lonely ']' character in the pattern + * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and + * you must use the '\u' escape sequence with 4 hex digits to specify a unicode + * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by + * the specified number of hex digits, they match 'x' and 'u' literally; also + * '\U' always matches 'U' instead of being an error in the pattern. Finally, + * pattern matching is modified so that back references to an unset subpattern + * group produces a match with the empty string instead of an error. See + * man:pcreapi(3) for more information. + * * Creating and manipulating the same #GRegex structure from different * threads is not a problem as #GRegex does not modify its internal * state between creation and destruction, on the other hand #GMatchInfo @@ -114,7 +126,8 @@ G_REGEX_NEWLINE_LF | \ G_REGEX_NEWLINE_CRLF | \ G_REGEX_NEWLINE_ANYCRLF | \ - G_REGEX_BSR_ANYCRLF) + G_REGEX_BSR_ANYCRLF | \ + G_REGEX_JAVASCRIPT_COMPAT) /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */ #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK) @@ -136,20 +149,21 @@ G_REGEX_MATCH_BSR_ANY) /* we rely on these flags having the same values */ -G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS); -G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE); -G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL); -G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED); -G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED); -G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY); -G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY); -G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE); -G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES); -G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR); -G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF); -G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); -G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS); +G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE); +G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL); +G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED); +G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED); +G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY); +G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY); +G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE); +G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES); +G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR); +G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF); +G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); +G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT); G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED); G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL); @@ -472,6 +486,9 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case G_REGEX_ERROR_MISSING_DIGIT: *errmsg = _("digit expected after (?+"); break; + case G_REGEX_ERROR_INVALID_DATA_CHARACTER: + *errmsg = _("] is an invalid data character in JavaScript compatibility mode"); + break; case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME: *errmsg = _("different names for subpatterns of the same number are not allowed"); break; @@ -513,11 +530,6 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case 174: /* invalid UTF-16 string */ /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE * and we do not check if strings are valid */ - case 164: /* ] is an invalid data character in JavaScript compatibility mode */ - /* This should not happen as we don't use PCRE_JAVASCRIPT_COMPAT */ - g_warning ("%s", *errmsg); - *errcode = G_REGEX_ERROR_COMPILE; - break; case 170: /* internal error: unknown opcode in find_fixedlength() */ *errcode = G_REGEX_ERROR_INTERNAL; break; diff --git a/glib/gregex.h b/glib/gregex.h index 8705230..91852bf 100644 --- a/glib/gregex.h +++ b/glib/gregex.h @@ -116,6 +116,8 @@ G_BEGIN_DECLS * control verb. Since: 2.34 * @G_REGEX_ERROR_NUMBER_TOO_BIG: number is too big in escape sequence. Since: 2.34 * @G_REGEX_ERROR_MISSING_SUBPATTERN_NAME: Missing subpattern name. Since: 2.34 + * @G_REGEX_ERROR_INVALID_DATA_CHARACTER: In JavaScript compatibility mode, + * "[" is an invalid data character. Since: 2.34 * @G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME: different names for subpatterns of the * same number are not allowed. Since: 2.34 * @G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED: the backtracing control @@ -185,6 +187,7 @@ typedef enum G_REGEX_ERROR_NUMBER_TOO_BIG = 161, G_REGEX_ERROR_MISSING_SUBPATTERN_NAME = 162, G_REGEX_ERROR_MISSING_DIGIT = 163, + G_REGEX_ERROR_INVALID_DATA_CHARACTER = 164, G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME = 165, G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED = 166, G_REGEX_ERROR_INVALID_CONTROL_CHAR = 168, @@ -299,7 +302,8 @@ typedef enum G_REGEX_NEWLINE_LF = 1 << 21, G_REGEX_NEWLINE_CRLF = G_REGEX_NEWLINE_CR | G_REGEX_NEWLINE_LF, G_REGEX_NEWLINE_ANYCRLF = G_REGEX_NEWLINE_CR | 1 << 22, - G_REGEX_BSR_ANYCRLF = 1 << 23 + G_REGEX_BSR_ANYCRLF = 1 << 23, + G_REGEX_JAVASCRIPT_COMPAT = 1 << 25 } GRegexCompileFlags; /** @@ -355,6 +359,8 @@ typedef enum * single characters U+000B LINE TABULATION, U+000C FORM FEED (FF), * U+0085 NEXT LINE (NEL), U+2028 LINE SEPARATOR and * U+2029 PARAGRAPH SEPARATOR. Since: 2.34 + * @G_REGEX_JAVASCRIPT_COMPAT: Changes behaviour so that it is compatible with + * JavaScript rather than PCRE. Since: 2.34 * * Flags specifying match-time options. * diff --git a/glib/tests/regex.c b/glib/tests/regex.c index 6e97fda..f08db88 100644 --- a/glib/tests/regex.c +++ b/glib/tests/regex.c @@ -2149,6 +2149,7 @@ main (int argc, char *argv[]) TEST_NEW_FAIL ("(?i:A{1,}\\6666666666)", 0, G_REGEX_ERROR_NUMBER_TOO_BIG); TEST_NEW_FAIL ("(?)(?&)", 0, G_REGEX_ERROR_MISSING_SUBPATTERN_NAME); TEST_NEW_FAIL ("(?+-a)", 0, G_REGEX_ERROR_MISSING_DIGIT); + TEST_NEW_FAIL ("TA]", G_REGEX_JAVASCRIPT_COMPAT, G_REGEX_ERROR_INVALID_DATA_CHARACTER); TEST_NEW_FAIL ("(?|(?A)|(?B))", 0, G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME); TEST_NEW_FAIL ("a(*MARK)b", 0, G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED); TEST_NEW_FAIL ("^\\c€", 0, G_REGEX_ERROR_INVALID_CONTROL_CHAR);