#include "gmessages.h"
#include "gstrfuncs.h"
#include "gatomic.h"
+#include "gthread.h"
/**
* SECTION:gregex
* @title: Perl-compatible regular expressions
* @short_description: matches strings against regular expressions
- * @see_also: <xref linkend="glib-regex-syntax"/>
+ * @see_also: [Regular expression syntax][glib-regex-syntax]
*
- * The <function>g_regex_*()</function> functions implement regular
+ * The g_regex_*() functions implement regular
* expression pattern matching using syntax and semantics similar to
* Perl regular expression.
*
* unescaped "#" outside a character class is encountered. This indicates
* a comment that lasts until after the next newline.
*
+ * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern
+ * matching is changed to be compatible with the way that regular expressions
+ * work in JavaScript. More precisely, a lonely ']' character in the pattern
+ * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and
+ * you must use the '\u' escape sequence with 4 hex digits to specify a unicode
+ * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by
+ * the specified number of hex digits, they match 'x' and 'u' literally; also
+ * '\U' always matches 'U' instead of being an error in the pattern. Finally,
+ * pattern matching is modified so that back references to an unset subpattern
+ * group produces a match with the empty string instead of an error. See
+ * pcreapi(3) for more information.
+ *
* Creating and manipulating the same #GRegex structure from different
* threads is not a problem as #GRegex does not modify its internal
* state between creation and destruction, on the other hand #GMatchInfo
* is not threadsafe.
*
* The regular expressions low-level functionalities are obtained through
- * the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
- * written by Philip Hazel.
+ * the excellent
+ * [PCRE](http://www.pcre.org/)
+ * library written by Philip Hazel.
*/
/* Mask of all the possible values for GRegexCompileFlags. */
G_REGEX_RAW | \
G_REGEX_NO_AUTO_CAPTURE | \
G_REGEX_OPTIMIZE | \
+ G_REGEX_FIRSTLINE | \
G_REGEX_DUPNAMES | \
G_REGEX_NEWLINE_CR | \
G_REGEX_NEWLINE_LF | \
- G_REGEX_NEWLINE_CRLF)
+ G_REGEX_NEWLINE_CRLF | \
+ G_REGEX_NEWLINE_ANYCRLF | \
+ G_REGEX_BSR_ANYCRLF | \
+ G_REGEX_JAVASCRIPT_COMPAT)
+
+/* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */
+#define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK)
+#define G_REGEX_COMPILE_NONPCRE_MASK (G_REGEX_RAW | \
+ G_REGEX_OPTIMIZE)
/* Mask of all the possible values for GRegexMatchFlags. */
-#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \
- G_REGEX_MATCH_NOTBOL | \
- G_REGEX_MATCH_NOTEOL | \
- G_REGEX_MATCH_NOTEMPTY | \
- G_REGEX_MATCH_PARTIAL | \
- G_REGEX_MATCH_NEWLINE_CR | \
- G_REGEX_MATCH_NEWLINE_LF | \
- G_REGEX_MATCH_NEWLINE_CRLF | \
- G_REGEX_MATCH_NEWLINE_ANY)
+#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \
+ G_REGEX_MATCH_NOTBOL | \
+ G_REGEX_MATCH_NOTEOL | \
+ G_REGEX_MATCH_NOTEMPTY | \
+ G_REGEX_MATCH_PARTIAL | \
+ G_REGEX_MATCH_NEWLINE_CR | \
+ G_REGEX_MATCH_NEWLINE_LF | \
+ G_REGEX_MATCH_NEWLINE_CRLF | \
+ G_REGEX_MATCH_NEWLINE_ANY | \
+ G_REGEX_MATCH_NEWLINE_ANYCRLF | \
+ G_REGEX_MATCH_BSR_ANYCRLF | \
+ G_REGEX_MATCH_BSR_ANY | \
+ G_REGEX_MATCH_PARTIAL_SOFT | \
+ G_REGEX_MATCH_PARTIAL_HARD | \
+ G_REGEX_MATCH_NOTEMPTY_ATSTART)
+
+/* we rely on these flags having the same values */
+G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS);
+G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE);
+G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL);
+G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED);
+G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED);
+G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY);
+G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY);
+G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE);
+G_STATIC_ASSERT (G_REGEX_FIRSTLINE == PCRE_FIRSTLINE);
+G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF);
+G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT);
+
+G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED);
+G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL);
+G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL);
+G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY);
+G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL);
+G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR);
+G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF);
+G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF);
+G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY);
+G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF);
+G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE);
+G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT);
+G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD);
+G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART);
+
+/* These PCRE flags are unused or not exposed publically in GRegexFlags, so
+ * it should be ok to reuse them for different things.
+ */
+G_STATIC_ASSERT (G_REGEX_OPTIMIZE == PCRE_NO_UTF8_CHECK);
+G_STATIC_ASSERT (G_REGEX_RAW == PCRE_UTF8);
/* if the string is in UTF-8 use g_utf8_ functions, else use
* use just +/- 1. */
-#define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
- g_utf8_next_char (s) : \
- ((s) + 1))
-#define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
- g_utf8_prev_char (s) : \
- ((s) - 1))
+#define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
+ ((s) + 1) : \
+ g_utf8_next_char (s))
+#define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
+ ((s) - 1) : \
+ g_utf8_prev_char (s))
struct _GMatchInfo
{
+ volatile gint ref_count; /* the ref count */
GRegex *regex; /* the regex */
GRegexMatchFlags match_opts; /* options used at match time on the regex */
gint matches; /* number of matching sub patterns */
gint pos; /* position in the string where last match left off */
+ gint n_offsets; /* number of offsets */
gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
- gint n_offsets; /* number of offsets */
gint *workspace; /* workspace for pcre_dfa_exec() */
gint n_workspace; /* number of workspace elements */
const gchar *string; /* string passed to the match function */
case PCRE_ERROR_DFA_RECURSE:
case PCRE_ERROR_RECURSIONLIMIT:
return _("recursion limit reached");
- case PCRE_ERROR_NULLWSLIMIT:
- return _("workspace limit for empty substrings reached");
case PCRE_ERROR_BADNEWLINE:
return _("invalid combination of newline flags");
+ case PCRE_ERROR_BADOFFSET:
+ return _("bad offset");
+ case PCRE_ERROR_SHORTUTF8:
+ return _("short utf8");
+ case PCRE_ERROR_RECURSELOOP:
+ return _("recursion loop");
default:
break;
}
*errmsg = _("\\c at end of pattern");
break;
case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
- *errmsg = _("unrecognized character follows \\");
- break;
- case 137:
- /* A number of Perl escapes are not handled by PCRE.
- * Therefore it explicitly raises ERR37.
- */
- *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
- *errmsg = _("case-changing escapes (\\l, \\L, \\u, \\U) are not allowed here");
+ *errmsg = _("unrecognized character following \\");
break;
case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
*errmsg = _("numbers out of order in {} quantifier");
case G_REGEX_ERROR_NOTHING_TO_REPEAT:
*errmsg = _("nothing to repeat");
break;
- case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
- *errmsg = _("unrecognized character after (?");
- break;
- case 124:
- *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
- *errmsg = _("unrecognized character after (?<");
+ case 111: /* internal error: unexpected repeat */
+ *errcode = G_REGEX_ERROR_INTERNAL;
+ *errmsg = _("unexpected repeat");
break;
- case 141:
- *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
- *errmsg = _("unrecognized character after (?P");
+ case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
+ *errmsg = _("unrecognized character after (? or (?-");
break;
case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
*errmsg = _("POSIX named classes are supported only within a class");
case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
*errmsg = _("missing terminating )");
break;
- case 122:
- *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
- *errmsg = _(") without opening (");
- break;
- case 129:
- *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
- /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
- * sequences here, '(?-54' would be an example for the second group.
- */
- *errmsg = _("(?R or (?[+-]digits must be followed by )");
- break;
case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
*errmsg = _("reference to non-existent subpattern");
break;
*errmsg = _("missing ) after comment");
break;
case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
- *errmsg = _("regular expression too large");
+ *errmsg = _("regular expression is too large");
break;
case G_REGEX_ERROR_MEMORY_ERROR:
*errmsg = _("failed to get memory");
break;
+ case 122: /* unmatched parentheses */
+ *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
+ *errmsg = _(") without opening (");
+ break;
+ case 123: /* internal error: code overflow */
+ *errcode = G_REGEX_ERROR_INTERNAL;
+ *errmsg = _("code overflow");
+ break;
+ case 124: /* "unrecognized character after (?<\0 */
+ *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
+ *errmsg = _("unrecognized character after (?<");
+ break;
case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
*errmsg = _("lookbehind assertion is not fixed length");
break;
case G_REGEX_ERROR_ASSERTION_EXPECTED:
*errmsg = _("assertion expected after (?(");
break;
+ case 129:
+ *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
+ /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
+ * sequences here, '(?-54' would be an example for the second group.
+ */
+ *errmsg = _("(?R or (?[+-]digits must be followed by )");
+ break;
case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
*errmsg = _("unknown POSIX class name");
break;
case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
*errmsg = _("\\C not allowed in lookbehind assertion");
break;
+ case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */
+ /* A number of Perl escapes are not handled by PCRE.
+ * Therefore it explicitly raises ERR37.
+ */
+ *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
+ *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");
+ break;
case G_REGEX_ERROR_INFINITE_LOOP:
*errmsg = _("recursive call could loop indefinitely");
break;
+ case 141: /* unrecognized character after (?P\0 */
+ *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
+ *errmsg = _("unrecognized character after (?P");
+ break;
case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
*errmsg = _("missing terminator in subpattern name");
break;
case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
*errmsg = _("octal value is greater than \\377");
break;
+ case 152: /* internal error: overran compiling workspace */
+ *errcode = G_REGEX_ERROR_INTERNAL;
+ *errmsg = _("overran compiling workspace");
+ break;
+ case 153: /* internal error: previously-checked referenced subpattern not found */
+ *errcode = G_REGEX_ERROR_INTERNAL;
+ *errmsg = _("previously-checked referenced subpattern not found");
+ break;
case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
*errmsg = _("DEFINE group contains more than one branch");
break;
- case G_REGEX_ERROR_DEFINE_REPETION:
- *errmsg = _("repeating a DEFINE group is not allowed");
- break;
case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
*errmsg = _("inconsistent NEWLINE options");
break;
case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
- *errmsg = _("\\g is not followed by a braced name or an optionally "
- "braced non-zero number");
+ *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
+ "number, or by a plain number");
break;
- case 11:
- *errcode = G_REGEX_ERROR_INTERNAL;
- *errmsg = _("unexpected repeat");
+ case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:
+ *errmsg = _("a numbered reference must not be zero");
break;
- case 23:
- *errcode = G_REGEX_ERROR_INTERNAL;
- *errmsg = _("code overflow");
+ case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:
+ *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
break;
- case 52:
- *errcode = G_REGEX_ERROR_INTERNAL;
- *errmsg = _("overran compiling workspace");
+ case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:
+ *errmsg = _("(*VERB) not recognized");
break;
- case 53:
- *errcode = G_REGEX_ERROR_INTERNAL;
- *errmsg = _("previously-checked referenced subpattern not found");
+ case G_REGEX_ERROR_NUMBER_TOO_BIG:
+ *errmsg = _("number is too big");
break;
- case 16:
+ case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:
+ *errmsg = _("missing subpattern name after (?&");
+ break;
+ case G_REGEX_ERROR_MISSING_DIGIT:
+ *errmsg = _("digit expected after (?+");
+ break;
+ case G_REGEX_ERROR_INVALID_DATA_CHARACTER:
+ *errmsg = _("] is an invalid data character in JavaScript compatibility mode");
+ break;
+ case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
+ *errmsg = _("different names for subpatterns of the same number are not allowed");
+ break;
+ case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:
+ *errmsg = _("(*MARK) must have an argument");
+ break;
+ case G_REGEX_ERROR_INVALID_CONTROL_CHAR:
+ *errmsg = _( "\\c must be followed by an ASCII character");
+ break;
+ case G_REGEX_ERROR_MISSING_NAME:
+ *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
+ break;
+ case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:
+ *errmsg = _("\\N is not supported in a class");
+ break;
+ case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:
+ *errmsg = _("too many forward references");
+ break;
+ case G_REGEX_ERROR_NAME_TOO_LONG:
+ *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
+ break;
+ case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE:
+ *errmsg = _("character value in \\u.... sequence is too large");
+ break;
+
+ case 116: /* erroffset passed as NULL */
/* This should not happen as we never pass a NULL erroffset */
g_warning ("erroffset passed as NULL");
*errcode = G_REGEX_ERROR_COMPILE;
break;
- case 17:
+ case 117: /* unknown option bit(s) set */
/* This should not happen as we check options before passing them
* to pcre_compile2() */
g_warning ("unknown option bit(s) set");
*errcode = G_REGEX_ERROR_COMPILE;
break;
- case 32:
- case 44:
- case 45:
- /* These errors should not happen as we are using an UTF8-enabled PCRE
+ case 132: /* this version of PCRE is compiled without UTF support */
+ case 144: /* invalid UTF-8 string */
+ case 145: /* support for \\P, \\p, and \\X has not been compiled */
+ case 167: /* this version of PCRE is not compiled with Unicode property support */
+ case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */
+ case 174: /* invalid UTF-16 string */
+ /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
* and we do not check if strings are valid */
- g_warning ("%s", *errmsg);
- *errcode = G_REGEX_ERROR_COMPILE;
+ case 170: /* internal error: unknown opcode in find_fixedlength() */
+ *errcode = G_REGEX_ERROR_INTERNAL;
break;
+
default:
*errcode = G_REGEX_ERROR_COMPILE;
}
string_len = strlen (string);
match_info = g_new0 (GMatchInfo, 1);
+ match_info->ref_count = 1;
match_info->regex = g_regex_ref ((GRegex *)regex);
match_info->string = string;
match_info->string_len = string_len;
}
/**
- * g_match_info_free:
+ * g_match_info_ref:
* @match_info: a #GMatchInfo
*
- * Frees all the memory associated with the #GMatchInfo structure.
+ * Increases reference count of @match_info by 1.
*
- * Since: 2.14
+ * Returns: @match_info
+ *
+ * Since: 2.30
+ */
+GMatchInfo *
+g_match_info_ref (GMatchInfo *match_info)
+{
+ g_return_val_if_fail (match_info != NULL, NULL);
+ g_atomic_int_inc (&match_info->ref_count);
+ return match_info;
+}
+
+/**
+ * g_match_info_unref:
+ * @match_info: a #GMatchInfo
+ *
+ * Decreases reference count of @match_info by 1. When reference count drops
+ * to zero, it frees all the memory associated with the match_info structure.
+ *
+ * Since: 2.30
*/
void
-g_match_info_free (GMatchInfo *match_info)
+g_match_info_unref (GMatchInfo *match_info)
{
- if (match_info)
+ if (g_atomic_int_dec_and_test (&match_info->ref_count))
{
g_regex_unref (match_info->regex);
g_free (match_info->offsets);
}
/**
+ * g_match_info_free:
+ * @match_info: (allow-none): a #GMatchInfo, or %NULL
+ *
+ * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
+ * nothing.
+ *
+ * Since: 2.14
+ */
+void
+g_match_info_free (GMatchInfo *match_info)
+{
+ if (match_info == NULL)
+ return;
+
+ g_match_info_unref (match_info);
+}
+
+/**
* g_match_info_next:
* @match_info: a #GMatchInfo structure
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Scans for the next match using the same parameters of the previous
* call to g_regex_match_full() or g_regex_match() that returned
prev_match_start = match_info->offsets[0];
prev_match_end = match_info->offsets[1];
+ if (match_info->pos > match_info->string_len)
+ {
+ /* we have reached the end of the string */
+ match_info->pos = -1;
+ match_info->matches = PCRE_ERROR_NOMATCH;
+ return FALSE;
+ }
+
match_info->matches = pcre_exec (match_info->regex->pcre_re,
match_info->regex->extra,
match_info->string,
* able to raise an error as soon as a mistake is made.
*
* GRegex supports the concept of partial matching by means of the
- * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
+ * #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD flags.
+ * When they are used, the return code for
* g_regex_match() or g_regex_match_full() is, as usual, %TRUE
* for a complete match, %FALSE otherwise. But, when these functions
* return %FALSE, you can check if the match was partial calling
* g_match_info_is_partial_match().
*
- * When using partial matching you cannot use g_match_info_fetch*().
+ * The difference between #G_REGEX_MATCH_PARTIAL_SOFT and
+ * #G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered
+ * with #G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a
+ * possible complete match, while with #G_REGEX_MATCH_PARTIAL_HARD matching
+ * stops at the partial match.
+ * When both #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD
+ * are set, the latter takes precedence.
+ *
+ * There were formerly some restrictions on the pattern for partial matching.
+ * The restrictions no longer apply.
*
- * Because of the way certain internal optimizations are implemented
- * the partial matching algorithm cannot be used with all patterns.
- * So repeated single characters such as "a{2,4}" and repeated single
- * meta-sequences such as "\d+" are not permitted if the maximum number
- * of occurrences is greater than one. Optional items such as "\d?"
- * (where the maximum is one) are permitted. Quantifiers with any values
- * are permitted after parentheses, so the invalid examples above can be
- * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
- * for a pattern that does not conform to the restrictions, matching
- * functions return an error.
+ * See pcrepartial(3) for more information on partial matching.
*
* Returns: %TRUE if the match was partial, %FALSE otherwise
*
/**
* g_match_info_expand_references:
- * @match_info: a #GMatchInfo or %NULL
+ * @match_info: (allow-none): a #GMatchInfo or %NULL
* @string_to_expand: the string to expand
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Returns a new string containing the text in @string_to_expand with
* references and escape sequences expanded. References refer to the last
result = g_string_sized_new (strlen (string_to_expand));
interpolate_replacement (match_info, result, list);
- g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
- g_list_free (list);
+ g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
return g_string_free (result, FALSE);
}
* @match_info: #GMatchInfo structure
* @match_num: number of the sub expression
*
- * Retrieves the text matching the @match_num<!-- -->'th capturing
+ * Retrieves the text matching the @match_num'th capturing
* parentheses. 0 is the full text of the match, 1 is the first paren
* set, 2 the second, and so on.
*
* @end_pos: (out) (allow-none): pointer to location where to store
* the end position, or %NULL
*
- * Retrieves the position in bytes of the @match_num<!-- -->'th capturing
+ * Retrieves the position in bytes of the @match_num'th capturing
* parentheses. 0 is the full text of the match, 1 is the first
* paren set, 2 the second, and so on.
*
* Retrieves the text matching the capturing parentheses named @name.
*
* If @name is a valid sub pattern name but it didn't match anything
- * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
+ * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
* then an empty string is returned.
*
* The string is fetched from the string passed to the match function,
* Retrieves the position in bytes of the capturing parentheses named @name.
*
* If @name is a valid sub pattern name but it didn't match anything
- * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
+ * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
* then @start_pos and @end_pos are set to -1 and %TRUE is returned.
*
* Returns: %TRUE if the position was fetched, %FALSE otherwise.
* The strings are fetched from the string passed to the match function,
* so you cannot call this function after freeing the string.
*
- * Returns: (allow-none): a %NULL-terminated array of gchar * pointers.
- * It must be freed using g_strfreev(). If the previous match failed
- * %NULL is returned
+ * Returns: (transfer full): a %NULL-terminated array of gchar *
+ * pointers. It must be freed using g_strfreev(). If the previous
+ * match failed %NULL is returned
*
* Since: 2.14
*/
/* GRegex */
-GQuark
-g_regex_error_quark (void)
-{
- static GQuark error_quark = 0;
-
- if (error_quark == 0)
- error_quark = g_quark_from_static_string ("g-regex-error-quark");
-
- return error_quark;
-}
+G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
/**
* g_regex_ref:
{
g_return_if_fail (regex != NULL);
- if (g_atomic_int_exchange_and_add (®ex->ref_count, -1) - 1 == 0)
+ if (g_atomic_int_dec_and_test (®ex->ref_count))
{
g_free (regex->pattern);
if (regex->pcre_re != NULL)
gint erroffset;
gint errcode;
gboolean optimize = FALSE;
- static gboolean initialized = FALSE;
+ static volatile gsize initialised = 0;
unsigned long int pcre_compile_options;
+ GRegexCompileFlags nonpcre_compile_options;
g_return_val_if_fail (pattern != NULL, NULL);
g_return_val_if_fail (error == NULL || *error == NULL, NULL);
g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
- if (!initialized)
+ if (g_once_init_enter (&initialised))
{
- gint support;
- const gchar *msg;
+ int supports_utf8, supports_ucp;
- pcre_config (PCRE_CONFIG_UTF8, &support);
- if (!support)
- {
- msg = N_("PCRE library is compiled without UTF8 support");
- g_critical ("%s", msg);
- g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
- return NULL;
- }
+ pcre_config (PCRE_CONFIG_UTF8, &supports_utf8);
+ if (!supports_utf8)
+ g_critical (_("PCRE library is compiled without UTF8 support"));
- pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
- if (!support)
- {
- msg = N_("PCRE library is compiled without UTF8 properties support");
- g_critical ("%s", msg);
- g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
- return NULL;
- }
+ pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &supports_ucp);
+ if (!supports_ucp)
+ g_critical (_("PCRE library is compiled without UTF8 properties support"));
- initialized = TRUE;
+ g_once_init_leave (&initialised, supports_utf8 && supports_ucp ? 1 : 2);
}
+ if (G_UNLIKELY (initialised != 1))
+ {
+ g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE,
+ _("PCRE library is compiled with incompatible options"));
+ return NULL;
+ }
+
+ nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
+
/* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
* as we do not need to wrap PCRE_NO_UTF8_CHECK. */
if (compile_options & G_REGEX_OPTIMIZE)
compile_options |= PCRE_NEWLINE_ANY;
}
+ compile_options |= PCRE_UCP;
+
+ /* PCRE_BSR_UNICODE is the default for the internal PCRE but
+ * possibly not for the system one.
+ */
+ if (~compile_options & G_REGEX_BSR_ANYCRLF)
+ compile_options |= PCRE_BSR_UNICODE;
+
/* compile the pattern */
re = pcre_compile2 (pattern, compile_options, &errcode,
&errmsg, &erroffset, NULL);
* compile options, e.g. "(?i)foo" will make the pcre structure store
* PCRE_CASELESS even though it wasn't explicitly given for compilation. */
pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);
- compile_options = pcre_compile_options;
+ compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK;
+
+ /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */
+ if ((pcre_compile_options & PCRE_NEWLINE_ANYCRLF) != PCRE_NEWLINE_ANYCRLF)
+ compile_options &= ~PCRE_NEWLINE_ANY;
+
+ compile_options |= nonpcre_compile_options;
if (!(compile_options & G_REGEX_DUPNAMES))
{
}
/**
+ * g_regex_get_has_cr_or_lf:
+ * @regex: a #GRegex structure
+ *
+ * Checks whether the pattern contains explicit CR or LF references.
+ *
+ * Returns: %TRUE if the pattern contains explicit CR or LF references
+ *
+ * Since: 2.34
+ */
+gboolean
+g_regex_get_has_cr_or_lf (const GRegex *regex)
+{
+ gint value;
+
+ pcre_fullinfo (regex->pcre_re, regex->extra,
+ PCRE_INFO_HASCRORLF, &value);
+
+ return !!value;
+}
+
+/**
+ * g_regex_get_max_lookbehind:
+ * @regex: a #GRegex structure
+ *
+ * Gets the number of characters in the longest lookbehind assertion in the
+ * pattern. This information is useful when doing multi-segment matching using
+ * the partial matching facilities.
+ *
+ * Returns: the number of characters in the longest lookbehind assertion.
+ *
+ * Since: 2.38
+ */
+gint
+g_regex_get_max_lookbehind (const GRegex *regex)
+{
+ gint max_lookbehind;
+
+ pcre_fullinfo (regex->pcre_re, regex->extra,
+ PCRE_INFO_MAXLOOKBEHIND, &max_lookbehind);
+
+ return max_lookbehind;
+}
+
+/**
* g_regex_get_compile_flags:
* @regex: a #GRegex
*
{
g_return_val_if_fail (regex != NULL, 0);
- return regex->match_opts;
+ return regex->match_opts & G_REGEX_MATCH_MASK;
}
/**
* To retrieve all the non-overlapping matches of the pattern in
* string you can use g_match_info_next().
*
- * |[
+ * |[<!-- language="C" -->
* static void
* print_uppercase_words (const gchar *string)
* {
- * /* Print all uppercase-only words. */
+ * // Print all uppercase-only words.
* GRegex *regex;
* GMatchInfo *match_info;
- *
+ *
* regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
- * g_regex_match (regex, string, 0, &match_info);
+ * g_regex_match (regex, string, 0, &match_info);
* while (g_match_info_matches (match_info))
* {
* gchar *word = g_match_info_fetch (match_info, 0);
* @match_options: match options
* @match_info: (out) (allow-none): pointer to location where to store
* the #GMatchInfo, or %NULL if you do not need it
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Scans for a match in string for the pattern in @regex.
* The @match_options are combined with the match options specified
* To retrieve all the non-overlapping matches of the pattern in
* string you can use g_match_info_next().
*
- * |[
+ * |[<!-- language="C" -->
* static void
* print_uppercase_words (const gchar *string)
* {
- * /* Print all uppercase-only words. */
+ * // Print all uppercase-only words.
* GRegex *regex;
* GMatchInfo *match_info;
* GError *error = NULL;
- *
+ *
* regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
- * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
+ * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
* while (g_match_info_matches (match_info))
* {
* gchar *word = g_match_info_fetch (match_info, 0);
* g_print ("Found: %s\n", word);
* g_free (word);
- * g_match_info_next (match_info, &error);
+ * g_match_info_next (match_info, &error);
* }
* g_match_info_free (match_info);
* g_regex_unref (regex);
* @match_options: match options
* @match_info: (out) (allow-none): pointer to location where to store
* the #GMatchInfo, or %NULL if you do not need it
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Using the standard algorithm for regular expression matching only
- * the longest match in the string is retrieved, it is not possibile
+ * the longest match in the string is retrieved, it is not possible
* to obtain all the available matches. For instance matching
- * "<a> <b> <c>" against the pattern "<.*>"
- * you get "<a> <b> <c>".
+ * "<a> <b> <c>" against the pattern "<.*>"
+ * you get "<a> <b> <c>".
*
* This function uses a different algorithm (called DFA, i.e. deterministic
* finite automaton), so it can retrieve all the possible matches, all
* starting at the same point in the string. For instance matching
- * "<a> <b> <c>" against the pattern "<.*>"
- * you would obtain three matches: "<a> <b> <c>",
- * "<a> <b>" and "<a>".
+ * "<a> <b> <c>" against the pattern "<.*>;"
+ * you would obtain three matches: "<a> <b> <c>",
+ * "<a> <b>" and "<a>".
*
* The number of matched strings is retrieved using
* g_match_info_get_match_count(). To obtain the matched strings and
* characters. For example splitting "ab c" using as a separator
* "\s*", you will get "a", "b" and "c".
*
- * Returns: a %NULL-terminated array of strings. Free it using g_strfreev()
+ * Returns: (transfer full): a %NULL-terminated array of strings. Free
+ * it using g_strfreev()
*
* Since: 2.14
**/
regex = g_regex_new (pattern, compile_options, 0, NULL);
if (!regex)
return NULL;
+
result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
g_regex_unref (regex);
return result;
* For example splitting "ab c" using as a separator "\s*", you will get
* "a", "b" and "c".
*
- * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
+ * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
+ * it using g_strfreev()
*
* Since: 2.14
**/
* string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
* that begins with any kind of lookbehind assertion, such as "\b".
*
- * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
+ * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
+ * it using g_strfreev()
*
* Since: 2.14
**/
match_ok = g_regex_match_full (regex, string, string_len, start_position,
match_options, &match_info, &tmp_error);
+
while (tmp_error == NULL)
{
if (match_ok)
if (tmp_error != NULL)
{
g_propagate_error (error, tmp_error);
- g_list_foreach (list, (GFunc)g_free, NULL);
- g_list_free (list);
+ g_list_free_full (list, g_free);
match_info->pos = -1;
return NULL;
}
start = p = expand_escape (replacement, p, data, error);
if (p == NULL)
{
- g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
- g_list_free (list);
+ g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
free_interpolation_data (data);
return NULL;
* @start_position: starting index of the string to match
* @replacement: text to replace each match with
* @match_options: options for the match
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Replaces all occurrences of the pattern in @regex with the
* replacement text. Backreferences of the form '\number' or
- * '\g<number>' in the replacement text are interpolated by the
- * number-th captured subexpression of the match, '\g<name>' refers
- * to the captured subexpression with the given name. '\0' refers to the
- * complete match, but '\0' followed by a number is the octal representation
- * of a character. To include a literal '\' in the replacement, write '\\'.
+ * '\g<number>' in the replacement text are interpolated by the
+ * number-th captured subexpression of the match, '\g<name>' refers
+ * to the captured subexpression with the given name. '\0' refers
+ * to the complete match, but '\0' followed by a number is the octal
+ * representation of a character. To include a literal '\' in the
+ * replacement, write '\\'.
+ *
* There are also escapes that changes the case of the following text:
*
- * <variablelist>
- * <varlistentry><term>\l</term>
- * <listitem>
- * <para>Convert to lower case the next character</para>
- * </listitem>
- * </varlistentry>
- * <varlistentry><term>\u</term>
- * <listitem>
- * <para>Convert to upper case the next character</para>
- * </listitem>
- * </varlistentry>
- * <varlistentry><term>\L</term>
- * <listitem>
- * <para>Convert to lower case till \E</para>
- * </listitem>
- * </varlistentry>
- * <varlistentry><term>\U</term>
- * <listitem>
- * <para>Convert to upper case till \E</para>
- * </listitem>
- * </varlistentry>
- * <varlistentry><term>\E</term>
- * <listitem>
- * <para>End case modification</para>
- * </listitem>
- * </varlistentry>
- * </variablelist>
+ * - \l: Convert to lower case the next character
+ * - \u: Convert to upper case the next character
+ * - \L: Convert to lower case till \E
+ * - \U: Convert to upper case till \E
+ * - \E: End case modification
*
* If you do not need to use backreferences use g_regex_replace_literal().
*
if (tmp_error != NULL)
g_propagate_error (error, tmp_error);
- g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
- g_list_free (list);
+ g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
return result;
}
* @start_position: starting index of the string to match
* @replacement: text to replace each match with
* @match_options: options for the match
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Replaces all occurrences of the pattern in @regex with the
* replacement text. @replacement is replaced literally, to
* @match_options: options for the match
* @eval: a function to call for each match
* @user_data: user data to pass to the function
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Replaces occurrences of the pattern in regex with the output of
* @eval for that occurrence.
*
* The following example uses g_regex_replace_eval() to replace multiple
* strings at once:
- * |[
+ * |[<!-- language="C" -->
* static gboolean
* eval_cb (const GMatchInfo *info,
* GString *res,
* return FALSE;
* }
*
- * /* ... */
+ * ...
*
* GRegex *reg;
* GHashTable *h;
* res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
* g_hash_table_destroy (h);
*
- * /* ... */
+ * ...
* ]|
*
* Returns: a newly allocated string containing the replacements
if (has_references)
*has_references = interpolation_list_needs_match (list);
- g_list_foreach (list, (GFunc) free_interpolation_data, NULL);
- g_list_free (list);
+ g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
return TRUE;
}
/**
+ * g_regex_escape_nul:
+ * @string: the string to escape
+ * @length: the length of @string
+ *
+ * Escapes the nul characters in @string to "\x00". It can be used
+ * to compile a regex with embedded nul characters.
+ *
+ * For completeness, @length can be -1 for a nul-terminated string.
+ * In this case the output string will be of course equal to @string.
+ *
+ * Returns: a newly-allocated escaped string
+ *
+ * Since: 2.30
+ */
+gchar *
+g_regex_escape_nul (const gchar *string,
+ gint length)
+{
+ GString *escaped;
+ const gchar *p, *piece_start, *end;
+ gint backslashes;
+
+ g_return_val_if_fail (string != NULL, NULL);
+
+ if (length < 0)
+ return g_strdup (string);
+
+ end = string + length;
+ p = piece_start = string;
+ escaped = g_string_sized_new (length + 1);
+
+ backslashes = 0;
+ while (p < end)
+ {
+ switch (*p)
+ {
+ case '\0':
+ if (p != piece_start)
+ {
+ /* copy the previous piece. */
+ g_string_append_len (escaped, piece_start, p - piece_start);
+ }
+ if ((backslashes & 1) == 0)
+ g_string_append_c (escaped, '\\');
+ g_string_append_c (escaped, 'x');
+ g_string_append_c (escaped, '0');
+ g_string_append_c (escaped, '0');
+ piece_start = ++p;
+ backslashes = 0;
+ break;
+ case '\\':
+ backslashes++;
+ ++p;
+ break;
+ default:
+ backslashes = 0;
+ p = g_utf8_next_char (p);
+ break;
+ }
+ }
+
+ if (piece_start < end)
+ g_string_append_len (escaped, piece_start, end - piece_start);
+
+ return g_string_free (escaped, FALSE);
+}
+
+/**
* g_regex_escape_string:
* @string: (array length=length): the string to escape
* @length: the length of @string, or -1 if @string is nul-terminated