X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=glib%2Fgregex.c;h=41bf67e3f531a1865f1bee47aba32c81a54706c0;hb=bc6ee788b4ff6590513da6ab657448885e92b20b;hp=f21fbe8743517de1c89ad62ddcf611fd8ba701b6;hpb=96d87da379879f20e2c162b468525aaa1a0a82dd;p=platform%2Fupstream%2Fglib.git diff --git a/glib/gregex.c b/glib/gregex.c index f21fbe8..41bf67e 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -32,14 +32,19 @@ #include "gtypes.h" #include "gregex.h" #include "glibintl.h" +#include "glist.h" +#include "gmessages.h" +#include "gstrfuncs.h" +#include "gatomic.h" +#include "gthread.h" /** * SECTION:gregex * @title: Perl-compatible regular expressions * @short_description: matches strings against regular expressions - * @see_also: + * @see_also: [Regular expression syntax][glib-regex-syntax] * - * The g_regex_*() functions implement regular + * The g_regex_*() functions implement regular * expression pattern matching using syntax and semantics similar to * Perl regular expression. * @@ -83,14 +88,27 @@ * unescaped "#" outside a character class is encountered. This indicates * a comment that lasts until after the next newline. * + * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern + * matching is changed to be compatible with the way that regular expressions + * work in JavaScript. More precisely, a lonely ']' character in the pattern + * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and + * you must use the '\u' escape sequence with 4 hex digits to specify a unicode + * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by + * the specified number of hex digits, they match 'x' and 'u' literally; also + * '\U' always matches 'U' instead of being an error in the pattern. Finally, + * pattern matching is modified so that back references to an unset subpattern + * group produces a match with the empty string instead of an error. See + * pcreapi(3) for more information. + * * Creating and manipulating the same #GRegex structure from different * threads is not a problem as #GRegex does not modify its internal * state between creation and destruction, on the other hand #GMatchInfo * is not threadsafe. * * The regular expressions low-level functionalities are obtained through - * the excellent PCRE library - * written by Philip Hazel. + * the excellent + * [PCRE](http://www.pcre.org/) + * library written by Philip Hazel. */ /* Mask of all the possible values for GRegexCompileFlags. */ @@ -104,39 +122,95 @@ G_REGEX_RAW | \ G_REGEX_NO_AUTO_CAPTURE | \ G_REGEX_OPTIMIZE | \ + G_REGEX_FIRSTLINE | \ G_REGEX_DUPNAMES | \ G_REGEX_NEWLINE_CR | \ G_REGEX_NEWLINE_LF | \ - G_REGEX_NEWLINE_CRLF) + G_REGEX_NEWLINE_CRLF | \ + G_REGEX_NEWLINE_ANYCRLF | \ + G_REGEX_BSR_ANYCRLF | \ + G_REGEX_JAVASCRIPT_COMPAT) + +/* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */ +#define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK) +#define G_REGEX_COMPILE_NONPCRE_MASK (G_REGEX_RAW | \ + G_REGEX_OPTIMIZE) /* Mask of all the possible values for GRegexMatchFlags. */ -#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ - G_REGEX_MATCH_NOTBOL | \ - G_REGEX_MATCH_NOTEOL | \ - G_REGEX_MATCH_NOTEMPTY | \ - G_REGEX_MATCH_PARTIAL | \ - G_REGEX_MATCH_NEWLINE_CR | \ - G_REGEX_MATCH_NEWLINE_LF | \ - G_REGEX_MATCH_NEWLINE_CRLF | \ - G_REGEX_MATCH_NEWLINE_ANY) +#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ + G_REGEX_MATCH_NOTBOL | \ + G_REGEX_MATCH_NOTEOL | \ + G_REGEX_MATCH_NOTEMPTY | \ + G_REGEX_MATCH_PARTIAL | \ + G_REGEX_MATCH_NEWLINE_CR | \ + G_REGEX_MATCH_NEWLINE_LF | \ + G_REGEX_MATCH_NEWLINE_CRLF | \ + G_REGEX_MATCH_NEWLINE_ANY | \ + G_REGEX_MATCH_NEWLINE_ANYCRLF | \ + G_REGEX_MATCH_BSR_ANYCRLF | \ + G_REGEX_MATCH_BSR_ANY | \ + G_REGEX_MATCH_PARTIAL_SOFT | \ + G_REGEX_MATCH_PARTIAL_HARD | \ + G_REGEX_MATCH_NOTEMPTY_ATSTART) + +/* we rely on these flags having the same values */ +G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS); +G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE); +G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL); +G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED); +G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED); +G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY); +G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY); +G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE); +G_STATIC_ASSERT (G_REGEX_FIRSTLINE == PCRE_FIRSTLINE); +G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES); +G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR); +G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF); +G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); +G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT); + +G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY); +G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE); +G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT); +G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART); + +/* These PCRE flags are unused or not exposed publically in GRegexFlags, so + * it should be ok to reuse them for different things. + */ +G_STATIC_ASSERT (G_REGEX_OPTIMIZE == PCRE_NO_UTF8_CHECK); +G_STATIC_ASSERT (G_REGEX_RAW == PCRE_UTF8); /* if the string is in UTF-8 use g_utf8_ functions, else use * use just +/- 1. */ -#define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \ - g_utf8_next_char (s) : \ - ((s) + 1)) -#define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \ - g_utf8_prev_char (s) : \ - ((s) - 1)) +#define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ + ((s) + 1) : \ + g_utf8_next_char (s)) +#define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ + ((s) - 1) : \ + g_utf8_prev_char (s)) struct _GMatchInfo { + volatile gint ref_count; /* the ref count */ GRegex *regex; /* the regex */ GRegexMatchFlags match_opts; /* options used at match time on the regex */ gint matches; /* number of matching sub patterns */ gint pos; /* position in the string where last match left off */ + gint n_offsets; /* number of offsets */ gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ - gint n_offsets; /* number of offsets */ gint *workspace; /* workspace for pcre_dfa_exec() */ gint n_workspace; /* number of workspace elements */ const gchar *string; /* string passed to the match function */ @@ -222,10 +296,14 @@ match_error (gint errcode) case PCRE_ERROR_DFA_RECURSE: case PCRE_ERROR_RECURSIONLIMIT: return _("recursion limit reached"); - case PCRE_ERROR_NULLWSLIMIT: - return _("workspace limit for empty substrings reached"); case PCRE_ERROR_BADNEWLINE: return _("invalid combination of newline flags"); + case PCRE_ERROR_BADOFFSET: + return _("bad offset"); + case PCRE_ERROR_SHORTUTF8: + return _("short utf8"); + case PCRE_ERROR_RECURSELOOP: + return _("recursion loop"); default: break; } @@ -255,14 +333,7 @@ translate_compile_error (gint *errcode, const gchar **errmsg) *errmsg = _("\\c at end of pattern"); break; case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: - *errmsg = _("unrecognized character follows \\"); - break; - case 137: - /* A number of Perl escapes are not handled by PCRE. - * Therefore it explicitly raises ERR37. - */ - *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; - *errmsg = _("case-changing escapes (\\l, \\L, \\u, \\U) are not allowed here"); + *errmsg = _("unrecognized character following \\"); break; case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: *errmsg = _("numbers out of order in {} quantifier"); @@ -282,16 +353,12 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case G_REGEX_ERROR_NOTHING_TO_REPEAT: *errmsg = _("nothing to repeat"); break; - case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: - *errmsg = _("unrecognized character after (?"); - break; - case 124: - *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; - *errmsg = _("unrecognized character after (?<"); + case 111: /* internal error: unexpected repeat */ + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("unexpected repeat"); break; - case 141: - *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; - *errmsg = _("unrecognized character after (?P"); + case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: + *errmsg = _("unrecognized character after (? or (?-"); break; case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: *errmsg = _("POSIX named classes are supported only within a class"); @@ -299,17 +366,6 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case G_REGEX_ERROR_UNMATCHED_PARENTHESIS: *errmsg = _("missing terminating )"); break; - case 122: - *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; - *errmsg = _(") without opening ("); - break; - case 129: - *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; - /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of) - * sequences here, '(?-54' would be an example for the second group. - */ - *errmsg = _("(?R or (?[+-]digits must be followed by )"); - break; case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: *errmsg = _("reference to non-existent subpattern"); break; @@ -317,11 +373,23 @@ translate_compile_error (gint *errcode, const gchar **errmsg) *errmsg = _("missing ) after comment"); break; case G_REGEX_ERROR_EXPRESSION_TOO_LARGE: - *errmsg = _("regular expression too large"); + *errmsg = _("regular expression is too large"); break; case G_REGEX_ERROR_MEMORY_ERROR: *errmsg = _("failed to get memory"); break; + case 122: /* unmatched parentheses */ + *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; + *errmsg = _(") without opening ("); + break; + case 123: /* internal error: code overflow */ + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("code overflow"); + break; + case 124: /* "unrecognized character after (?<\0 */ + *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; + *errmsg = _("unrecognized character after (?<"); + break; case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: *errmsg = _("lookbehind assertion is not fixed length"); break; @@ -334,6 +402,13 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case G_REGEX_ERROR_ASSERTION_EXPECTED: *errmsg = _("assertion expected after (?("); break; + case 129: + *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; + /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of) + * sequences here, '(?-54' would be an example for the second group. + */ + *errmsg = _("(?R or (?[+-]digits must be followed by )"); + break; case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: *errmsg = _("unknown POSIX class name"); break; @@ -349,9 +424,20 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: *errmsg = _("\\C not allowed in lookbehind assertion"); break; + case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */ + /* A number of Perl escapes are not handled by PCRE. + * Therefore it explicitly raises ERR37. + */ + *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; + *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported"); + break; case G_REGEX_ERROR_INFINITE_LOOP: *errmsg = _("recursive call could loop indefinitely"); break; + case 141: /* unrecognized character after (?P\0 */ + *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; + *errmsg = _("unrecognized character after (?P"); + break; case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: *errmsg = _("missing terminator in subpattern name"); break; @@ -373,54 +459,93 @@ translate_compile_error (gint *errcode, const gchar **errmsg) case G_REGEX_ERROR_INVALID_OCTAL_VALUE: *errmsg = _("octal value is greater than \\377"); break; + case 152: /* internal error: overran compiling workspace */ + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("overran compiling workspace"); + break; + case 153: /* internal error: previously-checked referenced subpattern not found */ + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("previously-checked referenced subpattern not found"); + break; case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: *errmsg = _("DEFINE group contains more than one branch"); break; - case G_REGEX_ERROR_DEFINE_REPETION: - *errmsg = _("repeating a DEFINE group is not allowed"); - break; case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: *errmsg = _("inconsistent NEWLINE options"); break; case G_REGEX_ERROR_MISSING_BACK_REFERENCE: - *errmsg = _("\\g is not followed by a braced name or an optionally " - "braced non-zero number"); + *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " + "number, or by a plain number"); break; - case 11: - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("unexpected repeat"); + case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE: + *errmsg = _("a numbered reference must not be zero"); break; - case 23: - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("code overflow"); + case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN: + *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); break; - case 52: - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("overran compiling workspace"); + case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB: + *errmsg = _("(*VERB) not recognized"); break; - case 53: - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("previously-checked referenced subpattern not found"); + case G_REGEX_ERROR_NUMBER_TOO_BIG: + *errmsg = _("number is too big"); break; - case 16: + case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME: + *errmsg = _("missing subpattern name after (?&"); + break; + case G_REGEX_ERROR_MISSING_DIGIT: + *errmsg = _("digit expected after (?+"); + break; + case G_REGEX_ERROR_INVALID_DATA_CHARACTER: + *errmsg = _("] is an invalid data character in JavaScript compatibility mode"); + break; + case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME: + *errmsg = _("different names for subpatterns of the same number are not allowed"); + break; + case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED: + *errmsg = _("(*MARK) must have an argument"); + break; + case G_REGEX_ERROR_INVALID_CONTROL_CHAR: + *errmsg = _( "\\c must be followed by an ASCII character"); + break; + case G_REGEX_ERROR_MISSING_NAME: + *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); + break; + case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS: + *errmsg = _("\\N is not supported in a class"); + break; + case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES: + *errmsg = _("too many forward references"); + break; + case G_REGEX_ERROR_NAME_TOO_LONG: + *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); + break; + case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE: + *errmsg = _("character value in \\u.... sequence is too large"); + break; + + case 116: /* erroffset passed as NULL */ /* This should not happen as we never pass a NULL erroffset */ g_warning ("erroffset passed as NULL"); *errcode = G_REGEX_ERROR_COMPILE; break; - case 17: + case 117: /* unknown option bit(s) set */ /* This should not happen as we check options before passing them * to pcre_compile2() */ g_warning ("unknown option bit(s) set"); *errcode = G_REGEX_ERROR_COMPILE; break; - case 32: - case 44: - case 45: - /* These errors should not happen as we are using an UTF8-enabled PCRE + case 132: /* this version of PCRE is compiled without UTF support */ + case 144: /* invalid UTF-8 string */ + case 145: /* support for \\P, \\p, and \\X has not been compiled */ + case 167: /* this version of PCRE is not compiled with Unicode property support */ + case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */ + case 174: /* invalid UTF-16 string */ + /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE * and we do not check if strings are valid */ - g_warning ("%s", *errmsg); - *errcode = G_REGEX_ERROR_COMPILE; + case 170: /* internal error: unknown opcode in find_fixedlength() */ + *errcode = G_REGEX_ERROR_INTERNAL; break; + default: *errcode = G_REGEX_ERROR_COMPILE; } @@ -442,6 +567,7 @@ match_info_new (const GRegex *regex, string_len = strlen (string); match_info = g_new0 (GMatchInfo, 1); + match_info->ref_count = 1; match_info->regex = g_regex_ref ((GRegex *)regex); match_info->string = string; match_info->string_len = string_len; @@ -512,17 +638,36 @@ g_match_info_get_string (const GMatchInfo *match_info) } /** - * g_match_info_free: + * g_match_info_ref: * @match_info: a #GMatchInfo * - * Frees all the memory associated with the #GMatchInfo structure. + * Increases reference count of @match_info by 1. * - * Since: 2.14 + * Returns: @match_info + * + * Since: 2.30 + */ +GMatchInfo * +g_match_info_ref (GMatchInfo *match_info) +{ + g_return_val_if_fail (match_info != NULL, NULL); + g_atomic_int_inc (&match_info->ref_count); + return match_info; +} + +/** + * g_match_info_unref: + * @match_info: a #GMatchInfo + * + * Decreases reference count of @match_info by 1. When reference count drops + * to zero, it frees all the memory associated with the match_info structure. + * + * Since: 2.30 */ void -g_match_info_free (GMatchInfo *match_info) +g_match_info_unref (GMatchInfo *match_info) { - if (match_info) + if (g_atomic_int_dec_and_test (&match_info->ref_count)) { g_regex_unref (match_info->regex); g_free (match_info->offsets); @@ -532,9 +677,27 @@ g_match_info_free (GMatchInfo *match_info) } /** + * g_match_info_free: + * @match_info: (allow-none): a #GMatchInfo, or %NULL + * + * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does + * nothing. + * + * Since: 2.14 + */ +void +g_match_info_free (GMatchInfo *match_info) +{ + if (match_info == NULL) + return; + + g_match_info_unref (match_info); +} + +/** * g_match_info_next: * @match_info: a #GMatchInfo structure - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Scans for the next match using the same parameters of the previous * call to g_regex_match_full() or g_regex_match() that returned @@ -561,6 +724,14 @@ g_match_info_next (GMatchInfo *match_info, prev_match_start = match_info->offsets[0]; prev_match_end = match_info->offsets[1]; + if (match_info->pos > match_info->string_len) + { + /* we have reached the end of the string */ + match_info->pos = -1; + match_info->matches = PCRE_ERROR_NOMATCH; + return FALSE; + } + match_info->matches = pcre_exec (match_info->regex->pcre_re, match_info->regex->extra, match_info->string, @@ -690,24 +861,25 @@ g_match_info_get_match_count (const GMatchInfo *match_info) * able to raise an error as soon as a mistake is made. * * GRegex supports the concept of partial matching by means of the - * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for + * #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD flags. + * When they are used, the return code for * g_regex_match() or g_regex_match_full() is, as usual, %TRUE * for a complete match, %FALSE otherwise. But, when these functions * return %FALSE, you can check if the match was partial calling * g_match_info_is_partial_match(). * - * When using partial matching you cannot use g_match_info_fetch*(). + * The difference between #G_REGEX_MATCH_PARTIAL_SOFT and + * #G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered + * with #G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a + * possible complete match, while with #G_REGEX_MATCH_PARTIAL_HARD matching + * stops at the partial match. + * When both #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD + * are set, the latter takes precedence. + * + * There were formerly some restrictions on the pattern for partial matching. + * The restrictions no longer apply. * - * Because of the way certain internal optimizations are implemented - * the partial matching algorithm cannot be used with all patterns. - * So repeated single characters such as "a{2,4}" and repeated single - * meta-sequences such as "\d+" are not permitted if the maximum number - * of occurrences is greater than one. Optional items such as "\d?" - * (where the maximum is one) are permitted. Quantifiers with any values - * are permitted after parentheses, so the invalid examples above can be - * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set - * for a pattern that does not conform to the restrictions, matching - * functions return an error. + * See pcrepartial(3) for more information on partial matching. * * Returns: %TRUE if the match was partial, %FALSE otherwise * @@ -723,9 +895,9 @@ g_match_info_is_partial_match (const GMatchInfo *match_info) /** * g_match_info_expand_references: - * @match_info: a #GMatchInfo or %NULL + * @match_info: (allow-none): a #GMatchInfo or %NULL * @string_to_expand: the string to expand - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Returns a new string containing the text in @string_to_expand with * references and escape sequences expanded. References refer to the last @@ -779,8 +951,7 @@ g_match_info_expand_references (const GMatchInfo *match_info, result = g_string_sized_new (strlen (string_to_expand)); interpolate_replacement (match_info, result, list); - g_list_foreach (list, (GFunc)free_interpolation_data, NULL); - g_list_free (list); + g_list_free_full (list, (GDestroyNotify) free_interpolation_data); return g_string_free (result, FALSE); } @@ -790,7 +961,7 @@ g_match_info_expand_references (const GMatchInfo *match_info, * @match_info: #GMatchInfo structure * @match_num: number of the sub expression * - * Retrieves the text matching the @match_num'th capturing + * Retrieves the text matching the @match_num'th capturing * parentheses. 0 is the full text of the match, 1 is the first paren * set, 2 the second, and so on. * @@ -845,7 +1016,7 @@ g_match_info_fetch (const GMatchInfo *match_info, * @end_pos: (out) (allow-none): pointer to location where to store * the end position, or %NULL * - * Retrieves the position in bytes of the @match_num'th capturing + * Retrieves the position in bytes of the @match_num'th capturing * parentheses. 0 is the full text of the match, 1 is the first * paren set, 2 the second, and so on. * @@ -932,7 +1103,7 @@ get_matched_substring_number (const GMatchInfo *match_info, * Retrieves the text matching the capturing parentheses named @name. * * If @name is a valid sub pattern name but it didn't match anything - * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") + * (e.g. sub pattern "X", matching "b" against "(?Pa)?b") * then an empty string is returned. * * The string is fetched from the string passed to the match function, @@ -973,7 +1144,7 @@ g_match_info_fetch_named (const GMatchInfo *match_info, * Retrieves the position in bytes of the capturing parentheses named @name. * * If @name is a valid sub pattern name but it didn't match anything - * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") + * (e.g. sub pattern "X", matching "b" against "(?Pa)?b") * then @start_pos and @end_pos are set to -1 and %TRUE is returned. * * Returns: %TRUE if the position was fetched, %FALSE otherwise. @@ -1021,9 +1192,9 @@ g_match_info_fetch_named_pos (const GMatchInfo *match_info, * The strings are fetched from the string passed to the match function, * so you cannot call this function after freeing the string. * - * Returns: (allow-none): a %NULL-terminated array of gchar * pointers. - * It must be freed using g_strfreev(). If the previous match failed - * %NULL is returned + * Returns: (transfer full): a %NULL-terminated array of gchar * + * pointers. It must be freed using g_strfreev(). If the previous + * match failed %NULL is returned * * Since: 2.14 */ @@ -1051,16 +1222,7 @@ g_match_info_fetch_all (const GMatchInfo *match_info) /* GRegex */ -GQuark -g_regex_error_quark (void) -{ - static GQuark error_quark = 0; - - if (error_quark == 0) - error_quark = g_quark_from_static_string ("g-regex-error-quark"); - - return error_quark; -} +G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) /** * g_regex_ref: @@ -1094,7 +1256,7 @@ g_regex_unref (GRegex *regex) { g_return_if_fail (regex != NULL); - if (g_atomic_int_exchange_and_add (®ex->ref_count, -1) - 1 == 0) + if (g_atomic_int_dec_and_test (®ex->ref_count)) { g_free (regex->pattern); if (regex->pcre_re != NULL) @@ -1132,40 +1294,39 @@ g_regex_new (const gchar *pattern, gint erroffset; gint errcode; gboolean optimize = FALSE; - static gboolean initialized = FALSE; + static volatile gsize initialised = 0; unsigned long int pcre_compile_options; + GRegexCompileFlags nonpcre_compile_options; g_return_val_if_fail (pattern != NULL, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); - if (!initialized) + if (g_once_init_enter (&initialised)) { - gint support; - const gchar *msg; + int supports_utf8, supports_ucp; - pcre_config (PCRE_CONFIG_UTF8, &support); - if (!support) - { - msg = N_("PCRE library is compiled without UTF8 support"); - g_critical ("%s", msg); - g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg)); - return NULL; - } + pcre_config (PCRE_CONFIG_UTF8, &supports_utf8); + if (!supports_utf8) + g_critical (_("PCRE library is compiled without UTF8 support")); - pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support); - if (!support) - { - msg = N_("PCRE library is compiled without UTF8 properties support"); - g_critical ("%s", msg); - g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg)); - return NULL; - } + pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &supports_ucp); + if (!supports_ucp) + g_critical (_("PCRE library is compiled without UTF8 properties support")); - initialized = TRUE; + g_once_init_leave (&initialised, supports_utf8 && supports_ucp ? 1 : 2); } + if (G_UNLIKELY (initialised != 1)) + { + g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, + _("PCRE library is compiled with incompatible options")); + return NULL; + } + + nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; + /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK, * as we do not need to wrap PCRE_NO_UTF8_CHECK. */ if (compile_options & G_REGEX_OPTIMIZE) @@ -1193,6 +1354,14 @@ g_regex_new (const gchar *pattern, compile_options |= PCRE_NEWLINE_ANY; } + compile_options |= PCRE_UCP; + + /* PCRE_BSR_UNICODE is the default for the internal PCRE but + * possibly not for the system one. + */ + if (~compile_options & G_REGEX_BSR_ANYCRLF) + compile_options |= PCRE_BSR_UNICODE; + /* compile the pattern */ re = pcre_compile2 (pattern, compile_options, &errcode, &errmsg, &erroffset, NULL); @@ -1223,7 +1392,13 @@ g_regex_new (const gchar *pattern, * compile options, e.g. "(?i)foo" will make the pcre structure store * PCRE_CASELESS even though it wasn't explicitly given for compilation. */ pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options); - compile_options = pcre_compile_options; + compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK; + + /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */ + if ((pcre_compile_options & PCRE_NEWLINE_ANYCRLF) != PCRE_NEWLINE_ANYCRLF) + compile_options &= ~PCRE_NEWLINE_ANY; + + compile_options |= nonpcre_compile_options; if (!(compile_options & G_REGEX_DUPNAMES)) { @@ -1325,6 +1500,50 @@ g_regex_get_capture_count (const GRegex *regex) } /** + * g_regex_get_has_cr_or_lf: + * @regex: a #GRegex structure + * + * Checks whether the pattern contains explicit CR or LF references. + * + * Returns: %TRUE if the pattern contains explicit CR or LF references + * + * Since: 2.34 + */ +gboolean +g_regex_get_has_cr_or_lf (const GRegex *regex) +{ + gint value; + + pcre_fullinfo (regex->pcre_re, regex->extra, + PCRE_INFO_HASCRORLF, &value); + + return !!value; +} + +/** + * g_regex_get_max_lookbehind: + * @regex: a #GRegex structure + * + * Gets the number of characters in the longest lookbehind assertion in the + * pattern. This information is useful when doing multi-segment matching using + * the partial matching facilities. + * + * Returns: the number of characters in the longest lookbehind assertion. + * + * Since: 2.38 + */ +gint +g_regex_get_max_lookbehind (const GRegex *regex) +{ + gint max_lookbehind; + + pcre_fullinfo (regex->pcre_re, regex->extra, + PCRE_INFO_MAXLOOKBEHIND, &max_lookbehind); + + return max_lookbehind; +} + +/** * g_regex_get_compile_flags: * @regex: a #GRegex * @@ -1357,7 +1576,7 @@ g_regex_get_match_flags (const GRegex *regex) { g_return_val_if_fail (regex != NULL, 0); - return regex->match_opts; + return regex->match_opts & G_REGEX_MATCH_MASK; } /** @@ -1420,16 +1639,16 @@ g_regex_match_simple (const gchar *pattern, * To retrieve all the non-overlapping matches of the pattern in * string you can use g_match_info_next(). * - * |[ + * |[ * static void * print_uppercase_words (const gchar *string) * { - * /* Print all uppercase-only words. */ + * // Print all uppercase-only words. * GRegex *regex; * GMatchInfo *match_info; - *   + * * regex = g_regex_new ("[A-Z]+", 0, 0, NULL); - * g_regex_match (regex, string, 0, &match_info); + * g_regex_match (regex, string, 0, &match_info); * while (g_match_info_matches (match_info)) * { * gchar *word = g_match_info_fetch (match_info, 0); @@ -1469,7 +1688,7 @@ g_regex_match (const GRegex *regex, * @match_options: match options * @match_info: (out) (allow-none): pointer to location where to store * the #GMatchInfo, or %NULL if you do not need it - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Scans for a match in string for the pattern in @regex. * The @match_options are combined with the match options specified @@ -1493,23 +1712,23 @@ g_regex_match (const GRegex *regex, * To retrieve all the non-overlapping matches of the pattern in * string you can use g_match_info_next(). * - * |[ + * |[ * static void * print_uppercase_words (const gchar *string) * { - * /* Print all uppercase-only words. */ + * // Print all uppercase-only words. * GRegex *regex; * GMatchInfo *match_info; * GError *error = NULL; - *   + * * regex = g_regex_new ("[A-Z]+", 0, 0, NULL); - * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); + * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); * while (g_match_info_matches (match_info)) * { * gchar *word = g_match_info_fetch (match_info, 0); * g_print ("Found: %s\n", word); * g_free (word); - * g_match_info_next (match_info, &error); + * g_match_info_next (match_info, &error); * } * g_match_info_free (match_info); * g_regex_unref (regex); @@ -1600,20 +1819,20 @@ g_regex_match_all (const GRegex *regex, * @match_options: match options * @match_info: (out) (allow-none): pointer to location where to store * the #GMatchInfo, or %NULL if you do not need it - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Using the standard algorithm for regular expression matching only - * the longest match in the string is retrieved, it is not possibile + * the longest match in the string is retrieved, it is not possible * to obtain all the available matches. For instance matching - * "<a> <b> <c>" against the pattern "<.*>" - * you get "<a> <b> <c>". + * " " against the pattern "<.*>" + * you get " ". * * This function uses a different algorithm (called DFA, i.e. deterministic * finite automaton), so it can retrieve all the possible matches, all * starting at the same point in the string. For instance matching - * "<a> <b> <c>" against the pattern "<.*>" - * you would obtain three matches: "<a> <b> <c>", - * "<a> <b>" and "<a>". + * " " against the pattern "<.*>;" + * you would obtain three matches: " ", + * " " and "". * * The number of matched strings is retrieved using * g_match_info_get_match_count(). To obtain the matched strings and @@ -1772,7 +1991,8 @@ g_regex_get_string_number (const GRegex *regex, * characters. For example splitting "ab c" using as a separator * "\s*", you will get "a", "b" and "c". * - * Returns: a %NULL-terminated array of strings. Free it using g_strfreev() + * Returns: (transfer full): a %NULL-terminated array of strings. Free + * it using g_strfreev() * * Since: 2.14 **/ @@ -1788,6 +2008,7 @@ g_regex_split_simple (const gchar *pattern, regex = g_regex_new (pattern, compile_options, 0, NULL); if (!regex) return NULL; + result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); g_regex_unref (regex); return result; @@ -1817,7 +2038,8 @@ g_regex_split_simple (const gchar *pattern, * For example splitting "ab c" using as a separator "\s*", you will get * "a", "b" and "c". * - * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev() + * Returns: (transfer full): a %NULL-terminated gchar ** array. Free + * it using g_strfreev() * * Since: 2.14 **/ @@ -1863,7 +2085,8 @@ g_regex_split (const GRegex *regex, * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern * that begins with any kind of lookbehind assertion, such as "\b". * - * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev() + * Returns: (transfer full): a %NULL-terminated gchar ** array. Free + * it using g_strfreev() * * Since: 2.14 **/ @@ -1920,6 +2143,7 @@ g_regex_split_full (const GRegex *regex, match_ok = g_regex_match_full (regex, string, string_len, start_position, match_options, &match_info, &tmp_error); + while (tmp_error == NULL) { if (match_ok) @@ -2002,8 +2226,7 @@ g_regex_split_full (const GRegex *regex, if (tmp_error != NULL) { g_propagate_error (error, tmp_error); - g_list_foreach (list, (GFunc)g_free, NULL); - g_list_free (list); + g_list_free_full (list, g_free); match_info->pos = -1; return NULL; } @@ -2325,8 +2548,7 @@ split_replacement (const gchar *replacement, start = p = expand_escape (replacement, p, data, error); if (p == NULL) { - g_list_foreach (list, (GFunc)free_interpolation_data, NULL); - g_list_free (list); + g_list_free_full (list, (GDestroyNotify) free_interpolation_data); free_interpolation_data (data); return NULL; @@ -2468,44 +2690,24 @@ interpolation_list_needs_match (GList *list) * @start_position: starting index of the string to match * @replacement: text to replace each match with * @match_options: options for the match - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Replaces all occurrences of the pattern in @regex with the * replacement text. Backreferences of the form '\number' or - * '\g<number>' in the replacement text are interpolated by the - * number-th captured subexpression of the match, '\g<name>' refers - * to the captured subexpression with the given name. '\0' refers to the - * complete match, but '\0' followed by a number is the octal representation - * of a character. To include a literal '\' in the replacement, write '\\'. + * '\g' in the replacement text are interpolated by the + * number-th captured subexpression of the match, '\g' refers + * to the captured subexpression with the given name. '\0' refers + * to the complete match, but '\0' followed by a number is the octal + * representation of a character. To include a literal '\' in the + * replacement, write '\\'. + * * There are also escapes that changes the case of the following text: * - * - * \l - * - * Convert to lower case the next character - * - * - * \u - * - * Convert to upper case the next character - * - * - * \L - * - * Convert to lower case till \E - * - * - * \U - * - * Convert to upper case till \E - * - * - * \E - * - * End case modification - * - * - * + * - \l: Convert to lower case the next character + * - \u: Convert to upper case the next character + * - \L: Convert to lower case till \E + * - \U: Convert to upper case till \E + * - \E: End case modification * * If you do not need to use backreferences use g_regex_replace_literal(). * @@ -2557,8 +2759,7 @@ g_regex_replace (const GRegex *regex, if (tmp_error != NULL) g_propagate_error (error, tmp_error); - g_list_foreach (list, (GFunc)free_interpolation_data, NULL); - g_list_free (list); + g_list_free_full (list, (GDestroyNotify) free_interpolation_data); return result; } @@ -2580,7 +2781,7 @@ literal_replacement (const GMatchInfo *match_info, * @start_position: starting index of the string to match * @replacement: text to replace each match with * @match_options: options for the match - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Replaces all occurrences of the pattern in @regex with the * replacement text. @replacement is replaced literally, to @@ -2624,7 +2825,7 @@ g_regex_replace_literal (const GRegex *regex, * @match_options: options for the match * @eval: a function to call for each match * @user_data: user data to pass to the function - * @error: location to store the error occuring, or %NULL to ignore errors + * @error: location to store the error occurring, or %NULL to ignore errors * * Replaces occurrences of the pattern in regex with the output of * @eval for that occurrence. @@ -2635,7 +2836,7 @@ g_regex_replace_literal (const GRegex *regex, * * The following example uses g_regex_replace_eval() to replace multiple * strings at once: - * |[ + * |[ * static gboolean * eval_cb (const GMatchInfo *info, * GString *res, @@ -2652,7 +2853,7 @@ g_regex_replace_literal (const GRegex *regex, * return FALSE; * } * - * /* ... */ + * ... * * GRegex *reg; * GHashTable *h; @@ -2669,7 +2870,7 @@ g_regex_replace_literal (const GRegex *regex, * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); * g_hash_table_destroy (h); * - * /* ... */ + * ... * ]| * * Returns: a newly allocated string containing the replacements @@ -2767,13 +2968,80 @@ g_regex_check_replacement (const gchar *replacement, if (has_references) *has_references = interpolation_list_needs_match (list); - g_list_foreach (list, (GFunc) free_interpolation_data, NULL); - g_list_free (list); + g_list_free_full (list, (GDestroyNotify) free_interpolation_data); return TRUE; } /** + * g_regex_escape_nul: + * @string: the string to escape + * @length: the length of @string + * + * Escapes the nul characters in @string to "\x00". It can be used + * to compile a regex with embedded nul characters. + * + * For completeness, @length can be -1 for a nul-terminated string. + * In this case the output string will be of course equal to @string. + * + * Returns: a newly-allocated escaped string + * + * Since: 2.30 + */ +gchar * +g_regex_escape_nul (const gchar *string, + gint length) +{ + GString *escaped; + const gchar *p, *piece_start, *end; + gint backslashes; + + g_return_val_if_fail (string != NULL, NULL); + + if (length < 0) + return g_strdup (string); + + end = string + length; + p = piece_start = string; + escaped = g_string_sized_new (length + 1); + + backslashes = 0; + while (p < end) + { + switch (*p) + { + case '\0': + if (p != piece_start) + { + /* copy the previous piece. */ + g_string_append_len (escaped, piece_start, p - piece_start); + } + if ((backslashes & 1) == 0) + g_string_append_c (escaped, '\\'); + g_string_append_c (escaped, 'x'); + g_string_append_c (escaped, '0'); + g_string_append_c (escaped, '0'); + piece_start = ++p; + backslashes = 0; + break; + case '\\': + backslashes++; + ++p; + break; + default: + backslashes = 0; + p = g_utf8_next_char (p); + break; + } + } + + if (piece_start < end) + g_string_append_len (escaped, piece_start, end - piece_start); + + return g_string_free (escaped, FALSE); +} + +/** * g_regex_escape_string: * @string: (array length=length): the string to escape * @length: the length of @string, or -1 if @string is nul-terminated