1 /* GRegex -- regular expression API wrapper around PCRE.
3 * Copyright (C) 1999, 2000 Scott Wimer
4 * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
5 * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 #ifdef USE_SYSTEM_PCRE
33 #include "pcre/pcre.h"
36 /* PCRE 7.3 does not contain the definition of PCRE_ERROR_NULLWSLIMIT */
37 #ifndef PCRE_ERROR_NULLWSLIMIT
38 #define PCRE_ERROR_NULLWSLIMIT (-22)
43 /* Mask of all the possible values for GRegexCompileFlags. */
44 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \
49 G_REGEX_DOLLAR_ENDONLY | \
52 G_REGEX_NO_AUTO_CAPTURE | \
55 G_REGEX_NEWLINE_CR | \
56 G_REGEX_NEWLINE_LF | \
59 /* Mask of all the possible values for GRegexMatchFlags. */
60 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \
61 G_REGEX_MATCH_NOTBOL | \
62 G_REGEX_MATCH_NOTEOL | \
63 G_REGEX_MATCH_NOTEMPTY | \
64 G_REGEX_MATCH_PARTIAL | \
65 G_REGEX_MATCH_NEWLINE_CR | \
66 G_REGEX_MATCH_NEWLINE_LF | \
67 G_REGEX_MATCH_NEWLINE_CRLF | \
68 G_REGEX_MATCH_NEWLINE_ANY)
70 /* if the string is in UTF-8 use g_utf8_ functions, else use
72 #define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
73 g_utf8_next_char (s) : \
75 #define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
76 g_utf8_prev_char (s) : \
81 GRegex *regex; /* the regex */
82 GRegexMatchFlags match_opts; /* options used at match time on the regex */
83 gint matches; /* number of matching sub patterns */
84 gint pos; /* position in the string where last match left off */
85 gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
86 gint n_offsets; /* number of offsets */
87 gint *workspace; /* workspace for pcre_dfa_exec() */
88 gint n_workspace; /* number of workspace elements */
89 const gchar *string; /* string passed to the match function */
90 gssize string_len; /* length of string */
95 volatile gint ref_count; /* the ref count for the immutable part */
96 gchar *pattern; /* the pattern */
97 pcre *pcre_re; /* compiled form of the pattern */
98 GRegexCompileFlags compile_opts; /* options used at compile time on the pattern */
99 GRegexMatchFlags match_opts; /* options used at match time on the regex */
100 pcre_extra *extra; /* data stored when G_REGEX_OPTIMIZE is used */
103 /* TRUE if ret is an error code, FALSE otherwise. */
104 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
106 typedef struct _InterpolationData InterpolationData;
107 static gboolean interpolation_list_needs_match (GList *list);
108 static gboolean interpolate_replacement (const GMatchInfo *match_info,
111 static GList *split_replacement (const gchar *replacement,
113 static void free_interpolation_data (InterpolationData *data);
117 match_error (gint errcode)
121 case PCRE_ERROR_NOMATCH:
124 case PCRE_ERROR_NULL:
125 /* NULL argument, this should not happen in GRegex */
126 g_warning ("A NULL argument was passed to PCRE");
128 case PCRE_ERROR_BADOPTION:
129 return "bad options";
130 case PCRE_ERROR_BADMAGIC:
131 return _("corrupted object");
132 case PCRE_ERROR_UNKNOWN_OPCODE:
133 return N_("internal error or corrupted object");
134 case PCRE_ERROR_NOMEMORY:
135 return _("out of memory");
136 case PCRE_ERROR_NOSUBSTRING:
137 /* not used by pcre_exec() */
139 case PCRE_ERROR_MATCHLIMIT:
140 return _("backtracking limit reached");
141 case PCRE_ERROR_CALLOUT:
142 /* callouts are not implemented */
144 case PCRE_ERROR_BADUTF8:
145 case PCRE_ERROR_BADUTF8_OFFSET:
146 /* we do not check if strings are valid */
148 case PCRE_ERROR_PARTIAL:
151 case PCRE_ERROR_BADPARTIAL:
152 return _("the pattern contains items not supported for partial matching");
153 case PCRE_ERROR_INTERNAL:
154 return _("internal error");
155 case PCRE_ERROR_BADCOUNT:
156 /* negative ovecsize, this should not happen in GRegex */
157 g_warning ("A negative ovecsize was passed to PCRE");
159 case PCRE_ERROR_DFA_UITEM:
160 return _("the pattern contains items not supported for partial matching");
161 case PCRE_ERROR_DFA_UCOND:
162 return _("back references as conditions are not supported for partial matching");
163 case PCRE_ERROR_DFA_UMLIMIT:
164 /* the match_field field is not used in GRegex */
166 case PCRE_ERROR_DFA_WSSIZE:
167 /* handled expanding the workspace */
169 case PCRE_ERROR_DFA_RECURSE:
170 case PCRE_ERROR_RECURSIONLIMIT:
171 return _("recursion limit reached");
172 case PCRE_ERROR_NULLWSLIMIT:
173 return _("workspace limit for empty substrings reached");
174 case PCRE_ERROR_BADNEWLINE:
175 return _("invalid combination of newline flags");
179 return _("unknown error");
183 translate_compile_error (gint *errcode, gchar **errmsg)
185 /* Compile errors are created adding 100 to the error code returned
187 * If errcode is known we put the translatable error message in
188 * erromsg. If errcode is unknown we put the generic
189 * G_REGEX_ERROR_COMPILE error code in errcode and keep the
190 * untranslated error message returned by PCRE.
191 * Note that there can be more PCRE errors with the same GRegexError
192 * and that some PCRE errors are useless for us. */
197 case G_REGEX_ERROR_STRAY_BACKSLASH:
198 *errmsg = _("\\ at end of pattern");
200 case G_REGEX_ERROR_MISSING_CONTROL_CHAR:
201 *errmsg = _("\\c at end of pattern");
203 case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
204 *errmsg = _("unrecognized character follows \\");
207 *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
208 *errmsg = _("case changing escapes are not allowed here");
210 case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
211 *errmsg = _("numbers out of order in {} quantifier");
213 case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
214 *errmsg = _("number too big in {} quantifier");
216 case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
217 *errmsg = _("missing terminating ] for character class");
219 case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
220 *errmsg = _("invalid escape sequence in character class");
222 case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
223 *errmsg = _("range out of order in character class");
225 case G_REGEX_ERROR_NOTHING_TO_REPEAT:
226 *errmsg = _("nothing to repeat");
228 case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
229 *errmsg = _("unrecognized character after (?");
232 *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
233 *errmsg = _("unrecognized character after (?<");
236 *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
237 *errmsg = _("unrecognized character after (?P");
239 case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
240 *errmsg = _("POSIX named classes are supported only within a class");
242 case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
243 *errmsg = _("missing terminating )");
246 *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
247 *errmsg = _(") without opening (");
250 *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
251 *errmsg = _("(?R or (?[+-]digits must be followed by )");
253 case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
254 *errmsg = _("reference to non-existent subpattern");
256 case G_REGEX_ERROR_UNTERMINATED_COMMENT:
257 *errmsg = _("missing ) after comment");
259 case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
260 *errmsg = _("regular expression too large");
262 case G_REGEX_ERROR_MEMORY_ERROR:
263 *errmsg = _("failed to get memory");
265 case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
266 *errmsg = _("lookbehind assertion is not fixed length");
268 case G_REGEX_ERROR_MALFORMED_CONDITION:
269 *errmsg = _("malformed number or name after (?(");
271 case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
272 *errmsg = _("conditional group contains more than two branches");
274 case G_REGEX_ERROR_ASSERTION_EXPECTED:
275 *errmsg = _("assertion expected after (?(");
277 case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
278 *errmsg = _("unknown POSIX class name");
280 case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
281 *errmsg = _("POSIX collating elements are not supported");
283 case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
284 *errmsg = _("character value in \\x{...} sequence is too large");
286 case G_REGEX_ERROR_INVALID_CONDITION:
287 *errmsg = _("invalid condition (?(0)");
289 case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
290 *errmsg = _("\\C not allowed in lookbehind assertion");
292 case G_REGEX_ERROR_INFINITE_LOOP:
293 *errmsg = _("recursive call could loop indefinitely");
295 case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
296 *errmsg = _("missing terminator in subpattern name");
298 case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
299 *errmsg = _("two named subpatterns have the same name");
301 case G_REGEX_ERROR_MALFORMED_PROPERTY:
302 *errmsg = _("malformed \\P or \\p sequence");
304 case G_REGEX_ERROR_UNKNOWN_PROPERTY:
305 *errmsg = _("unknown property name after \\P or \\p");
307 case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
308 *errmsg = _("subpattern name is too long (maximum 32 characters)");
310 case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
311 *errmsg = _("too many named subpatterns (maximum 10,000)");
313 case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
314 *errmsg = _("octal value is greater than \\377");
316 case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
317 *errmsg = _("DEFINE group contains more than one branch");
319 case G_REGEX_ERROR_DEFINE_REPETION:
320 *errmsg = _("repeating a DEFINE group is not allowed");
322 case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
323 *errmsg = _("inconsistent NEWLINE options");
325 case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
326 *errmsg = _("\\g is not followed by a braced name or an optionally "
327 "braced non-zero number");
330 *errcode = G_REGEX_ERROR_INTERNAL;
331 *errmsg = _("unexpected repeat");
334 *errcode = G_REGEX_ERROR_INTERNAL;
335 *errmsg = _("code overflow");
338 *errcode = G_REGEX_ERROR_INTERNAL;
339 *errmsg = _("overran compiling workspace");
342 *errcode = G_REGEX_ERROR_INTERNAL;
343 *errmsg = _("previously-checked referenced subpattern not found");
346 /* This should not happen as we never pass a NULL erroffset */
347 g_warning ("erroffset passed as NULL");
348 *errcode = G_REGEX_ERROR_COMPILE;
351 /* This should not happen as we check options before passing them
352 * to pcre_compile2() */
353 g_warning ("unknown option bit(s) set");
354 *errcode = G_REGEX_ERROR_COMPILE;
359 /* These errors should not happen as we are using an UTF8-enabled PCRE
360 * and we do not check if strings are valid */
362 *errcode = G_REGEX_ERROR_COMPILE;
365 *errcode = G_REGEX_ERROR_COMPILE;
372 match_info_new (const GRegex *regex,
379 GMatchInfo *match_info;
382 string_len = strlen (string);
384 match_info = g_new0 (GMatchInfo, 1);
385 match_info->regex = g_regex_ref ((GRegex *)regex);
386 match_info->string = string;
387 match_info->string_len = string_len;
388 match_info->matches = PCRE_ERROR_NOMATCH;
389 match_info->pos = start_position;
390 match_info->match_opts = match_options;
394 /* These values should be enough for most cases, if they are not
395 * enough g_regex_match_all_full() will expand them. */
396 match_info->n_offsets = 24;
397 match_info->n_workspace = 100;
398 match_info->workspace = g_new (gint, match_info->n_workspace);
403 pcre_fullinfo (regex->pcre_re, regex->extra,
404 PCRE_INFO_CAPTURECOUNT, &capture_count);
405 match_info->n_offsets = (capture_count + 1) * 3;
407 match_info->offsets = g_new0 (gint, match_info->n_offsets);
413 * g_match_info_get_regex:
414 * @match_info: a #GMatchInfo
416 * Returns #GRegex object used in @match_info. It belongs to Glib
417 * and must not be freed. Use g_regex_ref() if you need to keep it
418 * after you free @match_info object.
420 * Returns: #GRegex object used in @match_info
425 g_match_info_get_regex (const GMatchInfo *match_info)
427 g_return_val_if_fail (match_info != NULL, NULL);
428 return match_info->regex;
432 * g_match_info_get_string:
433 * @match_info: a #GMatchInfo
435 * Returns the string searched with @match_info. This is the
436 * string passed to g_regex_match() or g_regex_replace() so
437 * you may not free it before calling this function.
439 * Returns: the string searched with @match_info
444 g_match_info_get_string (const GMatchInfo *match_info)
446 g_return_val_if_fail (match_info != NULL, NULL);
447 return match_info->string;
452 * @match_info: a #GMatchInfo
454 * Frees all the memory associated with the #GMatchInfo structure.
459 g_match_info_free (GMatchInfo *match_info)
463 g_regex_unref (match_info->regex);
464 g_free (match_info->offsets);
465 g_free (match_info->workspace);
472 * @match_info: a #GMatchInfo structure
473 * @error: location to store the error occuring, or %NULL to ignore errors
475 * Scans for the next match using the same parameters of the previous
476 * call to g_regex_match_full() or g_regex_match() that returned
479 * The match is done on the string passed to the match function, so you
480 * cannot free it before calling this function.
482 * Returns: %TRUE is the string matched, %FALSE otherwise
487 g_match_info_next (GMatchInfo *match_info,
492 g_return_val_if_fail (match_info != NULL, FALSE);
493 g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
494 g_return_val_if_fail (match_info->pos >= 0, FALSE);
496 opts = match_info->regex->match_opts | match_info->match_opts;
498 match_info->matches = pcre_exec (match_info->regex->pcre_re,
499 match_info->regex->extra,
501 match_info->string_len,
503 match_info->regex->match_opts |
504 match_info->match_opts,
506 match_info->n_offsets);
507 if (IS_PCRE_ERROR (match_info->matches))
509 g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
510 _("Error while matching regular expression %s: %s"),
511 match_info->regex->pattern, match_error (match_info->matches));
515 /* avoid infinite loops if the pattern is an empty string or something
517 if (match_info->pos == match_info->offsets[1])
519 if (match_info->pos > match_info->string_len)
521 /* we have reached the end of the string */
522 match_info->pos = -1;
523 match_info->matches = PCRE_ERROR_NOMATCH;
527 match_info->pos = NEXT_CHAR (match_info->regex,
528 &match_info->string[match_info->pos]) -
533 match_info->pos = match_info->offsets[1];
536 return match_info->matches >= 0;
540 * g_match_info_matches:
541 * @match_info: a #GMatchInfo structure
543 * Returns whether the previous match operation succeeded.
545 * Returns: %TRUE if the previous match operation succeeded,
551 g_match_info_matches (const GMatchInfo *match_info)
553 g_return_val_if_fail (match_info != NULL, FALSE);
555 return match_info->matches >= 0;
559 * g_match_info_get_match_count:
560 * @match_info: a #GMatchInfo structure
562 * Retrieves the number of matched substrings (including substring 0,
563 * that is the whole matched text), so 1 is returned if the pattern
564 * has no substrings in it and 0 is returned if the match failed.
566 * If the last match was obtained using the DFA algorithm, that is
567 * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
568 * count is not that of the number of capturing parentheses but that of
569 * the number of matched substrings.
571 * Returns: Number of matched substrings, or -1 if an error occurred
576 g_match_info_get_match_count (const GMatchInfo *match_info)
578 g_return_val_if_fail (match_info, -1);
580 if (match_info->matches == PCRE_ERROR_NOMATCH)
583 else if (match_info->matches < PCRE_ERROR_NOMATCH)
588 return match_info->matches;
592 * g_match_info_is_partial_match:
593 * @match_info: a #GMatchInfo structure
595 * Usually if the string passed to g_regex_match*() matches as far as
596 * it goes, but is too short to match the entire pattern, %FALSE is
597 * returned. There are circumstances where it might be helpful to
598 * distinguish this case from other cases in which there is no match.
600 * Consider, for example, an application where a human is required to
601 * type in data for a field with specific formatting requirements. An
602 * example might be a date in the form ddmmmyy, defined by the pattern
603 * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
604 * If the application sees the user’s keystrokes one by one, and can
605 * check that what has been typed so far is potentially valid, it is
606 * able to raise an error as soon as a mistake is made.
608 * GRegex supports the concept of partial matching by means of the
609 * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
610 * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
611 * for a complete match, %FALSE otherwise. But, when these functions
612 * return %FALSE, you can check if the match was partial calling
613 * g_match_info_is_partial_match().
615 * When using partial matching you cannot use g_match_info_fetch*().
617 * Because of the way certain internal optimizations are implemented
618 * the partial matching algorithm cannot be used with all patterns.
619 * So repeated single characters such as "a{2,4}" and repeated single
620 * meta-sequences such as "\d+" are not permitted if the maximum number
621 * of occurrences is greater than one. Optional items such as "\d?"
622 * (where the maximum is one) are permitted. Quantifiers with any values
623 * are permitted after parentheses, so the invalid examples above can be
624 * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
625 * for a pattern that does not conform to the restrictions, matching
626 * functions return an error.
628 * Returns: %TRUE if the match was partial, %FALSE otherwise
633 g_match_info_is_partial_match (const GMatchInfo *match_info)
635 g_return_val_if_fail (match_info != NULL, FALSE);
637 return match_info->matches == PCRE_ERROR_PARTIAL;
641 * g_match_info_expand_references:
642 * @match_info: a #GMatchInfo or %NULL
643 * @string_to_expand: the string to expand
644 * @error: location to store the error occuring, or %NULL to ignore errors
646 * Returns a new string containing the text in @string_to_expand with
647 * references and escape sequences expanded. References refer to the last
648 * match done with @string against @regex and have the same syntax used by
651 * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
652 * passed to g_regex_new().
654 * The backreferences are extracted from the string passed to the match
655 * function, so you cannot call this function after freeing the string.
657 * @match_info may be %NULL in which case @string_to_expand must not
658 * contain references. For instance "foo\n" does not refer to an actual
659 * pattern and '\n' merely will be replaced with \n character,
660 * while to expand "\0" (whole match) one needs the result of a match.
661 * Use g_regex_check_replacement() to find out whether @string_to_expand
662 * contains references.
664 * Returns: the expanded string, or %NULL if an error occurred
669 g_match_info_expand_references (const GMatchInfo *match_info,
670 const gchar *string_to_expand,
675 GError *tmp_error = NULL;
677 g_return_val_if_fail (string_to_expand != NULL, NULL);
678 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
680 list = split_replacement (string_to_expand, &tmp_error);
681 if (tmp_error != NULL)
683 g_propagate_error (error, tmp_error);
687 if (!match_info && interpolation_list_needs_match (list))
689 g_critical ("String '%s' contains references to the match, can't "
690 "expand references without GMatchInfo object",
695 result = g_string_sized_new (strlen (string_to_expand));
696 interpolate_replacement (match_info, result, list);
698 g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
701 return g_string_free (result, FALSE);
705 * g_match_info_fetch:
706 * @match_info: #GMatchInfo structure
707 * @match_num: number of the sub expression
709 * Retrieves the text matching the @match_num<!-- -->'th capturing
710 * parentheses. 0 is the full text of the match, 1 is the first paren
711 * set, 2 the second, and so on.
713 * If @match_num is a valid sub pattern but it didn't match anything
714 * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
715 * string is returned.
717 * If the match was obtained using the DFA algorithm, that is using
718 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
719 * string is not that of a set of parentheses but that of a matched
720 * substring. Substrings are matched in reverse order of length, so
721 * 0 is the longest match.
723 * The string is fetched from the string passed to the match function,
724 * so you cannot call this function after freeing the string.
726 * Returns: The matched substring, or %NULL if an error occurred.
727 * You have to free the string yourself
732 g_match_info_fetch (const GMatchInfo *match_info,
735 /* we cannot use pcre_get_substring() because it allocates the
736 * string using pcre_malloc(). */
740 g_return_val_if_fail (match_info != NULL, NULL);
741 g_return_val_if_fail (match_num >= 0, NULL);
743 /* match_num does not exist or it didn't matched, i.e. matching "b"
744 * against "(a)?b" then group 0 is empty. */
745 if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
747 else if (start == -1)
748 match = g_strdup ("");
750 match = g_strndup (&match_info->string[start], end - start);
756 * g_match_info_fetch_pos:
757 * @match_info: #GMatchInfo structure
758 * @match_num: number of the sub expression
759 * @start_pos: pointer to location where to store the start position
760 * @end_pos: pointer to location where to store the end position
762 * Retrieves the position of the @match_num<!-- -->'th capturing
763 * parentheses. 0 is the full text of the match, 1 is the first
764 * paren set, 2 the second, and so on.
766 * If @match_num is a valid sub pattern but it didn't match anything
767 * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
768 * and @end_pos are set to -1 and %TRUE is returned.
770 * If the match was obtained using the DFA algorithm, that is using
771 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
772 * position is not that of a set of parentheses but that of a matched
773 * substring. Substrings are matched in reverse order of length, so
774 * 0 is the longest match.
776 * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
777 * the position cannot be fetched, @start_pos and @end_pos are left
783 g_match_info_fetch_pos (const GMatchInfo *match_info,
788 g_return_val_if_fail (match_info != NULL, FALSE);
789 g_return_val_if_fail (match_num >= 0, FALSE);
791 /* make sure the sub expression number they're requesting is less than
792 * the total number of sub expressions that were matched. */
793 if (match_num >= match_info->matches)
796 if (start_pos != NULL)
797 *start_pos = match_info->offsets[2 * match_num];
800 *end_pos = match_info->offsets[2 * match_num + 1];
806 * Returns number of first matched subpattern with name @name.
807 * There may be more than one in case when DUPNAMES is used,
808 * and not all subpatterns with that name match;
809 * pcre_get_stringnumber() does not work in that case.
812 get_matched_substring_number (const GMatchInfo *match_info,
819 if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES))
820 return pcre_get_stringnumber (match_info->regex->pcre_re, name);
822 /* This code is copied from pcre_get.c: get_first_set() */
823 entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
831 for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
833 gint n = (entry[0] << 8) + entry[1];
834 if (match_info->offsets[n*2] >= 0)
838 return (first[0] << 8) + first[1];
842 * g_match_info_fetch_named:
843 * @match_info: #GMatchInfo structure
844 * @name: name of the subexpression
846 * Retrieves the text matching the capturing parentheses named @name.
848 * If @name is a valid sub pattern name but it didn't match anything
849 * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
850 * then an empty string is returned.
852 * The string is fetched from the string passed to the match function,
853 * so you cannot call this function after freeing the string.
855 * Returns: The matched substring, or %NULL if an error occurred.
856 * You have to free the string yourself
861 g_match_info_fetch_named (const GMatchInfo *match_info,
864 /* we cannot use pcre_get_named_substring() because it allocates the
865 * string using pcre_malloc(). */
868 g_return_val_if_fail (match_info != NULL, NULL);
869 g_return_val_if_fail (name != NULL, NULL);
871 num = get_matched_substring_number (match_info, name);
875 return g_match_info_fetch (match_info, num);
879 * g_match_info_fetch_named_pos:
880 * @match_info: #GMatchInfo structure
881 * @name: name of the subexpression
882 * @start_pos: pointer to location where to store the start position
883 * @end_pos: pointer to location where to store the end position
885 * Retrieves the position of the capturing parentheses named @name.
887 * If @name is a valid sub pattern name but it didn't match anything
888 * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
889 * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
891 * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
892 * the position cannot be fetched, @start_pos and @end_pos are left
898 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
905 g_return_val_if_fail (match_info != NULL, FALSE);
906 g_return_val_if_fail (name != NULL, FALSE);
908 num = get_matched_substring_number (match_info, name);
912 return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
916 * g_match_info_fetch_all:
917 * @match_info: a #GMatchInfo structure
919 * Bundles up pointers to each of the matching substrings from a match
920 * and stores them in an array of gchar pointers. The first element in
921 * the returned array is the match number 0, i.e. the entire matched
924 * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
925 * "b" against "(a)?b") then an empty string is inserted.
927 * If the last match was obtained using the DFA algorithm, that is using
928 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
929 * strings are not that matched by sets of parentheses but that of the
930 * matched substring. Substrings are matched in reverse order of length,
931 * so the first one is the longest match.
933 * The strings are fetched from the string passed to the match function,
934 * so you cannot call this function after freeing the string.
936 * Returns: a %NULL-terminated array of gchar * pointers. It must be
937 * freed using g_strfreev(). If the previous match failed %NULL is
943 g_match_info_fetch_all (const GMatchInfo *match_info)
945 /* we cannot use pcre_get_substring_list() because the returned value
946 * isn't suitable for g_strfreev(). */
950 g_return_val_if_fail (match_info != NULL, NULL);
952 if (match_info->matches < 0)
955 result = g_new (gchar *, match_info->matches + 1);
956 for (i = 0; i < match_info->matches; i++)
957 result[i] = g_match_info_fetch (match_info, i);
967 g_regex_error_quark (void)
969 static GQuark error_quark = 0;
971 if (error_quark == 0)
972 error_quark = g_quark_from_static_string ("g-regex-error-quark");
981 * Increases reference count of @regex by 1.
988 g_regex_ref (GRegex *regex)
990 g_return_val_if_fail (regex != NULL, NULL);
991 g_atomic_int_inc (®ex->ref_count);
999 * Decreases reference count of @regex by 1. When reference count drops
1000 * to zero, it frees all the memory associated with the regex structure.
1005 g_regex_unref (GRegex *regex)
1007 g_return_if_fail (regex != NULL);
1009 if (g_atomic_int_exchange_and_add (®ex->ref_count, -1) - 1 == 0)
1011 g_free (regex->pattern);
1012 if (regex->pcre_re != NULL)
1013 pcre_free (regex->pcre_re);
1014 if (regex->extra != NULL)
1015 pcre_free (regex->extra);
1022 * @pattern: the regular expression
1023 * @compile_options: compile options for the regular expression
1024 * @match_options: match options for the regular expression
1025 * @error: return location for a #GError
1027 * Compiles the regular expression to an internal form, and does
1028 * the initial setup of the #GRegex structure.
1030 * Returns: a #GRegex structure. Call g_regex_unref() when you
1036 g_regex_new (const gchar *pattern,
1037 GRegexCompileFlags compile_options,
1038 GRegexMatchFlags match_options,
1046 gboolean optimize = FALSE;
1047 static gboolean initialized = FALSE;
1048 unsigned long int pcre_compile_options;
1050 g_return_val_if_fail (pattern != NULL, NULL);
1051 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1052 g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
1053 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1060 pcre_config (PCRE_CONFIG_UTF8, &support);
1063 msg = N_("PCRE library is compiled without UTF8 support");
1065 g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
1069 pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
1072 msg = N_("PCRE library is compiled without UTF8 properties support");
1074 g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
1081 /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
1082 * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
1083 if (compile_options & G_REGEX_OPTIMIZE)
1086 /* In GRegex the string are, by default, UTF-8 encoded. PCRE
1087 * instead uses UTF-8 only if required with PCRE_UTF8. */
1088 if (compile_options & G_REGEX_RAW)
1091 compile_options &= ~G_REGEX_RAW;
1096 compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1097 match_options |= PCRE_NO_UTF8_CHECK;
1100 /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
1101 * not for the system one. */
1102 if (!(compile_options & G_REGEX_NEWLINE_CR) &&
1103 !(compile_options & G_REGEX_NEWLINE_LF))
1105 compile_options |= PCRE_NEWLINE_ANY;
1108 /* compile the pattern */
1109 re = pcre_compile2 (pattern, compile_options, &errcode,
1110 (const gchar **)&errmsg, &erroffset, NULL);
1112 /* if the compilation failed, set the error member and return
1118 /* Translate the PCRE error code to GRegexError and use a translated
1119 * error message if possible */
1120 translate_compile_error (&errcode, &errmsg);
1122 /* PCRE uses byte offsets but we want to show character offsets */
1123 erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1125 tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1126 _("Error while compiling regular "
1127 "expression %s at char %d: %s"),
1128 pattern, erroffset, errmsg);
1129 g_propagate_error (error, tmp_error);
1134 /* For options set at the beginning of the pattern, pcre puts them into
1135 * compile options, e.g. "(?i)foo" will make the pcre structure store
1136 * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
1137 pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);
1138 compile_options = pcre_compile_options;
1140 if (!(compile_options & G_REGEX_DUPNAMES))
1142 gboolean jchanged = FALSE;
1143 pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1145 compile_options |= G_REGEX_DUPNAMES;
1148 regex = g_new0 (GRegex, 1);
1149 regex->ref_count = 1;
1150 regex->pattern = g_strdup (pattern);
1151 regex->pcre_re = re;
1152 regex->compile_opts = compile_options;
1153 regex->match_opts = match_options;
1157 regex->extra = pcre_study (regex->pcre_re, 0, (const gchar **)&errmsg);
1160 GError *tmp_error = g_error_new (G_REGEX_ERROR,
1161 G_REGEX_ERROR_OPTIMIZE,
1162 _("Error while optimizing "
1163 "regular expression %s: %s"),
1166 g_propagate_error (error, tmp_error);
1175 * g_regex_get_pattern:
1176 * @regex: a #GRegex structure
1178 * Gets the pattern string associated with @regex, i.e. a copy of
1179 * the string passed to g_regex_new().
1181 * Returns: the pattern of @regex
1186 g_regex_get_pattern (const GRegex *regex)
1188 g_return_val_if_fail (regex != NULL, NULL);
1190 return regex->pattern;
1194 * g_regex_get_max_backref:
1197 * Returns the number of the highest back reference
1198 * in the pattern, or 0 if the pattern does not contain
1201 * Returns: the number of the highest back reference
1206 g_regex_get_max_backref (const GRegex *regex)
1210 pcre_fullinfo (regex->pcre_re, regex->extra,
1211 PCRE_INFO_BACKREFMAX, &value);
1217 * g_regex_get_capture_count:
1220 * Returns the number of capturing subpatterns in the pattern.
1222 * Returns: the number of capturing subpatterns
1227 g_regex_get_capture_count (const GRegex *regex)
1231 pcre_fullinfo (regex->pcre_re, regex->extra,
1232 PCRE_INFO_CAPTURECOUNT, &value);
1238 * g_regex_match_simple:
1239 * @pattern: the regular expression
1240 * @string: the string to scan for matches
1241 * @compile_options: compile options for the regular expression
1242 * @match_options: match options
1244 * Scans for a match in @string for @pattern.
1246 * This function is equivalent to g_regex_match() but it does not
1247 * require to compile the pattern with g_regex_new(), avoiding some
1248 * lines of code when you need just to do a match without extracting
1249 * substrings, capture counts, and so on.
1251 * If this function is to be called on the same @pattern more than
1252 * once, it's more efficient to compile the pattern once with
1253 * g_regex_new() and then use g_regex_match().
1255 * Returns: %TRUE is the string matched, %FALSE otherwise
1260 g_regex_match_simple (const gchar *pattern,
1261 const gchar *string,
1262 GRegexCompileFlags compile_options,
1263 GRegexMatchFlags match_options)
1268 regex = g_regex_new (pattern, compile_options, 0, NULL);
1271 result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1272 g_regex_unref (regex);
1278 * @regex: a #GRegex structure from g_regex_new()
1279 * @string: the string to scan for matches
1280 * @match_options: match options
1281 * @match_info: pointer to location where to store the #GMatchInfo,
1282 * or %NULL if you do not need it
1284 * Scans for a match in string for the pattern in @regex.
1285 * The @match_options are combined with the match options specified
1286 * when the @regex structure was created, letting you have more
1287 * flexibility in reusing #GRegex structures.
1289 * A #GMatchInfo structure, used to get information on the match,
1290 * is stored in @match_info if not %NULL. Note that if @match_info
1291 * is not %NULL then it is created even if the function returns %FALSE,
1292 * i.e. you must free it regardless if regular expression actually matched.
1294 * To retrieve all the non-overlapping matches of the pattern in
1295 * string you can use g_match_info_next().
1299 * print_uppercase_words (const gchar *string)
1301 * /* Print all uppercase-only words. */
1303 * GMatchInfo *match_info;
1305 * regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1306 * g_regex_match (regex, string, 0, &match_info);
1307 * while (g_match_info_matches (match_info))
1309 * gchar *word = g_match_info_fetch (match_info, 0);
1310 * g_print ("Found: %s\n", word);
1312 * g_match_info_next (match_info, NULL);
1314 * g_match_info_free (match_info);
1315 * g_regex_unref (regex);
1319 * Returns: %TRUE is the string matched, %FALSE otherwise
1324 g_regex_match (const GRegex *regex,
1325 const gchar *string,
1326 GRegexMatchFlags match_options,
1327 GMatchInfo **match_info)
1329 return g_regex_match_full (regex, string, -1, 0, match_options,
1334 * g_regex_match_full:
1335 * @regex: a #GRegex structure from g_regex_new()
1336 * @string: the string to scan for matches
1337 * @string_len: the length of @string, or -1 if @string is nul-terminated
1338 * @start_position: starting index of the string to match
1339 * @match_options: match options
1340 * @match_info: pointer to location where to store the #GMatchInfo,
1341 * or %NULL if you do not need it
1342 * @error: location to store the error occuring, or %NULL to ignore errors
1344 * Scans for a match in string for the pattern in @regex.
1345 * The @match_options are combined with the match options specified
1346 * when the @regex structure was created, letting you have more
1347 * flexibility in reusing #GRegex structures.
1349 * Setting @start_position differs from just passing over a shortened
1350 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1351 * that begins with any kind of lookbehind assertion, such as "\b".
1353 * A #GMatchInfo structure, used to get information on the match, is
1354 * stored in @match_info if not %NULL. Note that if @match_info is
1355 * not %NULL then it is created even if the function returns %FALSE,
1356 * i.e. you must free it regardless if regular expression actually
1359 * @string is not copied and is used in #GMatchInfo internally. If
1360 * you use any #GMatchInfo method (except g_match_info_free()) after
1361 * freeing or modifying @string then the behaviour is undefined.
1363 * To retrieve all the non-overlapping matches of the pattern in
1364 * string you can use g_match_info_next().
1368 * print_uppercase_words (const gchar *string)
1370 * /* Print all uppercase-only words. */
1372 * GMatchInfo *match_info;
1373 * GError *error = NULL;
1375 * regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1376 * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
1377 * while (g_match_info_matches (match_info))
1379 * gchar *word = g_match_info_fetch (match_info, 0);
1380 * g_print ("Found: %s\n", word);
1382 * g_match_info_next (match_info, &error);
1384 * g_match_info_free (match_info);
1385 * g_regex_unref (regex);
1386 * if (error != NULL)
1388 * g_printerr ("Error while matching: %s\n", error->message);
1389 * g_error_free (error);
1394 * Returns: %TRUE is the string matched, %FALSE otherwise
1399 g_regex_match_full (const GRegex *regex,
1400 const gchar *string,
1402 gint start_position,
1403 GRegexMatchFlags match_options,
1404 GMatchInfo **match_info,
1410 g_return_val_if_fail (regex != NULL, FALSE);
1411 g_return_val_if_fail (string != NULL, FALSE);
1412 g_return_val_if_fail (start_position >= 0, FALSE);
1413 g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1414 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1416 info = match_info_new (regex, string, string_len, start_position,
1417 match_options, FALSE);
1418 match_ok = g_match_info_next (info, error);
1419 if (match_info != NULL)
1422 g_match_info_free (info);
1428 * g_regex_match_all:
1429 * @regex: a #GRegex structure from g_regex_new()
1430 * @string: the string to scan for matches
1431 * @match_options: match options
1432 * @match_info: pointer to location where to store the #GMatchInfo,
1433 * or %NULL if you do not need it
1435 * Using the standard algorithm for regular expression matching only
1436 * the longest match in the string is retrieved. This function uses
1437 * a different algorithm so it can retrieve all the possible matches.
1438 * For more documentation see g_regex_match_all_full().
1440 * A #GMatchInfo structure, used to get information on the match, is
1441 * stored in @match_info if not %NULL. Note that if @match_info is
1442 * not %NULL then it is created even if the function returns %FALSE,
1443 * i.e. you must free it regardless if regular expression actually
1446 * Returns: %TRUE is the string matched, %FALSE otherwise
1451 g_regex_match_all (const GRegex *regex,
1452 const gchar *string,
1453 GRegexMatchFlags match_options,
1454 GMatchInfo **match_info)
1456 return g_regex_match_all_full (regex, string, -1, 0, match_options,
1461 * g_regex_match_all_full:
1462 * @regex: a #GRegex structure from g_regex_new()
1463 * @string: the string to scan for matches
1464 * @string_len: the length of @string, or -1 if @string is nul-terminated
1465 * @start_position: starting index of the string to match
1466 * @match_options: match options
1467 * @match_info: pointer to location where to store the #GMatchInfo,
1468 * or %NULL if you do not need it
1469 * @error: location to store the error occuring, or %NULL to ignore errors
1471 * Using the standard algorithm for regular expression matching only
1472 * the longest match in the string is retrieved, it is not possibile
1473 * to obtain all the available matches. For instance matching
1474 * "<a> <b> <c>" against the pattern "<.*>"
1475 * you get "<a> <b> <c>".
1477 * This function uses a different algorithm (called DFA, i.e. deterministic
1478 * finite automaton), so it can retrieve all the possible matches, all
1479 * starting at the same point in the string. For instance matching
1480 * "<a> <b> <c>" against the pattern "<.*>"
1481 * you would obtain three matches: "<a> <b> <c>",
1482 * "<a> <b>" and "<a>".
1484 * The number of matched strings is retrieved using
1485 * g_match_info_get_match_count(). To obtain the matched strings and
1486 * their position you can use, respectively, g_match_info_fetch() and
1487 * g_match_info_fetch_pos(). Note that the strings are returned in
1488 * reverse order of length; that is, the longest matching string is
1491 * Note that the DFA algorithm is slower than the standard one and it
1492 * is not able to capture substrings, so backreferences do not work.
1494 * Setting @start_position differs from just passing over a shortened
1495 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1496 * that begins with any kind of lookbehind assertion, such as "\b".
1498 * A #GMatchInfo structure, used to get information on the match, is
1499 * stored in @match_info if not %NULL. Note that if @match_info is
1500 * not %NULL then it is created even if the function returns %FALSE,
1501 * i.e. you must free it regardless if regular expression actually
1504 * Returns: %TRUE is the string matched, %FALSE otherwise
1509 g_regex_match_all_full (const GRegex *regex,
1510 const gchar *string,
1512 gint start_position,
1513 GRegexMatchFlags match_options,
1514 GMatchInfo **match_info,
1520 g_return_val_if_fail (regex != NULL, FALSE);
1521 g_return_val_if_fail (string != NULL, FALSE);
1522 g_return_val_if_fail (start_position >= 0, FALSE);
1523 g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1524 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1526 info = match_info_new (regex, string, string_len, start_position,
1527 match_options, TRUE);
1533 info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1534 info->string, info->string_len,
1536 regex->match_opts | match_options,
1537 info->offsets, info->n_offsets,
1538 info->workspace, info->n_workspace);
1539 if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1541 /* info->workspace is too small. */
1542 info->n_workspace *= 2;
1543 info->workspace = g_realloc (info->workspace,
1544 info->n_workspace * sizeof (gint));
1547 else if (info->matches == 0)
1549 /* info->offsets is too small. */
1550 info->n_offsets *= 2;
1551 info->offsets = g_realloc (info->offsets,
1552 info->n_offsets * sizeof (gint));
1555 else if (IS_PCRE_ERROR (info->matches))
1557 g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1558 _("Error while matching regular expression %s: %s"),
1559 regex->pattern, match_error (info->matches));
1563 /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1566 if (match_info != NULL)
1569 g_match_info_free (info);
1571 return info->matches >= 0;
1575 * g_regex_get_string_number:
1576 * @regex: #GRegex structure
1577 * @name: name of the subexpression
1579 * Retrieves the number of the subexpression named @name.
1581 * Returns: The number of the subexpression or -1 if @name
1587 g_regex_get_string_number (const GRegex *regex,
1592 g_return_val_if_fail (regex != NULL, -1);
1593 g_return_val_if_fail (name != NULL, -1);
1595 num = pcre_get_stringnumber (regex->pcre_re, name);
1596 if (num == PCRE_ERROR_NOSUBSTRING)
1603 * g_regex_split_simple:
1604 * @pattern: the regular expression
1605 * @string: the string to scan for matches
1606 * @compile_options: compile options for the regular expression
1607 * @match_options: match options
1609 * Breaks the string on the pattern, and returns an array of
1610 * the tokens. If the pattern contains capturing parentheses,
1611 * then the text for each of the substrings will also be returned.
1612 * If the pattern does not match anywhere in the string, then the
1613 * whole string is returned as the first token.
1615 * This function is equivalent to g_regex_split() but it does
1616 * not require to compile the pattern with g_regex_new(), avoiding
1617 * some lines of code when you need just to do a split without
1618 * extracting substrings, capture counts, and so on.
1620 * If this function is to be called on the same @pattern more than
1621 * once, it's more efficient to compile the pattern once with
1622 * g_regex_new() and then use g_regex_split().
1624 * As a special case, the result of splitting the empty string ""
1625 * is an empty vector, not a vector containing a single string.
1626 * The reason for this special case is that being able to represent
1627 * a empty vector is typically more useful than consistent handling
1628 * of empty elements. If you do need to represent empty elements,
1629 * you'll need to check for the empty string before calling this
1632 * A pattern that can match empty strings splits @string into
1633 * separate characters wherever it matches the empty string between
1634 * characters. For example splitting "ab c" using as a separator
1635 * "\s*", you will get "a", "b" and "c".
1637 * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1642 g_regex_split_simple (const gchar *pattern,
1643 const gchar *string,
1644 GRegexCompileFlags compile_options,
1645 GRegexMatchFlags match_options)
1650 regex = g_regex_new (pattern, compile_options, 0, NULL);
1653 result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1654 g_regex_unref (regex);
1660 * @regex: a #GRegex structure
1661 * @string: the string to split with the pattern
1662 * @match_options: match time option flags
1664 * Breaks the string on the pattern, and returns an array of the tokens.
1665 * If the pattern contains capturing parentheses, then the text for each
1666 * of the substrings will also be returned. If the pattern does not match
1667 * anywhere in the string, then the whole string is returned as the first
1670 * As a special case, the result of splitting the empty string "" is an
1671 * empty vector, not a vector containing a single string. The reason for
1672 * this special case is that being able to represent a empty vector is
1673 * typically more useful than consistent handling of empty elements. If
1674 * you do need to represent empty elements, you'll need to check for the
1675 * empty string before calling this function.
1677 * A pattern that can match empty strings splits @string into separate
1678 * characters wherever it matches the empty string between characters.
1679 * For example splitting "ab c" using as a separator "\s*", you will get
1682 * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1687 g_regex_split (const GRegex *regex,
1688 const gchar *string,
1689 GRegexMatchFlags match_options)
1691 return g_regex_split_full (regex, string, -1, 0,
1692 match_options, 0, NULL);
1696 * g_regex_split_full:
1697 * @regex: a #GRegex structure
1698 * @string: the string to split with the pattern
1699 * @string_len: the length of @string, or -1 if @string is nul-terminated
1700 * @start_position: starting index of the string to match
1701 * @match_options: match time option flags
1702 * @max_tokens: the maximum number of tokens to split @string into.
1703 * If this is less than 1, the string is split completely
1704 * @error: return location for a #GError
1706 * Breaks the string on the pattern, and returns an array of the tokens.
1707 * If the pattern contains capturing parentheses, then the text for each
1708 * of the substrings will also be returned. If the pattern does not match
1709 * anywhere in the string, then the whole string is returned as the first
1712 * As a special case, the result of splitting the empty string "" is an
1713 * empty vector, not a vector containing a single string. The reason for
1714 * this special case is that being able to represent a empty vector is
1715 * typically more useful than consistent handling of empty elements. If
1716 * you do need to represent empty elements, you'll need to check for the
1717 * empty string before calling this function.
1719 * A pattern that can match empty strings splits @string into separate
1720 * characters wherever it matches the empty string between characters.
1721 * For example splitting "ab c" using as a separator "\s*", you will get
1724 * Setting @start_position differs from just passing over a shortened
1725 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1726 * that begins with any kind of lookbehind assertion, such as "\b".
1728 * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1733 g_regex_split_full (const GRegex *regex,
1734 const gchar *string,
1736 gint start_position,
1737 GRegexMatchFlags match_options,
1741 GError *tmp_error = NULL;
1742 GMatchInfo *match_info;
1747 /* position of the last separator. */
1748 gint last_separator_end;
1749 /* was the last match 0 bytes long? */
1750 gboolean last_match_is_empty;
1751 /* the returned array of char **s */
1752 gchar **string_list;
1754 g_return_val_if_fail (regex != NULL, NULL);
1755 g_return_val_if_fail (string != NULL, NULL);
1756 g_return_val_if_fail (start_position >= 0, NULL);
1757 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1758 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1760 if (max_tokens <= 0)
1761 max_tokens = G_MAXINT;
1764 string_len = strlen (string);
1766 /* zero-length string */
1767 if (string_len - start_position == 0)
1768 return g_new0 (gchar *, 1);
1770 if (max_tokens == 1)
1772 string_list = g_new0 (gchar *, 2);
1773 string_list[0] = g_strndup (&string[start_position],
1774 string_len - start_position);
1780 last_separator_end = start_position;
1781 last_match_is_empty = FALSE;
1783 match_ok = g_regex_match_full (regex, string, string_len, start_position,
1784 match_options, &match_info, &tmp_error);
1785 while (tmp_error == NULL)
1789 last_match_is_empty =
1790 (match_info->offsets[0] == match_info->offsets[1]);
1792 /* we need to skip empty separators at the same position of the end
1793 * of another separator. e.g. the string is "a b" and the separator
1794 * is " *", so from 1 to 2 we have a match and at position 2 we have
1795 * an empty match. */
1796 if (last_separator_end != match_info->offsets[1])
1801 token = g_strndup (string + last_separator_end,
1802 match_info->offsets[0] - last_separator_end);
1803 list = g_list_prepend (list, token);
1806 /* if there were substrings, these need to be added to
1808 match_count = g_match_info_get_match_count (match_info);
1809 if (match_count > 1)
1811 for (i = 1; i < match_count; i++)
1812 list = g_list_prepend (list, g_match_info_fetch (match_info, i));
1818 /* if there was no match, copy to end of string. */
1819 if (!last_match_is_empty)
1821 gchar *token = g_strndup (string + last_separator_end,
1822 match_info->string_len - last_separator_end);
1823 list = g_list_prepend (list, token);
1825 /* no more tokens, end the loop. */
1829 /* -1 to leave room for the last part. */
1830 if (token_count >= max_tokens - 1)
1832 /* we have reached the maximum number of tokens, so we copy
1833 * the remaining part of the string. */
1834 if (last_match_is_empty)
1836 /* the last match was empty, so we have moved one char
1837 * after the real position to avoid empty matches at the
1839 match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
1841 /* the if is needed in the case we have terminated the available
1842 * tokens, but we are at the end of the string, so there are no
1843 * characters left to copy. */
1844 if (string_len > match_info->pos)
1846 gchar *token = g_strndup (string + match_info->pos,
1847 string_len - match_info->pos);
1848 list = g_list_prepend (list, token);
1854 last_separator_end = match_info->pos;
1855 if (last_match_is_empty)
1856 /* if the last match was empty, g_match_info_next() has moved
1857 * forward to avoid infinite loops, but we still need to copy that
1859 last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
1861 match_ok = g_match_info_next (match_info, &tmp_error);
1863 g_match_info_free (match_info);
1864 if (tmp_error != NULL)
1866 g_propagate_error (error, tmp_error);
1867 g_list_foreach (list, (GFunc)g_free, NULL);
1869 match_info->pos = -1;
1873 string_list = g_new (gchar *, g_list_length (list) + 1);
1875 for (last = g_list_last (list); last; last = g_list_previous (last))
1876 string_list[i++] = last->data;
1877 string_list[i] = NULL;
1886 REPL_TYPE_CHARACTER,
1887 REPL_TYPE_SYMBOLIC_REFERENCE,
1888 REPL_TYPE_NUMERIC_REFERENCE,
1889 REPL_TYPE_CHANGE_CASE
1894 CHANGE_CASE_NONE = 1 << 0,
1895 CHANGE_CASE_UPPER = 1 << 1,
1896 CHANGE_CASE_LOWER = 1 << 2,
1897 CHANGE_CASE_UPPER_SINGLE = 1 << 3,
1898 CHANGE_CASE_LOWER_SINGLE = 1 << 4,
1899 CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
1900 CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
1901 CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
1904 struct _InterpolationData
1910 ChangeCase change_case;
1914 free_interpolation_data (InterpolationData *data)
1916 g_free (data->text);
1920 static const gchar *
1921 expand_escape (const gchar *replacement,
1923 InterpolationData *data,
1928 const gchar *error_detail;
1930 GError *tmp_error = NULL;
1938 data->type = REPL_TYPE_CHARACTER;
1943 data->type = REPL_TYPE_CHARACTER;
1948 data->type = REPL_TYPE_CHARACTER;
1953 data->type = REPL_TYPE_CHARACTER;
1958 data->type = REPL_TYPE_CHARACTER;
1963 data->type = REPL_TYPE_CHARACTER;
1968 data->type = REPL_TYPE_CHARACTER;
1973 data->type = REPL_TYPE_CHARACTER;
1983 h = g_ascii_xdigit_value (*p);
1986 error_detail = _("hexadecimal digit or '}' expected");
1997 for (i = 0; i < 2; i++)
1999 h = g_ascii_xdigit_value (*p);
2002 error_detail = _("hexadecimal digit expected");
2009 data->type = REPL_TYPE_STRING;
2010 data->text = g_new0 (gchar, 8);
2011 g_unichar_to_utf8 (x, data->text);
2015 data->type = REPL_TYPE_CHANGE_CASE;
2016 data->change_case = CHANGE_CASE_LOWER_SINGLE;
2020 data->type = REPL_TYPE_CHANGE_CASE;
2021 data->change_case = CHANGE_CASE_UPPER_SINGLE;
2025 data->type = REPL_TYPE_CHANGE_CASE;
2026 data->change_case = CHANGE_CASE_LOWER;
2030 data->type = REPL_TYPE_CHANGE_CASE;
2031 data->change_case = CHANGE_CASE_UPPER;
2035 data->type = REPL_TYPE_CHANGE_CASE;
2036 data->change_case = CHANGE_CASE_NONE;
2042 error_detail = _("missing '<' in symbolic reference");
2051 error_detail = _("unfinished symbolic reference");
2058 error_detail = _("zero-length symbolic reference");
2061 if (g_ascii_isdigit (*q))
2066 h = g_ascii_digit_value (*q);
2069 error_detail = _("digit expected");
2078 data->type = REPL_TYPE_NUMERIC_REFERENCE;
2085 if (!g_ascii_isalnum (*r))
2087 error_detail = _("illegal symbolic reference");
2094 data->text = g_strndup (q, p - q);
2095 data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
2100 /* if \0 is followed by a number is an octal number representing a
2101 * character, else it is a numeric reference. */
2102 if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
2105 p = g_utf8_next_char (p);
2118 for (i = 0; i < 3; i++)
2120 h = g_ascii_digit_value (*p);
2130 if (i == 2 && base == 10)
2136 if (base == 8 || i == 3)
2138 data->type = REPL_TYPE_STRING;
2139 data->text = g_new0 (gchar, 8);
2140 g_unichar_to_utf8 (x, data->text);
2144 data->type = REPL_TYPE_NUMERIC_REFERENCE;
2149 error_detail = _("stray final '\\'");
2153 error_detail = _("unknown escape sequence");
2160 /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
2161 tmp_error = g_error_new (G_REGEX_ERROR,
2162 G_REGEX_ERROR_REPLACE,
2163 _("Error while parsing replacement "
2164 "text \"%s\" at char %lu: %s"),
2166 (gulong)(p - replacement),
2168 g_propagate_error (error, tmp_error);
2174 split_replacement (const gchar *replacement,
2178 InterpolationData *data;
2179 const gchar *p, *start;
2181 start = p = replacement;
2186 data = g_new0 (InterpolationData, 1);
2187 start = p = expand_escape (replacement, p, data, error);
2190 g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2192 free_interpolation_data (data);
2196 list = g_list_prepend (list, data);
2201 if (*p == '\\' || *p == '\0')
2205 data = g_new0 (InterpolationData, 1);
2206 data->text = g_strndup (start, p - start);
2207 data->type = REPL_TYPE_STRING;
2208 list = g_list_prepend (list, data);
2214 return g_list_reverse (list);
2217 /* Change the case of c based on change_case. */
2218 #define CHANGE_CASE(c, change_case) \
2219 (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2220 g_unichar_tolower (c) : \
2221 g_unichar_toupper (c))
2224 string_append (GString *string,
2226 ChangeCase *change_case)
2230 if (text[0] == '\0')
2233 if (*change_case == CHANGE_CASE_NONE)
2235 g_string_append (string, text);
2237 else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2239 c = g_utf8_get_char (text);
2240 g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2241 g_string_append (string, g_utf8_next_char (text));
2242 *change_case = CHANGE_CASE_NONE;
2246 while (*text != '\0')
2248 c = g_utf8_get_char (text);
2249 g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2250 text = g_utf8_next_char (text);
2256 interpolate_replacement (const GMatchInfo *match_info,
2261 InterpolationData *idata;
2263 ChangeCase change_case = CHANGE_CASE_NONE;
2265 for (list = data; list; list = list->next)
2268 switch (idata->type)
2270 case REPL_TYPE_STRING:
2271 string_append (result, idata->text, &change_case);
2273 case REPL_TYPE_CHARACTER:
2274 g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2275 if (change_case & CHANGE_CASE_SINGLE_MASK)
2276 change_case = CHANGE_CASE_NONE;
2278 case REPL_TYPE_NUMERIC_REFERENCE:
2279 match = g_match_info_fetch (match_info, idata->num);
2282 string_append (result, match, &change_case);
2286 case REPL_TYPE_SYMBOLIC_REFERENCE:
2287 match = g_match_info_fetch_named (match_info, idata->text);
2290 string_append (result, match, &change_case);
2294 case REPL_TYPE_CHANGE_CASE:
2295 change_case = idata->change_case;
2303 /* whether actual match_info is needed for replacement, i.e.
2304 * whether there are references
2307 interpolation_list_needs_match (GList *list)
2309 while (list != NULL)
2311 InterpolationData *data = list->data;
2313 if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
2314 data->type == REPL_TYPE_NUMERIC_REFERENCE)
2327 * @regex: a #GRegex structure
2328 * @string: the string to perform matches against
2329 * @string_len: the length of @string, or -1 if @string is nul-terminated
2330 * @start_position: starting index of the string to match
2331 * @replacement: text to replace each match with
2332 * @match_options: options for the match
2333 * @error: location to store the error occuring, or %NULL to ignore errors
2335 * Replaces all occurances of the pattern in @regex with the
2336 * replacement text. Backreferences of the form '\number' or
2337 * '\g<number>' in the replacement text are interpolated by the
2338 * number-th captured subexpression of the match, '\g<name>' refers
2339 * to the captured subexpression with the given name. '\0' refers to the
2340 * complete match, but '\0' followed by a number is the octal representation
2341 * of a character. To include a literal '\' in the replacement, write '\\'.
2342 * There are also escapes that changes the case of the following text:
2345 * <varlistentry><term>\l</term>
2347 * <para>Convert to lower case the next character</para>
2350 * <varlistentry><term>\u</term>
2352 * <para>Convert to upper case the next character</para>
2355 * <varlistentry><term>\L</term>
2357 * <para>Convert to lower case till \E</para>
2360 * <varlistentry><term>\U</term>
2362 * <para>Convert to upper case till \E</para>
2365 * <varlistentry><term>\E</term>
2367 * <para>End case modification</para>
2372 * If you do not need to use backreferences use g_regex_replace_literal().
2374 * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2375 * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2376 * you can use g_regex_replace_literal().
2378 * Setting @start_position differs from just passing over a shortened
2379 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
2380 * begins with any kind of lookbehind assertion, such as "\b".
2382 * Returns: a newly allocated string containing the replacements
2387 g_regex_replace (const GRegex *regex,
2388 const gchar *string,
2390 gint start_position,
2391 const gchar *replacement,
2392 GRegexMatchFlags match_options,
2397 GError *tmp_error = NULL;
2399 g_return_val_if_fail (regex != NULL, NULL);
2400 g_return_val_if_fail (string != NULL, NULL);
2401 g_return_val_if_fail (start_position >= 0, NULL);
2402 g_return_val_if_fail (replacement != NULL, NULL);
2403 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2404 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2406 list = split_replacement (replacement, &tmp_error);
2407 if (tmp_error != NULL)
2409 g_propagate_error (error, tmp_error);
2413 result = g_regex_replace_eval (regex,
2414 string, string_len, start_position,
2416 interpolate_replacement,
2419 if (tmp_error != NULL)
2420 g_propagate_error (error, tmp_error);
2422 g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2429 literal_replacement (const GMatchInfo *match_info,
2433 g_string_append (result, data);
2438 * g_regex_replace_literal:
2439 * @regex: a #GRegex structure
2440 * @string: the string to perform matches against
2441 * @string_len: the length of @string, or -1 if @string is nul-terminated
2442 * @start_position: starting index of the string to match
2443 * @replacement: text to replace each match with
2444 * @match_options: options for the match
2445 * @error: location to store the error occuring, or %NULL to ignore errors
2447 * Replaces all occurances of the pattern in @regex with the
2448 * replacement text. @replacement is replaced literally, to
2449 * include backreferences use g_regex_replace().
2451 * Setting @start_position differs from just passing over a
2452 * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
2453 * case of a pattern that begins with any kind of lookbehind
2454 * assertion, such as "\b".
2456 * Returns: a newly allocated string containing the replacements
2461 g_regex_replace_literal (const GRegex *regex,
2462 const gchar *string,
2464 gint start_position,
2465 const gchar *replacement,
2466 GRegexMatchFlags match_options,
2469 g_return_val_if_fail (replacement != NULL, NULL);
2470 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2472 return g_regex_replace_eval (regex,
2473 string, string_len, start_position,
2475 literal_replacement,
2476 (gpointer)replacement,
2481 * g_regex_replace_eval:
2482 * @regex: a #GRegex structure from g_regex_new()
2483 * @string: string to perform matches against
2484 * @string_len: the length of @string, or -1 if @string is nul-terminated
2485 * @start_position: starting index of the string to match
2486 * @match_options: options for the match
2487 * @eval: a function to call for each match
2488 * @user_data: user data to pass to the function
2489 * @error: location to store the error occuring, or %NULL to ignore errors
2491 * Replaces occurances of the pattern in regex with the output of
2492 * @eval for that occurance.
2494 * Setting @start_position differs from just passing over a shortened
2495 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2496 * that begins with any kind of lookbehind assertion, such as "\b".
2498 * Returns: a newly allocated string containing the replacements
2503 g_regex_replace_eval (const GRegex *regex,
2504 const gchar *string,
2506 gint start_position,
2507 GRegexMatchFlags match_options,
2508 GRegexEvalCallback eval,
2512 GMatchInfo *match_info;
2515 gboolean done = FALSE;
2516 GError *tmp_error = NULL;
2518 g_return_val_if_fail (regex != NULL, NULL);
2519 g_return_val_if_fail (string != NULL, NULL);
2520 g_return_val_if_fail (start_position >= 0, NULL);
2521 g_return_val_if_fail (eval != NULL, NULL);
2522 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2525 string_len = strlen (string);
2527 result = g_string_sized_new (string_len);
2529 /* run down the string making matches. */
2530 g_regex_match_full (regex, string, string_len, start_position,
2531 match_options, &match_info, &tmp_error);
2532 while (!done && g_match_info_matches (match_info))
2534 g_string_append_len (result,
2536 match_info->offsets[0] - str_pos);
2537 done = (*eval) (match_info, result, user_data);
2538 str_pos = match_info->offsets[1];
2539 g_match_info_next (match_info, &tmp_error);
2541 g_match_info_free (match_info);
2542 if (tmp_error != NULL)
2544 g_propagate_error (error, tmp_error);
2545 g_string_free (result, TRUE);
2549 g_string_append_len (result, string + str_pos, string_len - str_pos);
2550 return g_string_free (result, FALSE);
2554 * g_regex_check_replacement:
2555 * @replacement: the replacement string
2556 * @has_references: location to store information about
2557 * references in @replacement or %NULL
2558 * @error: location to store error
2560 * Checks whether @replacement is a valid replacement string
2561 * (see g_regex_replace()), i.e. that all escape sequences in
2564 * If @has_references is not %NULL then @replacement is checked
2565 * for pattern references. For instance, replacement text 'foo\n'
2566 * does not contain references and may be evaluated without information
2567 * about actual match, but '\0\1' (whole match followed by first
2568 * subpattern) requires valid #GMatchInfo object.
2570 * Returns: whether @replacement is a valid replacement string
2575 g_regex_check_replacement (const gchar *replacement,
2576 gboolean *has_references,
2582 list = split_replacement (replacement, &tmp);
2586 g_propagate_error (error, tmp);
2591 *has_references = interpolation_list_needs_match (list);
2593 g_list_foreach (list, (GFunc) free_interpolation_data, NULL);
2600 * g_regex_escape_string:
2601 * @string: the string to escape
2602 * @length: the length of @string, or -1 if @string is nul-terminated
2604 * Escapes the special characters used for regular expressions
2605 * in @string, for instance "a.b*c" becomes "a\.b\*c". This
2606 * function is useful to dynamically generate regular expressions.
2608 * @string can contain nul characters that are replaced with "\0",
2609 * in this case remember to specify the correct length of @string
2612 * Returns: a newly-allocated escaped string
2617 g_regex_escape_string (const gchar *string,
2621 const char *p, *piece_start, *end;
2623 g_return_val_if_fail (string != NULL, NULL);
2626 length = strlen (string);
2628 end = string + length;
2629 p = piece_start = string;
2630 escaped = g_string_sized_new (length + 1);
2651 if (p != piece_start)
2652 /* copy the previous piece. */
2653 g_string_append_len (escaped, piece_start, p - piece_start);
2654 g_string_append_c (escaped, '\\');
2656 g_string_append_c (escaped, '0');
2658 g_string_append_c (escaped, *p);
2662 p = g_utf8_next_char (p);
2667 if (piece_start < end)
2668 g_string_append_len (escaped, piece_start, end - piece_start);
2670 return g_string_free (escaped, FALSE);
2673 #define __G_REGEX_C__
2674 #include "galiasdef.c"