glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include "config.h"
  23
  24 #include <string.h>
  25
  26 #ifdef USE_SYSTEM_PCRE
  27 #include <pcre.h>
  28 #else
  29 #include "pcre/pcre.h"
  30 #endif
  31
  32 #include "gtypes.h"
  33 #include "gregex.h"
  34 #include "glibintl.h"
  35 #include "glist.h"
  36 #include "gmessages.h"
  37 #include "gstrfuncs.h"
  38 #include "gatomic.h"
  39 #include "gthread.h"
  40
  41 /**
  42  * SECTION:gregex
  43  * @title: Perl-compatible regular expressions
  44  * @short_description: matches strings against regular expressions
  45  * @see_also: <xref linkend="glib-regex-syntax"/>
  46  *
  47  * The <function>g_regex_*()</function> functions implement regular
  48  * expression pattern matching using syntax and semantics similar to
  49  * Perl regular expression.
  50  *
  51  * Some functions accept a @start_position argument, setting it differs
  52  * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
  53  * in the case of a pattern that begins with any kind of lookbehind assertion.
  54  * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
  55  * in the middle of words. ("\B" matches only if the current position in the
  56  * subject is not a word boundary.) When applied to the string "Mississipi"
  57  * from the fourth byte, namely "issipi", it does not match, because "\B" is
  58  * always false at the start of the subject, which is deemed to be a word
  59  * boundary. However, if the entire string is passed , but with
  60  * @start_position set to 4, it finds the second occurrence of "iss" because
  61  * it is able to look behind the starting point to discover that it is
  62  * preceded by a letter.
  63  *
  64  * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
  65  * to these functions must be encoded in UTF-8. The lengths and the positions
  66  * inside the strings are in bytes and not in characters, so, for instance,
  67  * "\xc3\xa0" (i.e. "&agrave;") is two bytes long but it is treated as a
  68  * single character. If you set #G_REGEX_RAW the strings can be non-valid
  69  * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
  70  * bytes and two characters long.
  71  *
  72  * When matching a pattern, "\n" matches only against a "\n" character in
  73  * the string, and "\r" matches only a "\r" character. To match any newline
  74  * sequence use "\R". This particular group matches either the two-character
  75  * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
  76  * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
  77  * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
  78  * separator, U+2028), or PS (paragraph separator, U+2029).
  79  *
  80  * The behaviour of the dot, circumflex, and dollar metacharacters are
  81  * affected by newline characters, the default is to recognize any newline
  82  * character (the same characters recognized by "\R"). This can be changed
  83  * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
  84  * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
  85  * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
  86  * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
  87  * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
  88  * unescaped "#" outside a character class is encountered. This indicates
  89  * a comment that lasts until after the next newline.
  90  *
  91  * Creating and manipulating the same #GRegex structure from different
  92  * threads is not a problem as #GRegex does not modify its internal
  93  * state between creation and destruction, on the other hand #GMatchInfo
  94  * is not threadsafe.
  95  *
  96  * The regular expressions low-level functionalities are obtained through
  97  * the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
  98  * written by Philip Hazel.
  99  */
 100
 101 /* Mask of all the possible values for GRegexCompileFlags. */
 102 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
 103                               G_REGEX_MULTILINE         | \
 104                               G_REGEX_DOTALL            | \
 105                               G_REGEX_EXTENDED          | \
 106                               G_REGEX_ANCHORED          | \
 107                               G_REGEX_DOLLAR_ENDONLY    | \
 108                               G_REGEX_UNGREEDY          | \
 109                               G_REGEX_RAW               | \
 110                               G_REGEX_NO_AUTO_CAPTURE   | \
 111                               G_REGEX_OPTIMIZE          | \
 112                               G_REGEX_DUPNAMES          | \
 113                               G_REGEX_NEWLINE_CR        | \
 114                               G_REGEX_NEWLINE_LF        | \
 115                               G_REGEX_NEWLINE_CRLF)
 116
 117 /* Mask of all the possible values for GRegexMatchFlags. */
 118 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED      | \
 119                             G_REGEX_MATCH_NOTBOL        | \
 120                             G_REGEX_MATCH_NOTEOL        | \
 121                             G_REGEX_MATCH_NOTEMPTY      | \
 122                             G_REGEX_MATCH_PARTIAL       | \
 123                             G_REGEX_MATCH_NEWLINE_CR    | \
 124                             G_REGEX_MATCH_NEWLINE_LF    | \
 125                             G_REGEX_MATCH_NEWLINE_CRLF  | \
 126                             G_REGEX_MATCH_NEWLINE_ANY)
 127
 128 /* if the string is in UTF-8 use g_utf8_ functions, else use
 129  * use just +/- 1. */
 130 #define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
 131                                 g_utf8_next_char (s) : \
 132                                 ((s) + 1))
 133 #define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
 134                                 g_utf8_prev_char (s) : \
 135                                 ((s) - 1))
 136
 137 struct _GMatchInfo
 138 {
 139   volatile gint ref_count;      /* the ref count */
 140   GRegex *regex;                /* the regex */
 141   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 142   gint matches;                 /* number of matching sub patterns */
 143   gint pos;                     /* position in the string where last match left off */
 144   gint  n_offsets;              /* number of offsets */
 145   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
 146   gint *workspace;              /* workspace for pcre_dfa_exec() */
 147   gint n_workspace;             /* number of workspace elements */
 148   const gchar *string;          /* string passed to the match function */
 149   gssize string_len;            /* length of string */
 150 };
 151
 152 struct _GRegex
 153 {
 154   volatile gint ref_count;      /* the ref count for the immutable part */
 155   gchar *pattern;               /* the pattern */
 156   pcre *pcre_re;                /* compiled form of the pattern */
 157   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
 158   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 159   pcre_extra *extra;            /* data stored when G_REGEX_OPTIMIZE is used */
 160 };
 161
 162 /* TRUE if ret is an error code, FALSE otherwise. */
 163 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 164
 165 typedef struct _InterpolationData InterpolationData;
 166 static gboolean  interpolation_list_needs_match (GList *list);
 167 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
 168                                                  GString *result,
 169                                                  gpointer data);
 170 static GList    *split_replacement              (const gchar *replacement,
 171                                                  GError **error);
 172 static void      free_interpolation_data        (InterpolationData *data);
 173
 174
 175 static const gchar *
 176 match_error (gint errcode)
 177 {
 178   switch (errcode)
 179     {
 180     case PCRE_ERROR_NOMATCH:
 181       /* not an error */
 182       break;
 183     case PCRE_ERROR_NULL:
 184       /* NULL argument, this should not happen in GRegex */
 185       g_warning ("A NULL argument was passed to PCRE");
 186       break;
 187     case PCRE_ERROR_BADOPTION:
 188       return "bad options";
 189     case PCRE_ERROR_BADMAGIC:
 190       return _("corrupted object");
 191     case PCRE_ERROR_UNKNOWN_OPCODE:
 192       return N_("internal error or corrupted object");
 193     case PCRE_ERROR_NOMEMORY:
 194       return _("out of memory");
 195     case PCRE_ERROR_NOSUBSTRING:
 196       /* not used by pcre_exec() */
 197       break;
 198     case PCRE_ERROR_MATCHLIMIT:
 199       return _("backtracking limit reached");
 200     case PCRE_ERROR_CALLOUT:
 201       /* callouts are not implemented */
 202       break;
 203     case PCRE_ERROR_BADUTF8:
 204     case PCRE_ERROR_BADUTF8_OFFSET:
 205       /* we do not check if strings are valid */
 206       break;
 207     case PCRE_ERROR_PARTIAL:
 208       /* not an error */
 209       break;
 210     case PCRE_ERROR_BADPARTIAL:
 211       return _("the pattern contains items not supported for partial matching");
 212     case PCRE_ERROR_INTERNAL:
 213       return _("internal error");
 214     case PCRE_ERROR_BADCOUNT:
 215       /* negative ovecsize, this should not happen in GRegex */
 216       g_warning ("A negative ovecsize was passed to PCRE");
 217       break;
 218     case PCRE_ERROR_DFA_UITEM:
 219       return _("the pattern contains items not supported for partial matching");
 220     case PCRE_ERROR_DFA_UCOND:
 221       return _("back references as conditions are not supported for partial matching");
 222     case PCRE_ERROR_DFA_UMLIMIT:
 223       /* the match_field field is not used in GRegex */
 224       break;
 225     case PCRE_ERROR_DFA_WSSIZE:
 226       /* handled expanding the workspace */
 227       break;
 228     case PCRE_ERROR_DFA_RECURSE:
 229     case PCRE_ERROR_RECURSIONLIMIT:
 230       return _("recursion limit reached");
 231     case PCRE_ERROR_NULLWSLIMIT:
 232       return _("workspace limit for empty substrings reached");
 233     case PCRE_ERROR_BADNEWLINE:
 234       return _("invalid combination of newline flags");
 235     case PCRE_ERROR_BADOFFSET:
 236       return _("bad offset");
 237     case PCRE_ERROR_SHORTUTF8:
 238       return _("short utf8");
 239     case PCRE_ERROR_RECURSELOOP:
 240       return _("recursion loop");
 241     default:
 242       break;
 243     }
 244   return _("unknown error");
 245 }
 246
 247 static void
 248 translate_compile_error (gint *errcode, const gchar **errmsg)
 249 {
 250   /* Compile errors are created adding 100 to the error code returned
 251    * by PCRE.
 252    * If errcode is known we put the translatable error message in
 253    * erromsg. If errcode is unknown we put the generic
 254    * G_REGEX_ERROR_COMPILE error code in errcode and keep the
 255    * untranslated error message returned by PCRE.
 256    * Note that there can be more PCRE errors with the same GRegexError
 257    * and that some PCRE errors are useless for us.
 258    */
 259   *errcode += 100;
 260
 261   switch (*errcode)
 262     {
 263     case G_REGEX_ERROR_STRAY_BACKSLASH:
 264       *errmsg = _("\\ at end of pattern");
 265       break;
 266     case G_REGEX_ERROR_MISSING_CONTROL_CHAR:
 267       *errmsg = _("\\c at end of pattern");
 268       break;
 269     case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
 270       *errmsg = _("unrecognized character follows \\");
 271       break;
 272     case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
 273       *errmsg = _("numbers out of order in {} quantifier");
 274       break;
 275     case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
 276       *errmsg = _("number too big in {} quantifier");
 277       break;
 278     case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
 279       *errmsg = _("missing terminating ] for character class");
 280       break;
 281     case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
 282       *errmsg = _("invalid escape sequence in character class");
 283       break;
 284     case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
 285       *errmsg = _("range out of order in character class");
 286       break;
 287     case G_REGEX_ERROR_NOTHING_TO_REPEAT:
 288       *errmsg = _("nothing to repeat");
 289       break;
 290     case 111: /* internal error: unexpected repeat */
 291       *errcode = G_REGEX_ERROR_INTERNAL;
 292       *errmsg = _("unexpected repeat");
 293       break;
 294     case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
 295       *errmsg = _("unrecognized character after (? or (?-");
 296       break;
 297     case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
 298       *errmsg = _("POSIX named classes are supported only within a class");
 299       break;
 300     case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
 301       *errmsg = _("missing terminating )");
 302       break;
 303     case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
 304       *errmsg = _("reference to non-existent subpattern");
 305       break;
 306     case G_REGEX_ERROR_UNTERMINATED_COMMENT:
 307       *errmsg = _("missing ) after comment");
 308       break;
 309     case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
 310       *errmsg = _("regular expression is too large");
 311       break;
 312     case G_REGEX_ERROR_MEMORY_ERROR:
 313       *errmsg = _("failed to get memory");
 314       break;
 315     case 122: /* unmatched parentheses */
 316       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 317       *errmsg = _(") without opening (");
 318       break;
 319     case 123: /* internal error: code overflow */
 320       *errcode = G_REGEX_ERROR_INTERNAL;
 321       *errmsg = _("code overflow");
 322       break;
 323     case 124: /* "unrecognized character after (?<\0 */
 324       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 325       *errmsg = _("unrecognized character after (?<");
 326       break;
 327     case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
 328       *errmsg = _("lookbehind assertion is not fixed length");
 329       break;
 330     case G_REGEX_ERROR_MALFORMED_CONDITION:
 331       *errmsg = _("malformed number or name after (?(");
 332       break;
 333     case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
 334       *errmsg = _("conditional group contains more than two branches");
 335       break;
 336     case G_REGEX_ERROR_ASSERTION_EXPECTED:
 337       *errmsg = _("assertion expected after (?(");
 338       break;
 339     case 129:
 340       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 341       /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
 342        * sequences here, '(?-54' would be an example for the second group.
 343        */
 344       *errmsg = _("(?R or (?[+-]digits must be followed by )");
 345       break;
 346     case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
 347       *errmsg = _("unknown POSIX class name");
 348       break;
 349     case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
 350       *errmsg = _("POSIX collating elements are not supported");
 351       break;
 352     case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
 353       *errmsg = _("character value in \\x{...} sequence is too large");
 354       break;
 355     case G_REGEX_ERROR_INVALID_CONDITION:
 356       *errmsg = _("invalid condition (?(0)");
 357       break;
 358     case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
 359       *errmsg = _("\\C not allowed in lookbehind assertion");
 360       break;
 361     case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */
 362       /* A number of Perl escapes are not handled by PCRE.
 363        * Therefore it explicitly raises ERR37.
 364        */
 365       *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
 366       *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");
 367       break;
 368     case G_REGEX_ERROR_INFINITE_LOOP:
 369       *errmsg = _("recursive call could loop indefinitely");
 370       break;
 371     case 141: /* unrecognized character after (?P\0 */
 372       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 373       *errmsg = _("unrecognized character after (?P");
 374       break;
 375     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
 376       *errmsg = _("missing terminator in subpattern name");
 377       break;
 378     case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
 379       *errmsg = _("two named subpatterns have the same name");
 380       break;
 381     case G_REGEX_ERROR_MALFORMED_PROPERTY:
 382       *errmsg = _("malformed \\P or \\p sequence");
 383       break;
 384     case G_REGEX_ERROR_UNKNOWN_PROPERTY:
 385       *errmsg = _("unknown property name after \\P or \\p");
 386       break;
 387     case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
 388       *errmsg = _("subpattern name is too long (maximum 32 characters)");
 389       break;
 390     case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
 391       *errmsg = _("too many named subpatterns (maximum 10,000)");
 392       break;
 393     case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
 394       *errmsg = _("octal value is greater than \\377");
 395       break;
 396     case 152: /* internal error: overran compiling workspace */
 397       *errcode = G_REGEX_ERROR_INTERNAL;
 398       *errmsg = _("overran compiling workspace");
 399       break;
 400     case 153: /* internal error: previously-checked referenced subpattern not found */
 401       *errcode = G_REGEX_ERROR_INTERNAL;
 402       *errmsg = _("previously-checked referenced subpattern not found");
 403       break;
 404     case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
 405       *errmsg = _("DEFINE group contains more than one branch");
 406       break;
 407     case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
 408       *errmsg = _("inconsistent NEWLINE options");
 409       break;
 410     case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
 411       *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
 412                   "number, or by a plain number");
 413       break;
 414     case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:
 415       *errmsg = _("a numbered reference must not be zero");
 416       break;
 417     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:
 418       *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
 419       break;
 420     case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:
 421       *errmsg = _("(*VERB) not recognized");
 422       break;
 423     case G_REGEX_ERROR_NUMBER_TOO_BIG:
 424       *errmsg = _("number is too bug");
 425       break;
 426     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:
 427       *errmsg = _("missing subpattern name after (?&");
 428       break;
 429     case G_REGEX_ERROR_MISSING_DIGIT:
 430       *errmsg = _("digit expected after (?+");
 431       break;
 432     case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
 433       *errmsg = _("different names for subpatterns of the same number are not allowed");
 434       break;
 435     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:
 436       *errmsg = _("(*MARK) must have an argument");
 437       break;
 438     case G_REGEX_ERROR_INVALID_CONTROL_CHAR:
 439       *errmsg = _( "\\c must be followed by an ASCII character");
 440       break;
 441     case G_REGEX_ERROR_MISSING_NAME:
 442       *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
 443       break;
 444     case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:
 445       *errmsg = _("\\N is not supported in a class");
 446       break;
 447     case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:
 448       *errmsg = _("too many forward references");
 449       break;
 450     case G_REGEX_ERROR_NAME_TOO_LONG:
 451       *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
 452       break;
 453
 454     case 116: /* erroffset passed as NULL */
 455       /* This should not happen as we never pass a NULL erroffset */
 456       g_warning ("erroffset passed as NULL");
 457       *errcode = G_REGEX_ERROR_COMPILE;
 458       break;
 459     case 117: /* unknown option bit(s) set */
 460       /* This should not happen as we check options before passing them
 461        * to pcre_compile2() */
 462       g_warning ("unknown option bit(s) set");
 463       *errcode = G_REGEX_ERROR_COMPILE;
 464       break;
 465     case 132: /* this version of PCRE is compiled without UTF support */
 466     case 144: /* invalid UTF-8 string */
 467     case 145: /* support for \\P, \\p, and \\X has not been compiled */
 468     case 167: /* this version of PCRE is not compiled with Unicode property support */
 469     case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */
 470     case 174: /* invalid UTF-16 string */
 471       /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
 472        * and we do not check if strings are valid */
 473     case 164: /* ] is an invalid data character in JavaScript compatibility mode */
 474       /* This should not happen as we don't use PCRE_JAVASCRIPT_COMPAT */
 475       g_warning ("%s", *errmsg);
 476       *errcode = G_REGEX_ERROR_COMPILE;
 477       break;
 478     case 170: /* internal error: unknown opcode in find_fixedlength() */
 479       *errcode = G_REGEX_ERROR_INTERNAL;
 480       break;
 481
 482     default:
 483       *errcode = G_REGEX_ERROR_COMPILE;
 484     }
 485 }
 486
 487 /* GMatchInfo */
 488
 489 static GMatchInfo *
 490 match_info_new (const GRegex *regex,
 491                 const gchar  *string,
 492                 gint          string_len,
 493                 gint          start_position,
 494                 gint          match_options,
 495                 gboolean      is_dfa)
 496 {
 497   GMatchInfo *match_info;
 498
 499   if (string_len < 0)
 500     string_len = strlen (string);
 501
 502   match_info = g_new0 (GMatchInfo, 1);
 503   match_info->ref_count = 1;
 504   match_info->regex = g_regex_ref ((GRegex *)regex);
 505   match_info->string = string;
 506   match_info->string_len = string_len;
 507   match_info->matches = PCRE_ERROR_NOMATCH;
 508   match_info->pos = start_position;
 509   match_info->match_opts = match_options;
 510
 511   if (is_dfa)
 512     {
 513       /* These values should be enough for most cases, if they are not
 514        * enough g_regex_match_all_full() will expand them. */
 515       match_info->n_offsets = 24;
 516       match_info->n_workspace = 100;
 517       match_info->workspace = g_new (gint, match_info->n_workspace);
 518     }
 519   else
 520     {
 521       gint capture_count;
 522       pcre_fullinfo (regex->pcre_re, regex->extra,
 523                      PCRE_INFO_CAPTURECOUNT, &capture_count);
 524       match_info->n_offsets = (capture_count + 1) * 3;
 525     }
 526
 527   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 528   /* Set an invalid position for the previous match. */
 529   match_info->offsets[0] = -1;
 530   match_info->offsets[1] = -1;
 531
 532   return match_info;
 533 }
 534
 535 /**
 536  * g_match_info_get_regex:
 537  * @match_info: a #GMatchInfo
 538  *
 539  * Returns #GRegex object used in @match_info. It belongs to Glib
 540  * and must not be freed. Use g_regex_ref() if you need to keep it
 541  * after you free @match_info object.
 542  *
 543  * Returns: #GRegex object used in @match_info
 544  *
 545  * Since: 2.14
 546  */
 547 GRegex *
 548 g_match_info_get_regex (const GMatchInfo *match_info)
 549 {
 550   g_return_val_if_fail (match_info != NULL, NULL);
 551   return match_info->regex;
 552 }
 553
 554 /**
 555  * g_match_info_get_string:
 556  * @match_info: a #GMatchInfo
 557  *
 558  * Returns the string searched with @match_info. This is the
 559  * string passed to g_regex_match() or g_regex_replace() so
 560  * you may not free it before calling this function.
 561  *
 562  * Returns: the string searched with @match_info
 563  *
 564  * Since: 2.14
 565  */
 566 const gchar *
 567 g_match_info_get_string (const GMatchInfo *match_info)
 568 {
 569   g_return_val_if_fail (match_info != NULL, NULL);
 570   return match_info->string;
 571 }
 572
 573 /**
 574  * g_match_info_ref:
 575  * @match_info: a #GMatchInfo
 576  *
 577  * Increases reference count of @match_info by 1.
 578  *
 579  * Returns: @match_info
 580  *
 581  * Since: 2.30
 582  */
 583 GMatchInfo       *
 584 g_match_info_ref (GMatchInfo *match_info)
 585 {
 586   g_return_val_if_fail (match_info != NULL, NULL);
 587   g_atomic_int_inc (&match_info->ref_count);
 588   return match_info;
 589 }
 590
 591 /**
 592  * g_match_info_unref:
 593  * @match_info: a #GMatchInfo
 594  *
 595  * Decreases reference count of @match_info by 1. When reference count drops
 596  * to zero, it frees all the memory associated with the match_info structure.
 597  *
 598  * Since: 2.30
 599  */
 600 void
 601 g_match_info_unref (GMatchInfo *match_info)
 602 {
 603   if (g_atomic_int_dec_and_test (&match_info->ref_count))
 604     {
 605       g_regex_unref (match_info->regex);
 606       g_free (match_info->offsets);
 607       g_free (match_info->workspace);
 608       g_free (match_info);
 609     }
 610 }
 611
 612 /**
 613  * g_match_info_free:
 614  * @match_info: (allow-none): a #GMatchInfo, or %NULL
 615  *
 616  * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
 617  * nothing.
 618  *
 619  * Since: 2.14
 620  */
 621 void
 622 g_match_info_free (GMatchInfo *match_info)
 623 {
 624   if (match_info == NULL)
 625     return;
 626
 627   g_match_info_unref (match_info);
 628 }
 629
 630 /**
 631  * g_match_info_next:
 632  * @match_info: a #GMatchInfo structure
 633  * @error: location to store the error occurring, or %NULL to ignore errors
 634  *
 635  * Scans for the next match using the same parameters of the previous
 636  * call to g_regex_match_full() or g_regex_match() that returned
 637  * @match_info.
 638  *
 639  * The match is done on the string passed to the match function, so you
 640  * cannot free it before calling this function.
 641  *
 642  * Returns: %TRUE is the string matched, %FALSE otherwise
 643  *
 644  * Since: 2.14
 645  */
 646 gboolean
 647 g_match_info_next (GMatchInfo  *match_info,
 648                    GError     **error)
 649 {
 650   gint prev_match_start;
 651   gint prev_match_end;
 652
 653   g_return_val_if_fail (match_info != NULL, FALSE);
 654   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 655   g_return_val_if_fail (match_info->pos >= 0, FALSE);
 656
 657   prev_match_start = match_info->offsets[0];
 658   prev_match_end = match_info->offsets[1];
 659
 660   if (match_info->pos > match_info->string_len)
 661     {
 662       /* we have reached the end of the string */
 663       match_info->pos = -1;
 664       match_info->matches = PCRE_ERROR_NOMATCH;
 665       return FALSE;
 666     }
 667
 668   match_info->matches = pcre_exec (match_info->regex->pcre_re,
 669                                    match_info->regex->extra,
 670                                    match_info->string,
 671                                    match_info->string_len,
 672                                    match_info->pos,
 673                                    match_info->regex->match_opts | match_info->match_opts,
 674                                    match_info->offsets,
 675                                    match_info->n_offsets);
 676   if (IS_PCRE_ERROR (match_info->matches))
 677     {
 678       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 679                    _("Error while matching regular expression %s: %s"),
 680                    match_info->regex->pattern, match_error (match_info->matches));
 681       return FALSE;
 682     }
 683
 684   /* avoid infinite loops if the pattern is an empty string or something
 685    * equivalent */
 686   if (match_info->pos == match_info->offsets[1])
 687     {
 688       if (match_info->pos > match_info->string_len)
 689         {
 690           /* we have reached the end of the string */
 691           match_info->pos = -1;
 692           match_info->matches = PCRE_ERROR_NOMATCH;
 693           return FALSE;
 694         }
 695
 696       match_info->pos = NEXT_CHAR (match_info->regex,
 697                                    &match_info->string[match_info->pos]) -
 698                                    match_info->string;
 699     }
 700   else
 701     {
 702       match_info->pos = match_info->offsets[1];
 703     }
 704
 705   /* it's possible to get two identical matches when we are matching
 706    * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
 707    * the string is "RegExTest" we have:
 708    *  - search at position 0: match from 0 to 0
 709    *  - search at position 1: match from 3 to 3
 710    *  - search at position 3: match from 3 to 3 (duplicate)
 711    *  - search at position 4: match from 5 to 5
 712    *  - search at position 5: match from 5 to 5 (duplicate)
 713    *  - search at position 6: no match -> stop
 714    * so we have to ignore the duplicates.
 715    * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
 716   if (match_info->matches >= 0 &&
 717       prev_match_start == match_info->offsets[0] &&
 718       prev_match_end == match_info->offsets[1])
 719     {
 720       /* ignore this match and search the next one */
 721       return g_match_info_next (match_info, error);
 722     }
 723
 724   return match_info->matches >= 0;
 725 }
 726
 727 /**
 728  * g_match_info_matches:
 729  * @match_info: a #GMatchInfo structure
 730  *
 731  * Returns whether the previous match operation succeeded.
 732  *
 733  * Returns: %TRUE if the previous match operation succeeded,
 734  *   %FALSE otherwise
 735  *
 736  * Since: 2.14
 737  */
 738 gboolean
 739 g_match_info_matches (const GMatchInfo *match_info)
 740 {
 741   g_return_val_if_fail (match_info != NULL, FALSE);
 742
 743   return match_info->matches >= 0;
 744 }
 745
 746 /**
 747  * g_match_info_get_match_count:
 748  * @match_info: a #GMatchInfo structure
 749  *
 750  * Retrieves the number of matched substrings (including substring 0,
 751  * that is the whole matched text), so 1 is returned if the pattern
 752  * has no substrings in it and 0 is returned if the match failed.
 753  *
 754  * If the last match was obtained using the DFA algorithm, that is
 755  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
 756  * count is not that of the number of capturing parentheses but that of
 757  * the number of matched substrings.
 758  *
 759  * Returns: Number of matched substrings, or -1 if an error occurred
 760  *
 761  * Since: 2.14
 762  */
 763 gint
 764 g_match_info_get_match_count (const GMatchInfo *match_info)
 765 {
 766   g_return_val_if_fail (match_info, -1);
 767
 768   if (match_info->matches == PCRE_ERROR_NOMATCH)
 769     /* no match */
 770     return 0;
 771   else if (match_info->matches < PCRE_ERROR_NOMATCH)
 772     /* error */
 773     return -1;
 774   else
 775     /* match */
 776     return match_info->matches;
 777 }
 778
 779 /**
 780  * g_match_info_is_partial_match:
 781  * @match_info: a #GMatchInfo structure
 782  *
 783  * Usually if the string passed to g_regex_match*() matches as far as
 784  * it goes, but is too short to match the entire pattern, %FALSE is
 785  * returned. There are circumstances where it might be helpful to
 786  * distinguish this case from other cases in which there is no match.
 787  *
 788  * Consider, for example, an application where a human is required to
 789  * type in data for a field with specific formatting requirements. An
 790  * example might be a date in the form ddmmmyy, defined by the pattern
 791  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 792  * If the application sees the user’s keystrokes one by one, and can
 793  * check that what has been typed so far is potentially valid, it is
 794  * able to raise an error as soon as a mistake is made.
 795  *
 796  * GRegex supports the concept of partial matching by means of the
 797  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 798  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 799  * for a complete match, %FALSE otherwise. But, when these functions
 800  * return %FALSE, you can check if the match was partial calling
 801  * g_match_info_is_partial_match().
 802  *
 803  * When using partial matching you cannot use g_match_info_fetch*().
 804  *
 805  * Because of the way certain internal optimizations are implemented
 806  * the partial matching algorithm cannot be used with all patterns.
 807  * So repeated single characters such as "a{2,4}" and repeated single
 808  * meta-sequences such as "\d+" are not permitted if the maximum number
 809  * of occurrences is greater than one. Optional items such as "\d?"
 810  * (where the maximum is one) are permitted. Quantifiers with any values
 811  * are permitted after parentheses, so the invalid examples above can be
 812  * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
 813  * for a pattern that does not conform to the restrictions, matching
 814  * functions return an error.
 815  *
 816  * Returns: %TRUE if the match was partial, %FALSE otherwise
 817  *
 818  * Since: 2.14
 819  */
 820 gboolean
 821 g_match_info_is_partial_match (const GMatchInfo *match_info)
 822 {
 823   g_return_val_if_fail (match_info != NULL, FALSE);
 824
 825   return match_info->matches == PCRE_ERROR_PARTIAL;
 826 }
 827
 828 /**
 829  * g_match_info_expand_references:
 830  * @match_info: (allow-none): a #GMatchInfo or %NULL
 831  * @string_to_expand: the string to expand
 832  * @error: location to store the error occurring, or %NULL to ignore errors
 833  *
 834  * Returns a new string containing the text in @string_to_expand with
 835  * references and escape sequences expanded. References refer to the last
 836  * match done with @string against @regex and have the same syntax used by
 837  * g_regex_replace().
 838  *
 839  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
 840  * passed to g_regex_new().
 841  *
 842  * The backreferences are extracted from the string passed to the match
 843  * function, so you cannot call this function after freeing the string.
 844  *
 845  * @match_info may be %NULL in which case @string_to_expand must not
 846  * contain references. For instance "foo\n" does not refer to an actual
 847  * pattern and '\n' merely will be replaced with \n character,
 848  * while to expand "\0" (whole match) one needs the result of a match.
 849  * Use g_regex_check_replacement() to find out whether @string_to_expand
 850  * contains references.
 851  *
 852  * Returns: (allow-none): the expanded string, or %NULL if an error occurred
 853  *
 854  * Since: 2.14
 855  */
 856 gchar *
 857 g_match_info_expand_references (const GMatchInfo  *match_info,
 858                                 const gchar       *string_to_expand,
 859                                 GError           **error)
 860 {
 861   GString *result;
 862   GList *list;
 863   GError *tmp_error = NULL;
 864
 865   g_return_val_if_fail (string_to_expand != NULL, NULL);
 866   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 867
 868   list = split_replacement (string_to_expand, &tmp_error);
 869   if (tmp_error != NULL)
 870     {
 871       g_propagate_error (error, tmp_error);
 872       return NULL;
 873     }
 874
 875   if (!match_info && interpolation_list_needs_match (list))
 876     {
 877       g_critical ("String '%s' contains references to the match, can't "
 878                   "expand references without GMatchInfo object",
 879                   string_to_expand);
 880       return NULL;
 881     }
 882
 883   result = g_string_sized_new (strlen (string_to_expand));
 884   interpolate_replacement (match_info, result, list);
 885
 886   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
 887
 888   return g_string_free (result, FALSE);
 889 }
 890
 891 /**
 892  * g_match_info_fetch:
 893  * @match_info: #GMatchInfo structure
 894  * @match_num: number of the sub expression
 895  *
 896  * Retrieves the text matching the @match_num<!-- -->'th capturing
 897  * parentheses. 0 is the full text of the match, 1 is the first paren
 898  * set, 2 the second, and so on.
 899  *
 900  * If @match_num is a valid sub pattern but it didn't match anything
 901  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
 902  * string is returned.
 903  *
 904  * If the match was obtained using the DFA algorithm, that is using
 905  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 906  * string is not that of a set of parentheses but that of a matched
 907  * substring. Substrings are matched in reverse order of length, so
 908  * 0 is the longest match.
 909  *
 910  * The string is fetched from the string passed to the match function,
 911  * so you cannot call this function after freeing the string.
 912  *
 913  * Returns: (allow-none): The matched substring, or %NULL if an error
 914  *     occurred. You have to free the string yourself
 915  *
 916  * Since: 2.14
 917  */
 918 gchar *
 919 g_match_info_fetch (const GMatchInfo *match_info,
 920                     gint              match_num)
 921 {
 922   /* we cannot use pcre_get_substring() because it allocates the
 923    * string using pcre_malloc(). */
 924   gchar *match = NULL;
 925   gint start, end;
 926
 927   g_return_val_if_fail (match_info != NULL, NULL);
 928   g_return_val_if_fail (match_num >= 0, NULL);
 929
 930   /* match_num does not exist or it didn't matched, i.e. matching "b"
 931    * against "(a)?b" then group 0 is empty. */
 932   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
 933     match = NULL;
 934   else if (start == -1)
 935     match = g_strdup ("");
 936   else
 937     match = g_strndup (&match_info->string[start], end - start);
 938
 939   return match;
 940 }
 941
 942 /**
 943  * g_match_info_fetch_pos:
 944  * @match_info: #GMatchInfo structure
 945  * @match_num: number of the sub expression
 946  * @start_pos: (out) (allow-none): pointer to location where to store
 947  *     the start position, or %NULL
 948  * @end_pos: (out) (allow-none): pointer to location where to store
 949  *     the end position, or %NULL
 950  *
 951  * Retrieves the position in bytes of the @match_num<!-- -->'th capturing
 952  * parentheses. 0 is the full text of the match, 1 is the first
 953  * paren set, 2 the second, and so on.
 954  *
 955  * If @match_num is a valid sub pattern but it didn't match anything
 956  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
 957  * and @end_pos are set to -1 and %TRUE is returned.
 958  *
 959  * If the match was obtained using the DFA algorithm, that is using
 960  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 961  * position is not that of a set of parentheses but that of a matched
 962  * substring. Substrings are matched in reverse order of length, so
 963  * 0 is the longest match.
 964  *
 965  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
 966  *   the position cannot be fetched, @start_pos and @end_pos are left
 967  *   unchanged
 968  *
 969  * Since: 2.14
 970  */
 971 gboolean
 972 g_match_info_fetch_pos (const GMatchInfo *match_info,
 973                         gint              match_num,
 974                         gint             *start_pos,
 975                         gint             *end_pos)
 976 {
 977   g_return_val_if_fail (match_info != NULL, FALSE);
 978   g_return_val_if_fail (match_num >= 0, FALSE);
 979
 980   /* make sure the sub expression number they're requesting is less than
 981    * the total number of sub expressions that were matched. */
 982   if (match_num >= match_info->matches)
 983     return FALSE;
 984
 985   if (start_pos != NULL)
 986     *start_pos = match_info->offsets[2 * match_num];
 987
 988   if (end_pos != NULL)
 989     *end_pos = match_info->offsets[2 * match_num + 1];
 990
 991   return TRUE;
 992 }
 993
 994 /*
 995  * Returns number of first matched subpattern with name @name.
 996  * There may be more than one in case when DUPNAMES is used,
 997  * and not all subpatterns with that name match;
 998  * pcre_get_stringnumber() does not work in that case.
 999  */
1000 static gint
1001 get_matched_substring_number (const GMatchInfo *match_info,
1002                               const gchar      *name)
1003 {
1004   gint entrysize;
1005   gchar *first, *last;
1006   guchar *entry;
1007
1008   if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES))
1009     return pcre_get_stringnumber (match_info->regex->pcre_re, name);
1010
1011   /* This code is copied from pcre_get.c: get_first_set() */
1012   entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
1013                                             name,
1014                                             &first,
1015                                             &last);
1016
1017   if (entrysize <= 0)
1018     return entrysize;
1019
1020   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1021     {
1022       gint n = (entry[0] << 8) + entry[1];
1023       if (match_info->offsets[n*2] >= 0)
1024         return n;
1025     }
1026
1027   return (first[0] << 8) + first[1];
1028 }
1029
1030 /**
1031  * g_match_info_fetch_named:
1032  * @match_info: #GMatchInfo structure
1033  * @name: name of the subexpression
1034  *
1035  * Retrieves the text matching the capturing parentheses named @name.
1036  *
1037  * If @name is a valid sub pattern name but it didn't match anything
1038  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
1039  * then an empty string is returned.
1040  *
1041  * The string is fetched from the string passed to the match function,
1042  * so you cannot call this function after freeing the string.
1043  *
1044  * Returns: (allow-none): The matched substring, or %NULL if an error
1045  *     occurred. You have to free the string yourself
1046  *
1047  * Since: 2.14
1048  */
1049 gchar *
1050 g_match_info_fetch_named (const GMatchInfo *match_info,
1051                           const gchar      *name)
1052 {
1053   /* we cannot use pcre_get_named_substring() because it allocates the
1054    * string using pcre_malloc(). */
1055   gint num;
1056
1057   g_return_val_if_fail (match_info != NULL, NULL);
1058   g_return_val_if_fail (name != NULL, NULL);
1059
1060   num = get_matched_substring_number (match_info, name);
1061   if (num < 0)
1062     return NULL;
1063   else
1064     return g_match_info_fetch (match_info, num);
1065 }
1066
1067 /**
1068  * g_match_info_fetch_named_pos:
1069  * @match_info: #GMatchInfo structure
1070  * @name: name of the subexpression
1071  * @start_pos: (out) (allow-none): pointer to location where to store
1072  *     the start position, or %NULL
1073  * @end_pos: (out) (allow-none): pointer to location where to store
1074  *     the end position, or %NULL
1075  *
1076  * Retrieves the position in bytes of the capturing parentheses named @name.
1077  *
1078  * If @name is a valid sub pattern name but it didn't match anything
1079  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
1080  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1081  *
1082  * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1083  *     If the position cannot be fetched, @start_pos and @end_pos
1084  *     are left unchanged.
1085  *
1086  * Since: 2.14
1087  */
1088 gboolean
1089 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1090                               const gchar      *name,
1091                               gint             *start_pos,
1092                               gint             *end_pos)
1093 {
1094   gint num;
1095
1096   g_return_val_if_fail (match_info != NULL, FALSE);
1097   g_return_val_if_fail (name != NULL, FALSE);
1098
1099   num = get_matched_substring_number (match_info, name);
1100   if (num < 0)
1101     return FALSE;
1102
1103   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1104 }
1105
1106 /**
1107  * g_match_info_fetch_all:
1108  * @match_info: a #GMatchInfo structure
1109  *
1110  * Bundles up pointers to each of the matching substrings from a match
1111  * and stores them in an array of gchar pointers. The first element in
1112  * the returned array is the match number 0, i.e. the entire matched
1113  * text.
1114  *
1115  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1116  * "b" against "(a)?b") then an empty string is inserted.
1117  *
1118  * If the last match was obtained using the DFA algorithm, that is using
1119  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1120  * strings are not that matched by sets of parentheses but that of the
1121  * matched substring. Substrings are matched in reverse order of length,
1122  * so the first one is the longest match.
1123  *
1124  * The strings are fetched from the string passed to the match function,
1125  * so you cannot call this function after freeing the string.
1126  *
1127  * Returns: (allow-none): a %NULL-terminated array of gchar * pointers.
1128  *     It must be freed using g_strfreev(). If the previous match failed
1129  *     %NULL is returned
1130  *
1131  * Since: 2.14
1132  */
1133 gchar **
1134 g_match_info_fetch_all (const GMatchInfo *match_info)
1135 {
1136   /* we cannot use pcre_get_substring_list() because the returned value
1137    * isn't suitable for g_strfreev(). */
1138   gchar **result;
1139   gint i;
1140
1141   g_return_val_if_fail (match_info != NULL, NULL);
1142
1143   if (match_info->matches < 0)
1144     return NULL;
1145
1146   result = g_new (gchar *, match_info->matches + 1);
1147   for (i = 0; i < match_info->matches; i++)
1148     result[i] = g_match_info_fetch (match_info, i);
1149   result[i] = NULL;
1150
1151   return result;
1152 }
1153
1154
1155 /* GRegex */
1156
1157 GQuark
1158 g_regex_error_quark (void)
1159 {
1160   static GQuark error_quark = 0;
1161
1162   if (error_quark == 0)
1163     error_quark = g_quark_from_static_string ("g-regex-error-quark");
1164
1165   return error_quark;
1166 }
1167
1168 /**
1169  * g_regex_ref:
1170  * @regex: a #GRegex
1171  *
1172  * Increases reference count of @regex by 1.
1173  *
1174  * Returns: @regex
1175  *
1176  * Since: 2.14
1177  */
1178 GRegex *
1179 g_regex_ref (GRegex *regex)
1180 {
1181   g_return_val_if_fail (regex != NULL, NULL);
1182   g_atomic_int_inc (&regex->ref_count);
1183   return regex;
1184 }
1185
1186 /**
1187  * g_regex_unref:
1188  * @regex: a #GRegex
1189  *
1190  * Decreases reference count of @regex by 1. When reference count drops
1191  * to zero, it frees all the memory associated with the regex structure.
1192  *
1193  * Since: 2.14
1194  */
1195 void
1196 g_regex_unref (GRegex *regex)
1197 {
1198   g_return_if_fail (regex != NULL);
1199
1200   if (g_atomic_int_dec_and_test (&regex->ref_count))
1201     {
1202       g_free (regex->pattern);
1203       if (regex->pcre_re != NULL)
1204         pcre_free (regex->pcre_re);
1205       if (regex->extra != NULL)
1206         pcre_free (regex->extra);
1207       g_free (regex);
1208     }
1209 }
1210
1211 /**
1212  * g_regex_new:
1213  * @pattern: the regular expression
1214  * @compile_options: compile options for the regular expression, or 0
1215  * @match_options: match options for the regular expression, or 0
1216  * @error: return location for a #GError
1217  *
1218  * Compiles the regular expression to an internal form, and does
1219  * the initial setup of the #GRegex structure.
1220  *
1221  * Returns: a #GRegex structure. Call g_regex_unref() when you
1222  *   are done with it
1223  *
1224  * Since: 2.14
1225  */
1226 GRegex *
1227 g_regex_new (const gchar         *pattern,
1228              GRegexCompileFlags   compile_options,
1229              GRegexMatchFlags     match_options,
1230              GError             **error)
1231 {
1232   GRegex *regex;
1233   pcre *re;
1234   const gchar *errmsg;
1235   gint erroffset;
1236   gint errcode;
1237   gboolean optimize = FALSE;
1238   static gsize initialised;
1239   unsigned long int pcre_compile_options;
1240
1241   g_return_val_if_fail (pattern != NULL, NULL);
1242   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1243   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
1244   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1245
1246   if (g_once_init_enter (&initialised))
1247     {
1248       gint support;
1249       const gchar *msg;
1250
1251       pcre_config (PCRE_CONFIG_UTF8, &support);
1252       if (!support)
1253         {
1254           msg = N_("PCRE library is compiled without UTF8 support");
1255           g_critical ("%s", msg);
1256           g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
1257           return NULL;
1258         }
1259
1260       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
1261       if (!support)
1262         {
1263           msg = N_("PCRE library is compiled without UTF8 properties support");
1264           g_critical ("%s", msg);
1265           g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
1266           return NULL;
1267         }
1268
1269       g_once_init_leave (&initialised, TRUE);
1270     }
1271
1272   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
1273    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
1274   if (compile_options & G_REGEX_OPTIMIZE)
1275     optimize = TRUE;
1276
1277   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
1278    * instead uses UTF-8 only if required with PCRE_UTF8. */
1279   if (compile_options & G_REGEX_RAW)
1280     {
1281       /* disable utf-8 */
1282       compile_options &= ~G_REGEX_RAW;
1283     }
1284   else
1285     {
1286       /* enable utf-8 */
1287       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1288       match_options |= PCRE_NO_UTF8_CHECK;
1289     }
1290
1291   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
1292    * not for the system one. */
1293   if (!(compile_options & G_REGEX_NEWLINE_CR) &&
1294       !(compile_options & G_REGEX_NEWLINE_LF))
1295     {
1296       compile_options |= PCRE_NEWLINE_ANY;
1297     }
1298
1299   compile_options |= PCRE_UCP;
1300
1301   /* compile the pattern */
1302   re = pcre_compile2 (pattern, compile_options, &errcode,
1303                       &errmsg, &erroffset, NULL);
1304
1305   /* if the compilation failed, set the error member and return
1306    * immediately */
1307   if (re == NULL)
1308     {
1309       GError *tmp_error;
1310
1311       /* Translate the PCRE error code to GRegexError and use a translated
1312        * error message if possible */
1313       translate_compile_error (&errcode, &errmsg);
1314
1315       /* PCRE uses byte offsets but we want to show character offsets */
1316       erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1317
1318       tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1319                                _("Error while compiling regular "
1320                                  "expression %s at char %d: %s"),
1321                                pattern, erroffset, errmsg);
1322       g_propagate_error (error, tmp_error);
1323
1324       return NULL;
1325     }
1326
1327   /* For options set at the beginning of the pattern, pcre puts them into
1328    * compile options, e.g. "(?i)foo" will make the pcre structure store
1329    * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
1330   pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);
1331   compile_options = pcre_compile_options;
1332
1333   if (!(compile_options & G_REGEX_DUPNAMES))
1334     {
1335       gboolean jchanged = FALSE;
1336       pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1337       if (jchanged)
1338         compile_options |= G_REGEX_DUPNAMES;
1339     }
1340
1341   regex = g_new0 (GRegex, 1);
1342   regex->ref_count = 1;
1343   regex->pattern = g_strdup (pattern);
1344   regex->pcre_re = re;
1345   regex->compile_opts = compile_options;
1346   regex->match_opts = match_options;
1347
1348   if (optimize)
1349     {
1350       regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
1351       if (errmsg != NULL)
1352         {
1353           GError *tmp_error = g_error_new (G_REGEX_ERROR,
1354                                            G_REGEX_ERROR_OPTIMIZE,
1355                                            _("Error while optimizing "
1356                                              "regular expression %s: %s"),
1357                                            regex->pattern,
1358                                            errmsg);
1359           g_propagate_error (error, tmp_error);
1360
1361           g_regex_unref (regex);
1362           return NULL;
1363         }
1364     }
1365
1366   return regex;
1367 }
1368
1369 /**
1370  * g_regex_get_pattern:
1371  * @regex: a #GRegex structure
1372  *
1373  * Gets the pattern string associated with @regex, i.e. a copy of
1374  * the string passed to g_regex_new().
1375  *
1376  * Returns: the pattern of @regex
1377  *
1378  * Since: 2.14
1379  */
1380 const gchar *
1381 g_regex_get_pattern (const GRegex *regex)
1382 {
1383   g_return_val_if_fail (regex != NULL, NULL);
1384
1385   return regex->pattern;
1386 }
1387
1388 /**
1389  * g_regex_get_max_backref:
1390  * @regex: a #GRegex
1391  *
1392  * Returns the number of the highest back reference
1393  * in the pattern, or 0 if the pattern does not contain
1394  * back references.
1395  *
1396  * Returns: the number of the highest back reference
1397  *
1398  * Since: 2.14
1399  */
1400 gint
1401 g_regex_get_max_backref (const GRegex *regex)
1402 {
1403   gint value;
1404
1405   pcre_fullinfo (regex->pcre_re, regex->extra,
1406                  PCRE_INFO_BACKREFMAX, &value);
1407
1408   return value;
1409 }
1410
1411 /**
1412  * g_regex_get_capture_count:
1413  * @regex: a #GRegex
1414  *
1415  * Returns the number of capturing subpatterns in the pattern.
1416  *
1417  * Returns: the number of capturing subpatterns
1418  *
1419  * Since: 2.14
1420  */
1421 gint
1422 g_regex_get_capture_count (const GRegex *regex)
1423 {
1424   gint value;
1425
1426   pcre_fullinfo (regex->pcre_re, regex->extra,
1427                  PCRE_INFO_CAPTURECOUNT, &value);
1428
1429   return value;
1430 }
1431
1432 /**
1433  * g_regex_get_compile_flags:
1434  * @regex: a #GRegex
1435  *
1436  * Returns the compile options that @regex was created with.
1437  *
1438  * Returns: flags from #GRegexCompileFlags
1439  *
1440  * Since: 2.26
1441  */
1442 GRegexCompileFlags
1443 g_regex_get_compile_flags (const GRegex *regex)
1444 {
1445   g_return_val_if_fail (regex != NULL, 0);
1446
1447   return regex->compile_opts;
1448 }
1449
1450 /**
1451  * g_regex_get_match_flags:
1452  * @regex: a #GRegex
1453  *
1454  * Returns the match options that @regex was created with.
1455  *
1456  * Returns: flags from #GRegexMatchFlags
1457  *
1458  * Since: 2.26
1459  */
1460 GRegexMatchFlags
1461 g_regex_get_match_flags (const GRegex *regex)
1462 {
1463   g_return_val_if_fail (regex != NULL, 0);
1464
1465   return regex->match_opts;
1466 }
1467
1468 /**
1469  * g_regex_match_simple:
1470  * @pattern: the regular expression
1471  * @string: the string to scan for matches
1472  * @compile_options: compile options for the regular expression, or 0
1473  * @match_options: match options, or 0
1474  *
1475  * Scans for a match in @string for @pattern.
1476  *
1477  * This function is equivalent to g_regex_match() but it does not
1478  * require to compile the pattern with g_regex_new(), avoiding some
1479  * lines of code when you need just to do a match without extracting
1480  * substrings, capture counts, and so on.
1481  *
1482  * If this function is to be called on the same @pattern more than
1483  * once, it's more efficient to compile the pattern once with
1484  * g_regex_new() and then use g_regex_match().
1485  *
1486  * Returns: %TRUE if the string matched, %FALSE otherwise
1487  *
1488  * Since: 2.14
1489  */
1490 gboolean
1491 g_regex_match_simple (const gchar        *pattern,
1492                       const gchar        *string,
1493                       GRegexCompileFlags  compile_options,
1494                       GRegexMatchFlags    match_options)
1495 {
1496   GRegex *regex;
1497   gboolean result;
1498
1499   regex = g_regex_new (pattern, compile_options, 0, NULL);
1500   if (!regex)
1501     return FALSE;
1502   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1503   g_regex_unref (regex);
1504   return result;
1505 }
1506
1507 /**
1508  * g_regex_match:
1509  * @regex: a #GRegex structure from g_regex_new()
1510  * @string: the string to scan for matches
1511  * @match_options: match options
1512  * @match_info: (out) (allow-none): pointer to location where to store
1513  *     the #GMatchInfo, or %NULL if you do not need it
1514  *
1515  * Scans for a match in string for the pattern in @regex.
1516  * The @match_options are combined with the match options specified
1517  * when the @regex structure was created, letting you have more
1518  * flexibility in reusing #GRegex structures.
1519  *
1520  * A #GMatchInfo structure, used to get information on the match,
1521  * is stored in @match_info if not %NULL. Note that if @match_info
1522  * is not %NULL then it is created even if the function returns %FALSE,
1523  * i.e. you must free it regardless if regular expression actually matched.
1524  *
1525  * To retrieve all the non-overlapping matches of the pattern in
1526  * string you can use g_match_info_next().
1527  *
1528  * |[
1529  * static void
1530  * print_uppercase_words (const gchar *string)
1531  * {
1532  *   /&ast; Print all uppercase-only words. &ast;/
1533  *   GRegex *regex;
1534  *   GMatchInfo *match_info;
1535  *   &nbsp;
1536  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1537  *   g_regex_match (regex, string, 0, &amp;match_info);
1538  *   while (g_match_info_matches (match_info))
1539  *     {
1540  *       gchar *word = g_match_info_fetch (match_info, 0);
1541  *       g_print ("Found: %s\n", word);
1542  *       g_free (word);
1543  *       g_match_info_next (match_info, NULL);
1544  *     }
1545  *   g_match_info_free (match_info);
1546  *   g_regex_unref (regex);
1547  * }
1548  * ]|
1549  *
1550  * @string is not copied and is used in #GMatchInfo internally. If
1551  * you use any #GMatchInfo method (except g_match_info_free()) after
1552  * freeing or modifying @string then the behaviour is undefined.
1553  *
1554  * Returns: %TRUE is the string matched, %FALSE otherwise
1555  *
1556  * Since: 2.14
1557  */
1558 gboolean
1559 g_regex_match (const GRegex      *regex,
1560                const gchar       *string,
1561                GRegexMatchFlags   match_options,
1562                GMatchInfo       **match_info)
1563 {
1564   return g_regex_match_full (regex, string, -1, 0, match_options,
1565                              match_info, NULL);
1566 }
1567
1568 /**
1569  * g_regex_match_full:
1570  * @regex: a #GRegex structure from g_regex_new()
1571  * @string: (array length=string_len): the string to scan for matches
1572  * @string_len: the length of @string, or -1 if @string is nul-terminated
1573  * @start_position: starting index of the string to match
1574  * @match_options: match options
1575  * @match_info: (out) (allow-none): pointer to location where to store
1576  *     the #GMatchInfo, or %NULL if you do not need it
1577  * @error: location to store the error occurring, or %NULL to ignore errors
1578  *
1579  * Scans for a match in string for the pattern in @regex.
1580  * The @match_options are combined with the match options specified
1581  * when the @regex structure was created, letting you have more
1582  * flexibility in reusing #GRegex structures.
1583  *
1584  * Setting @start_position differs from just passing over a shortened
1585  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1586  * that begins with any kind of lookbehind assertion, such as "\b".
1587  *
1588  * A #GMatchInfo structure, used to get information on the match, is
1589  * stored in @match_info if not %NULL. Note that if @match_info is
1590  * not %NULL then it is created even if the function returns %FALSE,
1591  * i.e. you must free it regardless if regular expression actually
1592  * matched.
1593  *
1594  * @string is not copied and is used in #GMatchInfo internally. If
1595  * you use any #GMatchInfo method (except g_match_info_free()) after
1596  * freeing or modifying @string then the behaviour is undefined.
1597  *
1598  * To retrieve all the non-overlapping matches of the pattern in
1599  * string you can use g_match_info_next().
1600  *
1601  * |[
1602  * static void
1603  * print_uppercase_words (const gchar *string)
1604  * {
1605  *   /&ast; Print all uppercase-only words. &ast;/
1606  *   GRegex *regex;
1607  *   GMatchInfo *match_info;
1608  *   GError *error = NULL;
1609  *   &nbsp;
1610  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1611  *   g_regex_match_full (regex, string, -1, 0, 0, &amp;match_info, &amp;error);
1612  *   while (g_match_info_matches (match_info))
1613  *     {
1614  *       gchar *word = g_match_info_fetch (match_info, 0);
1615  *       g_print ("Found: %s\n", word);
1616  *       g_free (word);
1617  *       g_match_info_next (match_info, &amp;error);
1618  *     }
1619  *   g_match_info_free (match_info);
1620  *   g_regex_unref (regex);
1621  *   if (error != NULL)
1622  *     {
1623  *       g_printerr ("Error while matching: %s\n", error->message);
1624  *       g_error_free (error);
1625  *     }
1626  * }
1627  * ]|
1628  *
1629  * Returns: %TRUE is the string matched, %FALSE otherwise
1630  *
1631  * Since: 2.14
1632  */
1633 gboolean
1634 g_regex_match_full (const GRegex      *regex,
1635                     const gchar       *string,
1636                     gssize             string_len,
1637                     gint               start_position,
1638                     GRegexMatchFlags   match_options,
1639                     GMatchInfo       **match_info,
1640                     GError           **error)
1641 {
1642   GMatchInfo *info;
1643   gboolean match_ok;
1644
1645   g_return_val_if_fail (regex != NULL, FALSE);
1646   g_return_val_if_fail (string != NULL, FALSE);
1647   g_return_val_if_fail (start_position >= 0, FALSE);
1648   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1649   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1650
1651   info = match_info_new (regex, string, string_len, start_position,
1652                          match_options, FALSE);
1653   match_ok = g_match_info_next (info, error);
1654   if (match_info != NULL)
1655     *match_info = info;
1656   else
1657     g_match_info_free (info);
1658
1659   return match_ok;
1660 }
1661
1662 /**
1663  * g_regex_match_all:
1664  * @regex: a #GRegex structure from g_regex_new()
1665  * @string: the string to scan for matches
1666  * @match_options: match options
1667  * @match_info: (out) (allow-none): pointer to location where to store
1668  *     the #GMatchInfo, or %NULL if you do not need it
1669  *
1670  * Using the standard algorithm for regular expression matching only
1671  * the longest match in the string is retrieved. This function uses
1672  * a different algorithm so it can retrieve all the possible matches.
1673  * For more documentation see g_regex_match_all_full().
1674  *
1675  * A #GMatchInfo structure, used to get information on the match, is
1676  * stored in @match_info if not %NULL. Note that if @match_info is
1677  * not %NULL then it is created even if the function returns %FALSE,
1678  * i.e. you must free it regardless if regular expression actually
1679  * matched.
1680  *
1681  * @string is not copied and is used in #GMatchInfo internally. If
1682  * you use any #GMatchInfo method (except g_match_info_free()) after
1683  * freeing or modifying @string then the behaviour is undefined.
1684  *
1685  * Returns: %TRUE is the string matched, %FALSE otherwise
1686  *
1687  * Since: 2.14
1688  */
1689 gboolean
1690 g_regex_match_all (const GRegex      *regex,
1691                    const gchar       *string,
1692                    GRegexMatchFlags   match_options,
1693                    GMatchInfo       **match_info)
1694 {
1695   return g_regex_match_all_full (regex, string, -1, 0, match_options,
1696                                  match_info, NULL);
1697 }
1698
1699 /**
1700  * g_regex_match_all_full:
1701  * @regex: a #GRegex structure from g_regex_new()
1702  * @string: (array length=string_len): the string to scan for matches
1703  * @string_len: the length of @string, or -1 if @string is nul-terminated
1704  * @start_position: starting index of the string to match
1705  * @match_options: match options
1706  * @match_info: (out) (allow-none): pointer to location where to store
1707  *     the #GMatchInfo, or %NULL if you do not need it
1708  * @error: location to store the error occurring, or %NULL to ignore errors
1709  *
1710  * Using the standard algorithm for regular expression matching only
1711  * the longest match in the string is retrieved, it is not possible
1712  * to obtain all the available matches. For instance matching
1713  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1714  * you get "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
1715  *
1716  * This function uses a different algorithm (called DFA, i.e. deterministic
1717  * finite automaton), so it can retrieve all the possible matches, all
1718  * starting at the same point in the string. For instance matching
1719  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1720  * you would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
1721  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
1722  *
1723  * The number of matched strings is retrieved using
1724  * g_match_info_get_match_count(). To obtain the matched strings and
1725  * their position you can use, respectively, g_match_info_fetch() and
1726  * g_match_info_fetch_pos(). Note that the strings are returned in
1727  * reverse order of length; that is, the longest matching string is
1728  * given first.
1729  *
1730  * Note that the DFA algorithm is slower than the standard one and it
1731  * is not able to capture substrings, so backreferences do not work.
1732  *
1733  * Setting @start_position differs from just passing over a shortened
1734  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1735  * that begins with any kind of lookbehind assertion, such as "\b".
1736  *
1737  * A #GMatchInfo structure, used to get information on the match, is
1738  * stored in @match_info if not %NULL. Note that if @match_info is
1739  * not %NULL then it is created even if the function returns %FALSE,
1740  * i.e. you must free it regardless if regular expression actually
1741  * matched.
1742  *
1743  * @string is not copied and is used in #GMatchInfo internally. If
1744  * you use any #GMatchInfo method (except g_match_info_free()) after
1745  * freeing or modifying @string then the behaviour is undefined.
1746  *
1747  * Returns: %TRUE is the string matched, %FALSE otherwise
1748  *
1749  * Since: 2.14
1750  */
1751 gboolean
1752 g_regex_match_all_full (const GRegex      *regex,
1753                         const gchar       *string,
1754                         gssize             string_len,
1755                         gint               start_position,
1756                         GRegexMatchFlags   match_options,
1757                         GMatchInfo       **match_info,
1758                         GError           **error)
1759 {
1760   GMatchInfo *info;
1761   gboolean done;
1762
1763   g_return_val_if_fail (regex != NULL, FALSE);
1764   g_return_val_if_fail (string != NULL, FALSE);
1765   g_return_val_if_fail (start_position >= 0, FALSE);
1766   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1767   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1768
1769   info = match_info_new (regex, string, string_len, start_position,
1770                          match_options, TRUE);
1771
1772   done = FALSE;
1773   while (!done)
1774     {
1775       done = TRUE;
1776       info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1777                                      info->string, info->string_len,
1778                                      info->pos,
1779                                      regex->match_opts | match_options,
1780                                      info->offsets, info->n_offsets,
1781                                      info->workspace, info->n_workspace);
1782       if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1783         {
1784           /* info->workspace is too small. */
1785           info->n_workspace *= 2;
1786           info->workspace = g_realloc (info->workspace,
1787                                        info->n_workspace * sizeof (gint));
1788           done = FALSE;
1789         }
1790       else if (info->matches == 0)
1791         {
1792           /* info->offsets is too small. */
1793           info->n_offsets *= 2;
1794           info->offsets = g_realloc (info->offsets,
1795                                      info->n_offsets * sizeof (gint));
1796           done = FALSE;
1797         }
1798       else if (IS_PCRE_ERROR (info->matches))
1799         {
1800           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1801                        _("Error while matching regular expression %s: %s"),
1802                        regex->pattern, match_error (info->matches));
1803         }
1804     }
1805
1806   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1807   info->pos = -1;
1808
1809   if (match_info != NULL)
1810     *match_info = info;
1811   else
1812     g_match_info_free (info);
1813
1814   return info->matches >= 0;
1815 }
1816
1817 /**
1818  * g_regex_get_string_number:
1819  * @regex: #GRegex structure
1820  * @name: name of the subexpression
1821  *
1822  * Retrieves the number of the subexpression named @name.
1823  *
1824  * Returns: The number of the subexpression or -1 if @name
1825  *   does not exists
1826  *
1827  * Since: 2.14
1828  */
1829 gint
1830 g_regex_get_string_number (const GRegex *regex,
1831                            const gchar  *name)
1832 {
1833   gint num;
1834
1835   g_return_val_if_fail (regex != NULL, -1);
1836   g_return_val_if_fail (name != NULL, -1);
1837
1838   num = pcre_get_stringnumber (regex->pcre_re, name);
1839   if (num == PCRE_ERROR_NOSUBSTRING)
1840     num = -1;
1841
1842   return num;
1843 }
1844
1845 /**
1846  * g_regex_split_simple:
1847  * @pattern: the regular expression
1848  * @string: the string to scan for matches
1849  * @compile_options: compile options for the regular expression, or 0
1850  * @match_options: match options, or 0
1851  *
1852  * Breaks the string on the pattern, and returns an array of
1853  * the tokens. If the pattern contains capturing parentheses,
1854  * then the text for each of the substrings will also be returned.
1855  * If the pattern does not match anywhere in the string, then the
1856  * whole string is returned as the first token.
1857  *
1858  * This function is equivalent to g_regex_split() but it does
1859  * not require to compile the pattern with g_regex_new(), avoiding
1860  * some lines of code when you need just to do a split without
1861  * extracting substrings, capture counts, and so on.
1862  *
1863  * If this function is to be called on the same @pattern more than
1864  * once, it's more efficient to compile the pattern once with
1865  * g_regex_new() and then use g_regex_split().
1866  *
1867  * As a special case, the result of splitting the empty string ""
1868  * is an empty vector, not a vector containing a single string.
1869  * The reason for this special case is that being able to represent
1870  * a empty vector is typically more useful than consistent handling
1871  * of empty elements. If you do need to represent empty elements,
1872  * you'll need to check for the empty string before calling this
1873  * function.
1874  *
1875  * A pattern that can match empty strings splits @string into
1876  * separate characters wherever it matches the empty string between
1877  * characters. For example splitting "ab c" using as a separator
1878  * "\s*", you will get "a", "b" and "c".
1879  *
1880  * Returns: a %NULL-terminated array of strings. Free it using g_strfreev()
1881  *
1882  * Since: 2.14
1883  **/
1884 gchar **
1885 g_regex_split_simple (const gchar        *pattern,
1886                       const gchar        *string,
1887                       GRegexCompileFlags  compile_options,
1888                       GRegexMatchFlags    match_options)
1889 {
1890   GRegex *regex;
1891   gchar **result;
1892
1893   regex = g_regex_new (pattern, compile_options, 0, NULL);
1894   if (!regex)
1895     return NULL;
1896
1897   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1898   g_regex_unref (regex);
1899   return result;
1900 }
1901
1902 /**
1903  * g_regex_split:
1904  * @regex: a #GRegex structure
1905  * @string: the string to split with the pattern
1906  * @match_options: match time option flags
1907  *
1908  * Breaks the string on the pattern, and returns an array of the tokens.
1909  * If the pattern contains capturing parentheses, then the text for each
1910  * of the substrings will also be returned. If the pattern does not match
1911  * anywhere in the string, then the whole string is returned as the first
1912  * token.
1913  *
1914  * As a special case, the result of splitting the empty string "" is an
1915  * empty vector, not a vector containing a single string. The reason for
1916  * this special case is that being able to represent a empty vector is
1917  * typically more useful than consistent handling of empty elements. If
1918  * you do need to represent empty elements, you'll need to check for the
1919  * empty string before calling this function.
1920  *
1921  * A pattern that can match empty strings splits @string into separate
1922  * characters wherever it matches the empty string between characters.
1923  * For example splitting "ab c" using as a separator "\s*", you will get
1924  * "a", "b" and "c".
1925  *
1926  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1927  *
1928  * Since: 2.14
1929  **/
1930 gchar **
1931 g_regex_split (const GRegex     *regex,
1932                const gchar      *string,
1933                GRegexMatchFlags  match_options)
1934 {
1935   return g_regex_split_full (regex, string, -1, 0,
1936                              match_options, 0, NULL);
1937 }
1938
1939 /**
1940  * g_regex_split_full:
1941  * @regex: a #GRegex structure
1942  * @string: (array length=string_len): the string to split with the pattern
1943  * @string_len: the length of @string, or -1 if @string is nul-terminated
1944  * @start_position: starting index of the string to match
1945  * @match_options: match time option flags
1946  * @max_tokens: the maximum number of tokens to split @string into.
1947  *   If this is less than 1, the string is split completely
1948  * @error: return location for a #GError
1949  *
1950  * Breaks the string on the pattern, and returns an array of the tokens.
1951  * If the pattern contains capturing parentheses, then the text for each
1952  * of the substrings will also be returned. If the pattern does not match
1953  * anywhere in the string, then the whole string is returned as the first
1954  * token.
1955  *
1956  * As a special case, the result of splitting the empty string "" is an
1957  * empty vector, not a vector containing a single string. The reason for
1958  * this special case is that being able to represent a empty vector is
1959  * typically more useful than consistent handling of empty elements. If
1960  * you do need to represent empty elements, you'll need to check for the
1961  * empty string before calling this function.
1962  *
1963  * A pattern that can match empty strings splits @string into separate
1964  * characters wherever it matches the empty string between characters.
1965  * For example splitting "ab c" using as a separator "\s*", you will get
1966  * "a", "b" and "c".
1967  *
1968  * Setting @start_position differs from just passing over a shortened
1969  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1970  * that begins with any kind of lookbehind assertion, such as "\b".
1971  *
1972  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1973  *
1974  * Since: 2.14
1975  **/
1976 gchar **
1977 g_regex_split_full (const GRegex      *regex,
1978                     const gchar       *string,
1979                     gssize             string_len,
1980                     gint               start_position,
1981                     GRegexMatchFlags   match_options,
1982                     gint               max_tokens,
1983                     GError           **error)
1984 {
1985   GError *tmp_error = NULL;
1986   GMatchInfo *match_info;
1987   GList *list, *last;
1988   gint i;
1989   gint token_count;
1990   gboolean match_ok;
1991   /* position of the last separator. */
1992   gint last_separator_end;
1993   /* was the last match 0 bytes long? */
1994   gboolean last_match_is_empty;
1995   /* the returned array of char **s */
1996   gchar **string_list;
1997
1998   g_return_val_if_fail (regex != NULL, NULL);
1999   g_return_val_if_fail (string != NULL, NULL);
2000   g_return_val_if_fail (start_position >= 0, NULL);
2001   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2002   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2003
2004   if (max_tokens <= 0)
2005     max_tokens = G_MAXINT;
2006
2007   if (string_len < 0)
2008     string_len = strlen (string);
2009
2010   /* zero-length string */
2011   if (string_len - start_position == 0)
2012     return g_new0 (gchar *, 1);
2013
2014   if (max_tokens == 1)
2015     {
2016       string_list = g_new0 (gchar *, 2);
2017       string_list[0] = g_strndup (&string[start_position],
2018                                   string_len - start_position);
2019       return string_list;
2020     }
2021
2022   list = NULL;
2023   token_count = 0;
2024   last_separator_end = start_position;
2025   last_match_is_empty = FALSE;
2026
2027   match_ok = g_regex_match_full (regex, string, string_len, start_position,
2028                                  match_options, &match_info, &tmp_error);
2029
2030   while (tmp_error == NULL)
2031     {
2032       if (match_ok)
2033         {
2034           last_match_is_empty =
2035                     (match_info->offsets[0] == match_info->offsets[1]);
2036
2037           /* we need to skip empty separators at the same position of the end
2038            * of another separator. e.g. the string is "a b" and the separator
2039            * is " *", so from 1 to 2 we have a match and at position 2 we have
2040            * an empty match. */
2041           if (last_separator_end != match_info->offsets[1])
2042             {
2043               gchar *token;
2044               gint match_count;
2045
2046               token = g_strndup (string + last_separator_end,
2047                                  match_info->offsets[0] - last_separator_end);
2048               list = g_list_prepend (list, token);
2049               token_count++;
2050
2051               /* if there were substrings, these need to be added to
2052                * the list. */
2053               match_count = g_match_info_get_match_count (match_info);
2054               if (match_count > 1)
2055                 {
2056                   for (i = 1; i < match_count; i++)
2057                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
2058                 }
2059             }
2060         }
2061       else
2062         {
2063           /* if there was no match, copy to end of string. */
2064           if (!last_match_is_empty)
2065             {
2066               gchar *token = g_strndup (string + last_separator_end,
2067                                         match_info->string_len - last_separator_end);
2068               list = g_list_prepend (list, token);
2069             }
2070           /* no more tokens, end the loop. */
2071           break;
2072         }
2073
2074       /* -1 to leave room for the last part. */
2075       if (token_count >= max_tokens - 1)
2076         {
2077           /* we have reached the maximum number of tokens, so we copy
2078            * the remaining part of the string. */
2079           if (last_match_is_empty)
2080             {
2081               /* the last match was empty, so we have moved one char
2082                * after the real position to avoid empty matches at the
2083                * same position. */
2084               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
2085             }
2086           /* the if is needed in the case we have terminated the available
2087            * tokens, but we are at the end of the string, so there are no
2088            * characters left to copy. */
2089           if (string_len > match_info->pos)
2090             {
2091               gchar *token = g_strndup (string + match_info->pos,
2092                                         string_len - match_info->pos);
2093               list = g_list_prepend (list, token);
2094             }
2095           /* end the loop. */
2096           break;
2097         }
2098
2099       last_separator_end = match_info->pos;
2100       if (last_match_is_empty)
2101         /* if the last match was empty, g_match_info_next() has moved
2102          * forward to avoid infinite loops, but we still need to copy that
2103          * character. */
2104         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
2105
2106       match_ok = g_match_info_next (match_info, &tmp_error);
2107     }
2108   g_match_info_free (match_info);
2109   if (tmp_error != NULL)
2110     {
2111       g_propagate_error (error, tmp_error);
2112       g_list_free_full (list, g_free);
2113       match_info->pos = -1;
2114       return NULL;
2115     }
2116
2117   string_list = g_new (gchar *, g_list_length (list) + 1);
2118   i = 0;
2119   for (last = g_list_last (list); last; last = g_list_previous (last))
2120     string_list[i++] = last->data;
2121   string_list[i] = NULL;
2122   g_list_free (list);
2123
2124   return string_list;
2125 }
2126
2127 enum
2128 {
2129   REPL_TYPE_STRING,
2130   REPL_TYPE_CHARACTER,
2131   REPL_TYPE_SYMBOLIC_REFERENCE,
2132   REPL_TYPE_NUMERIC_REFERENCE,
2133   REPL_TYPE_CHANGE_CASE
2134 };
2135
2136 typedef enum
2137 {
2138   CHANGE_CASE_NONE         = 1 << 0,
2139   CHANGE_CASE_UPPER        = 1 << 1,
2140   CHANGE_CASE_LOWER        = 1 << 2,
2141   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
2142   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
2143   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
2144   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
2145   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
2146 } ChangeCase;
2147
2148 struct _InterpolationData
2149 {
2150   gchar     *text;
2151   gint       type;
2152   gint       num;
2153   gchar      c;
2154   ChangeCase change_case;
2155 };
2156
2157 static void
2158 free_interpolation_data (InterpolationData *data)
2159 {
2160   g_free (data->text);
2161   g_free (data);
2162 }
2163
2164 static const gchar *
2165 expand_escape (const gchar        *replacement,
2166                const gchar        *p,
2167                InterpolationData  *data,
2168                GError            **error)
2169 {
2170   const gchar *q, *r;
2171   gint x, d, h, i;
2172   const gchar *error_detail;
2173   gint base = 0;
2174   GError *tmp_error = NULL;
2175
2176   p++;
2177   switch (*p)
2178     {
2179     case 't':
2180       p++;
2181       data->c = '\t';
2182       data->type = REPL_TYPE_CHARACTER;
2183       break;
2184     case 'n':
2185       p++;
2186       data->c = '\n';
2187       data->type = REPL_TYPE_CHARACTER;
2188       break;
2189     case 'v':
2190       p++;
2191       data->c = '\v';
2192       data->type = REPL_TYPE_CHARACTER;
2193       break;
2194     case 'r':
2195       p++;
2196       data->c = '\r';
2197       data->type = REPL_TYPE_CHARACTER;
2198       break;
2199     case 'f':
2200       p++;
2201       data->c = '\f';
2202       data->type = REPL_TYPE_CHARACTER;
2203       break;
2204     case 'a':
2205       p++;
2206       data->c = '\a';
2207       data->type = REPL_TYPE_CHARACTER;
2208       break;
2209     case 'b':
2210       p++;
2211       data->c = '\b';
2212       data->type = REPL_TYPE_CHARACTER;
2213       break;
2214     case '\\':
2215       p++;
2216       data->c = '\\';
2217       data->type = REPL_TYPE_CHARACTER;
2218       break;
2219     case 'x':
2220       p++;
2221       x = 0;
2222       if (*p == '{')
2223         {
2224           p++;
2225           do
2226             {
2227               h = g_ascii_xdigit_value (*p);
2228               if (h < 0)
2229                 {
2230                   error_detail = _("hexadecimal digit or '}' expected");
2231                   goto error;
2232                 }
2233               x = x * 16 + h;
2234               p++;
2235             }
2236           while (*p != '}');
2237           p++;
2238         }
2239       else
2240         {
2241           for (i = 0; i < 2; i++)
2242             {
2243               h = g_ascii_xdigit_value (*p);
2244               if (h < 0)
2245                 {
2246                   error_detail = _("hexadecimal digit expected");
2247                   goto error;
2248                 }
2249               x = x * 16 + h;
2250               p++;
2251             }
2252         }
2253       data->type = REPL_TYPE_STRING;
2254       data->text = g_new0 (gchar, 8);
2255       g_unichar_to_utf8 (x, data->text);
2256       break;
2257     case 'l':
2258       p++;
2259       data->type = REPL_TYPE_CHANGE_CASE;
2260       data->change_case = CHANGE_CASE_LOWER_SINGLE;
2261       break;
2262     case 'u':
2263       p++;
2264       data->type = REPL_TYPE_CHANGE_CASE;
2265       data->change_case = CHANGE_CASE_UPPER_SINGLE;
2266       break;
2267     case 'L':
2268       p++;
2269       data->type = REPL_TYPE_CHANGE_CASE;
2270       data->change_case = CHANGE_CASE_LOWER;
2271       break;
2272     case 'U':
2273       p++;
2274       data->type = REPL_TYPE_CHANGE_CASE;
2275       data->change_case = CHANGE_CASE_UPPER;
2276       break;
2277     case 'E':
2278       p++;
2279       data->type = REPL_TYPE_CHANGE_CASE;
2280       data->change_case = CHANGE_CASE_NONE;
2281       break;
2282     case 'g':
2283       p++;
2284       if (*p != '<')
2285         {
2286           error_detail = _("missing '<' in symbolic reference");
2287           goto error;
2288         }
2289       q = p + 1;
2290       do
2291         {
2292           p++;
2293           if (!*p)
2294             {
2295               error_detail = _("unfinished symbolic reference");
2296               goto error;
2297             }
2298         }
2299       while (*p != '>');
2300       if (p - q == 0)
2301         {
2302           error_detail = _("zero-length symbolic reference");
2303           goto error;
2304         }
2305       if (g_ascii_isdigit (*q))
2306         {
2307           x = 0;
2308           do
2309             {
2310               h = g_ascii_digit_value (*q);
2311               if (h < 0)
2312                 {
2313                   error_detail = _("digit expected");
2314                   p = q;
2315                   goto error;
2316                 }
2317               x = x * 10 + h;
2318               q++;
2319             }
2320           while (q != p);
2321           data->num = x;
2322           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2323         }
2324       else
2325         {
2326           r = q;
2327           do
2328             {
2329               if (!g_ascii_isalnum (*r))
2330                 {
2331                   error_detail = _("illegal symbolic reference");
2332                   p = r;
2333                   goto error;
2334                 }
2335               r++;
2336             }
2337           while (r != p);
2338           data->text = g_strndup (q, p - q);
2339           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
2340         }
2341       p++;
2342       break;
2343     case '0':
2344       /* if \0 is followed by a number is an octal number representing a
2345        * character, else it is a numeric reference. */
2346       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
2347         {
2348           base = 8;
2349           p = g_utf8_next_char (p);
2350         }
2351     case '1':
2352     case '2':
2353     case '3':
2354     case '4':
2355     case '5':
2356     case '6':
2357     case '7':
2358     case '8':
2359     case '9':
2360       x = 0;
2361       d = 0;
2362       for (i = 0; i < 3; i++)
2363         {
2364           h = g_ascii_digit_value (*p);
2365           if (h < 0)
2366             break;
2367           if (h > 7)
2368             {
2369               if (base == 8)
2370                 break;
2371               else
2372                 base = 10;
2373             }
2374           if (i == 2 && base == 10)
2375             break;
2376           x = x * 8 + h;
2377           d = d * 10 + h;
2378           p++;
2379         }
2380       if (base == 8 || i == 3)
2381         {
2382           data->type = REPL_TYPE_STRING;
2383           data->text = g_new0 (gchar, 8);
2384           g_unichar_to_utf8 (x, data->text);
2385         }
2386       else
2387         {
2388           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2389           data->num = d;
2390         }
2391       break;
2392     case 0:
2393       error_detail = _("stray final '\\'");
2394       goto error;
2395       break;
2396     default:
2397       error_detail = _("unknown escape sequence");
2398       goto error;
2399     }
2400
2401   return p;
2402
2403  error:
2404   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
2405   tmp_error = g_error_new (G_REGEX_ERROR,
2406                            G_REGEX_ERROR_REPLACE,
2407                            _("Error while parsing replacement "
2408                              "text \"%s\" at char %lu: %s"),
2409                            replacement,
2410                            (gulong)(p - replacement),
2411                            error_detail);
2412   g_propagate_error (error, tmp_error);
2413
2414   return NULL;
2415 }
2416
2417 static GList *
2418 split_replacement (const gchar  *replacement,
2419                    GError      **error)
2420 {
2421   GList *list = NULL;
2422   InterpolationData *data;
2423   const gchar *p, *start;
2424
2425   start = p = replacement;
2426   while (*p)
2427     {
2428       if (*p == '\\')
2429         {
2430           data = g_new0 (InterpolationData, 1);
2431           start = p = expand_escape (replacement, p, data, error);
2432           if (p == NULL)
2433             {
2434               g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2435               free_interpolation_data (data);
2436
2437               return NULL;
2438             }
2439           list = g_list_prepend (list, data);
2440         }
2441       else
2442         {
2443           p++;
2444           if (*p == '\\' || *p == '\0')
2445             {
2446               if (p - start > 0)
2447                 {
2448                   data = g_new0 (InterpolationData, 1);
2449                   data->text = g_strndup (start, p - start);
2450                   data->type = REPL_TYPE_STRING;
2451                   list = g_list_prepend (list, data);
2452                 }
2453             }
2454         }
2455     }
2456
2457   return g_list_reverse (list);
2458 }
2459
2460 /* Change the case of c based on change_case. */
2461 #define CHANGE_CASE(c, change_case) \
2462         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2463                 g_unichar_tolower (c) : \
2464                 g_unichar_toupper (c))
2465
2466 static void
2467 string_append (GString     *string,
2468                const gchar *text,
2469                ChangeCase  *change_case)
2470 {
2471   gunichar c;
2472
2473   if (text[0] == '\0')
2474     return;
2475
2476   if (*change_case == CHANGE_CASE_NONE)
2477     {
2478       g_string_append (string, text);
2479     }
2480   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2481     {
2482       c = g_utf8_get_char (text);
2483       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2484       g_string_append (string, g_utf8_next_char (text));
2485       *change_case = CHANGE_CASE_NONE;
2486     }
2487   else
2488     {
2489       while (*text != '\0')
2490         {
2491           c = g_utf8_get_char (text);
2492           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2493           text = g_utf8_next_char (text);
2494         }
2495     }
2496 }
2497
2498 static gboolean
2499 interpolate_replacement (const GMatchInfo *match_info,
2500                          GString          *result,
2501                          gpointer          data)
2502 {
2503   GList *list;
2504   InterpolationData *idata;
2505   gchar *match;
2506   ChangeCase change_case = CHANGE_CASE_NONE;
2507
2508   for (list = data; list; list = list->next)
2509     {
2510       idata = list->data;
2511       switch (idata->type)
2512         {
2513         case REPL_TYPE_STRING:
2514           string_append (result, idata->text, &change_case);
2515           break;
2516         case REPL_TYPE_CHARACTER:
2517           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2518           if (change_case & CHANGE_CASE_SINGLE_MASK)
2519             change_case = CHANGE_CASE_NONE;
2520           break;
2521         case REPL_TYPE_NUMERIC_REFERENCE:
2522           match = g_match_info_fetch (match_info, idata->num);
2523           if (match)
2524             {
2525               string_append (result, match, &change_case);
2526               g_free (match);
2527             }
2528           break;
2529         case REPL_TYPE_SYMBOLIC_REFERENCE:
2530           match = g_match_info_fetch_named (match_info, idata->text);
2531           if (match)
2532             {
2533               string_append (result, match, &change_case);
2534               g_free (match);
2535             }
2536           break;
2537         case REPL_TYPE_CHANGE_CASE:
2538           change_case = idata->change_case;
2539           break;
2540         }
2541     }
2542
2543   return FALSE;
2544 }
2545
2546 /* whether actual match_info is needed for replacement, i.e.
2547  * whether there are references
2548  */
2549 static gboolean
2550 interpolation_list_needs_match (GList *list)
2551 {
2552   while (list != NULL)
2553     {
2554       InterpolationData *data = list->data;
2555
2556       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
2557           data->type == REPL_TYPE_NUMERIC_REFERENCE)
2558         {
2559           return TRUE;
2560         }
2561
2562       list = list->next;
2563     }
2564
2565   return FALSE;
2566 }
2567
2568 /**
2569  * g_regex_replace:
2570  * @regex: a #GRegex structure
2571  * @string: (array length=string_len): the string to perform matches against
2572  * @string_len: the length of @string, or -1 if @string is nul-terminated
2573  * @start_position: starting index of the string to match
2574  * @replacement: text to replace each match with
2575  * @match_options: options for the match
2576  * @error: location to store the error occurring, or %NULL to ignore errors
2577  *
2578  * Replaces all occurrences of the pattern in @regex with the
2579  * replacement text. Backreferences of the form '\number' or
2580  * '\g&lt;number&gt;' in the replacement text are interpolated by the
2581  * number-th captured subexpression of the match, '\g&lt;name&gt;' refers
2582  * to the captured subexpression with the given name. '\0' refers to the
2583  * complete match, but '\0' followed by a number is the octal representation
2584  * of a character. To include a literal '\' in the replacement, write '\\'.
2585  * There are also escapes that changes the case of the following text:
2586  *
2587  * <variablelist>
2588  * <varlistentry><term>\l</term>
2589  * <listitem>
2590  * <para>Convert to lower case the next character</para>
2591  * </listitem>
2592  * </varlistentry>
2593  * <varlistentry><term>\u</term>
2594  * <listitem>
2595  * <para>Convert to upper case the next character</para>
2596  * </listitem>
2597  * </varlistentry>
2598  * <varlistentry><term>\L</term>
2599  * <listitem>
2600  * <para>Convert to lower case till \E</para>
2601  * </listitem>
2602  * </varlistentry>
2603  * <varlistentry><term>\U</term>
2604  * <listitem>
2605  * <para>Convert to upper case till \E</para>
2606  * </listitem>
2607  * </varlistentry>
2608  * <varlistentry><term>\E</term>
2609  * <listitem>
2610  * <para>End case modification</para>
2611  * </listitem>
2612  * </varlistentry>
2613  * </variablelist>
2614  *
2615  * If you do not need to use backreferences use g_regex_replace_literal().
2616  *
2617  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2618  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2619  * you can use g_regex_replace_literal().
2620  *
2621  * Setting @start_position differs from just passing over a shortened
2622  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
2623  * begins with any kind of lookbehind assertion, such as "\b".
2624  *
2625  * Returns: a newly allocated string containing the replacements
2626  *
2627  * Since: 2.14
2628  */
2629 gchar *
2630 g_regex_replace (const GRegex      *regex,
2631                  const gchar       *string,
2632                  gssize             string_len,
2633                  gint               start_position,
2634                  const gchar       *replacement,
2635                  GRegexMatchFlags   match_options,
2636                  GError           **error)
2637 {
2638   gchar *result;
2639   GList *list;
2640   GError *tmp_error = NULL;
2641
2642   g_return_val_if_fail (regex != NULL, NULL);
2643   g_return_val_if_fail (string != NULL, NULL);
2644   g_return_val_if_fail (start_position >= 0, NULL);
2645   g_return_val_if_fail (replacement != NULL, NULL);
2646   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2647   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2648
2649   list = split_replacement (replacement, &tmp_error);
2650   if (tmp_error != NULL)
2651     {
2652       g_propagate_error (error, tmp_error);
2653       return NULL;
2654     }
2655
2656   result = g_regex_replace_eval (regex,
2657                                  string, string_len, start_position,
2658                                  match_options,
2659                                  interpolate_replacement,
2660                                  (gpointer)list,
2661                                  &tmp_error);
2662   if (tmp_error != NULL)
2663     g_propagate_error (error, tmp_error);
2664
2665   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2666
2667   return result;
2668 }
2669
2670 static gboolean
2671 literal_replacement (const GMatchInfo *match_info,
2672                      GString          *result,
2673                      gpointer          data)
2674 {
2675   g_string_append (result, data);
2676   return FALSE;
2677 }
2678
2679 /**
2680  * g_regex_replace_literal:
2681  * @regex: a #GRegex structure
2682  * @string: (array length=string_len): the string to perform matches against
2683  * @string_len: the length of @string, or -1 if @string is nul-terminated
2684  * @start_position: starting index of the string to match
2685  * @replacement: text to replace each match with
2686  * @match_options: options for the match
2687  * @error: location to store the error occurring, or %NULL to ignore errors
2688  *
2689  * Replaces all occurrences of the pattern in @regex with the
2690  * replacement text. @replacement is replaced literally, to
2691  * include backreferences use g_regex_replace().
2692  *
2693  * Setting @start_position differs from just passing over a
2694  * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
2695  * case of a pattern that begins with any kind of lookbehind
2696  * assertion, such as "\b".
2697  *
2698  * Returns: a newly allocated string containing the replacements
2699  *
2700  * Since: 2.14
2701  */
2702 gchar *
2703 g_regex_replace_literal (const GRegex      *regex,
2704                          const gchar       *string,
2705                          gssize             string_len,
2706                          gint               start_position,
2707                          const gchar       *replacement,
2708                          GRegexMatchFlags   match_options,
2709                          GError           **error)
2710 {
2711   g_return_val_if_fail (replacement != NULL, NULL);
2712   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2713
2714   return g_regex_replace_eval (regex,
2715                                string, string_len, start_position,
2716                                match_options,
2717                                literal_replacement,
2718                                (gpointer)replacement,
2719                                error);
2720 }
2721
2722 /**
2723  * g_regex_replace_eval:
2724  * @regex: a #GRegex structure from g_regex_new()
2725  * @string: (array length=string_len): string to perform matches against
2726  * @string_len: the length of @string, or -1 if @string is nul-terminated
2727  * @start_position: starting index of the string to match
2728  * @match_options: options for the match
2729  * @eval: a function to call for each match
2730  * @user_data: user data to pass to the function
2731  * @error: location to store the error occurring, or %NULL to ignore errors
2732  *
2733  * Replaces occurrences of the pattern in regex with the output of
2734  * @eval for that occurrence.
2735  *
2736  * Setting @start_position differs from just passing over a shortened
2737  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2738  * that begins with any kind of lookbehind assertion, such as "\b".
2739  *
2740  * The following example uses g_regex_replace_eval() to replace multiple
2741  * strings at once:
2742  * |[
2743  * static gboolean
2744  * eval_cb (const GMatchInfo *info,
2745  *          GString          *res,
2746  *          gpointer          data)
2747  * {
2748  *   gchar *match;
2749  *   gchar *r;
2750  *
2751  *    match = g_match_info_fetch (info, 0);
2752  *    r = g_hash_table_lookup ((GHashTable *)data, match);
2753  *    g_string_append (res, r);
2754  *    g_free (match);
2755  *
2756  *    return FALSE;
2757  * }
2758  *
2759  * /&ast; ... &ast;/
2760  *
2761  * GRegex *reg;
2762  * GHashTable *h;
2763  * gchar *res;
2764  *
2765  * h = g_hash_table_new (g_str_hash, g_str_equal);
2766  *
2767  * g_hash_table_insert (h, "1", "ONE");
2768  * g_hash_table_insert (h, "2", "TWO");
2769  * g_hash_table_insert (h, "3", "THREE");
2770  * g_hash_table_insert (h, "4", "FOUR");
2771  *
2772  * reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
2773  * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
2774  * g_hash_table_destroy (h);
2775  *
2776  * /&ast; ... &ast;/
2777  * ]|
2778  *
2779  * Returns: a newly allocated string containing the replacements
2780  *
2781  * Since: 2.14
2782  */
2783 gchar *
2784 g_regex_replace_eval (const GRegex        *regex,
2785                       const gchar         *string,
2786                       gssize               string_len,
2787                       gint                 start_position,
2788                       GRegexMatchFlags     match_options,
2789                       GRegexEvalCallback   eval,
2790                       gpointer             user_data,
2791                       GError             **error)
2792 {
2793   GMatchInfo *match_info;
2794   GString *result;
2795   gint str_pos = 0;
2796   gboolean done = FALSE;
2797   GError *tmp_error = NULL;
2798
2799   g_return_val_if_fail (regex != NULL, NULL);
2800   g_return_val_if_fail (string != NULL, NULL);
2801   g_return_val_if_fail (start_position >= 0, NULL);
2802   g_return_val_if_fail (eval != NULL, NULL);
2803   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2804
2805   if (string_len < 0)
2806     string_len = strlen (string);
2807
2808   result = g_string_sized_new (string_len);
2809
2810   /* run down the string making matches. */
2811   g_regex_match_full (regex, string, string_len, start_position,
2812                       match_options, &match_info, &tmp_error);
2813   while (!done && g_match_info_matches (match_info))
2814     {
2815       g_string_append_len (result,
2816                            string + str_pos,
2817                            match_info->offsets[0] - str_pos);
2818       done = (*eval) (match_info, result, user_data);
2819       str_pos = match_info->offsets[1];
2820       g_match_info_next (match_info, &tmp_error);
2821     }
2822   g_match_info_free (match_info);
2823   if (tmp_error != NULL)
2824     {
2825       g_propagate_error (error, tmp_error);
2826       g_string_free (result, TRUE);
2827       return NULL;
2828     }
2829
2830   g_string_append_len (result, string + str_pos, string_len - str_pos);
2831   return g_string_free (result, FALSE);
2832 }
2833
2834 /**
2835  * g_regex_check_replacement:
2836  * @replacement: the replacement string
2837  * @has_references: (out) (allow-none): location to store information about
2838  *   references in @replacement or %NULL
2839  * @error: location to store error
2840  *
2841  * Checks whether @replacement is a valid replacement string
2842  * (see g_regex_replace()), i.e. that all escape sequences in
2843  * it are valid.
2844  *
2845  * If @has_references is not %NULL then @replacement is checked
2846  * for pattern references. For instance, replacement text 'foo\n'
2847  * does not contain references and may be evaluated without information
2848  * about actual match, but '\0\1' (whole match followed by first
2849  * subpattern) requires valid #GMatchInfo object.
2850  *
2851  * Returns: whether @replacement is a valid replacement string
2852  *
2853  * Since: 2.14
2854  */
2855 gboolean
2856 g_regex_check_replacement (const gchar  *replacement,
2857                            gboolean     *has_references,
2858                            GError      **error)
2859 {
2860   GList *list;
2861   GError *tmp = NULL;
2862
2863   list = split_replacement (replacement, &tmp);
2864
2865   if (tmp)
2866   {
2867     g_propagate_error (error, tmp);
2868     return FALSE;
2869   }
2870
2871   if (has_references)
2872     *has_references = interpolation_list_needs_match (list);
2873
2874   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2875
2876   return TRUE;
2877 }
2878
2879 /**
2880  * g_regex_escape_nul:
2881  * @string: the string to escape
2882  * @length: the length of @string
2883  *
2884  * Escapes the nul characters in @string to "\x00".  It can be used
2885  * to compile a regex with embedded nul characters.
2886  *
2887  * For completeness, @length can be -1 for a nul-terminated string.
2888  * In this case the output string will be of course equal to @string.
2889  *
2890  * Returns: a newly-allocated escaped string
2891  *
2892  * Since: 2.30
2893  */
2894 gchar *
2895 g_regex_escape_nul (const gchar *string,
2896                     gint         length)
2897 {
2898   GString *escaped;
2899   const gchar *p, *piece_start, *end;
2900   gint backslashes;
2901
2902   g_return_val_if_fail (string != NULL, NULL);
2903
2904   if (length < 0)
2905     return g_strdup (string);
2906
2907   end = string + length;
2908   p = piece_start = string;
2909   escaped = g_string_sized_new (length + 1);
2910
2911   backslashes = 0;
2912   while (p < end)
2913     {
2914       switch (*p)
2915         {
2916         case '\0':
2917           if (p != piece_start)
2918             {
2919               /* copy the previous piece. */
2920               g_string_append_len (escaped, piece_start, p - piece_start);
2921             }
2922           if ((backslashes & 1) == 0)
2923             g_string_append_c (escaped, '\\');
2924           g_string_append_c (escaped, 'x');
2925           g_string_append_c (escaped, '0');
2926           g_string_append_c (escaped, '0');
2927           piece_start = ++p;
2928           backslashes = 0;
2929           break;
2930         case '\\':
2931           backslashes++;
2932           ++p;
2933           break;
2934         default:
2935           backslashes = 0;
2936           p = g_utf8_next_char (p);
2937           break;
2938         }
2939     }
2940
2941   if (piece_start < end)
2942     g_string_append_len (escaped, piece_start, end - piece_start);
2943
2944   return g_string_free (escaped, FALSE);
2945 }
2946
2947 /**
2948  * g_regex_escape_string:
2949  * @string: (array length=length): the string to escape
2950  * @length: the length of @string, or -1 if @string is nul-terminated
2951  *
2952  * Escapes the special characters used for regular expressions
2953  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
2954  * function is useful to dynamically generate regular expressions.
2955  *
2956  * @string can contain nul characters that are replaced with "\0",
2957  * in this case remember to specify the correct length of @string
2958  * in @length.
2959  *
2960  * Returns: a newly-allocated escaped string
2961  *
2962  * Since: 2.14
2963  */
2964 gchar *
2965 g_regex_escape_string (const gchar *string,
2966                        gint         length)
2967 {
2968   GString *escaped;
2969   const char *p, *piece_start, *end;
2970
2971   g_return_val_if_fail (string != NULL, NULL);
2972
2973   if (length < 0)
2974     length = strlen (string);
2975
2976   end = string + length;
2977   p = piece_start = string;
2978   escaped = g_string_sized_new (length + 1);
2979
2980   while (p < end)
2981     {
2982       switch (*p)
2983         {
2984         case '\0':
2985         case '\\':
2986         case '|':
2987         case '(':
2988         case ')':
2989         case '[':
2990         case ']':
2991         case '{':
2992         case '}':
2993         case '^':
2994         case '$':
2995         case '*':
2996         case '+':
2997         case '?':
2998         case '.':
2999           if (p != piece_start)
3000             /* copy the previous piece. */
3001             g_string_append_len (escaped, piece_start, p - piece_start);
3002           g_string_append_c (escaped, '\\');
3003           if (*p == '\0')
3004             g_string_append_c (escaped, '0');
3005           else
3006             g_string_append_c (escaped, *p);
3007           piece_start = ++p;
3008           break;
3009         default:
3010           p = g_utf8_next_char (p);
3011           break;
3012         }
3013   }
3014
3015   if (piece_start < end)
3016     g_string_append_len (escaped, piece_start, end - piece_start);
3017
3018   return g_string_free (escaped, FALSE);
3019 }