glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include "config.h"
  23
  24 #include <string.h>
  25
  26 #ifdef USE_SYSTEM_PCRE
  27 #include <pcre.h>
  28 #else
  29 #include "pcre/pcre.h"
  30 #endif
  31
  32 #include "gtypes.h"
  33 #include "gregex.h"
  34 #include "glibintl.h"
  35 #include "glist.h"
  36 #include "gmessages.h"
  37 #include "gstrfuncs.h"
  38 #include "gatomic.h"
  39 #include "gthread.h"
  40
  41 /**
  42  * SECTION:gregex
  43  * @title: Perl-compatible regular expressions
  44  * @short_description: matches strings against regular expressions
  45  * @see_also: <xref linkend="glib-regex-syntax"/>
  46  *
  47  * The <function>g_regex_*()</function> functions implement regular
  48  * expression pattern matching using syntax and semantics similar to
  49  * Perl regular expression.
  50  *
  51  * Some functions accept a @start_position argument, setting it differs
  52  * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
  53  * in the case of a pattern that begins with any kind of lookbehind assertion.
  54  * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
  55  * in the middle of words. ("\B" matches only if the current position in the
  56  * subject is not a word boundary.) When applied to the string "Mississipi"
  57  * from the fourth byte, namely "issipi", it does not match, because "\B" is
  58  * always false at the start of the subject, which is deemed to be a word
  59  * boundary. However, if the entire string is passed , but with
  60  * @start_position set to 4, it finds the second occurrence of "iss" because
  61  * it is able to look behind the starting point to discover that it is
  62  * preceded by a letter.
  63  *
  64  * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
  65  * to these functions must be encoded in UTF-8. The lengths and the positions
  66  * inside the strings are in bytes and not in characters, so, for instance,
  67  * "\xc3\xa0" (i.e. "&agrave;") is two bytes long but it is treated as a
  68  * single character. If you set #G_REGEX_RAW the strings can be non-valid
  69  * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
  70  * bytes and two characters long.
  71  *
  72  * When matching a pattern, "\n" matches only against a "\n" character in
  73  * the string, and "\r" matches only a "\r" character. To match any newline
  74  * sequence use "\R". This particular group matches either the two-character
  75  * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
  76  * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
  77  * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
  78  * separator, U+2028), or PS (paragraph separator, U+2029).
  79  *
  80  * The behaviour of the dot, circumflex, and dollar metacharacters are
  81  * affected by newline characters, the default is to recognize any newline
  82  * character (the same characters recognized by "\R"). This can be changed
  83  * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
  84  * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
  85  * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
  86  * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
  87  * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
  88  * unescaped "#" outside a character class is encountered. This indicates
  89  * a comment that lasts until after the next newline.
  90  *
  91  * Creating and manipulating the same #GRegex structure from different
  92  * threads is not a problem as #GRegex does not modify its internal
  93  * state between creation and destruction, on the other hand #GMatchInfo
  94  * is not threadsafe.
  95  *
  96  * The regular expressions low-level functionalities are obtained through
  97  * the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
  98  * written by Philip Hazel.
  99  */
 100
 101 /* Mask of all the possible values for GRegexCompileFlags. */
 102 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
 103                               G_REGEX_MULTILINE         | \
 104                               G_REGEX_DOTALL            | \
 105                               G_REGEX_EXTENDED          | \
 106                               G_REGEX_ANCHORED          | \
 107                               G_REGEX_DOLLAR_ENDONLY    | \
 108                               G_REGEX_UNGREEDY          | \
 109                               G_REGEX_RAW               | \
 110                               G_REGEX_NO_AUTO_CAPTURE   | \
 111                               G_REGEX_OPTIMIZE          | \
 112                               G_REGEX_DUPNAMES          | \
 113                               G_REGEX_NEWLINE_CR        | \
 114                               G_REGEX_NEWLINE_LF        | \
 115                               G_REGEX_NEWLINE_CRLF)
 116
 117 /* Mask of all the possible values for GRegexMatchFlags. */
 118 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED      | \
 119                             G_REGEX_MATCH_NOTBOL        | \
 120                             G_REGEX_MATCH_NOTEOL        | \
 121                             G_REGEX_MATCH_NOTEMPTY      | \
 122                             G_REGEX_MATCH_PARTIAL       | \
 123                             G_REGEX_MATCH_NEWLINE_CR    | \
 124                             G_REGEX_MATCH_NEWLINE_LF    | \
 125                             G_REGEX_MATCH_NEWLINE_CRLF  | \
 126                             G_REGEX_MATCH_NEWLINE_ANY)
 127
 128 /* if the string is in UTF-8 use g_utf8_ functions, else use
 129  * use just +/- 1. */
 130 #define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
 131                                 g_utf8_next_char (s) : \
 132                                 ((s) + 1))
 133 #define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
 134                                 g_utf8_prev_char (s) : \
 135                                 ((s) - 1))
 136
 137 struct _GMatchInfo
 138 {
 139   volatile gint ref_count;      /* the ref count */
 140   GRegex *regex;                /* the regex */
 141   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 142   gint matches;                 /* number of matching sub patterns */
 143   gint pos;                     /* position in the string where last match left off */
 144   gint  n_offsets;              /* number of offsets */
 145   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
 146   gint *workspace;              /* workspace for pcre_dfa_exec() */
 147   gint n_workspace;             /* number of workspace elements */
 148   const gchar *string;          /* string passed to the match function */
 149   gssize string_len;            /* length of string */
 150 };
 151
 152 struct _GRegex
 153 {
 154   volatile gint ref_count;      /* the ref count for the immutable part */
 155   gchar *pattern;               /* the pattern */
 156   pcre *pcre_re;                /* compiled form of the pattern */
 157   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
 158   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 159   pcre_extra *extra;            /* data stored when G_REGEX_OPTIMIZE is used */
 160 };
 161
 162 /* TRUE if ret is an error code, FALSE otherwise. */
 163 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 164
 165 typedef struct _InterpolationData InterpolationData;
 166 static gboolean  interpolation_list_needs_match (GList *list);
 167 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
 168                                                  GString *result,
 169                                                  gpointer data);
 170 static GList    *split_replacement              (const gchar *replacement,
 171                                                  GError **error);
 172 static void      free_interpolation_data        (InterpolationData *data);
 173
 174
 175 static const gchar *
 176 match_error (gint errcode)
 177 {
 178   switch (errcode)
 179     {
 180     case PCRE_ERROR_NOMATCH:
 181       /* not an error */
 182       break;
 183     case PCRE_ERROR_NULL:
 184       /* NULL argument, this should not happen in GRegex */
 185       g_warning ("A NULL argument was passed to PCRE");
 186       break;
 187     case PCRE_ERROR_BADOPTION:
 188       return "bad options";
 189     case PCRE_ERROR_BADMAGIC:
 190       return _("corrupted object");
 191     case PCRE_ERROR_UNKNOWN_OPCODE:
 192       return N_("internal error or corrupted object");
 193     case PCRE_ERROR_NOMEMORY:
 194       return _("out of memory");
 195     case PCRE_ERROR_NOSUBSTRING:
 196       /* not used by pcre_exec() */
 197       break;
 198     case PCRE_ERROR_MATCHLIMIT:
 199       return _("backtracking limit reached");
 200     case PCRE_ERROR_CALLOUT:
 201       /* callouts are not implemented */
 202       break;
 203     case PCRE_ERROR_BADUTF8:
 204     case PCRE_ERROR_BADUTF8_OFFSET:
 205       /* we do not check if strings are valid */
 206       break;
 207     case PCRE_ERROR_PARTIAL:
 208       /* not an error */
 209       break;
 210     case PCRE_ERROR_BADPARTIAL:
 211       return _("the pattern contains items not supported for partial matching");
 212     case PCRE_ERROR_INTERNAL:
 213       return _("internal error");
 214     case PCRE_ERROR_BADCOUNT:
 215       /* negative ovecsize, this should not happen in GRegex */
 216       g_warning ("A negative ovecsize was passed to PCRE");
 217       break;
 218     case PCRE_ERROR_DFA_UITEM:
 219       return _("the pattern contains items not supported for partial matching");
 220     case PCRE_ERROR_DFA_UCOND:
 221       return _("back references as conditions are not supported for partial matching");
 222     case PCRE_ERROR_DFA_UMLIMIT:
 223       /* the match_field field is not used in GRegex */
 224       break;
 225     case PCRE_ERROR_DFA_WSSIZE:
 226       /* handled expanding the workspace */
 227       break;
 228     case PCRE_ERROR_DFA_RECURSE:
 229     case PCRE_ERROR_RECURSIONLIMIT:
 230       return _("recursion limit reached");
 231     case PCRE_ERROR_NULLWSLIMIT:
 232       return _("workspace limit for empty substrings reached");
 233     case PCRE_ERROR_BADNEWLINE:
 234       return _("invalid combination of newline flags");
 235     case PCRE_ERROR_BADOFFSET:
 236       return _("bad offset");
 237     case PCRE_ERROR_SHORTUTF8:
 238       return _("short utf8");
 239     default:
 240       break;
 241     }
 242   return _("unknown error");
 243 }
 244
 245 static void
 246 translate_compile_error (gint *errcode, const gchar **errmsg)
 247 {
 248   /* Compile errors are created adding 100 to the error code returned
 249    * by PCRE.
 250    * If errcode is known we put the translatable error message in
 251    * erromsg. If errcode is unknown we put the generic
 252    * G_REGEX_ERROR_COMPILE error code in errcode and keep the
 253    * untranslated error message returned by PCRE.
 254    * Note that there can be more PCRE errors with the same GRegexError
 255    * and that some PCRE errors are useless for us.
 256    */
 257   *errcode += 100;
 258
 259   switch (*errcode)
 260     {
 261     case G_REGEX_ERROR_STRAY_BACKSLASH:
 262       *errmsg = _("\\ at end of pattern");
 263       break;
 264     case G_REGEX_ERROR_MISSING_CONTROL_CHAR:
 265       *errmsg = _("\\c at end of pattern");
 266       break;
 267     case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
 268       *errmsg = _("unrecognized character follows \\");
 269       break;
 270     case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
 271       *errmsg = _("numbers out of order in {} quantifier");
 272       break;
 273     case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
 274       *errmsg = _("number too big in {} quantifier");
 275       break;
 276     case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
 277       *errmsg = _("missing terminating ] for character class");
 278       break;
 279     case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
 280       *errmsg = _("invalid escape sequence in character class");
 281       break;
 282     case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
 283       *errmsg = _("range out of order in character class");
 284       break;
 285     case G_REGEX_ERROR_NOTHING_TO_REPEAT:
 286       *errmsg = _("nothing to repeat");
 287       break;
 288     case 111: /* internal error: unexpected repeat */
 289       *errcode = G_REGEX_ERROR_INTERNAL;
 290       *errmsg = _("unexpected repeat");
 291       break;
 292     case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
 293       *errmsg = _("unrecognized character after (? or (?-");
 294       break;
 295     case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
 296       *errmsg = _("POSIX named classes are supported only within a class");
 297       break;
 298     case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
 299       *errmsg = _("missing terminating )");
 300       break;
 301     case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
 302       *errmsg = _("reference to non-existent subpattern");
 303       break;
 304     case G_REGEX_ERROR_UNTERMINATED_COMMENT:
 305       *errmsg = _("missing ) after comment");
 306       break;
 307     case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
 308       *errmsg = _("regular expression is too large");
 309       break;
 310     case G_REGEX_ERROR_MEMORY_ERROR:
 311       *errmsg = _("failed to get memory");
 312       break;
 313     case 122: /* unmatched parentheses */
 314       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 315       *errmsg = _(") without opening (");
 316       break;
 317     case 123: /* internal error: code overflow */
 318       *errcode = G_REGEX_ERROR_INTERNAL;
 319       *errmsg = _("code overflow");
 320       break;
 321     case 124: /* "unrecognized character after (?<\0 */
 322       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 323       *errmsg = _("unrecognized character after (?<");
 324       break;
 325     case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
 326       *errmsg = _("lookbehind assertion is not fixed length");
 327       break;
 328     case G_REGEX_ERROR_MALFORMED_CONDITION:
 329       *errmsg = _("malformed number or name after (?(");
 330       break;
 331     case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
 332       *errmsg = _("conditional group contains more than two branches");
 333       break;
 334     case G_REGEX_ERROR_ASSERTION_EXPECTED:
 335       *errmsg = _("assertion expected after (?(");
 336       break;
 337     case 129:
 338       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 339       /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
 340        * sequences here, '(?-54' would be an example for the second group.
 341        */
 342       *errmsg = _("(?R or (?[+-]digits must be followed by )");
 343       break;
 344     case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
 345       *errmsg = _("unknown POSIX class name");
 346       break;
 347     case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
 348       *errmsg = _("POSIX collating elements are not supported");
 349       break;
 350     case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
 351       *errmsg = _("character value in \\x{...} sequence is too large");
 352       break;
 353     case G_REGEX_ERROR_INVALID_CONDITION:
 354       *errmsg = _("invalid condition (?(0)");
 355       break;
 356     case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
 357       *errmsg = _("\\C not allowed in lookbehind assertion");
 358       break;
 359     case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */
 360       /* A number of Perl escapes are not handled by PCRE.
 361        * Therefore it explicitly raises ERR37.
 362        */
 363       *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
 364       *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");
 365       break;
 366     case G_REGEX_ERROR_INFINITE_LOOP:
 367       *errmsg = _("recursive call could loop indefinitely");
 368       break;
 369     case 141: /* unrecognized character after (?P\0 */
 370       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 371       *errmsg = _("unrecognized character after (?P");
 372       break;
 373     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
 374       *errmsg = _("missing terminator in subpattern name");
 375       break;
 376     case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
 377       *errmsg = _("two named subpatterns have the same name");
 378       break;
 379     case G_REGEX_ERROR_MALFORMED_PROPERTY:
 380       *errmsg = _("malformed \\P or \\p sequence");
 381       break;
 382     case G_REGEX_ERROR_UNKNOWN_PROPERTY:
 383       *errmsg = _("unknown property name after \\P or \\p");
 384       break;
 385     case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
 386       *errmsg = _("subpattern name is too long (maximum 32 characters)");
 387       break;
 388     case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
 389       *errmsg = _("too many named subpatterns (maximum 10,000)");
 390       break;
 391     case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
 392       *errmsg = _("octal value is greater than \\377");
 393       break;
 394     case 152: /* internal error: overran compiling workspace */
 395       *errcode = G_REGEX_ERROR_INTERNAL;
 396       *errmsg = _("overran compiling workspace");
 397       break;
 398     case 153: /* internal error: previously-checked referenced subpattern not found */
 399       *errcode = G_REGEX_ERROR_INTERNAL;
 400       *errmsg = _("previously-checked referenced subpattern not found");
 401       break;
 402     case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
 403       *errmsg = _("DEFINE group contains more than one branch");
 404       break;
 405     case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
 406       *errmsg = _("inconsistent NEWLINE options");
 407       break;
 408     case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
 409       *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
 410                   "number, or by a plain number");
 411       break;
 412     case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:
 413       *errmsg = _("a numbered reference must not be zero");
 414       break;
 415     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:
 416       *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
 417       break;
 418     case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:
 419       *errmsg = _("(*VERB) not recognized");
 420       break;
 421     case G_REGEX_ERROR_NUMBER_TOO_BIG:
 422       *errmsg = _("number is too bug");
 423       break;
 424     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:
 425       *errmsg = _("missing subpattern name after (?&");
 426       break;
 427     case G_REGEX_ERROR_MISSING_DIGIT:
 428       *errmsg = _("digit expected after (?+");
 429       break;
 430     case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
 431       *errmsg = _("different names for subpatterns of the same number are not allowed");
 432       break;
 433     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:
 434       *errmsg = _("(*MARK) must have an argument");
 435       break;
 436     case G_REGEX_ERROR_INVALID_CONTROL_CHAR:
 437       *errmsg = _( "\\c must be followed by an ASCII character");
 438       break;
 439     case G_REGEX_ERROR_MISSING_NAME:
 440       *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
 441       break;
 442     case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:
 443       *errmsg = _("\\N is not supported in a class");
 444       break;
 445     case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:
 446       *errmsg = _("too many forward references");
 447       break;
 448     case G_REGEX_ERROR_NAME_TOO_LONG:
 449       *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
 450       break;
 451
 452     case 116: /* erroffset passed as NULL */
 453       /* This should not happen as we never pass a NULL erroffset */
 454       g_warning ("erroffset passed as NULL");
 455       *errcode = G_REGEX_ERROR_COMPILE;
 456       break;
 457     case 117: /* unknown option bit(s) set */
 458       /* This should not happen as we check options before passing them
 459        * to pcre_compile2() */
 460       g_warning ("unknown option bit(s) set");
 461       *errcode = G_REGEX_ERROR_COMPILE;
 462       break;
 463     case 132: /* this version of PCRE is compiled without UTF support */
 464     case 144: /* invalid UTF-8 string */
 465     case 145: /* support for \\P, \\p, and \\X has not been compiled */
 466     case 167: /* this version of PCRE is not compiled with Unicode property support */
 467     case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */
 468     case 174: /* invalid UTF-16 string */
 469       /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
 470        * and we do not check if strings are valid */
 471     case 164: /* ] is an invalid data character in JavaScript compatibility mode */
 472       /* This should not happen as we don't use PCRE_JAVASCRIPT_COMPAT */
 473       g_warning ("%s", *errmsg);
 474       *errcode = G_REGEX_ERROR_COMPILE;
 475       break;
 476     case 170: /* internal error: unknown opcode in find_fixedlength() */
 477       *errcode = G_REGEX_ERROR_INTERNAL;
 478       break;
 479
 480     default:
 481       *errcode = G_REGEX_ERROR_COMPILE;
 482     }
 483 }
 484
 485 /* GMatchInfo */
 486
 487 static GMatchInfo *
 488 match_info_new (const GRegex *regex,
 489                 const gchar  *string,
 490                 gint          string_len,
 491                 gint          start_position,
 492                 gint          match_options,
 493                 gboolean      is_dfa)
 494 {
 495   GMatchInfo *match_info;
 496
 497   if (string_len < 0)
 498     string_len = strlen (string);
 499
 500   match_info = g_new0 (GMatchInfo, 1);
 501   match_info->ref_count = 1;
 502   match_info->regex = g_regex_ref ((GRegex *)regex);
 503   match_info->string = string;
 504   match_info->string_len = string_len;
 505   match_info->matches = PCRE_ERROR_NOMATCH;
 506   match_info->pos = start_position;
 507   match_info->match_opts = match_options;
 508
 509   if (is_dfa)
 510     {
 511       /* These values should be enough for most cases, if they are not
 512        * enough g_regex_match_all_full() will expand them. */
 513       match_info->n_offsets = 24;
 514       match_info->n_workspace = 100;
 515       match_info->workspace = g_new (gint, match_info->n_workspace);
 516     }
 517   else
 518     {
 519       gint capture_count;
 520       pcre_fullinfo (regex->pcre_re, regex->extra,
 521                      PCRE_INFO_CAPTURECOUNT, &capture_count);
 522       match_info->n_offsets = (capture_count + 1) * 3;
 523     }
 524
 525   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 526   /* Set an invalid position for the previous match. */
 527   match_info->offsets[0] = -1;
 528   match_info->offsets[1] = -1;
 529
 530   return match_info;
 531 }
 532
 533 /**
 534  * g_match_info_get_regex:
 535  * @match_info: a #GMatchInfo
 536  *
 537  * Returns #GRegex object used in @match_info. It belongs to Glib
 538  * and must not be freed. Use g_regex_ref() if you need to keep it
 539  * after you free @match_info object.
 540  *
 541  * Returns: #GRegex object used in @match_info
 542  *
 543  * Since: 2.14
 544  */
 545 GRegex *
 546 g_match_info_get_regex (const GMatchInfo *match_info)
 547 {
 548   g_return_val_if_fail (match_info != NULL, NULL);
 549   return match_info->regex;
 550 }
 551
 552 /**
 553  * g_match_info_get_string:
 554  * @match_info: a #GMatchInfo
 555  *
 556  * Returns the string searched with @match_info. This is the
 557  * string passed to g_regex_match() or g_regex_replace() so
 558  * you may not free it before calling this function.
 559  *
 560  * Returns: the string searched with @match_info
 561  *
 562  * Since: 2.14
 563  */
 564 const gchar *
 565 g_match_info_get_string (const GMatchInfo *match_info)
 566 {
 567   g_return_val_if_fail (match_info != NULL, NULL);
 568   return match_info->string;
 569 }
 570
 571 /**
 572  * g_match_info_ref:
 573  * @match_info: a #GMatchInfo
 574  *
 575  * Increases reference count of @match_info by 1.
 576  *
 577  * Returns: @match_info
 578  *
 579  * Since: 2.30
 580  */
 581 GMatchInfo       *
 582 g_match_info_ref (GMatchInfo *match_info)
 583 {
 584   g_return_val_if_fail (match_info != NULL, NULL);
 585   g_atomic_int_inc (&match_info->ref_count);
 586   return match_info;
 587 }
 588
 589 /**
 590  * g_match_info_unref:
 591  * @match_info: a #GMatchInfo
 592  *
 593  * Decreases reference count of @match_info by 1. When reference count drops
 594  * to zero, it frees all the memory associated with the match_info structure.
 595  *
 596  * Since: 2.30
 597  */
 598 void
 599 g_match_info_unref (GMatchInfo *match_info)
 600 {
 601   if (g_atomic_int_dec_and_test (&match_info->ref_count))
 602     {
 603       g_regex_unref (match_info->regex);
 604       g_free (match_info->offsets);
 605       g_free (match_info->workspace);
 606       g_free (match_info);
 607     }
 608 }
 609
 610 /**
 611  * g_match_info_free:
 612  * @match_info: (allow-none): a #GMatchInfo, or %NULL
 613  *
 614  * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
 615  * nothing.
 616  *
 617  * Since: 2.14
 618  */
 619 void
 620 g_match_info_free (GMatchInfo *match_info)
 621 {
 622   if (match_info == NULL)
 623     return;
 624
 625   g_match_info_unref (match_info);
 626 }
 627
 628 /**
 629  * g_match_info_next:
 630  * @match_info: a #GMatchInfo structure
 631  * @error: location to store the error occurring, or %NULL to ignore errors
 632  *
 633  * Scans for the next match using the same parameters of the previous
 634  * call to g_regex_match_full() or g_regex_match() that returned
 635  * @match_info.
 636  *
 637  * The match is done on the string passed to the match function, so you
 638  * cannot free it before calling this function.
 639  *
 640  * Returns: %TRUE is the string matched, %FALSE otherwise
 641  *
 642  * Since: 2.14
 643  */
 644 gboolean
 645 g_match_info_next (GMatchInfo  *match_info,
 646                    GError     **error)
 647 {
 648   gint prev_match_start;
 649   gint prev_match_end;
 650
 651   g_return_val_if_fail (match_info != NULL, FALSE);
 652   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 653   g_return_val_if_fail (match_info->pos >= 0, FALSE);
 654
 655   prev_match_start = match_info->offsets[0];
 656   prev_match_end = match_info->offsets[1];
 657
 658   if (match_info->pos > match_info->string_len)
 659     {
 660       /* we have reached the end of the string */
 661       match_info->pos = -1;
 662       match_info->matches = PCRE_ERROR_NOMATCH;
 663       return FALSE;
 664     }
 665
 666   match_info->matches = pcre_exec (match_info->regex->pcre_re,
 667                                    match_info->regex->extra,
 668                                    match_info->string,
 669                                    match_info->string_len,
 670                                    match_info->pos,
 671                                    match_info->regex->match_opts | match_info->match_opts,
 672                                    match_info->offsets,
 673                                    match_info->n_offsets);
 674   if (IS_PCRE_ERROR (match_info->matches))
 675     {
 676       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 677                    _("Error while matching regular expression %s: %s"),
 678                    match_info->regex->pattern, match_error (match_info->matches));
 679       return FALSE;
 680     }
 681
 682   /* avoid infinite loops if the pattern is an empty string or something
 683    * equivalent */
 684   if (match_info->pos == match_info->offsets[1])
 685     {
 686       if (match_info->pos > match_info->string_len)
 687         {
 688           /* we have reached the end of the string */
 689           match_info->pos = -1;
 690           match_info->matches = PCRE_ERROR_NOMATCH;
 691           return FALSE;
 692         }
 693
 694       match_info->pos = NEXT_CHAR (match_info->regex,
 695                                    &match_info->string[match_info->pos]) -
 696                                    match_info->string;
 697     }
 698   else
 699     {
 700       match_info->pos = match_info->offsets[1];
 701     }
 702
 703   /* it's possible to get two identical matches when we are matching
 704    * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
 705    * the string is "RegExTest" we have:
 706    *  - search at position 0: match from 0 to 0
 707    *  - search at position 1: match from 3 to 3
 708    *  - search at position 3: match from 3 to 3 (duplicate)
 709    *  - search at position 4: match from 5 to 5
 710    *  - search at position 5: match from 5 to 5 (duplicate)
 711    *  - search at position 6: no match -> stop
 712    * so we have to ignore the duplicates.
 713    * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
 714   if (match_info->matches >= 0 &&
 715       prev_match_start == match_info->offsets[0] &&
 716       prev_match_end == match_info->offsets[1])
 717     {
 718       /* ignore this match and search the next one */
 719       return g_match_info_next (match_info, error);
 720     }
 721
 722   return match_info->matches >= 0;
 723 }
 724
 725 /**
 726  * g_match_info_matches:
 727  * @match_info: a #GMatchInfo structure
 728  *
 729  * Returns whether the previous match operation succeeded.
 730  *
 731  * Returns: %TRUE if the previous match operation succeeded,
 732  *   %FALSE otherwise
 733  *
 734  * Since: 2.14
 735  */
 736 gboolean
 737 g_match_info_matches (const GMatchInfo *match_info)
 738 {
 739   g_return_val_if_fail (match_info != NULL, FALSE);
 740
 741   return match_info->matches >= 0;
 742 }
 743
 744 /**
 745  * g_match_info_get_match_count:
 746  * @match_info: a #GMatchInfo structure
 747  *
 748  * Retrieves the number of matched substrings (including substring 0,
 749  * that is the whole matched text), so 1 is returned if the pattern
 750  * has no substrings in it and 0 is returned if the match failed.
 751  *
 752  * If the last match was obtained using the DFA algorithm, that is
 753  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
 754  * count is not that of the number of capturing parentheses but that of
 755  * the number of matched substrings.
 756  *
 757  * Returns: Number of matched substrings, or -1 if an error occurred
 758  *
 759  * Since: 2.14
 760  */
 761 gint
 762 g_match_info_get_match_count (const GMatchInfo *match_info)
 763 {
 764   g_return_val_if_fail (match_info, -1);
 765
 766   if (match_info->matches == PCRE_ERROR_NOMATCH)
 767     /* no match */
 768     return 0;
 769   else if (match_info->matches < PCRE_ERROR_NOMATCH)
 770     /* error */
 771     return -1;
 772   else
 773     /* match */
 774     return match_info->matches;
 775 }
 776
 777 /**
 778  * g_match_info_is_partial_match:
 779  * @match_info: a #GMatchInfo structure
 780  *
 781  * Usually if the string passed to g_regex_match*() matches as far as
 782  * it goes, but is too short to match the entire pattern, %FALSE is
 783  * returned. There are circumstances where it might be helpful to
 784  * distinguish this case from other cases in which there is no match.
 785  *
 786  * Consider, for example, an application where a human is required to
 787  * type in data for a field with specific formatting requirements. An
 788  * example might be a date in the form ddmmmyy, defined by the pattern
 789  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 790  * If the application sees the user’s keystrokes one by one, and can
 791  * check that what has been typed so far is potentially valid, it is
 792  * able to raise an error as soon as a mistake is made.
 793  *
 794  * GRegex supports the concept of partial matching by means of the
 795  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 796  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 797  * for a complete match, %FALSE otherwise. But, when these functions
 798  * return %FALSE, you can check if the match was partial calling
 799  * g_match_info_is_partial_match().
 800  *
 801  * When using partial matching you cannot use g_match_info_fetch*().
 802  *
 803  * Because of the way certain internal optimizations are implemented
 804  * the partial matching algorithm cannot be used with all patterns.
 805  * So repeated single characters such as "a{2,4}" and repeated single
 806  * meta-sequences such as "\d+" are not permitted if the maximum number
 807  * of occurrences is greater than one. Optional items such as "\d?"
 808  * (where the maximum is one) are permitted. Quantifiers with any values
 809  * are permitted after parentheses, so the invalid examples above can be
 810  * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
 811  * for a pattern that does not conform to the restrictions, matching
 812  * functions return an error.
 813  *
 814  * Returns: %TRUE if the match was partial, %FALSE otherwise
 815  *
 816  * Since: 2.14
 817  */
 818 gboolean
 819 g_match_info_is_partial_match (const GMatchInfo *match_info)
 820 {
 821   g_return_val_if_fail (match_info != NULL, FALSE);
 822
 823   return match_info->matches == PCRE_ERROR_PARTIAL;
 824 }
 825
 826 /**
 827  * g_match_info_expand_references:
 828  * @match_info: (allow-none): a #GMatchInfo or %NULL
 829  * @string_to_expand: the string to expand
 830  * @error: location to store the error occurring, or %NULL to ignore errors
 831  *
 832  * Returns a new string containing the text in @string_to_expand with
 833  * references and escape sequences expanded. References refer to the last
 834  * match done with @string against @regex and have the same syntax used by
 835  * g_regex_replace().
 836  *
 837  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
 838  * passed to g_regex_new().
 839  *
 840  * The backreferences are extracted from the string passed to the match
 841  * function, so you cannot call this function after freeing the string.
 842  *
 843  * @match_info may be %NULL in which case @string_to_expand must not
 844  * contain references. For instance "foo\n" does not refer to an actual
 845  * pattern and '\n' merely will be replaced with \n character,
 846  * while to expand "\0" (whole match) one needs the result of a match.
 847  * Use g_regex_check_replacement() to find out whether @string_to_expand
 848  * contains references.
 849  *
 850  * Returns: (allow-none): the expanded string, or %NULL if an error occurred
 851  *
 852  * Since: 2.14
 853  */
 854 gchar *
 855 g_match_info_expand_references (const GMatchInfo  *match_info,
 856                                 const gchar       *string_to_expand,
 857                                 GError           **error)
 858 {
 859   GString *result;
 860   GList *list;
 861   GError *tmp_error = NULL;
 862
 863   g_return_val_if_fail (string_to_expand != NULL, NULL);
 864   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 865
 866   list = split_replacement (string_to_expand, &tmp_error);
 867   if (tmp_error != NULL)
 868     {
 869       g_propagate_error (error, tmp_error);
 870       return NULL;
 871     }
 872
 873   if (!match_info && interpolation_list_needs_match (list))
 874     {
 875       g_critical ("String '%s' contains references to the match, can't "
 876                   "expand references without GMatchInfo object",
 877                   string_to_expand);
 878       return NULL;
 879     }
 880
 881   result = g_string_sized_new (strlen (string_to_expand));
 882   interpolate_replacement (match_info, result, list);
 883
 884   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
 885
 886   return g_string_free (result, FALSE);
 887 }
 888
 889 /**
 890  * g_match_info_fetch:
 891  * @match_info: #GMatchInfo structure
 892  * @match_num: number of the sub expression
 893  *
 894  * Retrieves the text matching the @match_num<!-- -->'th capturing
 895  * parentheses. 0 is the full text of the match, 1 is the first paren
 896  * set, 2 the second, and so on.
 897  *
 898  * If @match_num is a valid sub pattern but it didn't match anything
 899  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
 900  * string is returned.
 901  *
 902  * If the match was obtained using the DFA algorithm, that is using
 903  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 904  * string is not that of a set of parentheses but that of a matched
 905  * substring. Substrings are matched in reverse order of length, so
 906  * 0 is the longest match.
 907  *
 908  * The string is fetched from the string passed to the match function,
 909  * so you cannot call this function after freeing the string.
 910  *
 911  * Returns: (allow-none): The matched substring, or %NULL if an error
 912  *     occurred. You have to free the string yourself
 913  *
 914  * Since: 2.14
 915  */
 916 gchar *
 917 g_match_info_fetch (const GMatchInfo *match_info,
 918                     gint              match_num)
 919 {
 920   /* we cannot use pcre_get_substring() because it allocates the
 921    * string using pcre_malloc(). */
 922   gchar *match = NULL;
 923   gint start, end;
 924
 925   g_return_val_if_fail (match_info != NULL, NULL);
 926   g_return_val_if_fail (match_num >= 0, NULL);
 927
 928   /* match_num does not exist or it didn't matched, i.e. matching "b"
 929    * against "(a)?b" then group 0 is empty. */
 930   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
 931     match = NULL;
 932   else if (start == -1)
 933     match = g_strdup ("");
 934   else
 935     match = g_strndup (&match_info->string[start], end - start);
 936
 937   return match;
 938 }
 939
 940 /**
 941  * g_match_info_fetch_pos:
 942  * @match_info: #GMatchInfo structure
 943  * @match_num: number of the sub expression
 944  * @start_pos: (out) (allow-none): pointer to location where to store
 945  *     the start position, or %NULL
 946  * @end_pos: (out) (allow-none): pointer to location where to store
 947  *     the end position, or %NULL
 948  *
 949  * Retrieves the position in bytes of the @match_num<!-- -->'th capturing
 950  * parentheses. 0 is the full text of the match, 1 is the first
 951  * paren set, 2 the second, and so on.
 952  *
 953  * If @match_num is a valid sub pattern but it didn't match anything
 954  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
 955  * and @end_pos are set to -1 and %TRUE is returned.
 956  *
 957  * If the match was obtained using the DFA algorithm, that is using
 958  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 959  * position is not that of a set of parentheses but that of a matched
 960  * substring. Substrings are matched in reverse order of length, so
 961  * 0 is the longest match.
 962  *
 963  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
 964  *   the position cannot be fetched, @start_pos and @end_pos are left
 965  *   unchanged
 966  *
 967  * Since: 2.14
 968  */
 969 gboolean
 970 g_match_info_fetch_pos (const GMatchInfo *match_info,
 971                         gint              match_num,
 972                         gint             *start_pos,
 973                         gint             *end_pos)
 974 {
 975   g_return_val_if_fail (match_info != NULL, FALSE);
 976   g_return_val_if_fail (match_num >= 0, FALSE);
 977
 978   /* make sure the sub expression number they're requesting is less than
 979    * the total number of sub expressions that were matched. */
 980   if (match_num >= match_info->matches)
 981     return FALSE;
 982
 983   if (start_pos != NULL)
 984     *start_pos = match_info->offsets[2 * match_num];
 985
 986   if (end_pos != NULL)
 987     *end_pos = match_info->offsets[2 * match_num + 1];
 988
 989   return TRUE;
 990 }
 991
 992 /*
 993  * Returns number of first matched subpattern with name @name.
 994  * There may be more than one in case when DUPNAMES is used,
 995  * and not all subpatterns with that name match;
 996  * pcre_get_stringnumber() does not work in that case.
 997  */
 998 static gint
 999 get_matched_substring_number (const GMatchInfo *match_info,
1000                               const gchar      *name)
1001 {
1002   gint entrysize;
1003   gchar *first, *last;
1004   guchar *entry;
1005
1006   if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES))
1007     return pcre_get_stringnumber (match_info->regex->pcre_re, name);
1008
1009   /* This code is copied from pcre_get.c: get_first_set() */
1010   entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
1011                                             name,
1012                                             &first,
1013                                             &last);
1014
1015   if (entrysize <= 0)
1016     return entrysize;
1017
1018   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1019     {
1020       gint n = (entry[0] << 8) + entry[1];
1021       if (match_info->offsets[n*2] >= 0)
1022         return n;
1023     }
1024
1025   return (first[0] << 8) + first[1];
1026 }
1027
1028 /**
1029  * g_match_info_fetch_named:
1030  * @match_info: #GMatchInfo structure
1031  * @name: name of the subexpression
1032  *
1033  * Retrieves the text matching the capturing parentheses named @name.
1034  *
1035  * If @name is a valid sub pattern name but it didn't match anything
1036  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
1037  * then an empty string is returned.
1038  *
1039  * The string is fetched from the string passed to the match function,
1040  * so you cannot call this function after freeing the string.
1041  *
1042  * Returns: (allow-none): The matched substring, or %NULL if an error
1043  *     occurred. You have to free the string yourself
1044  *
1045  * Since: 2.14
1046  */
1047 gchar *
1048 g_match_info_fetch_named (const GMatchInfo *match_info,
1049                           const gchar      *name)
1050 {
1051   /* we cannot use pcre_get_named_substring() because it allocates the
1052    * string using pcre_malloc(). */
1053   gint num;
1054
1055   g_return_val_if_fail (match_info != NULL, NULL);
1056   g_return_val_if_fail (name != NULL, NULL);
1057
1058   num = get_matched_substring_number (match_info, name);
1059   if (num < 0)
1060     return NULL;
1061   else
1062     return g_match_info_fetch (match_info, num);
1063 }
1064
1065 /**
1066  * g_match_info_fetch_named_pos:
1067  * @match_info: #GMatchInfo structure
1068  * @name: name of the subexpression
1069  * @start_pos: (out) (allow-none): pointer to location where to store
1070  *     the start position, or %NULL
1071  * @end_pos: (out) (allow-none): pointer to location where to store
1072  *     the end position, or %NULL
1073  *
1074  * Retrieves the position in bytes of the capturing parentheses named @name.
1075  *
1076  * If @name is a valid sub pattern name but it didn't match anything
1077  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
1078  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1079  *
1080  * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1081  *     If the position cannot be fetched, @start_pos and @end_pos
1082  *     are left unchanged.
1083  *
1084  * Since: 2.14
1085  */
1086 gboolean
1087 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1088                               const gchar      *name,
1089                               gint             *start_pos,
1090                               gint             *end_pos)
1091 {
1092   gint num;
1093
1094   g_return_val_if_fail (match_info != NULL, FALSE);
1095   g_return_val_if_fail (name != NULL, FALSE);
1096
1097   num = get_matched_substring_number (match_info, name);
1098   if (num < 0)
1099     return FALSE;
1100
1101   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1102 }
1103
1104 /**
1105  * g_match_info_fetch_all:
1106  * @match_info: a #GMatchInfo structure
1107  *
1108  * Bundles up pointers to each of the matching substrings from a match
1109  * and stores them in an array of gchar pointers. The first element in
1110  * the returned array is the match number 0, i.e. the entire matched
1111  * text.
1112  *
1113  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1114  * "b" against "(a)?b") then an empty string is inserted.
1115  *
1116  * If the last match was obtained using the DFA algorithm, that is using
1117  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1118  * strings are not that matched by sets of parentheses but that of the
1119  * matched substring. Substrings are matched in reverse order of length,
1120  * so the first one is the longest match.
1121  *
1122  * The strings are fetched from the string passed to the match function,
1123  * so you cannot call this function after freeing the string.
1124  *
1125  * Returns: (allow-none): a %NULL-terminated array of gchar * pointers.
1126  *     It must be freed using g_strfreev(). If the previous match failed
1127  *     %NULL is returned
1128  *
1129  * Since: 2.14
1130  */
1131 gchar **
1132 g_match_info_fetch_all (const GMatchInfo *match_info)
1133 {
1134   /* we cannot use pcre_get_substring_list() because the returned value
1135    * isn't suitable for g_strfreev(). */
1136   gchar **result;
1137   gint i;
1138
1139   g_return_val_if_fail (match_info != NULL, NULL);
1140
1141   if (match_info->matches < 0)
1142     return NULL;
1143
1144   result = g_new (gchar *, match_info->matches + 1);
1145   for (i = 0; i < match_info->matches; i++)
1146     result[i] = g_match_info_fetch (match_info, i);
1147   result[i] = NULL;
1148
1149   return result;
1150 }
1151
1152
1153 /* GRegex */
1154
1155 GQuark
1156 g_regex_error_quark (void)
1157 {
1158   static GQuark error_quark = 0;
1159
1160   if (error_quark == 0)
1161     error_quark = g_quark_from_static_string ("g-regex-error-quark");
1162
1163   return error_quark;
1164 }
1165
1166 /**
1167  * g_regex_ref:
1168  * @regex: a #GRegex
1169  *
1170  * Increases reference count of @regex by 1.
1171  *
1172  * Returns: @regex
1173  *
1174  * Since: 2.14
1175  */
1176 GRegex *
1177 g_regex_ref (GRegex *regex)
1178 {
1179   g_return_val_if_fail (regex != NULL, NULL);
1180   g_atomic_int_inc (&regex->ref_count);
1181   return regex;
1182 }
1183
1184 /**
1185  * g_regex_unref:
1186  * @regex: a #GRegex
1187  *
1188  * Decreases reference count of @regex by 1. When reference count drops
1189  * to zero, it frees all the memory associated with the regex structure.
1190  *
1191  * Since: 2.14
1192  */
1193 void
1194 g_regex_unref (GRegex *regex)
1195 {
1196   g_return_if_fail (regex != NULL);
1197
1198   if (g_atomic_int_dec_and_test (&regex->ref_count))
1199     {
1200       g_free (regex->pattern);
1201       if (regex->pcre_re != NULL)
1202         pcre_free (regex->pcre_re);
1203       if (regex->extra != NULL)
1204         pcre_free (regex->extra);
1205       g_free (regex);
1206     }
1207 }
1208
1209 /**
1210  * g_regex_new:
1211  * @pattern: the regular expression
1212  * @compile_options: compile options for the regular expression, or 0
1213  * @match_options: match options for the regular expression, or 0
1214  * @error: return location for a #GError
1215  *
1216  * Compiles the regular expression to an internal form, and does
1217  * the initial setup of the #GRegex structure.
1218  *
1219  * Returns: a #GRegex structure. Call g_regex_unref() when you
1220  *   are done with it
1221  *
1222  * Since: 2.14
1223  */
1224 GRegex *
1225 g_regex_new (const gchar         *pattern,
1226              GRegexCompileFlags   compile_options,
1227              GRegexMatchFlags     match_options,
1228              GError             **error)
1229 {
1230   GRegex *regex;
1231   pcre *re;
1232   const gchar *errmsg;
1233   gint erroffset;
1234   gint errcode;
1235   gboolean optimize = FALSE;
1236   static gsize initialised;
1237   unsigned long int pcre_compile_options;
1238
1239   g_return_val_if_fail (pattern != NULL, NULL);
1240   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1241   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
1242   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1243
1244   if (g_once_init_enter (&initialised))
1245     {
1246       gint support;
1247       const gchar *msg;
1248
1249       pcre_config (PCRE_CONFIG_UTF8, &support);
1250       if (!support)
1251         {
1252           msg = N_("PCRE library is compiled without UTF8 support");
1253           g_critical ("%s", msg);
1254           g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
1255           return NULL;
1256         }
1257
1258       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
1259       if (!support)
1260         {
1261           msg = N_("PCRE library is compiled without UTF8 properties support");
1262           g_critical ("%s", msg);
1263           g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
1264           return NULL;
1265         }
1266
1267       g_once_init_leave (&initialised, TRUE);
1268     }
1269
1270   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
1271    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
1272   if (compile_options & G_REGEX_OPTIMIZE)
1273     optimize = TRUE;
1274
1275   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
1276    * instead uses UTF-8 only if required with PCRE_UTF8. */
1277   if (compile_options & G_REGEX_RAW)
1278     {
1279       /* disable utf-8 */
1280       compile_options &= ~G_REGEX_RAW;
1281     }
1282   else
1283     {
1284       /* enable utf-8 */
1285       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1286       match_options |= PCRE_NO_UTF8_CHECK;
1287     }
1288
1289   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
1290    * not for the system one. */
1291   if (!(compile_options & G_REGEX_NEWLINE_CR) &&
1292       !(compile_options & G_REGEX_NEWLINE_LF))
1293     {
1294       compile_options |= PCRE_NEWLINE_ANY;
1295     }
1296
1297   compile_options |= PCRE_UCP;
1298
1299   /* compile the pattern */
1300   re = pcre_compile2 (pattern, compile_options, &errcode,
1301                       &errmsg, &erroffset, NULL);
1302
1303   /* if the compilation failed, set the error member and return
1304    * immediately */
1305   if (re == NULL)
1306     {
1307       GError *tmp_error;
1308
1309       /* Translate the PCRE error code to GRegexError and use a translated
1310        * error message if possible */
1311       translate_compile_error (&errcode, &errmsg);
1312
1313       /* PCRE uses byte offsets but we want to show character offsets */
1314       erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1315
1316       tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1317                                _("Error while compiling regular "
1318                                  "expression %s at char %d: %s"),
1319                                pattern, erroffset, errmsg);
1320       g_propagate_error (error, tmp_error);
1321
1322       return NULL;
1323     }
1324
1325   /* For options set at the beginning of the pattern, pcre puts them into
1326    * compile options, e.g. "(?i)foo" will make the pcre structure store
1327    * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
1328   pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);
1329   compile_options = pcre_compile_options;
1330
1331   if (!(compile_options & G_REGEX_DUPNAMES))
1332     {
1333       gboolean jchanged = FALSE;
1334       pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1335       if (jchanged)
1336         compile_options |= G_REGEX_DUPNAMES;
1337     }
1338
1339   regex = g_new0 (GRegex, 1);
1340   regex->ref_count = 1;
1341   regex->pattern = g_strdup (pattern);
1342   regex->pcre_re = re;
1343   regex->compile_opts = compile_options;
1344   regex->match_opts = match_options;
1345
1346   if (optimize)
1347     {
1348       regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
1349       if (errmsg != NULL)
1350         {
1351           GError *tmp_error = g_error_new (G_REGEX_ERROR,
1352                                            G_REGEX_ERROR_OPTIMIZE,
1353                                            _("Error while optimizing "
1354                                              "regular expression %s: %s"),
1355                                            regex->pattern,
1356                                            errmsg);
1357           g_propagate_error (error, tmp_error);
1358
1359           g_regex_unref (regex);
1360           return NULL;
1361         }
1362     }
1363
1364   return regex;
1365 }
1366
1367 /**
1368  * g_regex_get_pattern:
1369  * @regex: a #GRegex structure
1370  *
1371  * Gets the pattern string associated with @regex, i.e. a copy of
1372  * the string passed to g_regex_new().
1373  *
1374  * Returns: the pattern of @regex
1375  *
1376  * Since: 2.14
1377  */
1378 const gchar *
1379 g_regex_get_pattern (const GRegex *regex)
1380 {
1381   g_return_val_if_fail (regex != NULL, NULL);
1382
1383   return regex->pattern;
1384 }
1385
1386 /**
1387  * g_regex_get_max_backref:
1388  * @regex: a #GRegex
1389  *
1390  * Returns the number of the highest back reference
1391  * in the pattern, or 0 if the pattern does not contain
1392  * back references.
1393  *
1394  * Returns: the number of the highest back reference
1395  *
1396  * Since: 2.14
1397  */
1398 gint
1399 g_regex_get_max_backref (const GRegex *regex)
1400 {
1401   gint value;
1402
1403   pcre_fullinfo (regex->pcre_re, regex->extra,
1404                  PCRE_INFO_BACKREFMAX, &value);
1405
1406   return value;
1407 }
1408
1409 /**
1410  * g_regex_get_capture_count:
1411  * @regex: a #GRegex
1412  *
1413  * Returns the number of capturing subpatterns in the pattern.
1414  *
1415  * Returns: the number of capturing subpatterns
1416  *
1417  * Since: 2.14
1418  */
1419 gint
1420 g_regex_get_capture_count (const GRegex *regex)
1421 {
1422   gint value;
1423
1424   pcre_fullinfo (regex->pcre_re, regex->extra,
1425                  PCRE_INFO_CAPTURECOUNT, &value);
1426
1427   return value;
1428 }
1429
1430 /**
1431  * g_regex_get_compile_flags:
1432  * @regex: a #GRegex
1433  *
1434  * Returns the compile options that @regex was created with.
1435  *
1436  * Returns: flags from #GRegexCompileFlags
1437  *
1438  * Since: 2.26
1439  */
1440 GRegexCompileFlags
1441 g_regex_get_compile_flags (const GRegex *regex)
1442 {
1443   g_return_val_if_fail (regex != NULL, 0);
1444
1445   return regex->compile_opts;
1446 }
1447
1448 /**
1449  * g_regex_get_match_flags:
1450  * @regex: a #GRegex
1451  *
1452  * Returns the match options that @regex was created with.
1453  *
1454  * Returns: flags from #GRegexMatchFlags
1455  *
1456  * Since: 2.26
1457  */
1458 GRegexMatchFlags
1459 g_regex_get_match_flags (const GRegex *regex)
1460 {
1461   g_return_val_if_fail (regex != NULL, 0);
1462
1463   return regex->match_opts;
1464 }
1465
1466 /**
1467  * g_regex_match_simple:
1468  * @pattern: the regular expression
1469  * @string: the string to scan for matches
1470  * @compile_options: compile options for the regular expression, or 0
1471  * @match_options: match options, or 0
1472  *
1473  * Scans for a match in @string for @pattern.
1474  *
1475  * This function is equivalent to g_regex_match() but it does not
1476  * require to compile the pattern with g_regex_new(), avoiding some
1477  * lines of code when you need just to do a match without extracting
1478  * substrings, capture counts, and so on.
1479  *
1480  * If this function is to be called on the same @pattern more than
1481  * once, it's more efficient to compile the pattern once with
1482  * g_regex_new() and then use g_regex_match().
1483  *
1484  * Returns: %TRUE if the string matched, %FALSE otherwise
1485  *
1486  * Since: 2.14
1487  */
1488 gboolean
1489 g_regex_match_simple (const gchar        *pattern,
1490                       const gchar        *string,
1491                       GRegexCompileFlags  compile_options,
1492                       GRegexMatchFlags    match_options)
1493 {
1494   GRegex *regex;
1495   gboolean result;
1496
1497   regex = g_regex_new (pattern, compile_options, 0, NULL);
1498   if (!regex)
1499     return FALSE;
1500   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1501   g_regex_unref (regex);
1502   return result;
1503 }
1504
1505 /**
1506  * g_regex_match:
1507  * @regex: a #GRegex structure from g_regex_new()
1508  * @string: the string to scan for matches
1509  * @match_options: match options
1510  * @match_info: (out) (allow-none): pointer to location where to store
1511  *     the #GMatchInfo, or %NULL if you do not need it
1512  *
1513  * Scans for a match in string for the pattern in @regex.
1514  * The @match_options are combined with the match options specified
1515  * when the @regex structure was created, letting you have more
1516  * flexibility in reusing #GRegex structures.
1517  *
1518  * A #GMatchInfo structure, used to get information on the match,
1519  * is stored in @match_info if not %NULL. Note that if @match_info
1520  * is not %NULL then it is created even if the function returns %FALSE,
1521  * i.e. you must free it regardless if regular expression actually matched.
1522  *
1523  * To retrieve all the non-overlapping matches of the pattern in
1524  * string you can use g_match_info_next().
1525  *
1526  * |[
1527  * static void
1528  * print_uppercase_words (const gchar *string)
1529  * {
1530  *   /&ast; Print all uppercase-only words. &ast;/
1531  *   GRegex *regex;
1532  *   GMatchInfo *match_info;
1533  *   &nbsp;
1534  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1535  *   g_regex_match (regex, string, 0, &amp;match_info);
1536  *   while (g_match_info_matches (match_info))
1537  *     {
1538  *       gchar *word = g_match_info_fetch (match_info, 0);
1539  *       g_print ("Found: %s\n", word);
1540  *       g_free (word);
1541  *       g_match_info_next (match_info, NULL);
1542  *     }
1543  *   g_match_info_free (match_info);
1544  *   g_regex_unref (regex);
1545  * }
1546  * ]|
1547  *
1548  * @string is not copied and is used in #GMatchInfo internally. If
1549  * you use any #GMatchInfo method (except g_match_info_free()) after
1550  * freeing or modifying @string then the behaviour is undefined.
1551  *
1552  * Returns: %TRUE is the string matched, %FALSE otherwise
1553  *
1554  * Since: 2.14
1555  */
1556 gboolean
1557 g_regex_match (const GRegex      *regex,
1558                const gchar       *string,
1559                GRegexMatchFlags   match_options,
1560                GMatchInfo       **match_info)
1561 {
1562   return g_regex_match_full (regex, string, -1, 0, match_options,
1563                              match_info, NULL);
1564 }
1565
1566 /**
1567  * g_regex_match_full:
1568  * @regex: a #GRegex structure from g_regex_new()
1569  * @string: (array length=string_len): the string to scan for matches
1570  * @string_len: the length of @string, or -1 if @string is nul-terminated
1571  * @start_position: starting index of the string to match
1572  * @match_options: match options
1573  * @match_info: (out) (allow-none): pointer to location where to store
1574  *     the #GMatchInfo, or %NULL if you do not need it
1575  * @error: location to store the error occurring, or %NULL to ignore errors
1576  *
1577  * Scans for a match in string for the pattern in @regex.
1578  * The @match_options are combined with the match options specified
1579  * when the @regex structure was created, letting you have more
1580  * flexibility in reusing #GRegex structures.
1581  *
1582  * Setting @start_position differs from just passing over a shortened
1583  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1584  * that begins with any kind of lookbehind assertion, such as "\b".
1585  *
1586  * A #GMatchInfo structure, used to get information on the match, is
1587  * stored in @match_info if not %NULL. Note that if @match_info is
1588  * not %NULL then it is created even if the function returns %FALSE,
1589  * i.e. you must free it regardless if regular expression actually
1590  * matched.
1591  *
1592  * @string is not copied and is used in #GMatchInfo internally. If
1593  * you use any #GMatchInfo method (except g_match_info_free()) after
1594  * freeing or modifying @string then the behaviour is undefined.
1595  *
1596  * To retrieve all the non-overlapping matches of the pattern in
1597  * string you can use g_match_info_next().
1598  *
1599  * |[
1600  * static void
1601  * print_uppercase_words (const gchar *string)
1602  * {
1603  *   /&ast; Print all uppercase-only words. &ast;/
1604  *   GRegex *regex;
1605  *   GMatchInfo *match_info;
1606  *   GError *error = NULL;
1607  *   &nbsp;
1608  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1609  *   g_regex_match_full (regex, string, -1, 0, 0, &amp;match_info, &amp;error);
1610  *   while (g_match_info_matches (match_info))
1611  *     {
1612  *       gchar *word = g_match_info_fetch (match_info, 0);
1613  *       g_print ("Found: %s\n", word);
1614  *       g_free (word);
1615  *       g_match_info_next (match_info, &amp;error);
1616  *     }
1617  *   g_match_info_free (match_info);
1618  *   g_regex_unref (regex);
1619  *   if (error != NULL)
1620  *     {
1621  *       g_printerr ("Error while matching: %s\n", error->message);
1622  *       g_error_free (error);
1623  *     }
1624  * }
1625  * ]|
1626  *
1627  * Returns: %TRUE is the string matched, %FALSE otherwise
1628  *
1629  * Since: 2.14
1630  */
1631 gboolean
1632 g_regex_match_full (const GRegex      *regex,
1633                     const gchar       *string,
1634                     gssize             string_len,
1635                     gint               start_position,
1636                     GRegexMatchFlags   match_options,
1637                     GMatchInfo       **match_info,
1638                     GError           **error)
1639 {
1640   GMatchInfo *info;
1641   gboolean match_ok;
1642
1643   g_return_val_if_fail (regex != NULL, FALSE);
1644   g_return_val_if_fail (string != NULL, FALSE);
1645   g_return_val_if_fail (start_position >= 0, FALSE);
1646   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1647   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1648
1649   info = match_info_new (regex, string, string_len, start_position,
1650                          match_options, FALSE);
1651   match_ok = g_match_info_next (info, error);
1652   if (match_info != NULL)
1653     *match_info = info;
1654   else
1655     g_match_info_free (info);
1656
1657   return match_ok;
1658 }
1659
1660 /**
1661  * g_regex_match_all:
1662  * @regex: a #GRegex structure from g_regex_new()
1663  * @string: the string to scan for matches
1664  * @match_options: match options
1665  * @match_info: (out) (allow-none): pointer to location where to store
1666  *     the #GMatchInfo, or %NULL if you do not need it
1667  *
1668  * Using the standard algorithm for regular expression matching only
1669  * the longest match in the string is retrieved. This function uses
1670  * a different algorithm so it can retrieve all the possible matches.
1671  * For more documentation see g_regex_match_all_full().
1672  *
1673  * A #GMatchInfo structure, used to get information on the match, is
1674  * stored in @match_info if not %NULL. Note that if @match_info is
1675  * not %NULL then it is created even if the function returns %FALSE,
1676  * i.e. you must free it regardless if regular expression actually
1677  * matched.
1678  *
1679  * @string is not copied and is used in #GMatchInfo internally. If
1680  * you use any #GMatchInfo method (except g_match_info_free()) after
1681  * freeing or modifying @string then the behaviour is undefined.
1682  *
1683  * Returns: %TRUE is the string matched, %FALSE otherwise
1684  *
1685  * Since: 2.14
1686  */
1687 gboolean
1688 g_regex_match_all (const GRegex      *regex,
1689                    const gchar       *string,
1690                    GRegexMatchFlags   match_options,
1691                    GMatchInfo       **match_info)
1692 {
1693   return g_regex_match_all_full (regex, string, -1, 0, match_options,
1694                                  match_info, NULL);
1695 }
1696
1697 /**
1698  * g_regex_match_all_full:
1699  * @regex: a #GRegex structure from g_regex_new()
1700  * @string: (array length=string_len): the string to scan for matches
1701  * @string_len: the length of @string, or -1 if @string is nul-terminated
1702  * @start_position: starting index of the string to match
1703  * @match_options: match options
1704  * @match_info: (out) (allow-none): pointer to location where to store
1705  *     the #GMatchInfo, or %NULL if you do not need it
1706  * @error: location to store the error occurring, or %NULL to ignore errors
1707  *
1708  * Using the standard algorithm for regular expression matching only
1709  * the longest match in the string is retrieved, it is not possible
1710  * to obtain all the available matches. For instance matching
1711  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1712  * you get "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
1713  *
1714  * This function uses a different algorithm (called DFA, i.e. deterministic
1715  * finite automaton), so it can retrieve all the possible matches, all
1716  * starting at the same point in the string. For instance matching
1717  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1718  * you would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
1719  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
1720  *
1721  * The number of matched strings is retrieved using
1722  * g_match_info_get_match_count(). To obtain the matched strings and
1723  * their position you can use, respectively, g_match_info_fetch() and
1724  * g_match_info_fetch_pos(). Note that the strings are returned in
1725  * reverse order of length; that is, the longest matching string is
1726  * given first.
1727  *
1728  * Note that the DFA algorithm is slower than the standard one and it
1729  * is not able to capture substrings, so backreferences do not work.
1730  *
1731  * Setting @start_position differs from just passing over a shortened
1732  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1733  * that begins with any kind of lookbehind assertion, such as "\b".
1734  *
1735  * A #GMatchInfo structure, used to get information on the match, is
1736  * stored in @match_info if not %NULL. Note that if @match_info is
1737  * not %NULL then it is created even if the function returns %FALSE,
1738  * i.e. you must free it regardless if regular expression actually
1739  * matched.
1740  *
1741  * @string is not copied and is used in #GMatchInfo internally. If
1742  * you use any #GMatchInfo method (except g_match_info_free()) after
1743  * freeing or modifying @string then the behaviour is undefined.
1744  *
1745  * Returns: %TRUE is the string matched, %FALSE otherwise
1746  *
1747  * Since: 2.14
1748  */
1749 gboolean
1750 g_regex_match_all_full (const GRegex      *regex,
1751                         const gchar       *string,
1752                         gssize             string_len,
1753                         gint               start_position,
1754                         GRegexMatchFlags   match_options,
1755                         GMatchInfo       **match_info,
1756                         GError           **error)
1757 {
1758   GMatchInfo *info;
1759   gboolean done;
1760
1761   g_return_val_if_fail (regex != NULL, FALSE);
1762   g_return_val_if_fail (string != NULL, FALSE);
1763   g_return_val_if_fail (start_position >= 0, FALSE);
1764   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1765   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1766
1767   info = match_info_new (regex, string, string_len, start_position,
1768                          match_options, TRUE);
1769
1770   done = FALSE;
1771   while (!done)
1772     {
1773       done = TRUE;
1774       info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1775                                      info->string, info->string_len,
1776                                      info->pos,
1777                                      regex->match_opts | match_options,
1778                                      info->offsets, info->n_offsets,
1779                                      info->workspace, info->n_workspace);
1780       if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1781         {
1782           /* info->workspace is too small. */
1783           info->n_workspace *= 2;
1784           info->workspace = g_realloc (info->workspace,
1785                                        info->n_workspace * sizeof (gint));
1786           done = FALSE;
1787         }
1788       else if (info->matches == 0)
1789         {
1790           /* info->offsets is too small. */
1791           info->n_offsets *= 2;
1792           info->offsets = g_realloc (info->offsets,
1793                                      info->n_offsets * sizeof (gint));
1794           done = FALSE;
1795         }
1796       else if (IS_PCRE_ERROR (info->matches))
1797         {
1798           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1799                        _("Error while matching regular expression %s: %s"),
1800                        regex->pattern, match_error (info->matches));
1801         }
1802     }
1803
1804   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1805   info->pos = -1;
1806
1807   if (match_info != NULL)
1808     *match_info = info;
1809   else
1810     g_match_info_free (info);
1811
1812   return info->matches >= 0;
1813 }
1814
1815 /**
1816  * g_regex_get_string_number:
1817  * @regex: #GRegex structure
1818  * @name: name of the subexpression
1819  *
1820  * Retrieves the number of the subexpression named @name.
1821  *
1822  * Returns: The number of the subexpression or -1 if @name
1823  *   does not exists
1824  *
1825  * Since: 2.14
1826  */
1827 gint
1828 g_regex_get_string_number (const GRegex *regex,
1829                            const gchar  *name)
1830 {
1831   gint num;
1832
1833   g_return_val_if_fail (regex != NULL, -1);
1834   g_return_val_if_fail (name != NULL, -1);
1835
1836   num = pcre_get_stringnumber (regex->pcre_re, name);
1837   if (num == PCRE_ERROR_NOSUBSTRING)
1838     num = -1;
1839
1840   return num;
1841 }
1842
1843 /**
1844  * g_regex_split_simple:
1845  * @pattern: the regular expression
1846  * @string: the string to scan for matches
1847  * @compile_options: compile options for the regular expression, or 0
1848  * @match_options: match options, or 0
1849  *
1850  * Breaks the string on the pattern, and returns an array of
1851  * the tokens. If the pattern contains capturing parentheses,
1852  * then the text for each of the substrings will also be returned.
1853  * If the pattern does not match anywhere in the string, then the
1854  * whole string is returned as the first token.
1855  *
1856  * This function is equivalent to g_regex_split() but it does
1857  * not require to compile the pattern with g_regex_new(), avoiding
1858  * some lines of code when you need just to do a split without
1859  * extracting substrings, capture counts, and so on.
1860  *
1861  * If this function is to be called on the same @pattern more than
1862  * once, it's more efficient to compile the pattern once with
1863  * g_regex_new() and then use g_regex_split().
1864  *
1865  * As a special case, the result of splitting the empty string ""
1866  * is an empty vector, not a vector containing a single string.
1867  * The reason for this special case is that being able to represent
1868  * a empty vector is typically more useful than consistent handling
1869  * of empty elements. If you do need to represent empty elements,
1870  * you'll need to check for the empty string before calling this
1871  * function.
1872  *
1873  * A pattern that can match empty strings splits @string into
1874  * separate characters wherever it matches the empty string between
1875  * characters. For example splitting "ab c" using as a separator
1876  * "\s*", you will get "a", "b" and "c".
1877  *
1878  * Returns: a %NULL-terminated array of strings. Free it using g_strfreev()
1879  *
1880  * Since: 2.14
1881  **/
1882 gchar **
1883 g_regex_split_simple (const gchar        *pattern,
1884                       const gchar        *string,
1885                       GRegexCompileFlags  compile_options,
1886                       GRegexMatchFlags    match_options)
1887 {
1888   GRegex *regex;
1889   gchar **result;
1890
1891   regex = g_regex_new (pattern, compile_options, 0, NULL);
1892   if (!regex)
1893     return NULL;
1894
1895   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1896   g_regex_unref (regex);
1897   return result;
1898 }
1899
1900 /**
1901  * g_regex_split:
1902  * @regex: a #GRegex structure
1903  * @string: the string to split with the pattern
1904  * @match_options: match time option flags
1905  *
1906  * Breaks the string on the pattern, and returns an array of the tokens.
1907  * If the pattern contains capturing parentheses, then the text for each
1908  * of the substrings will also be returned. If the pattern does not match
1909  * anywhere in the string, then the whole string is returned as the first
1910  * token.
1911  *
1912  * As a special case, the result of splitting the empty string "" is an
1913  * empty vector, not a vector containing a single string. The reason for
1914  * this special case is that being able to represent a empty vector is
1915  * typically more useful than consistent handling of empty elements. If
1916  * you do need to represent empty elements, you'll need to check for the
1917  * empty string before calling this function.
1918  *
1919  * A pattern that can match empty strings splits @string into separate
1920  * characters wherever it matches the empty string between characters.
1921  * For example splitting "ab c" using as a separator "\s*", you will get
1922  * "a", "b" and "c".
1923  *
1924  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1925  *
1926  * Since: 2.14
1927  **/
1928 gchar **
1929 g_regex_split (const GRegex     *regex,
1930                const gchar      *string,
1931                GRegexMatchFlags  match_options)
1932 {
1933   return g_regex_split_full (regex, string, -1, 0,
1934                              match_options, 0, NULL);
1935 }
1936
1937 /**
1938  * g_regex_split_full:
1939  * @regex: a #GRegex structure
1940  * @string: (array length=string_len): the string to split with the pattern
1941  * @string_len: the length of @string, or -1 if @string is nul-terminated
1942  * @start_position: starting index of the string to match
1943  * @match_options: match time option flags
1944  * @max_tokens: the maximum number of tokens to split @string into.
1945  *   If this is less than 1, the string is split completely
1946  * @error: return location for a #GError
1947  *
1948  * Breaks the string on the pattern, and returns an array of the tokens.
1949  * If the pattern contains capturing parentheses, then the text for each
1950  * of the substrings will also be returned. If the pattern does not match
1951  * anywhere in the string, then the whole string is returned as the first
1952  * token.
1953  *
1954  * As a special case, the result of splitting the empty string "" is an
1955  * empty vector, not a vector containing a single string. The reason for
1956  * this special case is that being able to represent a empty vector is
1957  * typically more useful than consistent handling of empty elements. If
1958  * you do need to represent empty elements, you'll need to check for the
1959  * empty string before calling this function.
1960  *
1961  * A pattern that can match empty strings splits @string into separate
1962  * characters wherever it matches the empty string between characters.
1963  * For example splitting "ab c" using as a separator "\s*", you will get
1964  * "a", "b" and "c".
1965  *
1966  * Setting @start_position differs from just passing over a shortened
1967  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1968  * that begins with any kind of lookbehind assertion, such as "\b".
1969  *
1970  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1971  *
1972  * Since: 2.14
1973  **/
1974 gchar **
1975 g_regex_split_full (const GRegex      *regex,
1976                     const gchar       *string,
1977                     gssize             string_len,
1978                     gint               start_position,
1979                     GRegexMatchFlags   match_options,
1980                     gint               max_tokens,
1981                     GError           **error)
1982 {
1983   GError *tmp_error = NULL;
1984   GMatchInfo *match_info;
1985   GList *list, *last;
1986   gint i;
1987   gint token_count;
1988   gboolean match_ok;
1989   /* position of the last separator. */
1990   gint last_separator_end;
1991   /* was the last match 0 bytes long? */
1992   gboolean last_match_is_empty;
1993   /* the returned array of char **s */
1994   gchar **string_list;
1995
1996   g_return_val_if_fail (regex != NULL, NULL);
1997   g_return_val_if_fail (string != NULL, NULL);
1998   g_return_val_if_fail (start_position >= 0, NULL);
1999   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2000   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2001
2002   if (max_tokens <= 0)
2003     max_tokens = G_MAXINT;
2004
2005   if (string_len < 0)
2006     string_len = strlen (string);
2007
2008   /* zero-length string */
2009   if (string_len - start_position == 0)
2010     return g_new0 (gchar *, 1);
2011
2012   if (max_tokens == 1)
2013     {
2014       string_list = g_new0 (gchar *, 2);
2015       string_list[0] = g_strndup (&string[start_position],
2016                                   string_len - start_position);
2017       return string_list;
2018     }
2019
2020   list = NULL;
2021   token_count = 0;
2022   last_separator_end = start_position;
2023   last_match_is_empty = FALSE;
2024
2025   match_ok = g_regex_match_full (regex, string, string_len, start_position,
2026                                  match_options, &match_info, &tmp_error);
2027
2028   while (tmp_error == NULL)
2029     {
2030       if (match_ok)
2031         {
2032           last_match_is_empty =
2033                     (match_info->offsets[0] == match_info->offsets[1]);
2034
2035           /* we need to skip empty separators at the same position of the end
2036            * of another separator. e.g. the string is "a b" and the separator
2037            * is " *", so from 1 to 2 we have a match and at position 2 we have
2038            * an empty match. */
2039           if (last_separator_end != match_info->offsets[1])
2040             {
2041               gchar *token;
2042               gint match_count;
2043
2044               token = g_strndup (string + last_separator_end,
2045                                  match_info->offsets[0] - last_separator_end);
2046               list = g_list_prepend (list, token);
2047               token_count++;
2048
2049               /* if there were substrings, these need to be added to
2050                * the list. */
2051               match_count = g_match_info_get_match_count (match_info);
2052               if (match_count > 1)
2053                 {
2054                   for (i = 1; i < match_count; i++)
2055                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
2056                 }
2057             }
2058         }
2059       else
2060         {
2061           /* if there was no match, copy to end of string. */
2062           if (!last_match_is_empty)
2063             {
2064               gchar *token = g_strndup (string + last_separator_end,
2065                                         match_info->string_len - last_separator_end);
2066               list = g_list_prepend (list, token);
2067             }
2068           /* no more tokens, end the loop. */
2069           break;
2070         }
2071
2072       /* -1 to leave room for the last part. */
2073       if (token_count >= max_tokens - 1)
2074         {
2075           /* we have reached the maximum number of tokens, so we copy
2076            * the remaining part of the string. */
2077           if (last_match_is_empty)
2078             {
2079               /* the last match was empty, so we have moved one char
2080                * after the real position to avoid empty matches at the
2081                * same position. */
2082               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
2083             }
2084           /* the if is needed in the case we have terminated the available
2085            * tokens, but we are at the end of the string, so there are no
2086            * characters left to copy. */
2087           if (string_len > match_info->pos)
2088             {
2089               gchar *token = g_strndup (string + match_info->pos,
2090                                         string_len - match_info->pos);
2091               list = g_list_prepend (list, token);
2092             }
2093           /* end the loop. */
2094           break;
2095         }
2096
2097       last_separator_end = match_info->pos;
2098       if (last_match_is_empty)
2099         /* if the last match was empty, g_match_info_next() has moved
2100          * forward to avoid infinite loops, but we still need to copy that
2101          * character. */
2102         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
2103
2104       match_ok = g_match_info_next (match_info, &tmp_error);
2105     }
2106   g_match_info_free (match_info);
2107   if (tmp_error != NULL)
2108     {
2109       g_propagate_error (error, tmp_error);
2110       g_list_free_full (list, g_free);
2111       match_info->pos = -1;
2112       return NULL;
2113     }
2114
2115   string_list = g_new (gchar *, g_list_length (list) + 1);
2116   i = 0;
2117   for (last = g_list_last (list); last; last = g_list_previous (last))
2118     string_list[i++] = last->data;
2119   string_list[i] = NULL;
2120   g_list_free (list);
2121
2122   return string_list;
2123 }
2124
2125 enum
2126 {
2127   REPL_TYPE_STRING,
2128   REPL_TYPE_CHARACTER,
2129   REPL_TYPE_SYMBOLIC_REFERENCE,
2130   REPL_TYPE_NUMERIC_REFERENCE,
2131   REPL_TYPE_CHANGE_CASE
2132 };
2133
2134 typedef enum
2135 {
2136   CHANGE_CASE_NONE         = 1 << 0,
2137   CHANGE_CASE_UPPER        = 1 << 1,
2138   CHANGE_CASE_LOWER        = 1 << 2,
2139   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
2140   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
2141   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
2142   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
2143   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
2144 } ChangeCase;
2145
2146 struct _InterpolationData
2147 {
2148   gchar     *text;
2149   gint       type;
2150   gint       num;
2151   gchar      c;
2152   ChangeCase change_case;
2153 };
2154
2155 static void
2156 free_interpolation_data (InterpolationData *data)
2157 {
2158   g_free (data->text);
2159   g_free (data);
2160 }
2161
2162 static const gchar *
2163 expand_escape (const gchar        *replacement,
2164                const gchar        *p,
2165                InterpolationData  *data,
2166                GError            **error)
2167 {
2168   const gchar *q, *r;
2169   gint x, d, h, i;
2170   const gchar *error_detail;
2171   gint base = 0;
2172   GError *tmp_error = NULL;
2173
2174   p++;
2175   switch (*p)
2176     {
2177     case 't':
2178       p++;
2179       data->c = '\t';
2180       data->type = REPL_TYPE_CHARACTER;
2181       break;
2182     case 'n':
2183       p++;
2184       data->c = '\n';
2185       data->type = REPL_TYPE_CHARACTER;
2186       break;
2187     case 'v':
2188       p++;
2189       data->c = '\v';
2190       data->type = REPL_TYPE_CHARACTER;
2191       break;
2192     case 'r':
2193       p++;
2194       data->c = '\r';
2195       data->type = REPL_TYPE_CHARACTER;
2196       break;
2197     case 'f':
2198       p++;
2199       data->c = '\f';
2200       data->type = REPL_TYPE_CHARACTER;
2201       break;
2202     case 'a':
2203       p++;
2204       data->c = '\a';
2205       data->type = REPL_TYPE_CHARACTER;
2206       break;
2207     case 'b':
2208       p++;
2209       data->c = '\b';
2210       data->type = REPL_TYPE_CHARACTER;
2211       break;
2212     case '\\':
2213       p++;
2214       data->c = '\\';
2215       data->type = REPL_TYPE_CHARACTER;
2216       break;
2217     case 'x':
2218       p++;
2219       x = 0;
2220       if (*p == '{')
2221         {
2222           p++;
2223           do
2224             {
2225               h = g_ascii_xdigit_value (*p);
2226               if (h < 0)
2227                 {
2228                   error_detail = _("hexadecimal digit or '}' expected");
2229                   goto error;
2230                 }
2231               x = x * 16 + h;
2232               p++;
2233             }
2234           while (*p != '}');
2235           p++;
2236         }
2237       else
2238         {
2239           for (i = 0; i < 2; i++)
2240             {
2241               h = g_ascii_xdigit_value (*p);
2242               if (h < 0)
2243                 {
2244                   error_detail = _("hexadecimal digit expected");
2245                   goto error;
2246                 }
2247               x = x * 16 + h;
2248               p++;
2249             }
2250         }
2251       data->type = REPL_TYPE_STRING;
2252       data->text = g_new0 (gchar, 8);
2253       g_unichar_to_utf8 (x, data->text);
2254       break;
2255     case 'l':
2256       p++;
2257       data->type = REPL_TYPE_CHANGE_CASE;
2258       data->change_case = CHANGE_CASE_LOWER_SINGLE;
2259       break;
2260     case 'u':
2261       p++;
2262       data->type = REPL_TYPE_CHANGE_CASE;
2263       data->change_case = CHANGE_CASE_UPPER_SINGLE;
2264       break;
2265     case 'L':
2266       p++;
2267       data->type = REPL_TYPE_CHANGE_CASE;
2268       data->change_case = CHANGE_CASE_LOWER;
2269       break;
2270     case 'U':
2271       p++;
2272       data->type = REPL_TYPE_CHANGE_CASE;
2273       data->change_case = CHANGE_CASE_UPPER;
2274       break;
2275     case 'E':
2276       p++;
2277       data->type = REPL_TYPE_CHANGE_CASE;
2278       data->change_case = CHANGE_CASE_NONE;
2279       break;
2280     case 'g':
2281       p++;
2282       if (*p != '<')
2283         {
2284           error_detail = _("missing '<' in symbolic reference");
2285           goto error;
2286         }
2287       q = p + 1;
2288       do
2289         {
2290           p++;
2291           if (!*p)
2292             {
2293               error_detail = _("unfinished symbolic reference");
2294               goto error;
2295             }
2296         }
2297       while (*p != '>');
2298       if (p - q == 0)
2299         {
2300           error_detail = _("zero-length symbolic reference");
2301           goto error;
2302         }
2303       if (g_ascii_isdigit (*q))
2304         {
2305           x = 0;
2306           do
2307             {
2308               h = g_ascii_digit_value (*q);
2309               if (h < 0)
2310                 {
2311                   error_detail = _("digit expected");
2312                   p = q;
2313                   goto error;
2314                 }
2315               x = x * 10 + h;
2316               q++;
2317             }
2318           while (q != p);
2319           data->num = x;
2320           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2321         }
2322       else
2323         {
2324           r = q;
2325           do
2326             {
2327               if (!g_ascii_isalnum (*r))
2328                 {
2329                   error_detail = _("illegal symbolic reference");
2330                   p = r;
2331                   goto error;
2332                 }
2333               r++;
2334             }
2335           while (r != p);
2336           data->text = g_strndup (q, p - q);
2337           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
2338         }
2339       p++;
2340       break;
2341     case '0':
2342       /* if \0 is followed by a number is an octal number representing a
2343        * character, else it is a numeric reference. */
2344       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
2345         {
2346           base = 8;
2347           p = g_utf8_next_char (p);
2348         }
2349     case '1':
2350     case '2':
2351     case '3':
2352     case '4':
2353     case '5':
2354     case '6':
2355     case '7':
2356     case '8':
2357     case '9':
2358       x = 0;
2359       d = 0;
2360       for (i = 0; i < 3; i++)
2361         {
2362           h = g_ascii_digit_value (*p);
2363           if (h < 0)
2364             break;
2365           if (h > 7)
2366             {
2367               if (base == 8)
2368                 break;
2369               else
2370                 base = 10;
2371             }
2372           if (i == 2 && base == 10)
2373             break;
2374           x = x * 8 + h;
2375           d = d * 10 + h;
2376           p++;
2377         }
2378       if (base == 8 || i == 3)
2379         {
2380           data->type = REPL_TYPE_STRING;
2381           data->text = g_new0 (gchar, 8);
2382           g_unichar_to_utf8 (x, data->text);
2383         }
2384       else
2385         {
2386           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2387           data->num = d;
2388         }
2389       break;
2390     case 0:
2391       error_detail = _("stray final '\\'");
2392       goto error;
2393       break;
2394     default:
2395       error_detail = _("unknown escape sequence");
2396       goto error;
2397     }
2398
2399   return p;
2400
2401  error:
2402   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
2403   tmp_error = g_error_new (G_REGEX_ERROR,
2404                            G_REGEX_ERROR_REPLACE,
2405                            _("Error while parsing replacement "
2406                              "text \"%s\" at char %lu: %s"),
2407                            replacement,
2408                            (gulong)(p - replacement),
2409                            error_detail);
2410   g_propagate_error (error, tmp_error);
2411
2412   return NULL;
2413 }
2414
2415 static GList *
2416 split_replacement (const gchar  *replacement,
2417                    GError      **error)
2418 {
2419   GList *list = NULL;
2420   InterpolationData *data;
2421   const gchar *p, *start;
2422
2423   start = p = replacement;
2424   while (*p)
2425     {
2426       if (*p == '\\')
2427         {
2428           data = g_new0 (InterpolationData, 1);
2429           start = p = expand_escape (replacement, p, data, error);
2430           if (p == NULL)
2431             {
2432               g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2433               free_interpolation_data (data);
2434
2435               return NULL;
2436             }
2437           list = g_list_prepend (list, data);
2438         }
2439       else
2440         {
2441           p++;
2442           if (*p == '\\' || *p == '\0')
2443             {
2444               if (p - start > 0)
2445                 {
2446                   data = g_new0 (InterpolationData, 1);
2447                   data->text = g_strndup (start, p - start);
2448                   data->type = REPL_TYPE_STRING;
2449                   list = g_list_prepend (list, data);
2450                 }
2451             }
2452         }
2453     }
2454
2455   return g_list_reverse (list);
2456 }
2457
2458 /* Change the case of c based on change_case. */
2459 #define CHANGE_CASE(c, change_case) \
2460         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2461                 g_unichar_tolower (c) : \
2462                 g_unichar_toupper (c))
2463
2464 static void
2465 string_append (GString     *string,
2466                const gchar *text,
2467                ChangeCase  *change_case)
2468 {
2469   gunichar c;
2470
2471   if (text[0] == '\0')
2472     return;
2473
2474   if (*change_case == CHANGE_CASE_NONE)
2475     {
2476       g_string_append (string, text);
2477     }
2478   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2479     {
2480       c = g_utf8_get_char (text);
2481       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2482       g_string_append (string, g_utf8_next_char (text));
2483       *change_case = CHANGE_CASE_NONE;
2484     }
2485   else
2486     {
2487       while (*text != '\0')
2488         {
2489           c = g_utf8_get_char (text);
2490           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2491           text = g_utf8_next_char (text);
2492         }
2493     }
2494 }
2495
2496 static gboolean
2497 interpolate_replacement (const GMatchInfo *match_info,
2498                          GString          *result,
2499                          gpointer          data)
2500 {
2501   GList *list;
2502   InterpolationData *idata;
2503   gchar *match;
2504   ChangeCase change_case = CHANGE_CASE_NONE;
2505
2506   for (list = data; list; list = list->next)
2507     {
2508       idata = list->data;
2509       switch (idata->type)
2510         {
2511         case REPL_TYPE_STRING:
2512           string_append (result, idata->text, &change_case);
2513           break;
2514         case REPL_TYPE_CHARACTER:
2515           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2516           if (change_case & CHANGE_CASE_SINGLE_MASK)
2517             change_case = CHANGE_CASE_NONE;
2518           break;
2519         case REPL_TYPE_NUMERIC_REFERENCE:
2520           match = g_match_info_fetch (match_info, idata->num);
2521           if (match)
2522             {
2523               string_append (result, match, &change_case);
2524               g_free (match);
2525             }
2526           break;
2527         case REPL_TYPE_SYMBOLIC_REFERENCE:
2528           match = g_match_info_fetch_named (match_info, idata->text);
2529           if (match)
2530             {
2531               string_append (result, match, &change_case);
2532               g_free (match);
2533             }
2534           break;
2535         case REPL_TYPE_CHANGE_CASE:
2536           change_case = idata->change_case;
2537           break;
2538         }
2539     }
2540
2541   return FALSE;
2542 }
2543
2544 /* whether actual match_info is needed for replacement, i.e.
2545  * whether there are references
2546  */
2547 static gboolean
2548 interpolation_list_needs_match (GList *list)
2549 {
2550   while (list != NULL)
2551     {
2552       InterpolationData *data = list->data;
2553
2554       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
2555           data->type == REPL_TYPE_NUMERIC_REFERENCE)
2556         {
2557           return TRUE;
2558         }
2559
2560       list = list->next;
2561     }
2562
2563   return FALSE;
2564 }
2565
2566 /**
2567  * g_regex_replace:
2568  * @regex: a #GRegex structure
2569  * @string: (array length=string_len): the string to perform matches against
2570  * @string_len: the length of @string, or -1 if @string is nul-terminated
2571  * @start_position: starting index of the string to match
2572  * @replacement: text to replace each match with
2573  * @match_options: options for the match
2574  * @error: location to store the error occurring, or %NULL to ignore errors
2575  *
2576  * Replaces all occurrences of the pattern in @regex with the
2577  * replacement text. Backreferences of the form '\number' or
2578  * '\g&lt;number&gt;' in the replacement text are interpolated by the
2579  * number-th captured subexpression of the match, '\g&lt;name&gt;' refers
2580  * to the captured subexpression with the given name. '\0' refers to the
2581  * complete match, but '\0' followed by a number is the octal representation
2582  * of a character. To include a literal '\' in the replacement, write '\\'.
2583  * There are also escapes that changes the case of the following text:
2584  *
2585  * <variablelist>
2586  * <varlistentry><term>\l</term>
2587  * <listitem>
2588  * <para>Convert to lower case the next character</para>
2589  * </listitem>
2590  * </varlistentry>
2591  * <varlistentry><term>\u</term>
2592  * <listitem>
2593  * <para>Convert to upper case the next character</para>
2594  * </listitem>
2595  * </varlistentry>
2596  * <varlistentry><term>\L</term>
2597  * <listitem>
2598  * <para>Convert to lower case till \E</para>
2599  * </listitem>
2600  * </varlistentry>
2601  * <varlistentry><term>\U</term>
2602  * <listitem>
2603  * <para>Convert to upper case till \E</para>
2604  * </listitem>
2605  * </varlistentry>
2606  * <varlistentry><term>\E</term>
2607  * <listitem>
2608  * <para>End case modification</para>
2609  * </listitem>
2610  * </varlistentry>
2611  * </variablelist>
2612  *
2613  * If you do not need to use backreferences use g_regex_replace_literal().
2614  *
2615  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2616  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2617  * you can use g_regex_replace_literal().
2618  *
2619  * Setting @start_position differs from just passing over a shortened
2620  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
2621  * begins with any kind of lookbehind assertion, such as "\b".
2622  *
2623  * Returns: a newly allocated string containing the replacements
2624  *
2625  * Since: 2.14
2626  */
2627 gchar *
2628 g_regex_replace (const GRegex      *regex,
2629                  const gchar       *string,
2630                  gssize             string_len,
2631                  gint               start_position,
2632                  const gchar       *replacement,
2633                  GRegexMatchFlags   match_options,
2634                  GError           **error)
2635 {
2636   gchar *result;
2637   GList *list;
2638   GError *tmp_error = NULL;
2639
2640   g_return_val_if_fail (regex != NULL, NULL);
2641   g_return_val_if_fail (string != NULL, NULL);
2642   g_return_val_if_fail (start_position >= 0, NULL);
2643   g_return_val_if_fail (replacement != NULL, NULL);
2644   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2645   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2646
2647   list = split_replacement (replacement, &tmp_error);
2648   if (tmp_error != NULL)
2649     {
2650       g_propagate_error (error, tmp_error);
2651       return NULL;
2652     }
2653
2654   result = g_regex_replace_eval (regex,
2655                                  string, string_len, start_position,
2656                                  match_options,
2657                                  interpolate_replacement,
2658                                  (gpointer)list,
2659                                  &tmp_error);
2660   if (tmp_error != NULL)
2661     g_propagate_error (error, tmp_error);
2662
2663   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2664
2665   return result;
2666 }
2667
2668 static gboolean
2669 literal_replacement (const GMatchInfo *match_info,
2670                      GString          *result,
2671                      gpointer          data)
2672 {
2673   g_string_append (result, data);
2674   return FALSE;
2675 }
2676
2677 /**
2678  * g_regex_replace_literal:
2679  * @regex: a #GRegex structure
2680  * @string: (array length=string_len): the string to perform matches against
2681  * @string_len: the length of @string, or -1 if @string is nul-terminated
2682  * @start_position: starting index of the string to match
2683  * @replacement: text to replace each match with
2684  * @match_options: options for the match
2685  * @error: location to store the error occurring, or %NULL to ignore errors
2686  *
2687  * Replaces all occurrences of the pattern in @regex with the
2688  * replacement text. @replacement is replaced literally, to
2689  * include backreferences use g_regex_replace().
2690  *
2691  * Setting @start_position differs from just passing over a
2692  * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
2693  * case of a pattern that begins with any kind of lookbehind
2694  * assertion, such as "\b".
2695  *
2696  * Returns: a newly allocated string containing the replacements
2697  *
2698  * Since: 2.14
2699  */
2700 gchar *
2701 g_regex_replace_literal (const GRegex      *regex,
2702                          const gchar       *string,
2703                          gssize             string_len,
2704                          gint               start_position,
2705                          const gchar       *replacement,
2706                          GRegexMatchFlags   match_options,
2707                          GError           **error)
2708 {
2709   g_return_val_if_fail (replacement != NULL, NULL);
2710   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2711
2712   return g_regex_replace_eval (regex,
2713                                string, string_len, start_position,
2714                                match_options,
2715                                literal_replacement,
2716                                (gpointer)replacement,
2717                                error);
2718 }
2719
2720 /**
2721  * g_regex_replace_eval:
2722  * @regex: a #GRegex structure from g_regex_new()
2723  * @string: (array length=string_len): string to perform matches against
2724  * @string_len: the length of @string, or -1 if @string is nul-terminated
2725  * @start_position: starting index of the string to match
2726  * @match_options: options for the match
2727  * @eval: a function to call for each match
2728  * @user_data: user data to pass to the function
2729  * @error: location to store the error occurring, or %NULL to ignore errors
2730  *
2731  * Replaces occurrences of the pattern in regex with the output of
2732  * @eval for that occurrence.
2733  *
2734  * Setting @start_position differs from just passing over a shortened
2735  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2736  * that begins with any kind of lookbehind assertion, such as "\b".
2737  *
2738  * The following example uses g_regex_replace_eval() to replace multiple
2739  * strings at once:
2740  * |[
2741  * static gboolean
2742  * eval_cb (const GMatchInfo *info,
2743  *          GString          *res,
2744  *          gpointer          data)
2745  * {
2746  *   gchar *match;
2747  *   gchar *r;
2748  *
2749  *    match = g_match_info_fetch (info, 0);
2750  *    r = g_hash_table_lookup ((GHashTable *)data, match);
2751  *    g_string_append (res, r);
2752  *    g_free (match);
2753  *
2754  *    return FALSE;
2755  * }
2756  *
2757  * /&ast; ... &ast;/
2758  *
2759  * GRegex *reg;
2760  * GHashTable *h;
2761  * gchar *res;
2762  *
2763  * h = g_hash_table_new (g_str_hash, g_str_equal);
2764  *
2765  * g_hash_table_insert (h, "1", "ONE");
2766  * g_hash_table_insert (h, "2", "TWO");
2767  * g_hash_table_insert (h, "3", "THREE");
2768  * g_hash_table_insert (h, "4", "FOUR");
2769  *
2770  * reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
2771  * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
2772  * g_hash_table_destroy (h);
2773  *
2774  * /&ast; ... &ast;/
2775  * ]|
2776  *
2777  * Returns: a newly allocated string containing the replacements
2778  *
2779  * Since: 2.14
2780  */
2781 gchar *
2782 g_regex_replace_eval (const GRegex        *regex,
2783                       const gchar         *string,
2784                       gssize               string_len,
2785                       gint                 start_position,
2786                       GRegexMatchFlags     match_options,
2787                       GRegexEvalCallback   eval,
2788                       gpointer             user_data,
2789                       GError             **error)
2790 {
2791   GMatchInfo *match_info;
2792   GString *result;
2793   gint str_pos = 0;
2794   gboolean done = FALSE;
2795   GError *tmp_error = NULL;
2796
2797   g_return_val_if_fail (regex != NULL, NULL);
2798   g_return_val_if_fail (string != NULL, NULL);
2799   g_return_val_if_fail (start_position >= 0, NULL);
2800   g_return_val_if_fail (eval != NULL, NULL);
2801   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2802
2803   if (string_len < 0)
2804     string_len = strlen (string);
2805
2806   result = g_string_sized_new (string_len);
2807
2808   /* run down the string making matches. */
2809   g_regex_match_full (regex, string, string_len, start_position,
2810                       match_options, &match_info, &tmp_error);
2811   while (!done && g_match_info_matches (match_info))
2812     {
2813       g_string_append_len (result,
2814                            string + str_pos,
2815                            match_info->offsets[0] - str_pos);
2816       done = (*eval) (match_info, result, user_data);
2817       str_pos = match_info->offsets[1];
2818       g_match_info_next (match_info, &tmp_error);
2819     }
2820   g_match_info_free (match_info);
2821   if (tmp_error != NULL)
2822     {
2823       g_propagate_error (error, tmp_error);
2824       g_string_free (result, TRUE);
2825       return NULL;
2826     }
2827
2828   g_string_append_len (result, string + str_pos, string_len - str_pos);
2829   return g_string_free (result, FALSE);
2830 }
2831
2832 /**
2833  * g_regex_check_replacement:
2834  * @replacement: the replacement string
2835  * @has_references: (out) (allow-none): location to store information about
2836  *   references in @replacement or %NULL
2837  * @error: location to store error
2838  *
2839  * Checks whether @replacement is a valid replacement string
2840  * (see g_regex_replace()), i.e. that all escape sequences in
2841  * it are valid.
2842  *
2843  * If @has_references is not %NULL then @replacement is checked
2844  * for pattern references. For instance, replacement text 'foo\n'
2845  * does not contain references and may be evaluated without information
2846  * about actual match, but '\0\1' (whole match followed by first
2847  * subpattern) requires valid #GMatchInfo object.
2848  *
2849  * Returns: whether @replacement is a valid replacement string
2850  *
2851  * Since: 2.14
2852  */
2853 gboolean
2854 g_regex_check_replacement (const gchar  *replacement,
2855                            gboolean     *has_references,
2856                            GError      **error)
2857 {
2858   GList *list;
2859   GError *tmp = NULL;
2860
2861   list = split_replacement (replacement, &tmp);
2862
2863   if (tmp)
2864   {
2865     g_propagate_error (error, tmp);
2866     return FALSE;
2867   }
2868
2869   if (has_references)
2870     *has_references = interpolation_list_needs_match (list);
2871
2872   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2873
2874   return TRUE;
2875 }
2876
2877 /**
2878  * g_regex_escape_nul:
2879  * @string: the string to escape
2880  * @length: the length of @string
2881  *
2882  * Escapes the nul characters in @string to "\x00".  It can be used
2883  * to compile a regex with embedded nul characters.
2884  *
2885  * For completeness, @length can be -1 for a nul-terminated string.
2886  * In this case the output string will be of course equal to @string.
2887  *
2888  * Returns: a newly-allocated escaped string
2889  *
2890  * Since: 2.30
2891  */
2892 gchar *
2893 g_regex_escape_nul (const gchar *string,
2894                     gint         length)
2895 {
2896   GString *escaped;
2897   const gchar *p, *piece_start, *end;
2898   gint backslashes;
2899
2900   g_return_val_if_fail (string != NULL, NULL);
2901
2902   if (length < 0)
2903     return g_strdup (string);
2904
2905   end = string + length;
2906   p = piece_start = string;
2907   escaped = g_string_sized_new (length + 1);
2908
2909   backslashes = 0;
2910   while (p < end)
2911     {
2912       switch (*p)
2913         {
2914         case '\0':
2915           if (p != piece_start)
2916             {
2917               /* copy the previous piece. */
2918               g_string_append_len (escaped, piece_start, p - piece_start);
2919             }
2920           if ((backslashes & 1) == 0)
2921             g_string_append_c (escaped, '\\');
2922           g_string_append_c (escaped, 'x');
2923           g_string_append_c (escaped, '0');
2924           g_string_append_c (escaped, '0');
2925           piece_start = ++p;
2926           backslashes = 0;
2927           break;
2928         case '\\':
2929           backslashes++;
2930           ++p;
2931           break;
2932         default:
2933           backslashes = 0;
2934           p = g_utf8_next_char (p);
2935           break;
2936         }
2937     }
2938
2939   if (piece_start < end)
2940     g_string_append_len (escaped, piece_start, end - piece_start);
2941
2942   return g_string_free (escaped, FALSE);
2943 }
2944
2945 /**
2946  * g_regex_escape_string:
2947  * @string: (array length=length): the string to escape
2948  * @length: the length of @string, or -1 if @string is nul-terminated
2949  *
2950  * Escapes the special characters used for regular expressions
2951  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
2952  * function is useful to dynamically generate regular expressions.
2953  *
2954  * @string can contain nul characters that are replaced with "\0",
2955  * in this case remember to specify the correct length of @string
2956  * in @length.
2957  *
2958  * Returns: a newly-allocated escaped string
2959  *
2960  * Since: 2.14
2961  */
2962 gchar *
2963 g_regex_escape_string (const gchar *string,
2964                        gint         length)
2965 {
2966   GString *escaped;
2967   const char *p, *piece_start, *end;
2968
2969   g_return_val_if_fail (string != NULL, NULL);
2970
2971   if (length < 0)
2972     length = strlen (string);
2973
2974   end = string + length;
2975   p = piece_start = string;
2976   escaped = g_string_sized_new (length + 1);
2977
2978   while (p < end)
2979     {
2980       switch (*p)
2981         {
2982         case '\0':
2983         case '\\':
2984         case '|':
2985         case '(':
2986         case ')':
2987         case '[':
2988         case ']':
2989         case '{':
2990         case '}':
2991         case '^':
2992         case '$':
2993         case '*':
2994         case '+':
2995         case '?':
2996         case '.':
2997           if (p != piece_start)
2998             /* copy the previous piece. */
2999             g_string_append_len (escaped, piece_start, p - piece_start);
3000           g_string_append_c (escaped, '\\');
3001           if (*p == '\0')
3002             g_string_append_c (escaped, '0');
3003           else
3004             g_string_append_c (escaped, *p);
3005           piece_start = ++p;
3006           break;
3007         default:
3008           p = g_utf8_next_char (p);
3009           break;
3010         }
3011   }
3012
3013   if (piece_start < end)
3014     g_string_append_len (escaped, piece_start, end - piece_start);
3015
3016   return g_string_free (escaped, FALSE);
3017 }