glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include "config.h"
  23
  24 #include <string.h>
  25
  26 #ifdef USE_SYSTEM_PCRE
  27 #include <pcre.h>
  28 #else
  29 #include "pcre/pcre.h"
  30 #endif
  31
  32 #include "gtypes.h"
  33 #include "gregex.h"
  34 #include "glibintl.h"
  35 #include "glist.h"
  36 #include "gmessages.h"
  37 #include "gstrfuncs.h"
  38 #include "gatomic.h"
  39 #include "gthread.h"
  40
  41 /**
  42  * SECTION:gregex
  43  * @title: Perl-compatible regular expressions
  44  * @short_description: matches strings against regular expressions
  45  * @see_also: <xref linkend="glib-regex-syntax"/>
  46  *
  47  * The <function>g_regex_*()</function> functions implement regular
  48  * expression pattern matching using syntax and semantics similar to
  49  * Perl regular expression.
  50  *
  51  * Some functions accept a @start_position argument, setting it differs
  52  * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
  53  * in the case of a pattern that begins with any kind of lookbehind assertion.
  54  * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
  55  * in the middle of words. ("\B" matches only if the current position in the
  56  * subject is not a word boundary.) When applied to the string "Mississipi"
  57  * from the fourth byte, namely "issipi", it does not match, because "\B" is
  58  * always false at the start of the subject, which is deemed to be a word
  59  * boundary. However, if the entire string is passed , but with
  60  * @start_position set to 4, it finds the second occurrence of "iss" because
  61  * it is able to look behind the starting point to discover that it is
  62  * preceded by a letter.
  63  *
  64  * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
  65  * to these functions must be encoded in UTF-8. The lengths and the positions
  66  * inside the strings are in bytes and not in characters, so, for instance,
  67  * "\xc3\xa0" (i.e. "&agrave;") is two bytes long but it is treated as a
  68  * single character. If you set #G_REGEX_RAW the strings can be non-valid
  69  * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
  70  * bytes and two characters long.
  71  *
  72  * When matching a pattern, "\n" matches only against a "\n" character in
  73  * the string, and "\r" matches only a "\r" character. To match any newline
  74  * sequence use "\R". This particular group matches either the two-character
  75  * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
  76  * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
  77  * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
  78  * separator, U+2028), or PS (paragraph separator, U+2029).
  79  *
  80  * The behaviour of the dot, circumflex, and dollar metacharacters are
  81  * affected by newline characters, the default is to recognize any newline
  82  * character (the same characters recognized by "\R"). This can be changed
  83  * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
  84  * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
  85  * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
  86  * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
  87  * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
  88  * unescaped "#" outside a character class is encountered. This indicates
  89  * a comment that lasts until after the next newline.
  90  *
  91  * Creating and manipulating the same #GRegex structure from different
  92  * threads is not a problem as #GRegex does not modify its internal
  93  * state between creation and destruction, on the other hand #GMatchInfo
  94  * is not threadsafe.
  95  *
  96  * The regular expressions low-level functionalities are obtained through
  97  * the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
  98  * written by Philip Hazel.
  99  */
 100
 101 /* Mask of all the possible values for GRegexCompileFlags. */
 102 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
 103                               G_REGEX_MULTILINE         | \
 104                               G_REGEX_DOTALL            | \
 105                               G_REGEX_EXTENDED          | \
 106                               G_REGEX_ANCHORED          | \
 107                               G_REGEX_DOLLAR_ENDONLY    | \
 108                               G_REGEX_UNGREEDY          | \
 109                               G_REGEX_RAW               | \
 110                               G_REGEX_NO_AUTO_CAPTURE   | \
 111                               G_REGEX_OPTIMIZE          | \
 112                               G_REGEX_DUPNAMES          | \
 113                               G_REGEX_NEWLINE_CR        | \
 114                               G_REGEX_NEWLINE_LF        | \
 115                               G_REGEX_NEWLINE_CRLF      | \
 116                               G_REGEX_NEWLINE_ANYCRLF   | \
 117                               G_REGEX_BSR_ANYCRLF)
 118
 119 /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */
 120 #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK)
 121 #define G_REGEX_COMPILE_NONPCRE_MASK (G_REGEX_RAW              | \
 122                                       G_REGEX_OPTIMIZE)
 123
 124 /* Mask of all the possible values for GRegexMatchFlags. */
 125 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED         | \
 126                             G_REGEX_MATCH_NOTBOL           | \
 127                             G_REGEX_MATCH_NOTEOL           | \
 128                             G_REGEX_MATCH_NOTEMPTY         | \
 129                             G_REGEX_MATCH_PARTIAL          | \
 130                             G_REGEX_MATCH_NEWLINE_CR       | \
 131                             G_REGEX_MATCH_NEWLINE_LF       | \
 132                             G_REGEX_MATCH_NEWLINE_CRLF     | \
 133                             G_REGEX_MATCH_NEWLINE_ANY      | \
 134                             G_REGEX_MATCH_NEWLINE_ANYCRLF  | \
 135                             G_REGEX_MATCH_BSR_ANYCRLF      | \
 136                             G_REGEX_MATCH_BSR_ANY)
 137
 138 /* we rely on these flags having the same values */
 139 G_STATIC_ASSERT (G_REGEX_CASELESS        == PCRE_CASELESS);
 140 G_STATIC_ASSERT (G_REGEX_MULTILINE       == PCRE_MULTILINE);
 141 G_STATIC_ASSERT (G_REGEX_DOTALL          == PCRE_DOTALL);
 142 G_STATIC_ASSERT (G_REGEX_EXTENDED        == PCRE_EXTENDED);
 143 G_STATIC_ASSERT (G_REGEX_ANCHORED        == PCRE_ANCHORED);
 144 G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY  == PCRE_DOLLAR_ENDONLY);
 145 G_STATIC_ASSERT (G_REGEX_UNGREEDY        == PCRE_UNGREEDY);
 146 G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE);
 147 G_STATIC_ASSERT (G_REGEX_DUPNAMES        == PCRE_DUPNAMES);
 148 G_STATIC_ASSERT (G_REGEX_NEWLINE_CR      == PCRE_NEWLINE_CR);
 149 G_STATIC_ASSERT (G_REGEX_NEWLINE_LF      == PCRE_NEWLINE_LF);
 150 G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF    == PCRE_NEWLINE_CRLF);
 151 G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);
 152 G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF     == PCRE_BSR_ANYCRLF);
 153
 154 G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED        == PCRE_ANCHORED);
 155 G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL          == PCRE_NOTBOL);
 156 G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL          == PCRE_NOTEOL);
 157 G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY        == PCRE_NOTEMPTY);
 158 G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL         == PCRE_PARTIAL);
 159 G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR      == PCRE_NEWLINE_CR);
 160 G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF      == PCRE_NEWLINE_LF);
 161 G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF    == PCRE_NEWLINE_CRLF);
 162 G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY     == PCRE_NEWLINE_ANY);
 163 G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);
 164 G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF     == PCRE_BSR_ANYCRLF);
 165 G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY         == PCRE_BSR_UNICODE);
 166
 167 /* These PCRE flags are unused or not exposed publically in GRegexFlags, so
 168  * it should be ok to reuse them for different things.
 169  */
 170 G_STATIC_ASSERT (G_REGEX_OPTIMIZE          == PCRE_NO_UTF8_CHECK);
 171 G_STATIC_ASSERT (G_REGEX_RAW               == PCRE_UTF8);
 172
 173 /* if the string is in UTF-8 use g_utf8_ functions, else use
 174  * use just +/- 1. */
 175 #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
 176                                 ((s) + 1) : \
 177                                 g_utf8_next_char (s))
 178 #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
 179                                 ((s) - 1) : \
 180                                 g_utf8_prev_char (s))
 181
 182 struct _GMatchInfo
 183 {
 184   volatile gint ref_count;      /* the ref count */
 185   GRegex *regex;                /* the regex */
 186   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 187   gint matches;                 /* number of matching sub patterns */
 188   gint pos;                     /* position in the string where last match left off */
 189   gint  n_offsets;              /* number of offsets */
 190   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
 191   gint *workspace;              /* workspace for pcre_dfa_exec() */
 192   gint n_workspace;             /* number of workspace elements */
 193   const gchar *string;          /* string passed to the match function */
 194   gssize string_len;            /* length of string */
 195 };
 196
 197 struct _GRegex
 198 {
 199   volatile gint ref_count;      /* the ref count for the immutable part */
 200   gchar *pattern;               /* the pattern */
 201   pcre *pcre_re;                /* compiled form of the pattern */
 202   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
 203   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 204   pcre_extra *extra;            /* data stored when G_REGEX_OPTIMIZE is used */
 205 };
 206
 207 /* TRUE if ret is an error code, FALSE otherwise. */
 208 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 209
 210 typedef struct _InterpolationData InterpolationData;
 211 static gboolean  interpolation_list_needs_match (GList *list);
 212 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
 213                                                  GString *result,
 214                                                  gpointer data);
 215 static GList    *split_replacement              (const gchar *replacement,
 216                                                  GError **error);
 217 static void      free_interpolation_data        (InterpolationData *data);
 218
 219
 220 static const gchar *
 221 match_error (gint errcode)
 222 {
 223   switch (errcode)
 224     {
 225     case PCRE_ERROR_NOMATCH:
 226       /* not an error */
 227       break;
 228     case PCRE_ERROR_NULL:
 229       /* NULL argument, this should not happen in GRegex */
 230       g_warning ("A NULL argument was passed to PCRE");
 231       break;
 232     case PCRE_ERROR_BADOPTION:
 233       return "bad options";
 234     case PCRE_ERROR_BADMAGIC:
 235       return _("corrupted object");
 236     case PCRE_ERROR_UNKNOWN_OPCODE:
 237       return N_("internal error or corrupted object");
 238     case PCRE_ERROR_NOMEMORY:
 239       return _("out of memory");
 240     case PCRE_ERROR_NOSUBSTRING:
 241       /* not used by pcre_exec() */
 242       break;
 243     case PCRE_ERROR_MATCHLIMIT:
 244       return _("backtracking limit reached");
 245     case PCRE_ERROR_CALLOUT:
 246       /* callouts are not implemented */
 247       break;
 248     case PCRE_ERROR_BADUTF8:
 249     case PCRE_ERROR_BADUTF8_OFFSET:
 250       /* we do not check if strings are valid */
 251       break;
 252     case PCRE_ERROR_PARTIAL:
 253       /* not an error */
 254       break;
 255     case PCRE_ERROR_BADPARTIAL:
 256       return _("the pattern contains items not supported for partial matching");
 257     case PCRE_ERROR_INTERNAL:
 258       return _("internal error");
 259     case PCRE_ERROR_BADCOUNT:
 260       /* negative ovecsize, this should not happen in GRegex */
 261       g_warning ("A negative ovecsize was passed to PCRE");
 262       break;
 263     case PCRE_ERROR_DFA_UITEM:
 264       return _("the pattern contains items not supported for partial matching");
 265     case PCRE_ERROR_DFA_UCOND:
 266       return _("back references as conditions are not supported for partial matching");
 267     case PCRE_ERROR_DFA_UMLIMIT:
 268       /* the match_field field is not used in GRegex */
 269       break;
 270     case PCRE_ERROR_DFA_WSSIZE:
 271       /* handled expanding the workspace */
 272       break;
 273     case PCRE_ERROR_DFA_RECURSE:
 274     case PCRE_ERROR_RECURSIONLIMIT:
 275       return _("recursion limit reached");
 276     case PCRE_ERROR_BADNEWLINE:
 277       return _("invalid combination of newline flags");
 278     case PCRE_ERROR_BADOFFSET:
 279       return _("bad offset");
 280     case PCRE_ERROR_SHORTUTF8:
 281       return _("short utf8");
 282     case PCRE_ERROR_RECURSELOOP:
 283       return _("recursion loop");
 284     default:
 285       break;
 286     }
 287   return _("unknown error");
 288 }
 289
 290 static void
 291 translate_compile_error (gint *errcode, const gchar **errmsg)
 292 {
 293   /* Compile errors are created adding 100 to the error code returned
 294    * by PCRE.
 295    * If errcode is known we put the translatable error message in
 296    * erromsg. If errcode is unknown we put the generic
 297    * G_REGEX_ERROR_COMPILE error code in errcode and keep the
 298    * untranslated error message returned by PCRE.
 299    * Note that there can be more PCRE errors with the same GRegexError
 300    * and that some PCRE errors are useless for us.
 301    */
 302   *errcode += 100;
 303
 304   switch (*errcode)
 305     {
 306     case G_REGEX_ERROR_STRAY_BACKSLASH:
 307       *errmsg = _("\\ at end of pattern");
 308       break;
 309     case G_REGEX_ERROR_MISSING_CONTROL_CHAR:
 310       *errmsg = _("\\c at end of pattern");
 311       break;
 312     case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
 313       *errmsg = _("unrecognized character follows \\");
 314       break;
 315     case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
 316       *errmsg = _("numbers out of order in {} quantifier");
 317       break;
 318     case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
 319       *errmsg = _("number too big in {} quantifier");
 320       break;
 321     case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
 322       *errmsg = _("missing terminating ] for character class");
 323       break;
 324     case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
 325       *errmsg = _("invalid escape sequence in character class");
 326       break;
 327     case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
 328       *errmsg = _("range out of order in character class");
 329       break;
 330     case G_REGEX_ERROR_NOTHING_TO_REPEAT:
 331       *errmsg = _("nothing to repeat");
 332       break;
 333     case 111: /* internal error: unexpected repeat */
 334       *errcode = G_REGEX_ERROR_INTERNAL;
 335       *errmsg = _("unexpected repeat");
 336       break;
 337     case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
 338       *errmsg = _("unrecognized character after (? or (?-");
 339       break;
 340     case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
 341       *errmsg = _("POSIX named classes are supported only within a class");
 342       break;
 343     case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
 344       *errmsg = _("missing terminating )");
 345       break;
 346     case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
 347       *errmsg = _("reference to non-existent subpattern");
 348       break;
 349     case G_REGEX_ERROR_UNTERMINATED_COMMENT:
 350       *errmsg = _("missing ) after comment");
 351       break;
 352     case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
 353       *errmsg = _("regular expression is too large");
 354       break;
 355     case G_REGEX_ERROR_MEMORY_ERROR:
 356       *errmsg = _("failed to get memory");
 357       break;
 358     case 122: /* unmatched parentheses */
 359       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 360       *errmsg = _(") without opening (");
 361       break;
 362     case 123: /* internal error: code overflow */
 363       *errcode = G_REGEX_ERROR_INTERNAL;
 364       *errmsg = _("code overflow");
 365       break;
 366     case 124: /* "unrecognized character after (?<\0 */
 367       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 368       *errmsg = _("unrecognized character after (?<");
 369       break;
 370     case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
 371       *errmsg = _("lookbehind assertion is not fixed length");
 372       break;
 373     case G_REGEX_ERROR_MALFORMED_CONDITION:
 374       *errmsg = _("malformed number or name after (?(");
 375       break;
 376     case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
 377       *errmsg = _("conditional group contains more than two branches");
 378       break;
 379     case G_REGEX_ERROR_ASSERTION_EXPECTED:
 380       *errmsg = _("assertion expected after (?(");
 381       break;
 382     case 129:
 383       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 384       /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
 385        * sequences here, '(?-54' would be an example for the second group.
 386        */
 387       *errmsg = _("(?R or (?[+-]digits must be followed by )");
 388       break;
 389     case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
 390       *errmsg = _("unknown POSIX class name");
 391       break;
 392     case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
 393       *errmsg = _("POSIX collating elements are not supported");
 394       break;
 395     case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
 396       *errmsg = _("character value in \\x{...} sequence is too large");
 397       break;
 398     case G_REGEX_ERROR_INVALID_CONDITION:
 399       *errmsg = _("invalid condition (?(0)");
 400       break;
 401     case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
 402       *errmsg = _("\\C not allowed in lookbehind assertion");
 403       break;
 404     case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */
 405       /* A number of Perl escapes are not handled by PCRE.
 406        * Therefore it explicitly raises ERR37.
 407        */
 408       *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
 409       *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");
 410       break;
 411     case G_REGEX_ERROR_INFINITE_LOOP:
 412       *errmsg = _("recursive call could loop indefinitely");
 413       break;
 414     case 141: /* unrecognized character after (?P\0 */
 415       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 416       *errmsg = _("unrecognized character after (?P");
 417       break;
 418     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
 419       *errmsg = _("missing terminator in subpattern name");
 420       break;
 421     case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
 422       *errmsg = _("two named subpatterns have the same name");
 423       break;
 424     case G_REGEX_ERROR_MALFORMED_PROPERTY:
 425       *errmsg = _("malformed \\P or \\p sequence");
 426       break;
 427     case G_REGEX_ERROR_UNKNOWN_PROPERTY:
 428       *errmsg = _("unknown property name after \\P or \\p");
 429       break;
 430     case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
 431       *errmsg = _("subpattern name is too long (maximum 32 characters)");
 432       break;
 433     case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
 434       *errmsg = _("too many named subpatterns (maximum 10,000)");
 435       break;
 436     case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
 437       *errmsg = _("octal value is greater than \\377");
 438       break;
 439     case 152: /* internal error: overran compiling workspace */
 440       *errcode = G_REGEX_ERROR_INTERNAL;
 441       *errmsg = _("overran compiling workspace");
 442       break;
 443     case 153: /* internal error: previously-checked referenced subpattern not found */
 444       *errcode = G_REGEX_ERROR_INTERNAL;
 445       *errmsg = _("previously-checked referenced subpattern not found");
 446       break;
 447     case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
 448       *errmsg = _("DEFINE group contains more than one branch");
 449       break;
 450     case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
 451       *errmsg = _("inconsistent NEWLINE options");
 452       break;
 453     case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
 454       *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
 455                   "number, or by a plain number");
 456       break;
 457     case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:
 458       *errmsg = _("a numbered reference must not be zero");
 459       break;
 460     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:
 461       *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
 462       break;
 463     case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:
 464       *errmsg = _("(*VERB) not recognized");
 465       break;
 466     case G_REGEX_ERROR_NUMBER_TOO_BIG:
 467       *errmsg = _("number is too bug");
 468       break;
 469     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:
 470       *errmsg = _("missing subpattern name after (?&");
 471       break;
 472     case G_REGEX_ERROR_MISSING_DIGIT:
 473       *errmsg = _("digit expected after (?+");
 474       break;
 475     case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
 476       *errmsg = _("different names for subpatterns of the same number are not allowed");
 477       break;
 478     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:
 479       *errmsg = _("(*MARK) must have an argument");
 480       break;
 481     case G_REGEX_ERROR_INVALID_CONTROL_CHAR:
 482       *errmsg = _( "\\c must be followed by an ASCII character");
 483       break;
 484     case G_REGEX_ERROR_MISSING_NAME:
 485       *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
 486       break;
 487     case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:
 488       *errmsg = _("\\N is not supported in a class");
 489       break;
 490     case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:
 491       *errmsg = _("too many forward references");
 492       break;
 493     case G_REGEX_ERROR_NAME_TOO_LONG:
 494       *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
 495       break;
 496
 497     case 116: /* erroffset passed as NULL */
 498       /* This should not happen as we never pass a NULL erroffset */
 499       g_warning ("erroffset passed as NULL");
 500       *errcode = G_REGEX_ERROR_COMPILE;
 501       break;
 502     case 117: /* unknown option bit(s) set */
 503       /* This should not happen as we check options before passing them
 504        * to pcre_compile2() */
 505       g_warning ("unknown option bit(s) set");
 506       *errcode = G_REGEX_ERROR_COMPILE;
 507       break;
 508     case 132: /* this version of PCRE is compiled without UTF support */
 509     case 144: /* invalid UTF-8 string */
 510     case 145: /* support for \\P, \\p, and \\X has not been compiled */
 511     case 167: /* this version of PCRE is not compiled with Unicode property support */
 512     case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */
 513     case 174: /* invalid UTF-16 string */
 514       /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
 515        * and we do not check if strings are valid */
 516     case 164: /* ] is an invalid data character in JavaScript compatibility mode */
 517       /* This should not happen as we don't use PCRE_JAVASCRIPT_COMPAT */
 518       g_warning ("%s", *errmsg);
 519       *errcode = G_REGEX_ERROR_COMPILE;
 520       break;
 521     case 170: /* internal error: unknown opcode in find_fixedlength() */
 522       *errcode = G_REGEX_ERROR_INTERNAL;
 523       break;
 524
 525     default:
 526       *errcode = G_REGEX_ERROR_COMPILE;
 527     }
 528 }
 529
 530 /* GMatchInfo */
 531
 532 static GMatchInfo *
 533 match_info_new (const GRegex *regex,
 534                 const gchar  *string,
 535                 gint          string_len,
 536                 gint          start_position,
 537                 gint          match_options,
 538                 gboolean      is_dfa)
 539 {
 540   GMatchInfo *match_info;
 541
 542   if (string_len < 0)
 543     string_len = strlen (string);
 544
 545   match_info = g_new0 (GMatchInfo, 1);
 546   match_info->ref_count = 1;
 547   match_info->regex = g_regex_ref ((GRegex *)regex);
 548   match_info->string = string;
 549   match_info->string_len = string_len;
 550   match_info->matches = PCRE_ERROR_NOMATCH;
 551   match_info->pos = start_position;
 552   match_info->match_opts = match_options;
 553
 554   if (is_dfa)
 555     {
 556       /* These values should be enough for most cases, if they are not
 557        * enough g_regex_match_all_full() will expand them. */
 558       match_info->n_offsets = 24;
 559       match_info->n_workspace = 100;
 560       match_info->workspace = g_new (gint, match_info->n_workspace);
 561     }
 562   else
 563     {
 564       gint capture_count;
 565       pcre_fullinfo (regex->pcre_re, regex->extra,
 566                      PCRE_INFO_CAPTURECOUNT, &capture_count);
 567       match_info->n_offsets = (capture_count + 1) * 3;
 568     }
 569
 570   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 571   /* Set an invalid position for the previous match. */
 572   match_info->offsets[0] = -1;
 573   match_info->offsets[1] = -1;
 574
 575   return match_info;
 576 }
 577
 578 /**
 579  * g_match_info_get_regex:
 580  * @match_info: a #GMatchInfo
 581  *
 582  * Returns #GRegex object used in @match_info. It belongs to Glib
 583  * and must not be freed. Use g_regex_ref() if you need to keep it
 584  * after you free @match_info object.
 585  *
 586  * Returns: #GRegex object used in @match_info
 587  *
 588  * Since: 2.14
 589  */
 590 GRegex *
 591 g_match_info_get_regex (const GMatchInfo *match_info)
 592 {
 593   g_return_val_if_fail (match_info != NULL, NULL);
 594   return match_info->regex;
 595 }
 596
 597 /**
 598  * g_match_info_get_string:
 599  * @match_info: a #GMatchInfo
 600  *
 601  * Returns the string searched with @match_info. This is the
 602  * string passed to g_regex_match() or g_regex_replace() so
 603  * you may not free it before calling this function.
 604  *
 605  * Returns: the string searched with @match_info
 606  *
 607  * Since: 2.14
 608  */
 609 const gchar *
 610 g_match_info_get_string (const GMatchInfo *match_info)
 611 {
 612   g_return_val_if_fail (match_info != NULL, NULL);
 613   return match_info->string;
 614 }
 615
 616 /**
 617  * g_match_info_ref:
 618  * @match_info: a #GMatchInfo
 619  *
 620  * Increases reference count of @match_info by 1.
 621  *
 622  * Returns: @match_info
 623  *
 624  * Since: 2.30
 625  */
 626 GMatchInfo       *
 627 g_match_info_ref (GMatchInfo *match_info)
 628 {
 629   g_return_val_if_fail (match_info != NULL, NULL);
 630   g_atomic_int_inc (&match_info->ref_count);
 631   return match_info;
 632 }
 633
 634 /**
 635  * g_match_info_unref:
 636  * @match_info: a #GMatchInfo
 637  *
 638  * Decreases reference count of @match_info by 1. When reference count drops
 639  * to zero, it frees all the memory associated with the match_info structure.
 640  *
 641  * Since: 2.30
 642  */
 643 void
 644 g_match_info_unref (GMatchInfo *match_info)
 645 {
 646   if (g_atomic_int_dec_and_test (&match_info->ref_count))
 647     {
 648       g_regex_unref (match_info->regex);
 649       g_free (match_info->offsets);
 650       g_free (match_info->workspace);
 651       g_free (match_info);
 652     }
 653 }
 654
 655 /**
 656  * g_match_info_free:
 657  * @match_info: (allow-none): a #GMatchInfo, or %NULL
 658  *
 659  * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
 660  * nothing.
 661  *
 662  * Since: 2.14
 663  */
 664 void
 665 g_match_info_free (GMatchInfo *match_info)
 666 {
 667   if (match_info == NULL)
 668     return;
 669
 670   g_match_info_unref (match_info);
 671 }
 672
 673 /**
 674  * g_match_info_next:
 675  * @match_info: a #GMatchInfo structure
 676  * @error: location to store the error occurring, or %NULL to ignore errors
 677  *
 678  * Scans for the next match using the same parameters of the previous
 679  * call to g_regex_match_full() or g_regex_match() that returned
 680  * @match_info.
 681  *
 682  * The match is done on the string passed to the match function, so you
 683  * cannot free it before calling this function.
 684  *
 685  * Returns: %TRUE is the string matched, %FALSE otherwise
 686  *
 687  * Since: 2.14
 688  */
 689 gboolean
 690 g_match_info_next (GMatchInfo  *match_info,
 691                    GError     **error)
 692 {
 693   gint prev_match_start;
 694   gint prev_match_end;
 695
 696   g_return_val_if_fail (match_info != NULL, FALSE);
 697   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 698   g_return_val_if_fail (match_info->pos >= 0, FALSE);
 699
 700   prev_match_start = match_info->offsets[0];
 701   prev_match_end = match_info->offsets[1];
 702
 703   if (match_info->pos > match_info->string_len)
 704     {
 705       /* we have reached the end of the string */
 706       match_info->pos = -1;
 707       match_info->matches = PCRE_ERROR_NOMATCH;
 708       return FALSE;
 709     }
 710
 711   match_info->matches = pcre_exec (match_info->regex->pcre_re,
 712                                    match_info->regex->extra,
 713                                    match_info->string,
 714                                    match_info->string_len,
 715                                    match_info->pos,
 716                                    match_info->regex->match_opts | match_info->match_opts,
 717                                    match_info->offsets,
 718                                    match_info->n_offsets);
 719   if (IS_PCRE_ERROR (match_info->matches))
 720     {
 721       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 722                    _("Error while matching regular expression %s: %s"),
 723                    match_info->regex->pattern, match_error (match_info->matches));
 724       return FALSE;
 725     }
 726
 727   /* avoid infinite loops if the pattern is an empty string or something
 728    * equivalent */
 729   if (match_info->pos == match_info->offsets[1])
 730     {
 731       if (match_info->pos > match_info->string_len)
 732         {
 733           /* we have reached the end of the string */
 734           match_info->pos = -1;
 735           match_info->matches = PCRE_ERROR_NOMATCH;
 736           return FALSE;
 737         }
 738
 739       match_info->pos = NEXT_CHAR (match_info->regex,
 740                                    &match_info->string[match_info->pos]) -
 741                                    match_info->string;
 742     }
 743   else
 744     {
 745       match_info->pos = match_info->offsets[1];
 746     }
 747
 748   /* it's possible to get two identical matches when we are matching
 749    * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
 750    * the string is "RegExTest" we have:
 751    *  - search at position 0: match from 0 to 0
 752    *  - search at position 1: match from 3 to 3
 753    *  - search at position 3: match from 3 to 3 (duplicate)
 754    *  - search at position 4: match from 5 to 5
 755    *  - search at position 5: match from 5 to 5 (duplicate)
 756    *  - search at position 6: no match -> stop
 757    * so we have to ignore the duplicates.
 758    * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
 759   if (match_info->matches >= 0 &&
 760       prev_match_start == match_info->offsets[0] &&
 761       prev_match_end == match_info->offsets[1])
 762     {
 763       /* ignore this match and search the next one */
 764       return g_match_info_next (match_info, error);
 765     }
 766
 767   return match_info->matches >= 0;
 768 }
 769
 770 /**
 771  * g_match_info_matches:
 772  * @match_info: a #GMatchInfo structure
 773  *
 774  * Returns whether the previous match operation succeeded.
 775  *
 776  * Returns: %TRUE if the previous match operation succeeded,
 777  *   %FALSE otherwise
 778  *
 779  * Since: 2.14
 780  */
 781 gboolean
 782 g_match_info_matches (const GMatchInfo *match_info)
 783 {
 784   g_return_val_if_fail (match_info != NULL, FALSE);
 785
 786   return match_info->matches >= 0;
 787 }
 788
 789 /**
 790  * g_match_info_get_match_count:
 791  * @match_info: a #GMatchInfo structure
 792  *
 793  * Retrieves the number of matched substrings (including substring 0,
 794  * that is the whole matched text), so 1 is returned if the pattern
 795  * has no substrings in it and 0 is returned if the match failed.
 796  *
 797  * If the last match was obtained using the DFA algorithm, that is
 798  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
 799  * count is not that of the number of capturing parentheses but that of
 800  * the number of matched substrings.
 801  *
 802  * Returns: Number of matched substrings, or -1 if an error occurred
 803  *
 804  * Since: 2.14
 805  */
 806 gint
 807 g_match_info_get_match_count (const GMatchInfo *match_info)
 808 {
 809   g_return_val_if_fail (match_info, -1);
 810
 811   if (match_info->matches == PCRE_ERROR_NOMATCH)
 812     /* no match */
 813     return 0;
 814   else if (match_info->matches < PCRE_ERROR_NOMATCH)
 815     /* error */
 816     return -1;
 817   else
 818     /* match */
 819     return match_info->matches;
 820 }
 821
 822 /**
 823  * g_match_info_is_partial_match:
 824  * @match_info: a #GMatchInfo structure
 825  *
 826  * Usually if the string passed to g_regex_match*() matches as far as
 827  * it goes, but is too short to match the entire pattern, %FALSE is
 828  * returned. There are circumstances where it might be helpful to
 829  * distinguish this case from other cases in which there is no match.
 830  *
 831  * Consider, for example, an application where a human is required to
 832  * type in data for a field with specific formatting requirements. An
 833  * example might be a date in the form ddmmmyy, defined by the pattern
 834  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 835  * If the application sees the user’s keystrokes one by one, and can
 836  * check that what has been typed so far is potentially valid, it is
 837  * able to raise an error as soon as a mistake is made.
 838  *
 839  * GRegex supports the concept of partial matching by means of the
 840  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 841  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 842  * for a complete match, %FALSE otherwise. But, when these functions
 843  * return %FALSE, you can check if the match was partial calling
 844  * g_match_info_is_partial_match().
 845  *
 846  * When using partial matching you cannot use g_match_info_fetch*().
 847  *
 848  * Because of the way certain internal optimizations are implemented
 849  * the partial matching algorithm cannot be used with all patterns.
 850  * So repeated single characters such as "a{2,4}" and repeated single
 851  * meta-sequences such as "\d+" are not permitted if the maximum number
 852  * of occurrences is greater than one. Optional items such as "\d?"
 853  * (where the maximum is one) are permitted. Quantifiers with any values
 854  * are permitted after parentheses, so the invalid examples above can be
 855  * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
 856  * for a pattern that does not conform to the restrictions, matching
 857  * functions return an error.
 858  *
 859  * Returns: %TRUE if the match was partial, %FALSE otherwise
 860  *
 861  * Since: 2.14
 862  */
 863 gboolean
 864 g_match_info_is_partial_match (const GMatchInfo *match_info)
 865 {
 866   g_return_val_if_fail (match_info != NULL, FALSE);
 867
 868   return match_info->matches == PCRE_ERROR_PARTIAL;
 869 }
 870
 871 /**
 872  * g_match_info_expand_references:
 873  * @match_info: (allow-none): a #GMatchInfo or %NULL
 874  * @string_to_expand: the string to expand
 875  * @error: location to store the error occurring, or %NULL to ignore errors
 876  *
 877  * Returns a new string containing the text in @string_to_expand with
 878  * references and escape sequences expanded. References refer to the last
 879  * match done with @string against @regex and have the same syntax used by
 880  * g_regex_replace().
 881  *
 882  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
 883  * passed to g_regex_new().
 884  *
 885  * The backreferences are extracted from the string passed to the match
 886  * function, so you cannot call this function after freeing the string.
 887  *
 888  * @match_info may be %NULL in which case @string_to_expand must not
 889  * contain references. For instance "foo\n" does not refer to an actual
 890  * pattern and '\n' merely will be replaced with \n character,
 891  * while to expand "\0" (whole match) one needs the result of a match.
 892  * Use g_regex_check_replacement() to find out whether @string_to_expand
 893  * contains references.
 894  *
 895  * Returns: (allow-none): the expanded string, or %NULL if an error occurred
 896  *
 897  * Since: 2.14
 898  */
 899 gchar *
 900 g_match_info_expand_references (const GMatchInfo  *match_info,
 901                                 const gchar       *string_to_expand,
 902                                 GError           **error)
 903 {
 904   GString *result;
 905   GList *list;
 906   GError *tmp_error = NULL;
 907
 908   g_return_val_if_fail (string_to_expand != NULL, NULL);
 909   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 910
 911   list = split_replacement (string_to_expand, &tmp_error);
 912   if (tmp_error != NULL)
 913     {
 914       g_propagate_error (error, tmp_error);
 915       return NULL;
 916     }
 917
 918   if (!match_info && interpolation_list_needs_match (list))
 919     {
 920       g_critical ("String '%s' contains references to the match, can't "
 921                   "expand references without GMatchInfo object",
 922                   string_to_expand);
 923       return NULL;
 924     }
 925
 926   result = g_string_sized_new (strlen (string_to_expand));
 927   interpolate_replacement (match_info, result, list);
 928
 929   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
 930
 931   return g_string_free (result, FALSE);
 932 }
 933
 934 /**
 935  * g_match_info_fetch:
 936  * @match_info: #GMatchInfo structure
 937  * @match_num: number of the sub expression
 938  *
 939  * Retrieves the text matching the @match_num<!-- -->'th capturing
 940  * parentheses. 0 is the full text of the match, 1 is the first paren
 941  * set, 2 the second, and so on.
 942  *
 943  * If @match_num is a valid sub pattern but it didn't match anything
 944  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
 945  * string is returned.
 946  *
 947  * If the match was obtained using the DFA algorithm, that is using
 948  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 949  * string is not that of a set of parentheses but that of a matched
 950  * substring. Substrings are matched in reverse order of length, so
 951  * 0 is the longest match.
 952  *
 953  * The string is fetched from the string passed to the match function,
 954  * so you cannot call this function after freeing the string.
 955  *
 956  * Returns: (allow-none): The matched substring, or %NULL if an error
 957  *     occurred. You have to free the string yourself
 958  *
 959  * Since: 2.14
 960  */
 961 gchar *
 962 g_match_info_fetch (const GMatchInfo *match_info,
 963                     gint              match_num)
 964 {
 965   /* we cannot use pcre_get_substring() because it allocates the
 966    * string using pcre_malloc(). */
 967   gchar *match = NULL;
 968   gint start, end;
 969
 970   g_return_val_if_fail (match_info != NULL, NULL);
 971   g_return_val_if_fail (match_num >= 0, NULL);
 972
 973   /* match_num does not exist or it didn't matched, i.e. matching "b"
 974    * against "(a)?b" then group 0 is empty. */
 975   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
 976     match = NULL;
 977   else if (start == -1)
 978     match = g_strdup ("");
 979   else
 980     match = g_strndup (&match_info->string[start], end - start);
 981
 982   return match;
 983 }
 984
 985 /**
 986  * g_match_info_fetch_pos:
 987  * @match_info: #GMatchInfo structure
 988  * @match_num: number of the sub expression
 989  * @start_pos: (out) (allow-none): pointer to location where to store
 990  *     the start position, or %NULL
 991  * @end_pos: (out) (allow-none): pointer to location where to store
 992  *     the end position, or %NULL
 993  *
 994  * Retrieves the position in bytes of the @match_num<!-- -->'th capturing
 995  * parentheses. 0 is the full text of the match, 1 is the first
 996  * paren set, 2 the second, and so on.
 997  *
 998  * If @match_num is a valid sub pattern but it didn't match anything
 999  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
1000  * and @end_pos are set to -1 and %TRUE is returned.
1001  *
1002  * If the match was obtained using the DFA algorithm, that is using
1003  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1004  * position is not that of a set of parentheses but that of a matched
1005  * substring. Substrings are matched in reverse order of length, so
1006  * 0 is the longest match.
1007  *
1008  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
1009  *   the position cannot be fetched, @start_pos and @end_pos are left
1010  *   unchanged
1011  *
1012  * Since: 2.14
1013  */
1014 gboolean
1015 g_match_info_fetch_pos (const GMatchInfo *match_info,
1016                         gint              match_num,
1017                         gint             *start_pos,
1018                         gint             *end_pos)
1019 {
1020   g_return_val_if_fail (match_info != NULL, FALSE);
1021   g_return_val_if_fail (match_num >= 0, FALSE);
1022
1023   /* make sure the sub expression number they're requesting is less than
1024    * the total number of sub expressions that were matched. */
1025   if (match_num >= match_info->matches)
1026     return FALSE;
1027
1028   if (start_pos != NULL)
1029     *start_pos = match_info->offsets[2 * match_num];
1030
1031   if (end_pos != NULL)
1032     *end_pos = match_info->offsets[2 * match_num + 1];
1033
1034   return TRUE;
1035 }
1036
1037 /*
1038  * Returns number of first matched subpattern with name @name.
1039  * There may be more than one in case when DUPNAMES is used,
1040  * and not all subpatterns with that name match;
1041  * pcre_get_stringnumber() does not work in that case.
1042  */
1043 static gint
1044 get_matched_substring_number (const GMatchInfo *match_info,
1045                               const gchar      *name)
1046 {
1047   gint entrysize;
1048   gchar *first, *last;
1049   guchar *entry;
1050
1051   if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES))
1052     return pcre_get_stringnumber (match_info->regex->pcre_re, name);
1053
1054   /* This code is copied from pcre_get.c: get_first_set() */
1055   entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
1056                                             name,
1057                                             &first,
1058                                             &last);
1059
1060   if (entrysize <= 0)
1061     return entrysize;
1062
1063   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1064     {
1065       gint n = (entry[0] << 8) + entry[1];
1066       if (match_info->offsets[n*2] >= 0)
1067         return n;
1068     }
1069
1070   return (first[0] << 8) + first[1];
1071 }
1072
1073 /**
1074  * g_match_info_fetch_named:
1075  * @match_info: #GMatchInfo structure
1076  * @name: name of the subexpression
1077  *
1078  * Retrieves the text matching the capturing parentheses named @name.
1079  *
1080  * If @name is a valid sub pattern name but it didn't match anything
1081  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
1082  * then an empty string is returned.
1083  *
1084  * The string is fetched from the string passed to the match function,
1085  * so you cannot call this function after freeing the string.
1086  *
1087  * Returns: (allow-none): The matched substring, or %NULL if an error
1088  *     occurred. You have to free the string yourself
1089  *
1090  * Since: 2.14
1091  */
1092 gchar *
1093 g_match_info_fetch_named (const GMatchInfo *match_info,
1094                           const gchar      *name)
1095 {
1096   /* we cannot use pcre_get_named_substring() because it allocates the
1097    * string using pcre_malloc(). */
1098   gint num;
1099
1100   g_return_val_if_fail (match_info != NULL, NULL);
1101   g_return_val_if_fail (name != NULL, NULL);
1102
1103   num = get_matched_substring_number (match_info, name);
1104   if (num < 0)
1105     return NULL;
1106   else
1107     return g_match_info_fetch (match_info, num);
1108 }
1109
1110 /**
1111  * g_match_info_fetch_named_pos:
1112  * @match_info: #GMatchInfo structure
1113  * @name: name of the subexpression
1114  * @start_pos: (out) (allow-none): pointer to location where to store
1115  *     the start position, or %NULL
1116  * @end_pos: (out) (allow-none): pointer to location where to store
1117  *     the end position, or %NULL
1118  *
1119  * Retrieves the position in bytes of the capturing parentheses named @name.
1120  *
1121  * If @name is a valid sub pattern name but it didn't match anything
1122  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
1123  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1124  *
1125  * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1126  *     If the position cannot be fetched, @start_pos and @end_pos
1127  *     are left unchanged.
1128  *
1129  * Since: 2.14
1130  */
1131 gboolean
1132 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1133                               const gchar      *name,
1134                               gint             *start_pos,
1135                               gint             *end_pos)
1136 {
1137   gint num;
1138
1139   g_return_val_if_fail (match_info != NULL, FALSE);
1140   g_return_val_if_fail (name != NULL, FALSE);
1141
1142   num = get_matched_substring_number (match_info, name);
1143   if (num < 0)
1144     return FALSE;
1145
1146   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1147 }
1148
1149 /**
1150  * g_match_info_fetch_all:
1151  * @match_info: a #GMatchInfo structure
1152  *
1153  * Bundles up pointers to each of the matching substrings from a match
1154  * and stores them in an array of gchar pointers. The first element in
1155  * the returned array is the match number 0, i.e. the entire matched
1156  * text.
1157  *
1158  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1159  * "b" against "(a)?b") then an empty string is inserted.
1160  *
1161  * If the last match was obtained using the DFA algorithm, that is using
1162  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1163  * strings are not that matched by sets of parentheses but that of the
1164  * matched substring. Substrings are matched in reverse order of length,
1165  * so the first one is the longest match.
1166  *
1167  * The strings are fetched from the string passed to the match function,
1168  * so you cannot call this function after freeing the string.
1169  *
1170  * Returns: (allow-none): a %NULL-terminated array of gchar * pointers.
1171  *     It must be freed using g_strfreev(). If the previous match failed
1172  *     %NULL is returned
1173  *
1174  * Since: 2.14
1175  */
1176 gchar **
1177 g_match_info_fetch_all (const GMatchInfo *match_info)
1178 {
1179   /* we cannot use pcre_get_substring_list() because the returned value
1180    * isn't suitable for g_strfreev(). */
1181   gchar **result;
1182   gint i;
1183
1184   g_return_val_if_fail (match_info != NULL, NULL);
1185
1186   if (match_info->matches < 0)
1187     return NULL;
1188
1189   result = g_new (gchar *, match_info->matches + 1);
1190   for (i = 0; i < match_info->matches; i++)
1191     result[i] = g_match_info_fetch (match_info, i);
1192   result[i] = NULL;
1193
1194   return result;
1195 }
1196
1197
1198 /* GRegex */
1199
1200 GQuark
1201 g_regex_error_quark (void)
1202 {
1203   static GQuark error_quark = 0;
1204
1205   if (error_quark == 0)
1206     error_quark = g_quark_from_static_string ("g-regex-error-quark");
1207
1208   return error_quark;
1209 }
1210
1211 /**
1212  * g_regex_ref:
1213  * @regex: a #GRegex
1214  *
1215  * Increases reference count of @regex by 1.
1216  *
1217  * Returns: @regex
1218  *
1219  * Since: 2.14
1220  */
1221 GRegex *
1222 g_regex_ref (GRegex *regex)
1223 {
1224   g_return_val_if_fail (regex != NULL, NULL);
1225   g_atomic_int_inc (&regex->ref_count);
1226   return regex;
1227 }
1228
1229 /**
1230  * g_regex_unref:
1231  * @regex: a #GRegex
1232  *
1233  * Decreases reference count of @regex by 1. When reference count drops
1234  * to zero, it frees all the memory associated with the regex structure.
1235  *
1236  * Since: 2.14
1237  */
1238 void
1239 g_regex_unref (GRegex *regex)
1240 {
1241   g_return_if_fail (regex != NULL);
1242
1243   if (g_atomic_int_dec_and_test (&regex->ref_count))
1244     {
1245       g_free (regex->pattern);
1246       if (regex->pcre_re != NULL)
1247         pcre_free (regex->pcre_re);
1248       if (regex->extra != NULL)
1249         pcre_free (regex->extra);
1250       g_free (regex);
1251     }
1252 }
1253
1254 /**
1255  * g_regex_new:
1256  * @pattern: the regular expression
1257  * @compile_options: compile options for the regular expression, or 0
1258  * @match_options: match options for the regular expression, or 0
1259  * @error: return location for a #GError
1260  *
1261  * Compiles the regular expression to an internal form, and does
1262  * the initial setup of the #GRegex structure.
1263  *
1264  * Returns: a #GRegex structure. Call g_regex_unref() when you
1265  *   are done with it
1266  *
1267  * Since: 2.14
1268  */
1269 GRegex *
1270 g_regex_new (const gchar         *pattern,
1271              GRegexCompileFlags   compile_options,
1272              GRegexMatchFlags     match_options,
1273              GError             **error)
1274 {
1275   GRegex *regex;
1276   pcre *re;
1277   const gchar *errmsg;
1278   gint erroffset;
1279   gint errcode;
1280   gboolean optimize = FALSE;
1281   static volatile gsize initialised = 0;
1282   unsigned long int pcre_compile_options;
1283   GRegexCompileFlags nonpcre_compile_options;
1284
1285   g_return_val_if_fail (pattern != NULL, NULL);
1286   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1287   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
1288   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1289
1290   if (g_once_init_enter (&initialised))
1291     {
1292       int supports_utf8, supports_ucp;
1293
1294       pcre_config (PCRE_CONFIG_UTF8, &supports_utf8);
1295       if (!supports_utf8)
1296         g_critical (_("PCRE library is compiled without UTF8 support"));
1297
1298       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &supports_ucp);
1299       if (!supports_ucp)
1300         g_critical (_("PCRE library is compiled without UTF8 properties support"));
1301
1302       g_once_init_leave (&initialised, supports_utf8 && supports_ucp ? 1 : 2);
1303     }
1304
1305   if (G_UNLIKELY (initialised != 1))
1306     {
1307       g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE,
1308                            _("PCRE library is compiled with incompatible options"));
1309       return NULL;
1310     }
1311
1312   nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
1313
1314   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
1315    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
1316   if (compile_options & G_REGEX_OPTIMIZE)
1317     optimize = TRUE;
1318
1319   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
1320    * instead uses UTF-8 only if required with PCRE_UTF8. */
1321   if (compile_options & G_REGEX_RAW)
1322     {
1323       /* disable utf-8 */
1324       compile_options &= ~G_REGEX_RAW;
1325     }
1326   else
1327     {
1328       /* enable utf-8 */
1329       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1330       match_options |= PCRE_NO_UTF8_CHECK;
1331     }
1332
1333   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
1334    * not for the system one. */
1335   if (!(compile_options & G_REGEX_NEWLINE_CR) &&
1336       !(compile_options & G_REGEX_NEWLINE_LF))
1337     {
1338       compile_options |= PCRE_NEWLINE_ANY;
1339     }
1340
1341   compile_options |= PCRE_UCP;
1342
1343   /* PCRE_BSR_UNICODE is the default for the internal PCRE but
1344    * possibly not for the system one.
1345    */
1346   if (~compile_options & G_REGEX_BSR_ANYCRLF)
1347     compile_options |= PCRE_BSR_UNICODE;
1348
1349   /* compile the pattern */
1350   re = pcre_compile2 (pattern, compile_options, &errcode,
1351                       &errmsg, &erroffset, NULL);
1352
1353   /* if the compilation failed, set the error member and return
1354    * immediately */
1355   if (re == NULL)
1356     {
1357       GError *tmp_error;
1358
1359       /* Translate the PCRE error code to GRegexError and use a translated
1360        * error message if possible */
1361       translate_compile_error (&errcode, &errmsg);
1362
1363       /* PCRE uses byte offsets but we want to show character offsets */
1364       erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1365
1366       tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1367                                _("Error while compiling regular "
1368                                  "expression %s at char %d: %s"),
1369                                pattern, erroffset, errmsg);
1370       g_propagate_error (error, tmp_error);
1371
1372       return NULL;
1373     }
1374
1375   /* For options set at the beginning of the pattern, pcre puts them into
1376    * compile options, e.g. "(?i)foo" will make the pcre structure store
1377    * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
1378   pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);
1379   compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK;
1380
1381   /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */
1382   if ((pcre_compile_options & PCRE_NEWLINE_ANYCRLF) != PCRE_NEWLINE_ANYCRLF)
1383     compile_options &= ~PCRE_NEWLINE_ANY;
1384
1385   compile_options |= nonpcre_compile_options;
1386
1387   if (!(compile_options & G_REGEX_DUPNAMES))
1388     {
1389       gboolean jchanged = FALSE;
1390       pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1391       if (jchanged)
1392         compile_options |= G_REGEX_DUPNAMES;
1393     }
1394
1395   regex = g_new0 (GRegex, 1);
1396   regex->ref_count = 1;
1397   regex->pattern = g_strdup (pattern);
1398   regex->pcre_re = re;
1399   regex->compile_opts = compile_options;
1400   regex->match_opts = match_options;
1401
1402   if (optimize)
1403     {
1404       regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
1405       if (errmsg != NULL)
1406         {
1407           GError *tmp_error = g_error_new (G_REGEX_ERROR,
1408                                            G_REGEX_ERROR_OPTIMIZE,
1409                                            _("Error while optimizing "
1410                                              "regular expression %s: %s"),
1411                                            regex->pattern,
1412                                            errmsg);
1413           g_propagate_error (error, tmp_error);
1414
1415           g_regex_unref (regex);
1416           return NULL;
1417         }
1418     }
1419
1420   return regex;
1421 }
1422
1423 /**
1424  * g_regex_get_pattern:
1425  * @regex: a #GRegex structure
1426  *
1427  * Gets the pattern string associated with @regex, i.e. a copy of
1428  * the string passed to g_regex_new().
1429  *
1430  * Returns: the pattern of @regex
1431  *
1432  * Since: 2.14
1433  */
1434 const gchar *
1435 g_regex_get_pattern (const GRegex *regex)
1436 {
1437   g_return_val_if_fail (regex != NULL, NULL);
1438
1439   return regex->pattern;
1440 }
1441
1442 /**
1443  * g_regex_get_max_backref:
1444  * @regex: a #GRegex
1445  *
1446  * Returns the number of the highest back reference
1447  * in the pattern, or 0 if the pattern does not contain
1448  * back references.
1449  *
1450  * Returns: the number of the highest back reference
1451  *
1452  * Since: 2.14
1453  */
1454 gint
1455 g_regex_get_max_backref (const GRegex *regex)
1456 {
1457   gint value;
1458
1459   pcre_fullinfo (regex->pcre_re, regex->extra,
1460                  PCRE_INFO_BACKREFMAX, &value);
1461
1462   return value;
1463 }
1464
1465 /**
1466  * g_regex_get_capture_count:
1467  * @regex: a #GRegex
1468  *
1469  * Returns the number of capturing subpatterns in the pattern.
1470  *
1471  * Returns: the number of capturing subpatterns
1472  *
1473  * Since: 2.14
1474  */
1475 gint
1476 g_regex_get_capture_count (const GRegex *regex)
1477 {
1478   gint value;
1479
1480   pcre_fullinfo (regex->pcre_re, regex->extra,
1481                  PCRE_INFO_CAPTURECOUNT, &value);
1482
1483   return value;
1484 }
1485
1486 /**
1487  * g_regex_get_has_cr_or_lf:
1488  * @regex: a #GRegex structure
1489  *
1490  * Checks whether the pattern contains explicit CR or LF references.
1491  *
1492  * Returns: %TRUE if the pattern contains explicit CR or LF references
1493  *
1494  * Since: 2.34
1495  */
1496 gboolean
1497 g_regex_get_has_cr_or_lf (const GRegex *regex)
1498 {
1499   gint value;
1500
1501   pcre_fullinfo (regex->pcre_re, regex->extra,
1502                  PCRE_INFO_HASCRORLF, &value);
1503
1504   return !!value;
1505 }
1506
1507 /**
1508  * g_regex_get_compile_flags:
1509  * @regex: a #GRegex
1510  *
1511  * Returns the compile options that @regex was created with.
1512  *
1513  * Returns: flags from #GRegexCompileFlags
1514  *
1515  * Since: 2.26
1516  */
1517 GRegexCompileFlags
1518 g_regex_get_compile_flags (const GRegex *regex)
1519 {
1520   g_return_val_if_fail (regex != NULL, 0);
1521
1522   return regex->compile_opts;
1523 }
1524
1525 /**
1526  * g_regex_get_match_flags:
1527  * @regex: a #GRegex
1528  *
1529  * Returns the match options that @regex was created with.
1530  *
1531  * Returns: flags from #GRegexMatchFlags
1532  *
1533  * Since: 2.26
1534  */
1535 GRegexMatchFlags
1536 g_regex_get_match_flags (const GRegex *regex)
1537 {
1538   g_return_val_if_fail (regex != NULL, 0);
1539
1540   return regex->match_opts & G_REGEX_MATCH_MASK;
1541 }
1542
1543 /**
1544  * g_regex_match_simple:
1545  * @pattern: the regular expression
1546  * @string: the string to scan for matches
1547  * @compile_options: compile options for the regular expression, or 0
1548  * @match_options: match options, or 0
1549  *
1550  * Scans for a match in @string for @pattern.
1551  *
1552  * This function is equivalent to g_regex_match() but it does not
1553  * require to compile the pattern with g_regex_new(), avoiding some
1554  * lines of code when you need just to do a match without extracting
1555  * substrings, capture counts, and so on.
1556  *
1557  * If this function is to be called on the same @pattern more than
1558  * once, it's more efficient to compile the pattern once with
1559  * g_regex_new() and then use g_regex_match().
1560  *
1561  * Returns: %TRUE if the string matched, %FALSE otherwise
1562  *
1563  * Since: 2.14
1564  */
1565 gboolean
1566 g_regex_match_simple (const gchar        *pattern,
1567                       const gchar        *string,
1568                       GRegexCompileFlags  compile_options,
1569                       GRegexMatchFlags    match_options)
1570 {
1571   GRegex *regex;
1572   gboolean result;
1573
1574   regex = g_regex_new (pattern, compile_options, 0, NULL);
1575   if (!regex)
1576     return FALSE;
1577   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1578   g_regex_unref (regex);
1579   return result;
1580 }
1581
1582 /**
1583  * g_regex_match:
1584  * @regex: a #GRegex structure from g_regex_new()
1585  * @string: the string to scan for matches
1586  * @match_options: match options
1587  * @match_info: (out) (allow-none): pointer to location where to store
1588  *     the #GMatchInfo, or %NULL if you do not need it
1589  *
1590  * Scans for a match in string for the pattern in @regex.
1591  * The @match_options are combined with the match options specified
1592  * when the @regex structure was created, letting you have more
1593  * flexibility in reusing #GRegex structures.
1594  *
1595  * A #GMatchInfo structure, used to get information on the match,
1596  * is stored in @match_info if not %NULL. Note that if @match_info
1597  * is not %NULL then it is created even if the function returns %FALSE,
1598  * i.e. you must free it regardless if regular expression actually matched.
1599  *
1600  * To retrieve all the non-overlapping matches of the pattern in
1601  * string you can use g_match_info_next().
1602  *
1603  * |[
1604  * static void
1605  * print_uppercase_words (const gchar *string)
1606  * {
1607  *   /&ast; Print all uppercase-only words. &ast;/
1608  *   GRegex *regex;
1609  *   GMatchInfo *match_info;
1610  *   &nbsp;
1611  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1612  *   g_regex_match (regex, string, 0, &amp;match_info);
1613  *   while (g_match_info_matches (match_info))
1614  *     {
1615  *       gchar *word = g_match_info_fetch (match_info, 0);
1616  *       g_print ("Found: %s\n", word);
1617  *       g_free (word);
1618  *       g_match_info_next (match_info, NULL);
1619  *     }
1620  *   g_match_info_free (match_info);
1621  *   g_regex_unref (regex);
1622  * }
1623  * ]|
1624  *
1625  * @string is not copied and is used in #GMatchInfo internally. If
1626  * you use any #GMatchInfo method (except g_match_info_free()) after
1627  * freeing or modifying @string then the behaviour is undefined.
1628  *
1629  * Returns: %TRUE is the string matched, %FALSE otherwise
1630  *
1631  * Since: 2.14
1632  */
1633 gboolean
1634 g_regex_match (const GRegex      *regex,
1635                const gchar       *string,
1636                GRegexMatchFlags   match_options,
1637                GMatchInfo       **match_info)
1638 {
1639   return g_regex_match_full (regex, string, -1, 0, match_options,
1640                              match_info, NULL);
1641 }
1642
1643 /**
1644  * g_regex_match_full:
1645  * @regex: a #GRegex structure from g_regex_new()
1646  * @string: (array length=string_len): the string to scan for matches
1647  * @string_len: the length of @string, or -1 if @string is nul-terminated
1648  * @start_position: starting index of the string to match
1649  * @match_options: match options
1650  * @match_info: (out) (allow-none): pointer to location where to store
1651  *     the #GMatchInfo, or %NULL if you do not need it
1652  * @error: location to store the error occurring, or %NULL to ignore errors
1653  *
1654  * Scans for a match in string for the pattern in @regex.
1655  * The @match_options are combined with the match options specified
1656  * when the @regex structure was created, letting you have more
1657  * flexibility in reusing #GRegex structures.
1658  *
1659  * Setting @start_position differs from just passing over a shortened
1660  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1661  * that begins with any kind of lookbehind assertion, such as "\b".
1662  *
1663  * A #GMatchInfo structure, used to get information on the match, is
1664  * stored in @match_info if not %NULL. Note that if @match_info is
1665  * not %NULL then it is created even if the function returns %FALSE,
1666  * i.e. you must free it regardless if regular expression actually
1667  * matched.
1668  *
1669  * @string is not copied and is used in #GMatchInfo internally. If
1670  * you use any #GMatchInfo method (except g_match_info_free()) after
1671  * freeing or modifying @string then the behaviour is undefined.
1672  *
1673  * To retrieve all the non-overlapping matches of the pattern in
1674  * string you can use g_match_info_next().
1675  *
1676  * |[
1677  * static void
1678  * print_uppercase_words (const gchar *string)
1679  * {
1680  *   /&ast; Print all uppercase-only words. &ast;/
1681  *   GRegex *regex;
1682  *   GMatchInfo *match_info;
1683  *   GError *error = NULL;
1684  *   &nbsp;
1685  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1686  *   g_regex_match_full (regex, string, -1, 0, 0, &amp;match_info, &amp;error);
1687  *   while (g_match_info_matches (match_info))
1688  *     {
1689  *       gchar *word = g_match_info_fetch (match_info, 0);
1690  *       g_print ("Found: %s\n", word);
1691  *       g_free (word);
1692  *       g_match_info_next (match_info, &amp;error);
1693  *     }
1694  *   g_match_info_free (match_info);
1695  *   g_regex_unref (regex);
1696  *   if (error != NULL)
1697  *     {
1698  *       g_printerr ("Error while matching: %s\n", error->message);
1699  *       g_error_free (error);
1700  *     }
1701  * }
1702  * ]|
1703  *
1704  * Returns: %TRUE is the string matched, %FALSE otherwise
1705  *
1706  * Since: 2.14
1707  */
1708 gboolean
1709 g_regex_match_full (const GRegex      *regex,
1710                     const gchar       *string,
1711                     gssize             string_len,
1712                     gint               start_position,
1713                     GRegexMatchFlags   match_options,
1714                     GMatchInfo       **match_info,
1715                     GError           **error)
1716 {
1717   GMatchInfo *info;
1718   gboolean match_ok;
1719
1720   g_return_val_if_fail (regex != NULL, FALSE);
1721   g_return_val_if_fail (string != NULL, FALSE);
1722   g_return_val_if_fail (start_position >= 0, FALSE);
1723   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1724   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1725
1726   info = match_info_new (regex, string, string_len, start_position,
1727                          match_options, FALSE);
1728   match_ok = g_match_info_next (info, error);
1729   if (match_info != NULL)
1730     *match_info = info;
1731   else
1732     g_match_info_free (info);
1733
1734   return match_ok;
1735 }
1736
1737 /**
1738  * g_regex_match_all:
1739  * @regex: a #GRegex structure from g_regex_new()
1740  * @string: the string to scan for matches
1741  * @match_options: match options
1742  * @match_info: (out) (allow-none): pointer to location where to store
1743  *     the #GMatchInfo, or %NULL if you do not need it
1744  *
1745  * Using the standard algorithm for regular expression matching only
1746  * the longest match in the string is retrieved. This function uses
1747  * a different algorithm so it can retrieve all the possible matches.
1748  * For more documentation see g_regex_match_all_full().
1749  *
1750  * A #GMatchInfo structure, used to get information on the match, is
1751  * stored in @match_info if not %NULL. Note that if @match_info is
1752  * not %NULL then it is created even if the function returns %FALSE,
1753  * i.e. you must free it regardless if regular expression actually
1754  * matched.
1755  *
1756  * @string is not copied and is used in #GMatchInfo internally. If
1757  * you use any #GMatchInfo method (except g_match_info_free()) after
1758  * freeing or modifying @string then the behaviour is undefined.
1759  *
1760  * Returns: %TRUE is the string matched, %FALSE otherwise
1761  *
1762  * Since: 2.14
1763  */
1764 gboolean
1765 g_regex_match_all (const GRegex      *regex,
1766                    const gchar       *string,
1767                    GRegexMatchFlags   match_options,
1768                    GMatchInfo       **match_info)
1769 {
1770   return g_regex_match_all_full (regex, string, -1, 0, match_options,
1771                                  match_info, NULL);
1772 }
1773
1774 /**
1775  * g_regex_match_all_full:
1776  * @regex: a #GRegex structure from g_regex_new()
1777  * @string: (array length=string_len): the string to scan for matches
1778  * @string_len: the length of @string, or -1 if @string is nul-terminated
1779  * @start_position: starting index of the string to match
1780  * @match_options: match options
1781  * @match_info: (out) (allow-none): pointer to location where to store
1782  *     the #GMatchInfo, or %NULL if you do not need it
1783  * @error: location to store the error occurring, or %NULL to ignore errors
1784  *
1785  * Using the standard algorithm for regular expression matching only
1786  * the longest match in the string is retrieved, it is not possible
1787  * to obtain all the available matches. For instance matching
1788  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1789  * you get "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
1790  *
1791  * This function uses a different algorithm (called DFA, i.e. deterministic
1792  * finite automaton), so it can retrieve all the possible matches, all
1793  * starting at the same point in the string. For instance matching
1794  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1795  * you would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
1796  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
1797  *
1798  * The number of matched strings is retrieved using
1799  * g_match_info_get_match_count(). To obtain the matched strings and
1800  * their position you can use, respectively, g_match_info_fetch() and
1801  * g_match_info_fetch_pos(). Note that the strings are returned in
1802  * reverse order of length; that is, the longest matching string is
1803  * given first.
1804  *
1805  * Note that the DFA algorithm is slower than the standard one and it
1806  * is not able to capture substrings, so backreferences do not work.
1807  *
1808  * Setting @start_position differs from just passing over a shortened
1809  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1810  * that begins with any kind of lookbehind assertion, such as "\b".
1811  *
1812  * A #GMatchInfo structure, used to get information on the match, is
1813  * stored in @match_info if not %NULL. Note that if @match_info is
1814  * not %NULL then it is created even if the function returns %FALSE,
1815  * i.e. you must free it regardless if regular expression actually
1816  * matched.
1817  *
1818  * @string is not copied and is used in #GMatchInfo internally. If
1819  * you use any #GMatchInfo method (except g_match_info_free()) after
1820  * freeing or modifying @string then the behaviour is undefined.
1821  *
1822  * Returns: %TRUE is the string matched, %FALSE otherwise
1823  *
1824  * Since: 2.14
1825  */
1826 gboolean
1827 g_regex_match_all_full (const GRegex      *regex,
1828                         const gchar       *string,
1829                         gssize             string_len,
1830                         gint               start_position,
1831                         GRegexMatchFlags   match_options,
1832                         GMatchInfo       **match_info,
1833                         GError           **error)
1834 {
1835   GMatchInfo *info;
1836   gboolean done;
1837
1838   g_return_val_if_fail (regex != NULL, FALSE);
1839   g_return_val_if_fail (string != NULL, FALSE);
1840   g_return_val_if_fail (start_position >= 0, FALSE);
1841   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1842   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1843
1844   info = match_info_new (regex, string, string_len, start_position,
1845                          match_options, TRUE);
1846
1847   done = FALSE;
1848   while (!done)
1849     {
1850       done = TRUE;
1851       info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1852                                      info->string, info->string_len,
1853                                      info->pos,
1854                                      regex->match_opts | match_options,
1855                                      info->offsets, info->n_offsets,
1856                                      info->workspace, info->n_workspace);
1857       if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1858         {
1859           /* info->workspace is too small. */
1860           info->n_workspace *= 2;
1861           info->workspace = g_realloc (info->workspace,
1862                                        info->n_workspace * sizeof (gint));
1863           done = FALSE;
1864         }
1865       else if (info->matches == 0)
1866         {
1867           /* info->offsets is too small. */
1868           info->n_offsets *= 2;
1869           info->offsets = g_realloc (info->offsets,
1870                                      info->n_offsets * sizeof (gint));
1871           done = FALSE;
1872         }
1873       else if (IS_PCRE_ERROR (info->matches))
1874         {
1875           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1876                        _("Error while matching regular expression %s: %s"),
1877                        regex->pattern, match_error (info->matches));
1878         }
1879     }
1880
1881   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1882   info->pos = -1;
1883
1884   if (match_info != NULL)
1885     *match_info = info;
1886   else
1887     g_match_info_free (info);
1888
1889   return info->matches >= 0;
1890 }
1891
1892 /**
1893  * g_regex_get_string_number:
1894  * @regex: #GRegex structure
1895  * @name: name of the subexpression
1896  *
1897  * Retrieves the number of the subexpression named @name.
1898  *
1899  * Returns: The number of the subexpression or -1 if @name
1900  *   does not exists
1901  *
1902  * Since: 2.14
1903  */
1904 gint
1905 g_regex_get_string_number (const GRegex *regex,
1906                            const gchar  *name)
1907 {
1908   gint num;
1909
1910   g_return_val_if_fail (regex != NULL, -1);
1911   g_return_val_if_fail (name != NULL, -1);
1912
1913   num = pcre_get_stringnumber (regex->pcre_re, name);
1914   if (num == PCRE_ERROR_NOSUBSTRING)
1915     num = -1;
1916
1917   return num;
1918 }
1919
1920 /**
1921  * g_regex_split_simple:
1922  * @pattern: the regular expression
1923  * @string: the string to scan for matches
1924  * @compile_options: compile options for the regular expression, or 0
1925  * @match_options: match options, or 0
1926  *
1927  * Breaks the string on the pattern, and returns an array of
1928  * the tokens. If the pattern contains capturing parentheses,
1929  * then the text for each of the substrings will also be returned.
1930  * If the pattern does not match anywhere in the string, then the
1931  * whole string is returned as the first token.
1932  *
1933  * This function is equivalent to g_regex_split() but it does
1934  * not require to compile the pattern with g_regex_new(), avoiding
1935  * some lines of code when you need just to do a split without
1936  * extracting substrings, capture counts, and so on.
1937  *
1938  * If this function is to be called on the same @pattern more than
1939  * once, it's more efficient to compile the pattern once with
1940  * g_regex_new() and then use g_regex_split().
1941  *
1942  * As a special case, the result of splitting the empty string ""
1943  * is an empty vector, not a vector containing a single string.
1944  * The reason for this special case is that being able to represent
1945  * a empty vector is typically more useful than consistent handling
1946  * of empty elements. If you do need to represent empty elements,
1947  * you'll need to check for the empty string before calling this
1948  * function.
1949  *
1950  * A pattern that can match empty strings splits @string into
1951  * separate characters wherever it matches the empty string between
1952  * characters. For example splitting "ab c" using as a separator
1953  * "\s*", you will get "a", "b" and "c".
1954  *
1955  * Returns: a %NULL-terminated array of strings. Free it using g_strfreev()
1956  *
1957  * Since: 2.14
1958  **/
1959 gchar **
1960 g_regex_split_simple (const gchar        *pattern,
1961                       const gchar        *string,
1962                       GRegexCompileFlags  compile_options,
1963                       GRegexMatchFlags    match_options)
1964 {
1965   GRegex *regex;
1966   gchar **result;
1967
1968   regex = g_regex_new (pattern, compile_options, 0, NULL);
1969   if (!regex)
1970     return NULL;
1971
1972   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1973   g_regex_unref (regex);
1974   return result;
1975 }
1976
1977 /**
1978  * g_regex_split:
1979  * @regex: a #GRegex structure
1980  * @string: the string to split with the pattern
1981  * @match_options: match time option flags
1982  *
1983  * Breaks the string on the pattern, and returns an array of the tokens.
1984  * If the pattern contains capturing parentheses, then the text for each
1985  * of the substrings will also be returned. If the pattern does not match
1986  * anywhere in the string, then the whole string is returned as the first
1987  * token.
1988  *
1989  * As a special case, the result of splitting the empty string "" is an
1990  * empty vector, not a vector containing a single string. The reason for
1991  * this special case is that being able to represent a empty vector is
1992  * typically more useful than consistent handling of empty elements. If
1993  * you do need to represent empty elements, you'll need to check for the
1994  * empty string before calling this function.
1995  *
1996  * A pattern that can match empty strings splits @string into separate
1997  * characters wherever it matches the empty string between characters.
1998  * For example splitting "ab c" using as a separator "\s*", you will get
1999  * "a", "b" and "c".
2000  *
2001  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
2002  *
2003  * Since: 2.14
2004  **/
2005 gchar **
2006 g_regex_split (const GRegex     *regex,
2007                const gchar      *string,
2008                GRegexMatchFlags  match_options)
2009 {
2010   return g_regex_split_full (regex, string, -1, 0,
2011                              match_options, 0, NULL);
2012 }
2013
2014 /**
2015  * g_regex_split_full:
2016  * @regex: a #GRegex structure
2017  * @string: (array length=string_len): the string to split with the pattern
2018  * @string_len: the length of @string, or -1 if @string is nul-terminated
2019  * @start_position: starting index of the string to match
2020  * @match_options: match time option flags
2021  * @max_tokens: the maximum number of tokens to split @string into.
2022  *   If this is less than 1, the string is split completely
2023  * @error: return location for a #GError
2024  *
2025  * Breaks the string on the pattern, and returns an array of the tokens.
2026  * If the pattern contains capturing parentheses, then the text for each
2027  * of the substrings will also be returned. If the pattern does not match
2028  * anywhere in the string, then the whole string is returned as the first
2029  * token.
2030  *
2031  * As a special case, the result of splitting the empty string "" is an
2032  * empty vector, not a vector containing a single string. The reason for
2033  * this special case is that being able to represent a empty vector is
2034  * typically more useful than consistent handling of empty elements. If
2035  * you do need to represent empty elements, you'll need to check for the
2036  * empty string before calling this function.
2037  *
2038  * A pattern that can match empty strings splits @string into separate
2039  * characters wherever it matches the empty string between characters.
2040  * For example splitting "ab c" using as a separator "\s*", you will get
2041  * "a", "b" and "c".
2042  *
2043  * Setting @start_position differs from just passing over a shortened
2044  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2045  * that begins with any kind of lookbehind assertion, such as "\b".
2046  *
2047  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
2048  *
2049  * Since: 2.14
2050  **/
2051 gchar **
2052 g_regex_split_full (const GRegex      *regex,
2053                     const gchar       *string,
2054                     gssize             string_len,
2055                     gint               start_position,
2056                     GRegexMatchFlags   match_options,
2057                     gint               max_tokens,
2058                     GError           **error)
2059 {
2060   GError *tmp_error = NULL;
2061   GMatchInfo *match_info;
2062   GList *list, *last;
2063   gint i;
2064   gint token_count;
2065   gboolean match_ok;
2066   /* position of the last separator. */
2067   gint last_separator_end;
2068   /* was the last match 0 bytes long? */
2069   gboolean last_match_is_empty;
2070   /* the returned array of char **s */
2071   gchar **string_list;
2072
2073   g_return_val_if_fail (regex != NULL, NULL);
2074   g_return_val_if_fail (string != NULL, NULL);
2075   g_return_val_if_fail (start_position >= 0, NULL);
2076   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2077   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2078
2079   if (max_tokens <= 0)
2080     max_tokens = G_MAXINT;
2081
2082   if (string_len < 0)
2083     string_len = strlen (string);
2084
2085   /* zero-length string */
2086   if (string_len - start_position == 0)
2087     return g_new0 (gchar *, 1);
2088
2089   if (max_tokens == 1)
2090     {
2091       string_list = g_new0 (gchar *, 2);
2092       string_list[0] = g_strndup (&string[start_position],
2093                                   string_len - start_position);
2094       return string_list;
2095     }
2096
2097   list = NULL;
2098   token_count = 0;
2099   last_separator_end = start_position;
2100   last_match_is_empty = FALSE;
2101
2102   match_ok = g_regex_match_full (regex, string, string_len, start_position,
2103                                  match_options, &match_info, &tmp_error);
2104
2105   while (tmp_error == NULL)
2106     {
2107       if (match_ok)
2108         {
2109           last_match_is_empty =
2110                     (match_info->offsets[0] == match_info->offsets[1]);
2111
2112           /* we need to skip empty separators at the same position of the end
2113            * of another separator. e.g. the string is "a b" and the separator
2114            * is " *", so from 1 to 2 we have a match and at position 2 we have
2115            * an empty match. */
2116           if (last_separator_end != match_info->offsets[1])
2117             {
2118               gchar *token;
2119               gint match_count;
2120
2121               token = g_strndup (string + last_separator_end,
2122                                  match_info->offsets[0] - last_separator_end);
2123               list = g_list_prepend (list, token);
2124               token_count++;
2125
2126               /* if there were substrings, these need to be added to
2127                * the list. */
2128               match_count = g_match_info_get_match_count (match_info);
2129               if (match_count > 1)
2130                 {
2131                   for (i = 1; i < match_count; i++)
2132                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
2133                 }
2134             }
2135         }
2136       else
2137         {
2138           /* if there was no match, copy to end of string. */
2139           if (!last_match_is_empty)
2140             {
2141               gchar *token = g_strndup (string + last_separator_end,
2142                                         match_info->string_len - last_separator_end);
2143               list = g_list_prepend (list, token);
2144             }
2145           /* no more tokens, end the loop. */
2146           break;
2147         }
2148
2149       /* -1 to leave room for the last part. */
2150       if (token_count >= max_tokens - 1)
2151         {
2152           /* we have reached the maximum number of tokens, so we copy
2153            * the remaining part of the string. */
2154           if (last_match_is_empty)
2155             {
2156               /* the last match was empty, so we have moved one char
2157                * after the real position to avoid empty matches at the
2158                * same position. */
2159               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
2160             }
2161           /* the if is needed in the case we have terminated the available
2162            * tokens, but we are at the end of the string, so there are no
2163            * characters left to copy. */
2164           if (string_len > match_info->pos)
2165             {
2166               gchar *token = g_strndup (string + match_info->pos,
2167                                         string_len - match_info->pos);
2168               list = g_list_prepend (list, token);
2169             }
2170           /* end the loop. */
2171           break;
2172         }
2173
2174       last_separator_end = match_info->pos;
2175       if (last_match_is_empty)
2176         /* if the last match was empty, g_match_info_next() has moved
2177          * forward to avoid infinite loops, but we still need to copy that
2178          * character. */
2179         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
2180
2181       match_ok = g_match_info_next (match_info, &tmp_error);
2182     }
2183   g_match_info_free (match_info);
2184   if (tmp_error != NULL)
2185     {
2186       g_propagate_error (error, tmp_error);
2187       g_list_free_full (list, g_free);
2188       match_info->pos = -1;
2189       return NULL;
2190     }
2191
2192   string_list = g_new (gchar *, g_list_length (list) + 1);
2193   i = 0;
2194   for (last = g_list_last (list); last; last = g_list_previous (last))
2195     string_list[i++] = last->data;
2196   string_list[i] = NULL;
2197   g_list_free (list);
2198
2199   return string_list;
2200 }
2201
2202 enum
2203 {
2204   REPL_TYPE_STRING,
2205   REPL_TYPE_CHARACTER,
2206   REPL_TYPE_SYMBOLIC_REFERENCE,
2207   REPL_TYPE_NUMERIC_REFERENCE,
2208   REPL_TYPE_CHANGE_CASE
2209 };
2210
2211 typedef enum
2212 {
2213   CHANGE_CASE_NONE         = 1 << 0,
2214   CHANGE_CASE_UPPER        = 1 << 1,
2215   CHANGE_CASE_LOWER        = 1 << 2,
2216   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
2217   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
2218   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
2219   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
2220   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
2221 } ChangeCase;
2222
2223 struct _InterpolationData
2224 {
2225   gchar     *text;
2226   gint       type;
2227   gint       num;
2228   gchar      c;
2229   ChangeCase change_case;
2230 };
2231
2232 static void
2233 free_interpolation_data (InterpolationData *data)
2234 {
2235   g_free (data->text);
2236   g_free (data);
2237 }
2238
2239 static const gchar *
2240 expand_escape (const gchar        *replacement,
2241                const gchar        *p,
2242                InterpolationData  *data,
2243                GError            **error)
2244 {
2245   const gchar *q, *r;
2246   gint x, d, h, i;
2247   const gchar *error_detail;
2248   gint base = 0;
2249   GError *tmp_error = NULL;
2250
2251   p++;
2252   switch (*p)
2253     {
2254     case 't':
2255       p++;
2256       data->c = '\t';
2257       data->type = REPL_TYPE_CHARACTER;
2258       break;
2259     case 'n':
2260       p++;
2261       data->c = '\n';
2262       data->type = REPL_TYPE_CHARACTER;
2263       break;
2264     case 'v':
2265       p++;
2266       data->c = '\v';
2267       data->type = REPL_TYPE_CHARACTER;
2268       break;
2269     case 'r':
2270       p++;
2271       data->c = '\r';
2272       data->type = REPL_TYPE_CHARACTER;
2273       break;
2274     case 'f':
2275       p++;
2276       data->c = '\f';
2277       data->type = REPL_TYPE_CHARACTER;
2278       break;
2279     case 'a':
2280       p++;
2281       data->c = '\a';
2282       data->type = REPL_TYPE_CHARACTER;
2283       break;
2284     case 'b':
2285       p++;
2286       data->c = '\b';
2287       data->type = REPL_TYPE_CHARACTER;
2288       break;
2289     case '\\':
2290       p++;
2291       data->c = '\\';
2292       data->type = REPL_TYPE_CHARACTER;
2293       break;
2294     case 'x':
2295       p++;
2296       x = 0;
2297       if (*p == '{')
2298         {
2299           p++;
2300           do
2301             {
2302               h = g_ascii_xdigit_value (*p);
2303               if (h < 0)
2304                 {
2305                   error_detail = _("hexadecimal digit or '}' expected");
2306                   goto error;
2307                 }
2308               x = x * 16 + h;
2309               p++;
2310             }
2311           while (*p != '}');
2312           p++;
2313         }
2314       else
2315         {
2316           for (i = 0; i < 2; i++)
2317             {
2318               h = g_ascii_xdigit_value (*p);
2319               if (h < 0)
2320                 {
2321                   error_detail = _("hexadecimal digit expected");
2322                   goto error;
2323                 }
2324               x = x * 16 + h;
2325               p++;
2326             }
2327         }
2328       data->type = REPL_TYPE_STRING;
2329       data->text = g_new0 (gchar, 8);
2330       g_unichar_to_utf8 (x, data->text);
2331       break;
2332     case 'l':
2333       p++;
2334       data->type = REPL_TYPE_CHANGE_CASE;
2335       data->change_case = CHANGE_CASE_LOWER_SINGLE;
2336       break;
2337     case 'u':
2338       p++;
2339       data->type = REPL_TYPE_CHANGE_CASE;
2340       data->change_case = CHANGE_CASE_UPPER_SINGLE;
2341       break;
2342     case 'L':
2343       p++;
2344       data->type = REPL_TYPE_CHANGE_CASE;
2345       data->change_case = CHANGE_CASE_LOWER;
2346       break;
2347     case 'U':
2348       p++;
2349       data->type = REPL_TYPE_CHANGE_CASE;
2350       data->change_case = CHANGE_CASE_UPPER;
2351       break;
2352     case 'E':
2353       p++;
2354       data->type = REPL_TYPE_CHANGE_CASE;
2355       data->change_case = CHANGE_CASE_NONE;
2356       break;
2357     case 'g':
2358       p++;
2359       if (*p != '<')
2360         {
2361           error_detail = _("missing '<' in symbolic reference");
2362           goto error;
2363         }
2364       q = p + 1;
2365       do
2366         {
2367           p++;
2368           if (!*p)
2369             {
2370               error_detail = _("unfinished symbolic reference");
2371               goto error;
2372             }
2373         }
2374       while (*p != '>');
2375       if (p - q == 0)
2376         {
2377           error_detail = _("zero-length symbolic reference");
2378           goto error;
2379         }
2380       if (g_ascii_isdigit (*q))
2381         {
2382           x = 0;
2383           do
2384             {
2385               h = g_ascii_digit_value (*q);
2386               if (h < 0)
2387                 {
2388                   error_detail = _("digit expected");
2389                   p = q;
2390                   goto error;
2391                 }
2392               x = x * 10 + h;
2393               q++;
2394             }
2395           while (q != p);
2396           data->num = x;
2397           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2398         }
2399       else
2400         {
2401           r = q;
2402           do
2403             {
2404               if (!g_ascii_isalnum (*r))
2405                 {
2406                   error_detail = _("illegal symbolic reference");
2407                   p = r;
2408                   goto error;
2409                 }
2410               r++;
2411             }
2412           while (r != p);
2413           data->text = g_strndup (q, p - q);
2414           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
2415         }
2416       p++;
2417       break;
2418     case '0':
2419       /* if \0 is followed by a number is an octal number representing a
2420        * character, else it is a numeric reference. */
2421       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
2422         {
2423           base = 8;
2424           p = g_utf8_next_char (p);
2425         }
2426     case '1':
2427     case '2':
2428     case '3':
2429     case '4':
2430     case '5':
2431     case '6':
2432     case '7':
2433     case '8':
2434     case '9':
2435       x = 0;
2436       d = 0;
2437       for (i = 0; i < 3; i++)
2438         {
2439           h = g_ascii_digit_value (*p);
2440           if (h < 0)
2441             break;
2442           if (h > 7)
2443             {
2444               if (base == 8)
2445                 break;
2446               else
2447                 base = 10;
2448             }
2449           if (i == 2 && base == 10)
2450             break;
2451           x = x * 8 + h;
2452           d = d * 10 + h;
2453           p++;
2454         }
2455       if (base == 8 || i == 3)
2456         {
2457           data->type = REPL_TYPE_STRING;
2458           data->text = g_new0 (gchar, 8);
2459           g_unichar_to_utf8 (x, data->text);
2460         }
2461       else
2462         {
2463           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2464           data->num = d;
2465         }
2466       break;
2467     case 0:
2468       error_detail = _("stray final '\\'");
2469       goto error;
2470       break;
2471     default:
2472       error_detail = _("unknown escape sequence");
2473       goto error;
2474     }
2475
2476   return p;
2477
2478  error:
2479   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
2480   tmp_error = g_error_new (G_REGEX_ERROR,
2481                            G_REGEX_ERROR_REPLACE,
2482                            _("Error while parsing replacement "
2483                              "text \"%s\" at char %lu: %s"),
2484                            replacement,
2485                            (gulong)(p - replacement),
2486                            error_detail);
2487   g_propagate_error (error, tmp_error);
2488
2489   return NULL;
2490 }
2491
2492 static GList *
2493 split_replacement (const gchar  *replacement,
2494                    GError      **error)
2495 {
2496   GList *list = NULL;
2497   InterpolationData *data;
2498   const gchar *p, *start;
2499
2500   start = p = replacement;
2501   while (*p)
2502     {
2503       if (*p == '\\')
2504         {
2505           data = g_new0 (InterpolationData, 1);
2506           start = p = expand_escape (replacement, p, data, error);
2507           if (p == NULL)
2508             {
2509               g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2510               free_interpolation_data (data);
2511
2512               return NULL;
2513             }
2514           list = g_list_prepend (list, data);
2515         }
2516       else
2517         {
2518           p++;
2519           if (*p == '\\' || *p == '\0')
2520             {
2521               if (p - start > 0)
2522                 {
2523                   data = g_new0 (InterpolationData, 1);
2524                   data->text = g_strndup (start, p - start);
2525                   data->type = REPL_TYPE_STRING;
2526                   list = g_list_prepend (list, data);
2527                 }
2528             }
2529         }
2530     }
2531
2532   return g_list_reverse (list);
2533 }
2534
2535 /* Change the case of c based on change_case. */
2536 #define CHANGE_CASE(c, change_case) \
2537         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2538                 g_unichar_tolower (c) : \
2539                 g_unichar_toupper (c))
2540
2541 static void
2542 string_append (GString     *string,
2543                const gchar *text,
2544                ChangeCase  *change_case)
2545 {
2546   gunichar c;
2547
2548   if (text[0] == '\0')
2549     return;
2550
2551   if (*change_case == CHANGE_CASE_NONE)
2552     {
2553       g_string_append (string, text);
2554     }
2555   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2556     {
2557       c = g_utf8_get_char (text);
2558       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2559       g_string_append (string, g_utf8_next_char (text));
2560       *change_case = CHANGE_CASE_NONE;
2561     }
2562   else
2563     {
2564       while (*text != '\0')
2565         {
2566           c = g_utf8_get_char (text);
2567           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2568           text = g_utf8_next_char (text);
2569         }
2570     }
2571 }
2572
2573 static gboolean
2574 interpolate_replacement (const GMatchInfo *match_info,
2575                          GString          *result,
2576                          gpointer          data)
2577 {
2578   GList *list;
2579   InterpolationData *idata;
2580   gchar *match;
2581   ChangeCase change_case = CHANGE_CASE_NONE;
2582
2583   for (list = data; list; list = list->next)
2584     {
2585       idata = list->data;
2586       switch (idata->type)
2587         {
2588         case REPL_TYPE_STRING:
2589           string_append (result, idata->text, &change_case);
2590           break;
2591         case REPL_TYPE_CHARACTER:
2592           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2593           if (change_case & CHANGE_CASE_SINGLE_MASK)
2594             change_case = CHANGE_CASE_NONE;
2595           break;
2596         case REPL_TYPE_NUMERIC_REFERENCE:
2597           match = g_match_info_fetch (match_info, idata->num);
2598           if (match)
2599             {
2600               string_append (result, match, &change_case);
2601               g_free (match);
2602             }
2603           break;
2604         case REPL_TYPE_SYMBOLIC_REFERENCE:
2605           match = g_match_info_fetch_named (match_info, idata->text);
2606           if (match)
2607             {
2608               string_append (result, match, &change_case);
2609               g_free (match);
2610             }
2611           break;
2612         case REPL_TYPE_CHANGE_CASE:
2613           change_case = idata->change_case;
2614           break;
2615         }
2616     }
2617
2618   return FALSE;
2619 }
2620
2621 /* whether actual match_info is needed for replacement, i.e.
2622  * whether there are references
2623  */
2624 static gboolean
2625 interpolation_list_needs_match (GList *list)
2626 {
2627   while (list != NULL)
2628     {
2629       InterpolationData *data = list->data;
2630
2631       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
2632           data->type == REPL_TYPE_NUMERIC_REFERENCE)
2633         {
2634           return TRUE;
2635         }
2636
2637       list = list->next;
2638     }
2639
2640   return FALSE;
2641 }
2642
2643 /**
2644  * g_regex_replace:
2645  * @regex: a #GRegex structure
2646  * @string: (array length=string_len): the string to perform matches against
2647  * @string_len: the length of @string, or -1 if @string is nul-terminated
2648  * @start_position: starting index of the string to match
2649  * @replacement: text to replace each match with
2650  * @match_options: options for the match
2651  * @error: location to store the error occurring, or %NULL to ignore errors
2652  *
2653  * Replaces all occurrences of the pattern in @regex with the
2654  * replacement text. Backreferences of the form '\number' or
2655  * '\g&lt;number&gt;' in the replacement text are interpolated by the
2656  * number-th captured subexpression of the match, '\g&lt;name&gt;' refers
2657  * to the captured subexpression with the given name. '\0' refers to the
2658  * complete match, but '\0' followed by a number is the octal representation
2659  * of a character. To include a literal '\' in the replacement, write '\\'.
2660  * There are also escapes that changes the case of the following text:
2661  *
2662  * <variablelist>
2663  * <varlistentry><term>\l</term>
2664  * <listitem>
2665  * <para>Convert to lower case the next character</para>
2666  * </listitem>
2667  * </varlistentry>
2668  * <varlistentry><term>\u</term>
2669  * <listitem>
2670  * <para>Convert to upper case the next character</para>
2671  * </listitem>
2672  * </varlistentry>
2673  * <varlistentry><term>\L</term>
2674  * <listitem>
2675  * <para>Convert to lower case till \E</para>
2676  * </listitem>
2677  * </varlistentry>
2678  * <varlistentry><term>\U</term>
2679  * <listitem>
2680  * <para>Convert to upper case till \E</para>
2681  * </listitem>
2682  * </varlistentry>
2683  * <varlistentry><term>\E</term>
2684  * <listitem>
2685  * <para>End case modification</para>
2686  * </listitem>
2687  * </varlistentry>
2688  * </variablelist>
2689  *
2690  * If you do not need to use backreferences use g_regex_replace_literal().
2691  *
2692  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2693  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2694  * you can use g_regex_replace_literal().
2695  *
2696  * Setting @start_position differs from just passing over a shortened
2697  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
2698  * begins with any kind of lookbehind assertion, such as "\b".
2699  *
2700  * Returns: a newly allocated string containing the replacements
2701  *
2702  * Since: 2.14
2703  */
2704 gchar *
2705 g_regex_replace (const GRegex      *regex,
2706                  const gchar       *string,
2707                  gssize             string_len,
2708                  gint               start_position,
2709                  const gchar       *replacement,
2710                  GRegexMatchFlags   match_options,
2711                  GError           **error)
2712 {
2713   gchar *result;
2714   GList *list;
2715   GError *tmp_error = NULL;
2716
2717   g_return_val_if_fail (regex != NULL, NULL);
2718   g_return_val_if_fail (string != NULL, NULL);
2719   g_return_val_if_fail (start_position >= 0, NULL);
2720   g_return_val_if_fail (replacement != NULL, NULL);
2721   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2722   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2723
2724   list = split_replacement (replacement, &tmp_error);
2725   if (tmp_error != NULL)
2726     {
2727       g_propagate_error (error, tmp_error);
2728       return NULL;
2729     }
2730
2731   result = g_regex_replace_eval (regex,
2732                                  string, string_len, start_position,
2733                                  match_options,
2734                                  interpolate_replacement,
2735                                  (gpointer)list,
2736                                  &tmp_error);
2737   if (tmp_error != NULL)
2738     g_propagate_error (error, tmp_error);
2739
2740   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2741
2742   return result;
2743 }
2744
2745 static gboolean
2746 literal_replacement (const GMatchInfo *match_info,
2747                      GString          *result,
2748                      gpointer          data)
2749 {
2750   g_string_append (result, data);
2751   return FALSE;
2752 }
2753
2754 /**
2755  * g_regex_replace_literal:
2756  * @regex: a #GRegex structure
2757  * @string: (array length=string_len): the string to perform matches against
2758  * @string_len: the length of @string, or -1 if @string is nul-terminated
2759  * @start_position: starting index of the string to match
2760  * @replacement: text to replace each match with
2761  * @match_options: options for the match
2762  * @error: location to store the error occurring, or %NULL to ignore errors
2763  *
2764  * Replaces all occurrences of the pattern in @regex with the
2765  * replacement text. @replacement is replaced literally, to
2766  * include backreferences use g_regex_replace().
2767  *
2768  * Setting @start_position differs from just passing over a
2769  * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
2770  * case of a pattern that begins with any kind of lookbehind
2771  * assertion, such as "\b".
2772  *
2773  * Returns: a newly allocated string containing the replacements
2774  *
2775  * Since: 2.14
2776  */
2777 gchar *
2778 g_regex_replace_literal (const GRegex      *regex,
2779                          const gchar       *string,
2780                          gssize             string_len,
2781                          gint               start_position,
2782                          const gchar       *replacement,
2783                          GRegexMatchFlags   match_options,
2784                          GError           **error)
2785 {
2786   g_return_val_if_fail (replacement != NULL, NULL);
2787   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2788
2789   return g_regex_replace_eval (regex,
2790                                string, string_len, start_position,
2791                                match_options,
2792                                literal_replacement,
2793                                (gpointer)replacement,
2794                                error);
2795 }
2796
2797 /**
2798  * g_regex_replace_eval:
2799  * @regex: a #GRegex structure from g_regex_new()
2800  * @string: (array length=string_len): string to perform matches against
2801  * @string_len: the length of @string, or -1 if @string is nul-terminated
2802  * @start_position: starting index of the string to match
2803  * @match_options: options for the match
2804  * @eval: a function to call for each match
2805  * @user_data: user data to pass to the function
2806  * @error: location to store the error occurring, or %NULL to ignore errors
2807  *
2808  * Replaces occurrences of the pattern in regex with the output of
2809  * @eval for that occurrence.
2810  *
2811  * Setting @start_position differs from just passing over a shortened
2812  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2813  * that begins with any kind of lookbehind assertion, such as "\b".
2814  *
2815  * The following example uses g_regex_replace_eval() to replace multiple
2816  * strings at once:
2817  * |[
2818  * static gboolean
2819  * eval_cb (const GMatchInfo *info,
2820  *          GString          *res,
2821  *          gpointer          data)
2822  * {
2823  *   gchar *match;
2824  *   gchar *r;
2825  *
2826  *    match = g_match_info_fetch (info, 0);
2827  *    r = g_hash_table_lookup ((GHashTable *)data, match);
2828  *    g_string_append (res, r);
2829  *    g_free (match);
2830  *
2831  *    return FALSE;
2832  * }
2833  *
2834  * /&ast; ... &ast;/
2835  *
2836  * GRegex *reg;
2837  * GHashTable *h;
2838  * gchar *res;
2839  *
2840  * h = g_hash_table_new (g_str_hash, g_str_equal);
2841  *
2842  * g_hash_table_insert (h, "1", "ONE");
2843  * g_hash_table_insert (h, "2", "TWO");
2844  * g_hash_table_insert (h, "3", "THREE");
2845  * g_hash_table_insert (h, "4", "FOUR");
2846  *
2847  * reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
2848  * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
2849  * g_hash_table_destroy (h);
2850  *
2851  * /&ast; ... &ast;/
2852  * ]|
2853  *
2854  * Returns: a newly allocated string containing the replacements
2855  *
2856  * Since: 2.14
2857  */
2858 gchar *
2859 g_regex_replace_eval (const GRegex        *regex,
2860                       const gchar         *string,
2861                       gssize               string_len,
2862                       gint                 start_position,
2863                       GRegexMatchFlags     match_options,
2864                       GRegexEvalCallback   eval,
2865                       gpointer             user_data,
2866                       GError             **error)
2867 {
2868   GMatchInfo *match_info;
2869   GString *result;
2870   gint str_pos = 0;
2871   gboolean done = FALSE;
2872   GError *tmp_error = NULL;
2873
2874   g_return_val_if_fail (regex != NULL, NULL);
2875   g_return_val_if_fail (string != NULL, NULL);
2876   g_return_val_if_fail (start_position >= 0, NULL);
2877   g_return_val_if_fail (eval != NULL, NULL);
2878   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2879
2880   if (string_len < 0)
2881     string_len = strlen (string);
2882
2883   result = g_string_sized_new (string_len);
2884
2885   /* run down the string making matches. */
2886   g_regex_match_full (regex, string, string_len, start_position,
2887                       match_options, &match_info, &tmp_error);
2888   while (!done && g_match_info_matches (match_info))
2889     {
2890       g_string_append_len (result,
2891                            string + str_pos,
2892                            match_info->offsets[0] - str_pos);
2893       done = (*eval) (match_info, result, user_data);
2894       str_pos = match_info->offsets[1];
2895       g_match_info_next (match_info, &tmp_error);
2896     }
2897   g_match_info_free (match_info);
2898   if (tmp_error != NULL)
2899     {
2900       g_propagate_error (error, tmp_error);
2901       g_string_free (result, TRUE);
2902       return NULL;
2903     }
2904
2905   g_string_append_len (result, string + str_pos, string_len - str_pos);
2906   return g_string_free (result, FALSE);
2907 }
2908
2909 /**
2910  * g_regex_check_replacement:
2911  * @replacement: the replacement string
2912  * @has_references: (out) (allow-none): location to store information about
2913  *   references in @replacement or %NULL
2914  * @error: location to store error
2915  *
2916  * Checks whether @replacement is a valid replacement string
2917  * (see g_regex_replace()), i.e. that all escape sequences in
2918  * it are valid.
2919  *
2920  * If @has_references is not %NULL then @replacement is checked
2921  * for pattern references. For instance, replacement text 'foo\n'
2922  * does not contain references and may be evaluated without information
2923  * about actual match, but '\0\1' (whole match followed by first
2924  * subpattern) requires valid #GMatchInfo object.
2925  *
2926  * Returns: whether @replacement is a valid replacement string
2927  *
2928  * Since: 2.14
2929  */
2930 gboolean
2931 g_regex_check_replacement (const gchar  *replacement,
2932                            gboolean     *has_references,
2933                            GError      **error)
2934 {
2935   GList *list;
2936   GError *tmp = NULL;
2937
2938   list = split_replacement (replacement, &tmp);
2939
2940   if (tmp)
2941   {
2942     g_propagate_error (error, tmp);
2943     return FALSE;
2944   }
2945
2946   if (has_references)
2947     *has_references = interpolation_list_needs_match (list);
2948
2949   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2950
2951   return TRUE;
2952 }
2953
2954 /**
2955  * g_regex_escape_nul:
2956  * @string: the string to escape
2957  * @length: the length of @string
2958  *
2959  * Escapes the nul characters in @string to "\x00".  It can be used
2960  * to compile a regex with embedded nul characters.
2961  *
2962  * For completeness, @length can be -1 for a nul-terminated string.
2963  * In this case the output string will be of course equal to @string.
2964  *
2965  * Returns: a newly-allocated escaped string
2966  *
2967  * Since: 2.30
2968  */
2969 gchar *
2970 g_regex_escape_nul (const gchar *string,
2971                     gint         length)
2972 {
2973   GString *escaped;
2974   const gchar *p, *piece_start, *end;
2975   gint backslashes;
2976
2977   g_return_val_if_fail (string != NULL, NULL);
2978
2979   if (length < 0)
2980     return g_strdup (string);
2981
2982   end = string + length;
2983   p = piece_start = string;
2984   escaped = g_string_sized_new (length + 1);
2985
2986   backslashes = 0;
2987   while (p < end)
2988     {
2989       switch (*p)
2990         {
2991         case '\0':
2992           if (p != piece_start)
2993             {
2994               /* copy the previous piece. */
2995               g_string_append_len (escaped, piece_start, p - piece_start);
2996             }
2997           if ((backslashes & 1) == 0)
2998             g_string_append_c (escaped, '\\');
2999           g_string_append_c (escaped, 'x');
3000           g_string_append_c (escaped, '0');
3001           g_string_append_c (escaped, '0');
3002           piece_start = ++p;
3003           backslashes = 0;
3004           break;
3005         case '\\':
3006           backslashes++;
3007           ++p;
3008           break;
3009         default:
3010           backslashes = 0;
3011           p = g_utf8_next_char (p);
3012           break;
3013         }
3014     }
3015
3016   if (piece_start < end)
3017     g_string_append_len (escaped, piece_start, end - piece_start);
3018
3019   return g_string_free (escaped, FALSE);
3020 }
3021
3022 /**
3023  * g_regex_escape_string:
3024  * @string: (array length=length): the string to escape
3025  * @length: the length of @string, or -1 if @string is nul-terminated
3026  *
3027  * Escapes the special characters used for regular expressions
3028  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
3029  * function is useful to dynamically generate regular expressions.
3030  *
3031  * @string can contain nul characters that are replaced with "\0",
3032  * in this case remember to specify the correct length of @string
3033  * in @length.
3034  *
3035  * Returns: a newly-allocated escaped string
3036  *
3037  * Since: 2.14
3038  */
3039 gchar *
3040 g_regex_escape_string (const gchar *string,
3041                        gint         length)
3042 {
3043   GString *escaped;
3044   const char *p, *piece_start, *end;
3045
3046   g_return_val_if_fail (string != NULL, NULL);
3047
3048   if (length < 0)
3049     length = strlen (string);
3050
3051   end = string + length;
3052   p = piece_start = string;
3053   escaped = g_string_sized_new (length + 1);
3054
3055   while (p < end)
3056     {
3057       switch (*p)
3058         {
3059         case '\0':
3060         case '\\':
3061         case '|':
3062         case '(':
3063         case ')':
3064         case '[':
3065         case ']':
3066         case '{':
3067         case '}':
3068         case '^':
3069         case '$':
3070         case '*':
3071         case '+':
3072         case '?':
3073         case '.':
3074           if (p != piece_start)
3075             /* copy the previous piece. */
3076             g_string_append_len (escaped, piece_start, p - piece_start);
3077           g_string_append_c (escaped, '\\');
3078           if (*p == '\0')
3079             g_string_append_c (escaped, '0');
3080           else
3081             g_string_append_c (escaped, *p);
3082           piece_start = ++p;
3083           break;
3084         default:
3085           p = g_utf8_next_char (p);
3086           break;
3087         }
3088   }
3089
3090   if (piece_start < end)
3091     g_string_append_len (escaped, piece_start, end - piece_start);
3092
3093   return g_string_free (escaped, FALSE);
3094 }