glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "gregex.h"
  25
  26 #include <glib.h>
  27 #include <glib/gi18n.h>
  28 #include <string.h>
  29
  30 #ifdef USE_SYSTEM_PCRE
  31 #include <pcre.h>
  32 #else
  33 #include "pcre/pcre.h"
  34 #endif
  35
  36 #include "galias.h"
  37
  38 /* Mask of all the possible values for GRegexCompileFlags. */
  39 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
  40                               G_REGEX_MULTILINE         | \
  41                               G_REGEX_DOTALL            | \
  42                               G_REGEX_EXTENDED          | \
  43                               G_REGEX_ANCHORED          | \
  44                               G_REGEX_DOLLAR_ENDONLY    | \
  45                               G_REGEX_UNGREEDY          | \
  46                               G_REGEX_RAW               | \
  47                               G_REGEX_NO_AUTO_CAPTURE   | \
  48                               G_REGEX_OPTIMIZE          | \
  49                               G_REGEX_DUPNAMES          | \
  50                               G_REGEX_NEWLINE_CR        | \
  51                               G_REGEX_NEWLINE_LF        | \
  52                               G_REGEX_NEWLINE_CRLF)
  53
  54 /* Mask of all the possible values for GRegexMatchFlags. */
  55 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED      | \
  56                             G_REGEX_MATCH_NOTBOL        | \
  57                             G_REGEX_MATCH_NOTEOL        | \
  58                             G_REGEX_MATCH_NOTEMPTY      | \
  59                             G_REGEX_MATCH_PARTIAL       | \
  60                             G_REGEX_MATCH_NEWLINE_CR    | \
  61                             G_REGEX_MATCH_NEWLINE_LF    | \
  62                             G_REGEX_MATCH_NEWLINE_CRLF  | \
  63                             G_REGEX_MATCH_NEWLINE_ANY)
  64
  65 /* if the string is in UTF-8 use g_utf8_ functions, else use
  66  * use just +/- 1. */
  67 #define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
  68                                 g_utf8_next_char (s) : \
  69                                 ((s) + 1))
  70 #define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
  71                                 g_utf8_prev_char (s) : \
  72                                 ((s) - 1))
  73
  74 struct _GMatchInfo
  75 {
  76   GRegex *regex;                /* the regex */
  77   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
  78   gint matches;                 /* number of matching sub patterns */
  79   gint pos;                     /* position in the string where last match left off */
  80   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
  81   gint n_offsets;               /* number of offsets */
  82   gint *workspace;              /* workspace for pcre_dfa_exec() */
  83   gint n_workspace;             /* number of workspace elements */
  84   const gchar *string;          /* string passed to the match function */
  85   gssize string_len;            /* length of string */
  86 };
  87
  88 struct _GRegex
  89 {
  90   volatile guint ref_count;     /* the ref count for the immutable part */
  91   gchar *pattern;               /* the pattern */
  92   pcre *pcre_re;                /* compiled form of the pattern */
  93   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
  94   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
  95   pcre_extra *extra;            /* data stored when G_REGEX_OPTIMIZE is used */
  96 };
  97
  98 /* TRUE if ret is an error code, FALSE otherwise. */
  99 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 100
 101 static GRegex   *regex_ref      (GRegex *regex);
 102 static void      regex_unref    (GRegex *regex);
 103
 104 typedef struct _InterpolationData InterpolationData;
 105 static gboolean  interpolate_replacement        (const GRegex *regex,
 106                                                  const GMatchInfo *match_info,
 107                                                  const gchar *string,
 108                                                  GString *result,
 109                                                  gpointer data);
 110 static GList    *split_replacement              (const gchar *replacement,
 111                                                  GError **error);
 112 static void      free_interpolation_data        (InterpolationData *data);
 113
 114
 115 static const gchar *
 116 match_error (gint errcode)
 117 {
 118   switch (errcode)
 119     {
 120     case PCRE_ERROR_NOMATCH:
 121       /* not an error */
 122       break;
 123     case PCRE_ERROR_NULL:
 124       /* NULL argument, this should not happen in GRegex */
 125       g_warning ("A NULL argument was passed to PCRE");
 126       break;
 127     case PCRE_ERROR_BADOPTION:
 128       return "bad options";
 129     case PCRE_ERROR_BADMAGIC:
 130       return _("corrupted object");
 131     case PCRE_ERROR_UNKNOWN_OPCODE:
 132       return N_("internal error or corrupted object");
 133     case PCRE_ERROR_NOMEMORY:
 134       return _("out of memory");
 135     case PCRE_ERROR_NOSUBSTRING:
 136       /* not used by pcre_exec() */
 137       break;
 138     case PCRE_ERROR_MATCHLIMIT:
 139       return _("backtracking limit reached");
 140     case PCRE_ERROR_CALLOUT:
 141       /* callouts are not implemented */
 142       break;
 143     case PCRE_ERROR_BADUTF8:
 144     case PCRE_ERROR_BADUTF8_OFFSET:
 145       /* we do not check if strings are valid */
 146       break;
 147     case PCRE_ERROR_PARTIAL:
 148       /* not an error */
 149       break;
 150     case PCRE_ERROR_BADPARTIAL:
 151       return _("the pattern contains items not supported for partial matching");
 152     case PCRE_ERROR_INTERNAL:
 153       return _("internal error");
 154     case PCRE_ERROR_BADCOUNT:
 155       /* negative ovecsize, this should not happen in GRegex */
 156       g_warning ("A negative ovecsize was passed to PCRE");
 157       break;
 158     case PCRE_ERROR_DFA_UITEM:
 159       return _("the pattern contains items not supported for partial matching");
 160     case PCRE_ERROR_DFA_UCOND:
 161       return _("back references as conditions are not supported for partial matching");
 162     case PCRE_ERROR_DFA_UMLIMIT:
 163       /* the match_field field is not used in GRegex */
 164       break;
 165     case PCRE_ERROR_DFA_WSSIZE:
 166       /* handled expanding the workspace */
 167       break;
 168     case PCRE_ERROR_DFA_RECURSE:
 169     case PCRE_ERROR_RECURSIONLIMIT:
 170       return _("recursion limit reached");
 171     case PCRE_ERROR_NULLWSLIMIT:
 172       return _("workspace limit for empty substrings reached");
 173     case PCRE_ERROR_BADNEWLINE:
 174       return _("invalid combination of newline flags");
 175     default:
 176       break;
 177     }
 178   return _("unknown error");
 179 }
 180
 181
 182 /* GMatchInfo */
 183
 184 static GMatchInfo *
 185 match_info_new (const GRegex *regex,
 186                 const gchar  *string,
 187                 gint          string_len,
 188                 gint          start_position,
 189                 gint          match_options,
 190                 gboolean      is_dfa)
 191 {
 192   GMatchInfo *match_info;
 193
 194   if (string_len < 0)
 195     string_len = strlen (string);
 196
 197   match_info = g_new0 (GMatchInfo, 1);
 198   match_info->regex = regex_ref ((GRegex *)regex);
 199   match_info->string = string;
 200   match_info->string_len = string_len;
 201   match_info->matches = PCRE_ERROR_NOMATCH;
 202   match_info->pos = start_position;
 203   match_info->match_opts = match_options;
 204
 205   if (is_dfa)
 206     {
 207       /* These values should be enough for most cases, if they are not
 208        * enough g_regex_match_all_full() will expand them. */
 209       match_info->n_offsets = 24;
 210       match_info->n_workspace = 100;
 211       match_info->workspace = g_new (gint, match_info->n_workspace);
 212     }
 213   else
 214     {
 215       gint capture_count;
 216       pcre_fullinfo (regex->pcre_re, regex->extra,
 217                      PCRE_INFO_CAPTURECOUNT, &capture_count);
 218       match_info->n_offsets = (capture_count + 1) * 3;
 219     }
 220   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 221
 222   return match_info;
 223 }
 224
 225 /**
 226  * g_match_info_free:
 227  * @match_info: a #GMatchInfo
 228  *
 229  * Frees all the memory associated with the #GMatchInfo structure.
 230  *
 231  * Since: 2.14
 232  */
 233 void
 234 g_match_info_free (GMatchInfo *match_info)
 235 {
 236   regex_unref (match_info->regex);
 237   g_free (match_info->offsets);
 238   g_free (match_info->workspace);
 239   g_free (match_info);
 240 }
 241
 242 /**
 243  * g_match_info_next:
 244  * @match_info: a #GMatchInfo structure
 245  * @error: location to store the error occuring, or NULL to ignore errors
 246  *
 247  * Scans for the next match using the same parameters of the previous
 248  * call to g_regex_match_full() or g_regex_match() that returned
 249  * @match_info.
 250  *
 251  * The match is done on the string passed to the match function, so you
 252  * cannot free it before calling this function.
 253  *
 254  * Returns: %TRUE is the string matched, %FALSE otherwise
 255  *
 256  * Since: 2.14
 257  */
 258 gboolean
 259 g_match_info_next (GMatchInfo  *match_info,
 260                    GError     **error)
 261 {
 262   g_return_val_if_fail (match_info != NULL, FALSE);
 263   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 264   g_return_val_if_fail (match_info->pos >= 0, FALSE);
 265
 266   match_info->matches = pcre_exec (match_info->regex->pcre_re,
 267                                    match_info->regex->extra,
 268                                    match_info->string,
 269                                    match_info->string_len,
 270                                    match_info->pos,
 271                                    match_info->regex->match_opts |
 272                                    match_info->match_opts,
 273                                    match_info->offsets,
 274                                    match_info->n_offsets);
 275   if (IS_PCRE_ERROR (match_info->matches))
 276     {
 277       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 278                    _("Error while matching regular expression %s: %s"),
 279                    match_info->regex->pattern, match_error (match_info->matches));
 280       return FALSE;
 281     }
 282
 283   /* avoid infinite loops if the pattern is an empty string or something
 284    * equivalent */
 285   if (match_info->pos == match_info->offsets[1])
 286     {
 287       if (match_info->pos > match_info->string_len)
 288         {
 289           /* we have reached the end of the string */
 290           match_info->pos = -1;
 291           match_info->matches = PCRE_ERROR_NOMATCH;
 292           return FALSE;
 293         }
 294       match_info->pos = NEXT_CHAR (match_info->regex,
 295                                    &match_info->string[match_info->pos]) -
 296                                    match_info->string;
 297     }
 298   else
 299     {
 300       match_info->pos = match_info->offsets[1];
 301     }
 302
 303   return match_info->matches >= 0;
 304 }
 305
 306 /**
 307  * g_match_info_matches:
 308  * @match_info: a #GMatchInfo structure
 309  *
 310  * Returns: %TRUE if the previous match operation succeeded, %FALSE
 311  * otherwise
 312  *
 313  * Since: 2.14
 314  */
 315 gboolean
 316 g_match_info_matches (const GMatchInfo *match_info)
 317 {
 318   g_return_val_if_fail (match_info != NULL, FALSE);
 319
 320   return match_info->matches >= 0;
 321 }
 322
 323 /**
 324  * g_match_info_get_match_count:
 325  * @match_info: a #GMatchInfo structure
 326  *
 327  * Retrieves the number of matched substrings (including substring 0, that
 328  * is the whole matched text), so 1 is returned if the pattern has no
 329  * substrings in it and 0 is returned if the match failed.
 330  *
 331  * If the last match was obtained using the DFA algorithm, that is using
 332  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 333  * count is not that of the number of capturing parentheses but that of
 334  * the number of matched substrings.
 335  *
 336  * Returns:  Number of matched substrings, or -1 if an error occurred
 337  *
 338  * Since: 2.14
 339  */
 340 gint
 341 g_match_info_get_match_count (const GMatchInfo *match_info)
 342 {
 343   g_return_val_if_fail (match_info, -1);
 344
 345   if (match_info->matches == PCRE_ERROR_NOMATCH)
 346     /* no match */
 347     return 0;
 348   else if (match_info->matches < PCRE_ERROR_NOMATCH)
 349     /* error */
 350     return -1;
 351   else
 352     /* match */
 353     return match_info->matches;
 354 }
 355
 356 /**
 357  * g_match_info_is_partial_match:
 358  * @match_info: a #GMatchInfo structure
 359  *
 360  * Usually if the string passed to g_regex_match*() matches as far as
 361  * it goes, but is too short to match the entire pattern, %FALSE is
 362  * returned. There are circumstances where it might be helpful to
 363  * distinguish this case from other cases in which there is no match.
 364  *
 365  * Consider, for example, an application where a human is required to
 366  * type in data for a field with specific formatting requirements. An
 367  * example might be a date in the form ddmmmyy, defined by the pattern
 368  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 369  * If the application sees the user’s keystrokes one by one, and can
 370  * check that what has been typed so far is potentially valid, it is
 371  * able to raise an error as soon as a mistake is made.
 372  *
 373  * GRegex supports the concept of partial matching by means of the
 374  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 375  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 376  * for a complete match, %FALSE otherwise. But, when this functions
 377  * returns %FALSE, you can check if the match was partial calling
 378  * g_match_info_is_partial_match().
 379  *
 380  * When using partial matching you cannot use g_match_info_fetch*().
 381  *
 382  * Because of the way certain internal optimizations are implemented the
 383  * partial matching algorithm cannot be used with all patterns. So repeated
 384  * single characters such as "a{2,4}" and repeated single metasequences such
 385  * as "\d+" are not permitted if the maximum number of occurrences is
 386  * greater than one. Optional items such as "\d?" (where the maximum is one)
 387  * are permitted. Quantifiers with any values are permitted after
 388  * parentheses, so the invalid examples above can be coded thus "(a){2,4}"
 389  * and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set for a pattern that does
 390  * not conform to the restrictions, matching functions return an error.
 391  *
 392  * Returns: %TRUE if the match was partial, %FALSE otherwise
 393  *
 394  * Since: 2.14
 395  */
 396 gboolean
 397 g_match_info_is_partial_match (const GMatchInfo *match_info)
 398 {
 399   g_return_val_if_fail (match_info != NULL, FALSE);
 400
 401   return match_info->matches == PCRE_ERROR_PARTIAL;
 402 }
 403
 404 /**
 405  * g_match_info_expand_references:
 406  * @match_info: a #GMatchInfo
 407  * @string_to_expand: the string to expand
 408  * @error: location to store the error occuring, or %NULL to ignore errors
 409  *
 410  * Returns a new string containing the text in @string_to_expand with
 411  * references expanded. References refer to the last match done with
 412  * @string against @regex and have the same syntax used by g_regex_replace().
 413  *
 414  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
 415  * passed to g_regex_new().
 416  *
 417  * The backreferences are extracted from the string passed to the match
 418  * function, so you cannot call this function after freeing the string.
 419  *
 420  * Returns: the expanded string, or %NULL if an error occurred
 421  *
 422  * Since: 2.14
 423  */
 424 gchar *
 425 g_match_info_expand_references (const GMatchInfo *match_info,
 426                                 const gchar      *string_to_expand,
 427                                 GError          **error)
 428 {
 429   GString *result;
 430   GList *list;
 431   GError *tmp_error = NULL;
 432
 433   g_return_val_if_fail (match_info != NULL, NULL);
 434   g_return_val_if_fail (string_to_expand != NULL, NULL);
 435   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 436
 437   list = split_replacement (string_to_expand, &tmp_error);
 438   if (tmp_error != NULL)
 439     {
 440       g_propagate_error (error, tmp_error);
 441       return NULL;
 442     }
 443
 444   result = g_string_sized_new (strlen (string_to_expand));
 445   interpolate_replacement (match_info->regex, match_info,
 446                            match_info->string, result, list);
 447
 448   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
 449   g_list_free (list);
 450
 451   return g_string_free (result, FALSE);
 452 }
 453
 454 /**
 455  * g_match_info_fetch:
 456  * @match_info: #GMatchInfo structure
 457  * @match_num: number of the sub expression
 458  *
 459  * Retrieves the text matching the @match_num<!-- -->'th capturing parentheses.
 460  * 0 is the full text of the match, 1 is the first paren set, 2 the second,
 461  * and so on.
 462  *
 463  * If @match_num is a valid sub pattern but it didn't match anything (e.g.
 464  * sub pattern 1, matching "b" against "(a)?b") then an empty string is
 465  * returned.
 466  *
 467  * If the match was obtained using the DFA algorithm, that is using
 468  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 469  * string is not that of a set of parentheses but that of a matched
 470  * substring. Substrings are matched in reverse order of length, so 0 is
 471  * the longest match.
 472  *
 473  * The string is fetched from the string passed to the match function,
 474  * so you cannot call this function after freeing the string.
 475  *
 476  * Returns: The matched substring, or %NULL if an error occurred.
 477  *          You have to free the string yourself
 478  *
 479  * Since: 2.14
 480  */
 481 gchar *
 482 g_match_info_fetch (const GMatchInfo *match_info,
 483                     gint              match_num)
 484 {
 485   /* we cannot use pcre_get_substring() because it allocates the
 486    * string using pcre_malloc(). */
 487   gchar *match = NULL;
 488   gint start, end;
 489
 490   g_return_val_if_fail (match_info != NULL, NULL);
 491   g_return_val_if_fail (match_num >= 0, NULL);
 492
 493   /* match_num does not exist or it didn't matched, i.e. matching "b"
 494    * against "(a)?b" then group 0 is empty. */
 495   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
 496     match = NULL;
 497   else if (start == -1)
 498     match = g_strdup ("");
 499   else
 500     match = g_strndup (&match_info->string[start], end - start);
 501
 502   return match;
 503 }
 504
 505 /**
 506  * g_match_info_fetch_pos:
 507  * @match_info: #GMatchInfo structure
 508  * @match_num: number of the sub expression
 509  * @start_pos: pointer to location where to store the start position
 510  * @end_pos: pointer to location where to store the end position
 511  *
 512  * Retrieves the position of the @match_num<!-- -->'th capturing parentheses.
 513  * 0 is the full text of the match, 1 is the first paren set, 2 the second,
 514  * and so on.
 515  *
 516  * If @match_num is a valid sub pattern but it didn't match anything (e.g.
 517  * sub pattern 1, matching "b" against "(a)?b") then @start_pos and @end_pos
 518  * are set to -1 and %TRUE is returned.
 519  *
 520  * If the match was obtained using the DFA algorithm, that is using
 521  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 522  * position is not that of a set of parentheses but that of a matched
 523  * substring. Substrings are matched in reverse order of length, so 0 is
 524  * the longest match.
 525  *
 526  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If the
 527  *          position cannot be fetched, @start_pos and @end_pos are left
 528  *          unchanged.
 529  *
 530  * Since: 2.14
 531  */
 532 gboolean
 533 g_match_info_fetch_pos (const GMatchInfo *match_info,
 534                         gint              match_num,
 535                         gint             *start_pos,
 536                         gint             *end_pos)
 537 {
 538   g_return_val_if_fail (match_info != NULL, FALSE);
 539   g_return_val_if_fail (match_num >= 0, FALSE);
 540
 541   /* make sure the sub expression number they're requesting is less than
 542    * the total number of sub expressions that were matched. */
 543   if (match_num >= match_info->matches)
 544     return FALSE;
 545
 546   if (start_pos != NULL)
 547     *start_pos = match_info->offsets[2 * match_num];
 548
 549   if (end_pos != NULL)
 550     *end_pos = match_info->offsets[2 * match_num + 1];
 551
 552   return TRUE;
 553 }
 554
 555 /*
 556  * Returns number of first matched subpattern with name @name.
 557  * There may be more than one in case when DUPNAMES is used,
 558  * and not all subpatterns with that name match;
 559  * pcre_get_stringnumber() does not work in that case.
 560  */
 561 static gint
 562 get_matched_substring_number (const GMatchInfo *match_info,
 563                               const gchar      *name)
 564 {
 565   gint entrysize;
 566   gchar *first, *last;
 567   guchar *entry;
 568
 569   if ((match_info->regex->compile_opts & G_REGEX_DUPNAMES) == 0)
 570     return pcre_get_stringnumber (match_info->regex->pcre_re, name);
 571
 572   /* This code is copied from pcre_get.c: get_first_set() */
 573   entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
 574                                             name,
 575                                             &first,
 576                                             &last);
 577
 578   if (entrysize <= 0)
 579     return entrysize;
 580
 581   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
 582     {
 583       gint n = (entry[0] << 8) + entry[1];
 584       if (match_info->offsets[n*2] >= 0)
 585         return n;
 586     }
 587
 588   return (first[0] << 8) + first[1];
 589 }
 590
 591 /**
 592  * g_match_info_fetch_named:
 593  * @match_info: #GMatchInfo structure
 594  * @name: name of the subexpression
 595  *
 596  * Retrieves the text matching the capturing parentheses named @name.
 597  *
 598  * If @name is a valid sub pattern name but it didn't match anything (e.g.
 599  * sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b") then an empty
 600  * string is returned.
 601  *
 602  * The string is fetched from the string passed to the match function,
 603  * so you cannot call this function after freeing the string.
 604  *
 605  * Returns: The matched substring, or %NULL if an error occurred.
 606  *          You have to free the string yourself
 607  *
 608  * Since: 2.14
 609  */
 610 gchar *
 611 g_match_info_fetch_named (const GMatchInfo *match_info,
 612                           const gchar      *name)
 613 {
 614   /* we cannot use pcre_get_named_substring() because it allocates the
 615    * string using pcre_malloc(). */
 616   gint num;
 617
 618   g_return_val_if_fail (match_info != NULL, NULL);
 619   g_return_val_if_fail (name != NULL, NULL);
 620
 621   num = get_matched_substring_number (match_info, name);
 622   if (num < 0)
 623     return NULL;
 624   else
 625     return g_match_info_fetch (match_info, num);
 626 }
 627
 628 /**
 629  * g_match_info_fetch_named_pos:
 630  * @match_info: #GMatchInfo structure
 631  * @name: name of the subexpression
 632  * @start_pos: pointer to location where to store the start position
 633  * @end_pos: pointer to location where to store the end position
 634  *
 635  * Retrieves the position of the capturing parentheses named @name.
 636  *
 637  * If @name is a valid sub pattern name but it didn't match anything (e.g.
 638  * sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b") then @start_pos and
 639  * @end_pos are set to -1 and %TRUE is returned.
 640  *
 641  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If the
 642  *          position cannot be fetched, @start_pos and @end_pos are left
 643  *          unchanged.
 644  *
 645  * Since: 2.14
 646  */
 647 gboolean
 648 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
 649                               const gchar      *name,
 650                               gint             *start_pos,
 651                               gint             *end_pos)
 652 {
 653   gint num;
 654
 655   g_return_val_if_fail (match_info != NULL, FALSE);
 656   g_return_val_if_fail (name != NULL, FALSE);
 657
 658   num = get_matched_substring_number (match_info, name);
 659   if (num < 0)
 660     return FALSE;
 661
 662   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
 663 }
 664
 665 /**
 666  * g_match_info_fetch_all:
 667  * @match_info: a #GMatchInfo structure
 668  *
 669  * Bundles up pointers to each of the matching substrings from a match
 670  * and stores them in an array of gchar pointers. The first element in
 671  * the returned array is the match number 0, i.e. the entire matched
 672  * text.
 673  *
 674  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
 675  * "b" against "(a)?b") then an empty string is inserted.
 676  *
 677  * If the last match was obtained using the DFA algorithm, that is using
 678  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 679  * strings are not that matched by sets of parentheses but that of the
 680  * matched substring. Substrings are matched in reverse order of length,
 681  * so the first one is the longest match.
 682  *
 683  * The strings are fetched from the string passed to the match function,
 684  * so you cannot call this function after freeing the string.
 685  *
 686  * Returns: a %NULL-terminated array of gchar * pointers. It must be freed
 687  *          using g_strfreev(). If the previous match failed %NULL is
 688  *          returned.
 689  *
 690  * Since: 2.14
 691  */
 692 gchar **
 693 g_match_info_fetch_all (const GMatchInfo *match_info)
 694 {
 695   /* we cannot use pcre_get_substring_list() because the returned value
 696    * isn't suitable for g_strfreev(). */
 697   gchar **result;
 698   gint i;
 699
 700   g_return_val_if_fail (match_info != NULL, FALSE);
 701
 702   if (match_info->matches < 0)
 703     return NULL;
 704
 705   result = g_new (gchar *, match_info->matches + 1);
 706   for (i = 0; i < match_info->matches; i++)
 707     result[i] = g_match_info_fetch (match_info, i);
 708   result[i] = NULL;
 709
 710   return result;
 711 }
 712
 713
 714 /* GRegex */
 715
 716 GQuark
 717 g_regex_error_quark (void)
 718 {
 719   static GQuark error_quark = 0;
 720
 721   if (error_quark == 0)
 722     error_quark = g_quark_from_static_string ("g-regex-error-quark");
 723
 724   return error_quark;
 725 }
 726
 727 static GRegex *
 728 regex_ref (GRegex *regex)
 729 {
 730   g_atomic_int_inc ((gint*) &regex->ref_count);
 731   return regex;
 732 }
 733
 734 static void
 735 regex_unref (GRegex *regex)
 736 {
 737   if (g_atomic_int_exchange_and_add ((gint *) &regex->ref_count, -1) - 1 == 0)
 738     {
 739       g_free (regex->pattern);
 740       if (regex->pcre_re != NULL)
 741         pcre_free (regex->pcre_re);
 742       if (regex->extra != NULL)
 743         pcre_free (regex->extra);
 744       g_free (regex);
 745     }
 746 }
 747
 748 /**
 749  * g_regex_new:
 750  * @pattern: the regular expression
 751  * @compile_options: compile options for the regular expression
 752  * @match_options: match options for the regular expression
 753  * @error: return location for a #GError
 754  *
 755  * Compiles the regular expression to an internal form, and does the initial
 756  * setup of the #GRegex structure.
 757  *
 758  * Returns: a #GRegex structure
 759  *
 760  * Since: 2.14
 761  */
 762 GRegex *
 763 g_regex_new (const gchar         *pattern,
 764              GRegexCompileFlags   compile_options,
 765              GRegexMatchFlags     match_options,
 766              GError             **error)
 767 {
 768   GRegex *regex;
 769   pcre *re;
 770   const gchar *errmsg;
 771   gint erroffset;
 772   gboolean optimize = FALSE;
 773   static gboolean initialized = FALSE;
 774
 775   g_return_val_if_fail (pattern != NULL, NULL);
 776   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 777   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
 778   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
 779
 780   if (!initialized)
 781     {
 782       gint support;
 783       const gchar *msg;
 784
 785       pcre_config (PCRE_CONFIG_UTF8, &support);
 786       if (!support)
 787         {
 788           msg = N_("PCRE library is compiled without UTF8 support");
 789           g_critical (msg);
 790           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 791           return NULL;
 792         }
 793
 794       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
 795       if (!support)
 796         {
 797           msg = N_("PCRE library is compiled without UTF8 properties support");
 798           g_critical (msg);
 799           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 800           return NULL;
 801         }
 802
 803       initialized = TRUE;
 804     }
 805
 806   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
 807    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
 808   if (compile_options & G_REGEX_OPTIMIZE)
 809     optimize = TRUE;
 810
 811   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
 812    * instead uses UTF-8 only if required with PCRE_UTF8. */
 813   if (compile_options & G_REGEX_RAW)
 814     {
 815       /* disable utf-8 */
 816       compile_options &= ~G_REGEX_RAW;
 817     }
 818   else
 819     {
 820       /* enable utf-8 */
 821       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
 822       match_options |= PCRE_NO_UTF8_CHECK;
 823     }
 824
 825   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
 826    * not for the system one. */
 827   if (!(compile_options & G_REGEX_NEWLINE_CR) &&
 828       !(compile_options & G_REGEX_NEWLINE_LF))
 829     {
 830       compile_options |= PCRE_NEWLINE_ANY;
 831     }
 832
 833   /* compile the pattern */
 834   re = pcre_compile (pattern, compile_options, &errmsg, &erroffset, NULL);
 835
 836   /* if the compilation failed, set the error member and return
 837    * immediately */
 838   if (re == NULL)
 839     {
 840       GError *tmp_error = g_error_new (G_REGEX_ERROR,
 841                                        G_REGEX_ERROR_COMPILE,
 842                                        _("Error while compiling regular "
 843                                          "expression %s at char %d: %s"),
 844                                        pattern, erroffset, errmsg);
 845       g_propagate_error (error, tmp_error);
 846
 847       return NULL;
 848     }
 849
 850   regex = g_new0 (GRegex, 1);
 851   regex->ref_count = 1;
 852   regex->pattern = g_strdup (pattern);
 853   regex->pcre_re = re;
 854   regex->compile_opts = compile_options;
 855   regex->match_opts = match_options;
 856
 857   if (optimize)
 858     {
 859       regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
 860       if (errmsg != NULL)
 861         {
 862           GError *tmp_error = g_error_new (G_REGEX_ERROR,
 863                                            G_REGEX_ERROR_OPTIMIZE,
 864                                            _("Error while optimizing "
 865                                              "regular expression %s: %s"),
 866                                            regex->pattern,
 867                                            errmsg);
 868           g_propagate_error (error, tmp_error);
 869           return NULL;
 870         }
 871     }
 872
 873   return regex;
 874 }
 875
 876 /**
 877  * g_regex_free:
 878  * @regex: a #GRegex
 879  *
 880  * Frees all the memory associated with the regex structure.
 881  *
 882  * Since: 2.14
 883  */
 884 void
 885 g_regex_free (GRegex *regex)
 886 {
 887   if (regex == NULL)
 888     return;
 889
 890   regex_unref (regex);
 891 }
 892
 893 /**
 894  * g_regex_get_pattern:
 895  * @regex: a #GRegex structure
 896  *
 897  * Gets the pattern string associated with @regex, i.e. a copy of
 898  * the string passed to g_regex_new().
 899  *
 900  * Returns: the pattern of @regex
 901  *
 902  * Since: 2.14
 903  */
 904 const gchar *
 905 g_regex_get_pattern (const GRegex *regex)
 906 {
 907   g_return_val_if_fail (regex != NULL, NULL);
 908
 909   return regex->pattern;
 910 }
 911
 912 /**
 913  * g_regex_get_max_backref:
 914  * @regex: a #GRegex
 915  *
 916  * Returns the number of the highest back reference
 917  * in the pattern, or 0 if the pattern does not contain
 918  * back references.
 919  *
 920  * Returns: the number of the highest back reference.
 921  *
 922  * Since: 2.14
 923  */
 924 gint
 925 g_regex_get_max_backref (const GRegex *regex)
 926 {
 927   gint value;
 928
 929   pcre_fullinfo (regex->pcre_re, regex->extra,
 930                  PCRE_INFO_BACKREFMAX, &value);
 931
 932   return value;
 933 }
 934
 935 /**
 936  * g_regex_get_capture_count:
 937  * @regex: a #GRegex
 938  *
 939  * Returns the number of capturing subpatterns in the pattern.
 940  *
 941  * Returns: the number of capturing subpatterns.
 942  *
 943  * Since: 2.14
 944  */
 945 gint
 946 g_regex_get_capture_count (const GRegex *regex)
 947 {
 948   gint value;
 949
 950   pcre_fullinfo (regex->pcre_re, regex->extra,
 951                  PCRE_INFO_CAPTURECOUNT, &value);
 952
 953   return value;
 954 }
 955
 956 /**
 957  * g_regex_match_simple:
 958  * @pattern: the regular expression
 959  * @string: the string to scan for matches
 960  * @compile_options: compile options for the regular expression
 961  * @match_options: match options
 962  *
 963  * Scans for a match in @string for @pattern.
 964  *
 965  * This function is equivalent to g_regex_match() but it does not
 966  * require to compile the pattern with g_regex_new(), avoiding some
 967  * lines of code when you need just to do a match without extracting
 968  * substrings, capture counts, and so on.
 969  *
 970  * If this function is to be called on the same @pattern more than
 971  * once, it's more efficient to compile the pattern once with
 972  * g_regex_new() and then use g_regex_match().
 973  *
 974  * Returns: %TRUE is the string matched, %FALSE otherwise
 975  *
 976  * Since: 2.14
 977  */
 978 gboolean
 979 g_regex_match_simple (const gchar        *pattern,
 980                       const gchar        *string,
 981                       GRegexCompileFlags  compile_options,
 982                       GRegexMatchFlags    match_options)
 983 {
 984   GRegex *regex;
 985   gboolean result;
 986
 987   regex = g_regex_new (pattern, compile_options, 0, NULL);
 988   if (!regex)
 989     return FALSE;
 990   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
 991   g_regex_free (regex);
 992   return result;
 993 }
 994
 995 /**
 996  * g_regex_match:
 997  * @regex: a #GRegex structure from g_regex_new()
 998  * @string: the string to scan for matches
 999  * @match_options: match options
1000  * @match_info: pointer to location where to store the #GMatchInfo, or
1001  * %NULL if you do not need it
1002  *
1003  * Scans for a match in string for the pattern in @regex. The @match_options
1004  * are combined with the match options specified when the @regex structure
1005  * was created, letting you have more flexibility in reusing #GRegex
1006  * structures.
1007  *
1008  * A #GMatchInfo structure, used to get information on the match, is stored
1009  * in @match_info if not %NULL.
1010  *
1011  * To retrieve all the non-overlapping matches of the pattern in string you
1012  * can use g_match_info_next().
1013  *
1014  * <informalexample><programlisting>
1015  * static void
1016  * print_uppercase_words (const gchar *string)
1017  * {
1018  *   /&ast; Print all uppercase-only words. &ast;/
1019  *   GRegex *regex;
1020  *   GMatchInfo *match_info;
1021  *   &nbsp;
1022  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1023  *   g_regex_match (regex, string, 0, &match_info);
1024  *   while (g_match_info_matches (match_info))
1025  *     {
1026  *       gchar *word = g_match_info_fetch (match_info, 0);
1027  *       g_print ("Found: %s\n", word);
1028  *       g_free (word);
1029  *       g_match_info_next (match_info, NULL);
1030  *     }
1031  *   g_match_info_free (match_info);
1032  *   g_regex_free (regex);
1033  * }
1034  * </programlisting></informalexample>
1035  *
1036  * Returns: %TRUE is the string matched, %FALSE otherwise
1037  *
1038  * Since: 2.14
1039  */
1040 gboolean
1041 g_regex_match (const GRegex    *regex,
1042                const gchar     *string,
1043                GRegexMatchFlags match_options,
1044                GMatchInfo     **match_info)
1045 {
1046   return g_regex_match_full (regex, string, -1, 0, match_options,
1047                              match_info, NULL);
1048 }
1049
1050 /**
1051  * g_regex_match_full:
1052  * @regex: a #GRegex structure from g_regex_new()
1053  * @string: the string to scan for matches
1054  * @string_len: the length of @string, or -1 if @string is nul-terminated
1055  * @start_position: starting index of the string to match
1056  * @match_options: match options
1057  * @match_info: pointer to location where to store the #GMatchInfo, or
1058  * %NULL if you do not need it
1059  * @error: location to store the error occuring, or %NULL to ignore errors
1060  *
1061  * Scans for a match in string for the pattern in @regex. The @match_options
1062  * are combined with the match options specified when the @regex structure
1063  * was created, letting you have more flexibility in reusing #GRegex
1064  * structures.
1065  *
1066  * Setting @start_position differs from just passing over a shortened string
1067  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
1068  * with any kind of lookbehind assertion, such as "\b".
1069  *
1070  * A #GMatchInfo structure, used to get information on the match, is stored
1071  * in @match_info if not %NULL.
1072  *
1073  * To retrieve all the non-overlapping matches of the pattern in string you
1074  * can use g_match_info_next().
1075  *
1076  * <informalexample><programlisting>
1077  * static void
1078  * print_uppercase_words (const gchar *string)
1079  * {
1080  *   /&ast; Print all uppercase-only words. &ast;/
1081  *   GRegex *regex;
1082  *   GMatchInfo *match_info;
1083  *   GError *error = NULL;
1084  *   &nbsp;
1085  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1086  *   g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
1087  *   while (g_match_info_matches (match_info))
1088  *     {
1089  *       gchar *word = g_match_info_fetch (match_info, 0);
1090  *       g_print ("Found: %s\n", word);
1091  *       g_free (word);
1092  *       g_match_info_next (match_info, &error);
1093  *     }
1094  *   g_match_info_free (match_info);
1095  *   g_regex_free (regex);
1096  *   if (error != NULL)
1097  *     {
1098  *       g_printerr ("Error while matching: %s\n", error->message);
1099  *       g_error_free (error);
1100  *     }
1101  * }
1102  * </programlisting></informalexample>
1103  *
1104  * Returns: %TRUE is the string matched, %FALSE otherwise
1105  *
1106  * Since: 2.14
1107  */
1108 gboolean
1109 g_regex_match_full (const GRegex    *regex,
1110                     const gchar     *string,
1111                     gssize           string_len,
1112                     gint             start_position,
1113                     GRegexMatchFlags match_options,
1114                     GMatchInfo     **match_info,
1115                     GError         **error)
1116 {
1117   GMatchInfo *info;
1118   gboolean match_ok;
1119
1120   g_return_val_if_fail (regex != NULL, FALSE);
1121   g_return_val_if_fail (string != NULL, FALSE);
1122   g_return_val_if_fail (start_position >= 0, FALSE);
1123   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1124   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1125
1126   info = match_info_new (regex, string, string_len, start_position,
1127                          match_options, FALSE);
1128   match_ok = g_match_info_next (info, error);
1129   if (match_info != NULL)
1130     *match_info = info;
1131   else
1132     g_match_info_free (info);
1133
1134   return match_ok;
1135 }
1136
1137 /**
1138  * g_regex_match_all:
1139  * @regex: a #GRegex structure from g_regex_new()
1140  * @string: the string to scan for matches
1141  * @match_options: match options
1142  * @match_info: pointer to location where to store the #GMatchInfo, or
1143  * %NULL if you do not need it
1144  *
1145  * Using the standard algorithm for regular expression matching only the
1146  * longest match in the string is retrieved. This function uses a
1147  * different algorithm so it can retrieve all the possible matches.
1148  * For more documentation see g_regex_match_all_full().
1149  *
1150  * A #GMatchInfo structure, used to get information on the match, is stored
1151  * in @match_info if not %NULL.
1152  *
1153  * Returns: %TRUE is the string matched, %FALSE otherwise
1154  *
1155  * Since: 2.14
1156  */
1157 gboolean
1158 g_regex_match_all (const GRegex    *regex,
1159                    const gchar     *string,
1160                    GRegexMatchFlags match_options,
1161                    GMatchInfo     **match_info)
1162 {
1163   return g_regex_match_all_full (regex, string, -1, 0, match_options,
1164                                  match_info, NULL);
1165 }
1166
1167 /**
1168  * g_regex_match_all_full:
1169  * @regex: a #GRegex structure from g_regex_new()
1170  * @string: the string to scan for matches
1171  * @string_len: the length of @string, or -1 if @string is nul-terminated
1172  * @start_position: starting index of the string to match
1173  * @match_options: match options
1174  * @match_info: pointer to location where to store the #GMatchInfo, or
1175  * %NULL if you do not need it
1176  * @error: location to store the error occuring, or %NULL to ignore errors
1177  *
1178  * Using the standard algorithm for regular expression matching only the
1179  * longest match in the string is retrieved, it is not possibile to obtain
1180  * all the available matches. For instance matching
1181  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;" you get
1182  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
1183  *
1184  * This function uses a different algorithm (called DFA, i.e. deterministic
1185  * finite automaton), so it can retrieve all the possible matches, all
1186  * starting at the same point in the string. For instance matching
1187  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;" you
1188  * would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
1189  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
1190  *
1191  * The number of matched strings is retrieved using
1192  * g_match_info_get_match_count().
1193  * To obtain the matched strings and their position you can use,
1194  * respectively, g_match_info_fetch() and g_match_info_fetch_pos(). Note that
1195  * the strings are returned in reverse order of length; that is, the longest
1196  * matching string is given first.
1197  *
1198  * Note that the DFA algorithm is slower than the standard one and it is not
1199  * able to capture substrings, so backreferences do not work.
1200  *
1201  * Setting @start_position differs from just passing over a shortened string
1202  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
1203  * with any kind of lookbehind assertion, such as "\b".
1204  *
1205  * A #GMatchInfo structure, used to get information on the match, is stored
1206  * in @match_info if not %NULL.
1207  *
1208  * Returns: %TRUE is the string matched, %FALSE otherwise
1209  *
1210  * Since: 2.14
1211  */
1212 gboolean
1213 g_regex_match_all_full (const GRegex    *regex,
1214                         const gchar     *string,
1215                         gssize           string_len,
1216                         gint             start_position,
1217                         GRegexMatchFlags match_options,
1218                         GMatchInfo     **match_info,
1219                         GError         **error)
1220 {
1221   GMatchInfo *info;
1222   gboolean done;
1223
1224   g_return_val_if_fail (regex != NULL, FALSE);
1225   g_return_val_if_fail (string != NULL, FALSE);
1226   g_return_val_if_fail (start_position >= 0, FALSE);
1227   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1228   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1229
1230   info = match_info_new (regex, string, string_len, start_position,
1231                          match_options, TRUE);
1232
1233   done = FALSE;
1234   while (!done)
1235     {
1236       done = TRUE;
1237       info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1238                                      info->string, info->string_len,
1239                                      info->pos,
1240                                      regex->match_opts | match_options,
1241                                      info->offsets, info->n_offsets,
1242                                      info->workspace, info->n_workspace);
1243       if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1244         {
1245           /* info->workspace is too small. */
1246           info->n_workspace *= 2;
1247           info->workspace = g_realloc (info->workspace,
1248                                        info->n_workspace * sizeof (gint));
1249           done = FALSE;
1250         }
1251       else if (info->matches == 0)
1252         {
1253           /* info->offsets is too small. */
1254           info->n_offsets *= 2;
1255           info->offsets = g_realloc (info->offsets,
1256                                      info->n_offsets * sizeof (gint));
1257           done = FALSE;
1258         }
1259       else if (IS_PCRE_ERROR (info->matches))
1260         {
1261           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1262                        _("Error while matching regular expression %s: %s"),
1263                        regex->pattern, match_error (info->matches));
1264         }
1265     }
1266
1267   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1268   info->pos = -1;
1269
1270   if (match_info != NULL)
1271     *match_info = info;
1272   else
1273     g_match_info_free (info);
1274
1275   return info->matches >= 0;
1276 }
1277
1278 /**
1279  * g_regex_get_string_number:
1280  * @regex: #GRegex structure
1281  * @name: name of the subexpression
1282  *
1283  * Retrieves the number of the subexpression named @name.
1284  *
1285  * Returns: The number of the subexpression or -1 if @name does not exists
1286  *
1287  * Since: 2.14
1288  */
1289 gint
1290 g_regex_get_string_number (const GRegex *regex,
1291                            const gchar  *name)
1292 {
1293   gint num;
1294
1295   g_return_val_if_fail (regex != NULL, -1);
1296   g_return_val_if_fail (name != NULL, -1);
1297
1298   num = pcre_get_stringnumber (regex->pcre_re, name);
1299   if (num == PCRE_ERROR_NOSUBSTRING)
1300     num = -1;
1301
1302   return num;
1303 }
1304
1305 /**
1306  * g_regex_split_simple:
1307  * @pattern: the regular expression
1308  * @string: the string to scan for matches
1309  * @compile_options: compile options for the regular expression
1310  * @match_options: match options
1311  *
1312  * Breaks the string on the pattern, and returns an array of the tokens.
1313  * If the pattern contains capturing parentheses, then the text for each
1314  * of the substrings will also be returned. If the pattern does not match
1315  * anywhere in the string, then the whole string is returned as the first
1316  * token.
1317  *
1318  * This function is equivalent to g_regex_split() but it does not
1319  * require to compile the pattern with g_regex_new(), avoiding some
1320  * lines of code when you need just to do a split without extracting
1321  * substrings, capture counts, and so on.
1322  *
1323  * If this function is to be called on the same @pattern more than
1324  * once, it's more efficient to compile the pattern once with
1325  * g_regex_new() and then use g_regex_split().
1326  *
1327  * As a special case, the result of splitting the empty string "" is an
1328  * empty vector, not a vector containing a single string. The reason for
1329  * this special case is that being able to represent a empty vector is
1330  * typically more useful than consistent handling of empty elements. If
1331  * you do need to represent empty elements, you'll need to check for the
1332  * empty string before calling this function.
1333  *
1334  * A pattern that can match empty strings splits @string into separate
1335  * characters wherever it matches the empty string between characters.
1336  * For example splitting "ab c" using as a separator "\s*", you will get
1337  * "a", "b" and "c".
1338  *
1339  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1340  *
1341  * Since: 2.14
1342  **/
1343 gchar **
1344 g_regex_split_simple (const gchar        *pattern,
1345                       const gchar        *string,
1346                       GRegexCompileFlags  compile_options,
1347                       GRegexMatchFlags    match_options)
1348 {
1349   GRegex *regex;
1350   gchar **result;
1351
1352   regex = g_regex_new (pattern, compile_options, 0, NULL);
1353   if (!regex)
1354     return NULL;
1355   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1356   g_regex_free (regex);
1357   return result;
1358 }
1359
1360 /**
1361  * g_regex_split:
1362  * @regex: a #GRegex structure
1363  * @string: the string to split with the pattern
1364  * @match_options: match time option flags
1365  *
1366  * Breaks the string on the pattern, and returns an array of the tokens.
1367  * If the pattern contains capturing parentheses, then the text for each
1368  * of the substrings will also be returned. If the pattern does not match
1369  * anywhere in the string, then the whole string is returned as the first
1370  * token.
1371  *
1372  * As a special case, the result of splitting the empty string "" is an
1373  * empty vector, not a vector containing a single string. The reason for
1374  * this special case is that being able to represent a empty vector is
1375  * typically more useful than consistent handling of empty elements. If
1376  * you do need to represent empty elements, you'll need to check for the
1377  * empty string before calling this function.
1378  *
1379  * A pattern that can match empty strings splits @string into separate
1380  * characters wherever it matches the empty string between characters.
1381  * For example splitting "ab c" using as a separator "\s*", you will get
1382  * "a", "b" and "c".
1383  *
1384  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1385  *
1386  * Since: 2.14
1387  **/
1388 gchar **
1389 g_regex_split (const GRegex     *regex,
1390                const gchar      *string,
1391                GRegexMatchFlags  match_options)
1392 {
1393   return g_regex_split_full (regex, string, -1, 0,
1394                              match_options, 0, NULL);
1395 }
1396
1397 /**
1398  * g_regex_split_full:
1399  * @regex: a #GRegex structure
1400  * @string: the string to split with the pattern
1401  * @string_len: the length of @string, or -1 if @string is nul-terminated
1402  * @start_position: starting index of the string to match
1403  * @match_options: match time option flags
1404  * @max_tokens: the maximum number of tokens to split @string into. If this
1405  *    is less than 1, the string is split completely
1406  * @error: return location for a #GError
1407  *
1408  * Breaks the string on the pattern, and returns an array of the tokens.
1409  * If the pattern contains capturing parentheses, then the text for each
1410  * of the substrings will also be returned. If the pattern does not match
1411  * anywhere in the string, then the whole string is returned as the first
1412  * token.
1413  *
1414  * As a special case, the result of splitting the empty string "" is an
1415  * empty vector, not a vector containing a single string. The reason for
1416  * this special case is that being able to represent a empty vector is
1417  * typically more useful than consistent handling of empty elements. If
1418  * you do need to represent empty elements, you'll need to check for the
1419  * empty string before calling this function.
1420  *
1421  * A pattern that can match empty strings splits @string into separate
1422  * characters wherever it matches the empty string between characters.
1423  * For example splitting "ab c" using as a separator "\s*", you will get
1424  * "a", "b" and "c".
1425  *
1426  * Setting @start_position differs from just passing over a shortened string
1427  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
1428  * with any kind of lookbehind assertion, such as "\b".
1429  *
1430  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1431  *
1432  * Since: 2.14
1433  **/
1434 gchar **
1435 g_regex_split_full (const GRegex     *regex,
1436                     const gchar      *string,
1437                     gssize            string_len,
1438                     gint              start_position,
1439                     GRegexMatchFlags  match_options,
1440                     gint              max_tokens,
1441                     GError          **error)
1442 {
1443   GError *tmp_error = NULL;
1444   GMatchInfo *match_info;
1445   GList *list, *last;
1446   gint i;
1447   gint token_count;
1448   gboolean match_ok;
1449   /* position of the last separator. */
1450   gint last_separator_end;
1451   /* was the last match 0 bytes long? */
1452   gboolean last_match_is_empty;
1453   /* the returned array of char **s */
1454   gchar **string_list;
1455
1456   g_return_val_if_fail (regex != NULL, NULL);
1457   g_return_val_if_fail (string != NULL, NULL);
1458   g_return_val_if_fail (start_position >= 0, NULL);
1459   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1460   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1461
1462   if (max_tokens <= 0)
1463     max_tokens = G_MAXINT;
1464
1465   if (string_len < 0)
1466     string_len = strlen (string);
1467
1468   /* zero-length string */
1469   if (string_len - start_position == 0)
1470     return g_new0 (gchar *, 1);
1471
1472   if (max_tokens == 1)
1473     {
1474       string_list = g_new0 (gchar *, 2);
1475       string_list[0] = g_strndup (&string[start_position],
1476                                   string_len - start_position);
1477       return string_list;
1478     }
1479
1480   list = NULL;
1481   token_count = 0;
1482   last_separator_end = start_position;
1483   last_match_is_empty = FALSE;
1484
1485   match_ok = g_regex_match_full (regex, string, string_len, start_position,
1486                                  match_options, &match_info, &tmp_error);
1487   while (tmp_error == NULL)
1488     {
1489       if (match_ok)
1490         {
1491           last_match_is_empty =
1492                     (match_info->offsets[0] == match_info->offsets[1]);
1493
1494           /* we need to skip empty separators at the same position of the end
1495            * of another separator. e.g. the string is "a b" and the separator
1496            * is " *", so from 1 to 2 we have a match and at position 2 we have
1497            * an empty match. */
1498           if (last_separator_end != match_info->offsets[1])
1499             {
1500               gchar *token;
1501               gint match_count;
1502
1503               token = g_strndup (string + last_separator_end,
1504                                  match_info->offsets[0] - last_separator_end);
1505               list = g_list_prepend (list, token);
1506               token_count++;
1507
1508               /* if there were substrings, these need to be added to
1509                * the list. */
1510               match_count = g_match_info_get_match_count (match_info);
1511               if (match_count > 1)
1512                 {
1513                   for (i = 1; i < match_count; i++)
1514                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
1515                 }
1516             }
1517         }
1518       else
1519         {
1520           /* if there was no match, copy to end of string. */
1521           if (!last_match_is_empty)
1522             {
1523               gchar *token = g_strndup (string + last_separator_end,
1524                                         match_info->string_len - last_separator_end);
1525               list = g_list_prepend (list, token);
1526             }
1527           /* no more tokens, end the loop. */
1528           break;
1529         }
1530
1531       /* -1 to leave room for the last part. */
1532       if (token_count >= max_tokens - 1)
1533         {
1534           /* we have reached the maximum number of tokens, so we copy
1535            * the remaining part of the string. */
1536           if (last_match_is_empty)
1537             {
1538               /* the last match was empty, so we have moved one char
1539                * after the real position to avoid empty matches at the
1540                * same position. */
1541               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
1542             }
1543           /* the if is needed in the case we have terminated the available
1544            * tokens, but we are at the end of the string, so there are no
1545            * characters left to copy. */
1546           if (string_len > match_info->pos)
1547             {
1548               gchar *token = g_strndup (string + match_info->pos,
1549                                         string_len - match_info->pos);
1550               list = g_list_prepend (list, token);
1551             }
1552           /* end the loop. */
1553           break;
1554         }
1555
1556       last_separator_end = match_info->pos;
1557       if (last_match_is_empty)
1558         /* if the last match was empty, g_match_info_next() has moved
1559          * forward to avoid infinite loops, but we still need to copy that
1560          * character. */
1561         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
1562
1563       match_ok = g_match_info_next (match_info, &tmp_error);
1564     }
1565   g_match_info_free (match_info);
1566   if (tmp_error != NULL)
1567     {
1568       g_propagate_error (error, tmp_error);
1569       g_list_foreach (list, (GFunc)g_free, NULL);
1570       g_list_free (list);
1571       match_info->pos = -1;
1572       return NULL;
1573     }
1574
1575   string_list = g_new (gchar *, g_list_length (list) + 1);
1576   i = 0;
1577   for (last = g_list_last (list); last; last = g_list_previous (last))
1578     string_list[i++] = last->data;
1579   string_list[i] = 0;
1580   g_list_free (list);
1581
1582   return string_list;
1583 }
1584
1585 enum
1586 {
1587   REPL_TYPE_STRING,
1588   REPL_TYPE_CHARACTER,
1589   REPL_TYPE_SYMBOLIC_REFERENCE,
1590   REPL_TYPE_NUMERIC_REFERENCE,
1591   REPL_TYPE_CHANGE_CASE
1592 };
1593
1594 typedef enum
1595 {
1596   CHANGE_CASE_NONE         = 1 << 0,
1597   CHANGE_CASE_UPPER        = 1 << 1,
1598   CHANGE_CASE_LOWER        = 1 << 2,
1599   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
1600   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
1601   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
1602   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
1603   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
1604 } ChangeCase;
1605
1606 struct _InterpolationData
1607 {
1608   gchar     *text;
1609   gint       type;
1610   gint       num;
1611   gchar      c;
1612   ChangeCase change_case;
1613 };
1614
1615 static void
1616 free_interpolation_data (InterpolationData *data)
1617 {
1618   g_free (data->text);
1619   g_free (data);
1620 }
1621
1622 static const gchar *
1623 expand_escape (const gchar        *replacement,
1624                const gchar        *p,
1625                InterpolationData  *data,
1626                GError            **error)
1627 {
1628   const gchar *q, *r;
1629   gint x, d, h, i;
1630   const gchar *error_detail;
1631   gint base = 0;
1632   GError *tmp_error = NULL;
1633
1634   p++;
1635   switch (*p)
1636     {
1637     case 't':
1638       p++;
1639       data->c = '\t';
1640       data->type = REPL_TYPE_CHARACTER;
1641       break;
1642     case 'n':
1643       p++;
1644       data->c = '\n';
1645       data->type = REPL_TYPE_CHARACTER;
1646       break;
1647     case 'v':
1648       p++;
1649       data->c = '\v';
1650       data->type = REPL_TYPE_CHARACTER;
1651       break;
1652     case 'r':
1653       p++;
1654       data->c = '\r';
1655       data->type = REPL_TYPE_CHARACTER;
1656       break;
1657     case 'f':
1658       p++;
1659       data->c = '\f';
1660       data->type = REPL_TYPE_CHARACTER;
1661       break;
1662     case 'a':
1663       p++;
1664       data->c = '\a';
1665       data->type = REPL_TYPE_CHARACTER;
1666       break;
1667     case 'b':
1668       p++;
1669       data->c = '\b';
1670       data->type = REPL_TYPE_CHARACTER;
1671       break;
1672     case '\\':
1673       p++;
1674       data->c = '\\';
1675       data->type = REPL_TYPE_CHARACTER;
1676       break;
1677     case 'x':
1678       p++;
1679       x = 0;
1680       if (*p == '{')
1681         {
1682           p++;
1683           do
1684             {
1685               h = g_ascii_xdigit_value (*p);
1686               if (h < 0)
1687                 {
1688                   error_detail = _("hexadecimal digit or '}' expected");
1689                   goto error;
1690                 }
1691               x = x * 16 + h;
1692               p++;
1693             }
1694           while (*p != '}');
1695           p++;
1696         }
1697       else
1698         {
1699           for (i = 0; i < 2; i++)
1700             {
1701               h = g_ascii_xdigit_value (*p);
1702               if (h < 0)
1703                 {
1704                   error_detail = _("hexadecimal digit expected");
1705                   goto error;
1706                 }
1707               x = x * 16 + h;
1708               p++;
1709             }
1710         }
1711       data->type = REPL_TYPE_STRING;
1712       data->text = g_new0 (gchar, 8);
1713       g_unichar_to_utf8 (x, data->text);
1714       break;
1715     case 'l':
1716       p++;
1717       data->type = REPL_TYPE_CHANGE_CASE;
1718       data->change_case = CHANGE_CASE_LOWER_SINGLE;
1719       break;
1720     case 'u':
1721       p++;
1722       data->type = REPL_TYPE_CHANGE_CASE;
1723       data->change_case = CHANGE_CASE_UPPER_SINGLE;
1724       break;
1725     case 'L':
1726       p++;
1727       data->type = REPL_TYPE_CHANGE_CASE;
1728       data->change_case = CHANGE_CASE_LOWER;
1729       break;
1730     case 'U':
1731       p++;
1732       data->type = REPL_TYPE_CHANGE_CASE;
1733       data->change_case = CHANGE_CASE_UPPER;
1734       break;
1735     case 'E':
1736       p++;
1737       data->type = REPL_TYPE_CHANGE_CASE;
1738       data->change_case = CHANGE_CASE_NONE;
1739       break;
1740     case 'g':
1741       p++;
1742       if (*p != '<')
1743         {
1744           error_detail = _("missing '<' in symbolic reference");
1745           goto error;
1746         }
1747       q = p + 1;
1748       do
1749         {
1750           p++;
1751           if (!*p)
1752             {
1753               error_detail = _("unfinished symbolic reference");
1754               goto error;
1755             }
1756         }
1757       while (*p != '>');
1758       if (p - q == 0)
1759         {
1760           error_detail = _("zero-length symbolic reference");
1761           goto error;
1762         }
1763       if (g_ascii_isdigit (*q))
1764         {
1765           x = 0;
1766           do
1767             {
1768               h = g_ascii_digit_value (*q);
1769               if (h < 0)
1770                 {
1771                   error_detail = _("digit expected");
1772                   p = q;
1773                   goto error;
1774                 }
1775               x = x * 10 + h;
1776               q++;
1777             }
1778           while (q != p);
1779           data->num = x;
1780           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1781         }
1782       else
1783         {
1784           r = q;
1785           do
1786             {
1787               if (!g_ascii_isalnum (*r))
1788                 {
1789                   error_detail = _("illegal symbolic reference");
1790                   p = r;
1791                   goto error;
1792                 }
1793               r++;
1794             }
1795           while (r != p);
1796           data->text = g_strndup (q, p - q);
1797           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
1798         }
1799       p++;
1800       break;
1801     case '0':
1802       /* if \0 is followed by a number is an octal number representing a
1803        * character, else it is a numeric reference. */
1804       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
1805         {
1806           base = 8;
1807           p = g_utf8_next_char (p);
1808         }
1809     case '1':
1810     case '2':
1811     case '3':
1812     case '4':
1813     case '5':
1814     case '6':
1815     case '7':
1816     case '8':
1817     case '9':
1818       x = 0;
1819       d = 0;
1820       for (i = 0; i < 3; i++)
1821         {
1822           h = g_ascii_digit_value (*p);
1823           if (h < 0)
1824             break;
1825           if (h > 7)
1826             {
1827               if (base == 8)
1828                 break;
1829               else
1830                 base = 10;
1831             }
1832           if (i == 2 && base == 10)
1833             break;
1834           x = x * 8 + h;
1835           d = d * 10 + h;
1836           p++;
1837         }
1838       if (base == 8 || i == 3)
1839         {
1840           data->type = REPL_TYPE_STRING;
1841           data->text = g_new0 (gchar, 8);
1842           g_unichar_to_utf8 (x, data->text);
1843         }
1844       else
1845         {
1846           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1847           data->num = d;
1848         }
1849       break;
1850     case 0:
1851       error_detail = _("stray final '\\'");
1852       goto error;
1853       break;
1854     default:
1855       error_detail = _("unknown escape sequence");
1856       goto error;
1857     }
1858
1859   return p;
1860
1861  error:
1862   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
1863   tmp_error = g_error_new (G_REGEX_ERROR,
1864                            G_REGEX_ERROR_REPLACE,
1865                            _("Error while parsing replacement "
1866                              "text \"%s\" at char %lu: %s"),
1867                            replacement,
1868                            (gulong)(p - replacement),
1869                            error_detail);
1870   g_propagate_error (error, tmp_error);
1871
1872   return NULL;
1873 }
1874
1875 static GList *
1876 split_replacement (const gchar  *replacement,
1877                    GError      **error)
1878 {
1879   GList *list = NULL;
1880   InterpolationData *data;
1881   const gchar *p, *start;
1882
1883   start = p = replacement;
1884   while (*p)
1885     {
1886       if (*p == '\\')
1887         {
1888           data = g_new0 (InterpolationData, 1);
1889           start = p = expand_escape (replacement, p, data, error);
1890           if (p == NULL)
1891             {
1892               g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
1893               g_list_free (list);
1894               free_interpolation_data (data);
1895
1896               return NULL;
1897             }
1898           list = g_list_prepend (list, data);
1899         }
1900       else
1901         {
1902           p++;
1903           if (*p == '\\' || *p == '\0')
1904             {
1905               if (p - start > 0)
1906                 {
1907                   data = g_new0 (InterpolationData, 1);
1908                   data->text = g_strndup (start, p - start);
1909                   data->type = REPL_TYPE_STRING;
1910                   list = g_list_prepend (list, data);
1911                 }
1912             }
1913         }
1914     }
1915
1916   return g_list_reverse (list);
1917 }
1918
1919 /* Change the case of c based on change_case. */
1920 #define CHANGE_CASE(c, change_case) \
1921         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
1922                 g_unichar_tolower (c) : \
1923                 g_unichar_toupper (c))
1924
1925 static void
1926 string_append (GString     *string,
1927                const gchar *text,
1928                ChangeCase  *change_case)
1929 {
1930   gunichar c;
1931
1932   if (text[0] == '\0')
1933     return;
1934
1935   if (*change_case == CHANGE_CASE_NONE)
1936     {
1937       g_string_append (string, text);
1938     }
1939   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
1940     {
1941       c = g_utf8_get_char (text);
1942       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
1943       g_string_append (string, g_utf8_next_char (text));
1944       *change_case = CHANGE_CASE_NONE;
1945     }
1946   else
1947     {
1948       while (*text != '\0')
1949         {
1950           c = g_utf8_get_char (text);
1951           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
1952           text = g_utf8_next_char (text);
1953         }
1954     }
1955 }
1956
1957 static gboolean
1958 interpolate_replacement (const GRegex     *regex,
1959                          const GMatchInfo *match_info,
1960                          const gchar      *string,
1961                          GString          *result,
1962                          gpointer          data)
1963 {
1964   GList *list;
1965   InterpolationData *idata;
1966   gchar *match;
1967   ChangeCase change_case = CHANGE_CASE_NONE;
1968
1969   for (list = data; list; list = list->next)
1970     {
1971       idata = list->data;
1972       switch (idata->type)
1973         {
1974         case REPL_TYPE_STRING:
1975           string_append (result, idata->text, &change_case);
1976           break;
1977         case REPL_TYPE_CHARACTER:
1978           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
1979           if (change_case & CHANGE_CASE_SINGLE_MASK)
1980             change_case = CHANGE_CASE_NONE;
1981           break;
1982         case REPL_TYPE_NUMERIC_REFERENCE:
1983           match = g_match_info_fetch (match_info, idata->num);
1984           if (match)
1985             {
1986               string_append (result, match, &change_case);
1987               g_free (match);
1988             }
1989           break;
1990         case REPL_TYPE_SYMBOLIC_REFERENCE:
1991           match = g_match_info_fetch_named (match_info, idata->text);
1992           if (match)
1993             {
1994               string_append (result, match, &change_case);
1995               g_free (match);
1996             }
1997           break;
1998         case REPL_TYPE_CHANGE_CASE:
1999           change_case = idata->change_case;
2000           break;
2001         }
2002     }
2003
2004   return FALSE;
2005 }
2006
2007 /**
2008  * g_regex_replace:
2009  * @regex: a #GRegex structure
2010  * @string: the string to perform matches against
2011  * @string_len: the length of @string, or -1 if @string is nul-terminated
2012  * @start_position: starting index of the string to match
2013  * @replacement: text to replace each match with
2014  * @match_options: options for the match
2015  * @error: location to store the error occuring, or %NULL to ignore errors
2016  *
2017  * Replaces all occurances of the pattern in @regex with the
2018  * replacement text. Backreferences of the form '\number' or '\g&lt;number&gt;'
2019  * in the replacement text are interpolated by the number-th captured
2020  * subexpression of the match, '\g&lt;name&gt;' refers to the captured subexpression
2021  * with the given name. '\0' refers to the complete match, but '\0' followed
2022  * by a number is the octal representation of a character. To include a
2023  * literal '\' in the replacement, write '\\'.
2024  * There are also escapes that changes the case of the following text:
2025  *
2026  * <variablelist>
2027  * <varlistentry><term>\l</term>
2028  * <listitem>
2029  * <para>Convert to lower case the next character</para>
2030  * </listitem>
2031  * </varlistentry>
2032  * <varlistentry><term>\u</term>
2033  * <listitem>
2034  * <para>Convert to upper case the next character</para>
2035  * </listitem>
2036  * </varlistentry>
2037  * <varlistentry><term>\L</term>
2038  * <listitem>
2039  * <para>Convert to lower case till \E</para>
2040  * </listitem>
2041  * </varlistentry>
2042  * <varlistentry><term>\U</term>
2043  * <listitem>
2044  * <para>Convert to upper case till \E</para>
2045  * </listitem>
2046  * </varlistentry>
2047  * <varlistentry><term>\E</term>
2048  * <listitem>
2049  * <para>End case modification</para>
2050  * </listitem>
2051  * </varlistentry>
2052  * </variablelist>
2053  *
2054  * If you do not need to use backreferences use g_regex_replace_literal().
2055  *
2056  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2057  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2058  * you can use g_regex_replace_literal().
2059  *
2060  * Setting @start_position differs from just passing over a shortened string
2061  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
2062  * with any kind of lookbehind assertion, such as "\b".
2063  *
2064  * Returns: a newly allocated string containing the replacements
2065  *
2066  * Since: 2.14
2067  */
2068 gchar *
2069 g_regex_replace (const GRegex      *regex,
2070                  const gchar       *string,
2071                  gssize             string_len,
2072                  gint               start_position,
2073                  const gchar       *replacement,
2074                  GRegexMatchFlags   match_options,
2075                  GError           **error)
2076 {
2077   gchar *result;
2078   GList *list;
2079   GError *tmp_error = NULL;
2080
2081   g_return_val_if_fail (regex != NULL, NULL);
2082   g_return_val_if_fail (string != NULL, NULL);
2083   g_return_val_if_fail (start_position >= 0, NULL);
2084   g_return_val_if_fail (replacement != NULL, NULL);
2085   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2086   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2087
2088   list = split_replacement (replacement, &tmp_error);
2089   if (tmp_error != NULL)
2090     {
2091       g_propagate_error (error, tmp_error);
2092       return NULL;
2093     }
2094
2095   result = g_regex_replace_eval (regex,
2096                                  string, string_len, start_position,
2097                                  match_options,
2098                                  interpolate_replacement,
2099                                  (gpointer)list,
2100                                  &tmp_error);
2101   if (tmp_error != NULL)
2102     g_propagate_error (error, tmp_error);
2103
2104   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2105   g_list_free (list);
2106
2107   return result;
2108 }
2109
2110 static gboolean
2111 literal_replacement (const GRegex     *regex,
2112                      const GMatchInfo *match_info,
2113                      const gchar      *string,
2114                      GString          *result,
2115                      gpointer          data)
2116 {
2117   g_string_append (result, data);
2118   return FALSE;
2119 }
2120
2121 /**
2122  * g_regex_replace_literal:
2123  * @regex: a #GRegex structure
2124  * @string: the string to perform matches against
2125  * @string_len: the length of @string, or -1 if @string is nul-terminated
2126  * @start_position: starting index of the string to match
2127  * @replacement: text to replace each match with
2128  * @match_options: options for the match
2129  * @error: location to store the error occuring, or %NULL to ignore errors
2130  *
2131  * Replaces all occurances of the pattern in @regex with the
2132  * replacement text. @replacement is replaced literally, to
2133  * include backreferences use g_regex_replace().
2134  *
2135  * Setting @start_position differs from just passing over a shortened string
2136  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
2137  * with any kind of lookbehind assertion, such as "\b".
2138  *
2139  * Returns: a newly allocated string containing the replacements
2140  *
2141  * Since: 2.14
2142  */
2143 gchar *
2144 g_regex_replace_literal (const GRegex    *regex,
2145                          const gchar     *string,
2146                          gssize           string_len,
2147                          gint             start_position,
2148                          const gchar     *replacement,
2149                          GRegexMatchFlags match_options,
2150                          GError         **error)
2151 {
2152   g_return_val_if_fail (replacement != NULL, NULL);
2153   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2154
2155   return g_regex_replace_eval (regex,
2156                                string, string_len, start_position,
2157                                match_options,
2158                                literal_replacement,
2159                                (gpointer)replacement,
2160                                error);
2161 }
2162
2163 /**
2164  * g_regex_replace_eval:
2165  * @regex: a #GRegex structure from g_regex_new()
2166  * @string: string to perform matches against
2167  * @string_len: the length of @string, or -1 if @string is nul-terminated
2168  * @start_position: starting index of the string to match
2169  * @match_options: options for the match
2170  * @eval: a function to call for each match
2171  * @user_data: user data to pass to the function
2172  * @error: location to store the error occuring, or %NULL to ignore errors
2173  *
2174  * Replaces occurances of the pattern in regex with the output of @eval
2175  * for that occurance.
2176  *
2177  * Setting @start_position differs from just passing over a shortened string
2178  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
2179  * with any kind of lookbehind assertion, such as "\b".
2180  *
2181  * Returns: a newly allocated string containing the replacements
2182  *
2183  * Since: 2.14
2184  */
2185 gchar *
2186 g_regex_replace_eval (const GRegex      *regex,
2187                       const gchar       *string,
2188                       gssize             string_len,
2189                       gint               start_position,
2190                       GRegexMatchFlags   match_options,
2191                       GRegexEvalCallback eval,
2192                       gpointer           user_data,
2193                       GError           **error)
2194 {
2195   GMatchInfo *match_info;
2196   GString *result;
2197   gint str_pos = 0;
2198   gboolean done = FALSE;
2199   GError *tmp_error = NULL;
2200
2201   g_return_val_if_fail (regex != NULL, NULL);
2202   g_return_val_if_fail (string != NULL, NULL);
2203   g_return_val_if_fail (start_position >= 0, NULL);
2204   g_return_val_if_fail (eval != NULL, NULL);
2205   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2206
2207   if (string_len < 0)
2208     string_len = strlen (string);
2209
2210   result = g_string_sized_new (string_len);
2211
2212   /* run down the string making matches. */
2213   g_regex_match_full (regex, string, string_len, start_position,
2214                       match_options, &match_info, &tmp_error);
2215   while (!done && g_match_info_matches (match_info))
2216     {
2217       g_string_append_len (result,
2218                            string + str_pos,
2219                            match_info->offsets[0] - str_pos);
2220       done = (*eval) (regex, match_info, string, result, user_data);
2221       str_pos = match_info->offsets[1];
2222       g_match_info_next (match_info, &tmp_error);
2223     }
2224   g_match_info_free (match_info);
2225   if (tmp_error != NULL)
2226     {
2227       g_propagate_error (error, tmp_error);
2228       g_string_free (result, TRUE);
2229       return NULL;
2230     }
2231
2232   g_string_append_len (result, string + str_pos, string_len - str_pos);
2233   return g_string_free (result, FALSE);
2234 }
2235
2236 /**
2237  * g_regex_escape_string:
2238  * @string: the string to escape
2239  * @length: the length of @string, or -1 if @string is nul-terminated
2240  *
2241  * Escapes the special characters used for regular expressions in @string,
2242  * for instance "a.b*c" becomes "a\.b\*c". This function is useful to
2243  * dynamically generate regular expressions.
2244  *
2245  * @string can contain nul characters that are replaced with "\0", in this
2246  * case remember to specify the correct length of @string in @length.
2247  *
2248  * Returns: a newly-allocated escaped string
2249  *
2250  * Since: 2.14
2251  */
2252 gchar *
2253 g_regex_escape_string (const gchar *string,
2254                        gint         length)
2255 {
2256   GString *escaped;
2257   const char *p, *piece_start, *end;
2258
2259   g_return_val_if_fail (string != NULL, NULL);
2260
2261   if (length < 0)
2262     length = strlen (string);
2263
2264   end = string + length;
2265   p = piece_start = string;
2266   escaped = g_string_sized_new (length + 1);
2267
2268   while (p < end)
2269     {
2270       switch (*p)
2271         {
2272         case '\0':
2273         case '\\':
2274         case '|':
2275         case '(':
2276         case ')':
2277         case '[':
2278         case ']':
2279         case '{':
2280         case '}':
2281         case '^':
2282         case '$':
2283         case '*':
2284         case '+':
2285         case '?':
2286         case '.':
2287           if (p != piece_start)
2288             /* copy the previous piece. */
2289             g_string_append_len (escaped, piece_start, p - piece_start);
2290           g_string_append_c (escaped, '\\');
2291           if (*p == '\0')
2292             g_string_append_c (escaped, '0');
2293           else
2294             g_string_append_c (escaped, *p);
2295           piece_start = ++p;
2296           break;
2297         default:
2298           p = g_utf8_next_char (p);
2299           break;
2300         }
2301   }
2302
2303   if (piece_start < end)
2304     g_string_append_len (escaped, piece_start, end - piece_start);
2305
2306   return g_string_free (escaped, FALSE);
2307 }
2308
2309 #define __G_REGEX_C__
2310 #include "galiasdef.c"