glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include "config.h"
  23
  24 #include <string.h>
  25
  26 #include "glib.h"
  27 #include "glibintl.h"
  28 #include "gregex.h"
  29
  30 #ifdef USE_SYSTEM_PCRE
  31 #include <pcre.h>
  32 #else
  33 #include "pcre/pcre.h"
  34 #endif
  35
  36 /* PCRE 7.3 does not contain the definition of PCRE_ERROR_NULLWSLIMIT */
  37 #ifndef PCRE_ERROR_NULLWSLIMIT
  38 #define PCRE_ERROR_NULLWSLIMIT (-22)
  39 #endif
  40
  41 #include "galias.h"
  42
  43 /* Mask of all the possible values for GRegexCompileFlags. */
  44 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
  45                               G_REGEX_MULTILINE         | \
  46                               G_REGEX_DOTALL            | \
  47                               G_REGEX_EXTENDED          | \
  48                               G_REGEX_ANCHORED          | \
  49                               G_REGEX_DOLLAR_ENDONLY    | \
  50                               G_REGEX_UNGREEDY          | \
  51                               G_REGEX_RAW               | \
  52                               G_REGEX_NO_AUTO_CAPTURE   | \
  53                               G_REGEX_OPTIMIZE          | \
  54                               G_REGEX_DUPNAMES          | \
  55                               G_REGEX_NEWLINE_CR        | \
  56                               G_REGEX_NEWLINE_LF        | \
  57                               G_REGEX_NEWLINE_CRLF)
  58
  59 /* Mask of all the possible values for GRegexMatchFlags. */
  60 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED      | \
  61                             G_REGEX_MATCH_NOTBOL        | \
  62                             G_REGEX_MATCH_NOTEOL        | \
  63                             G_REGEX_MATCH_NOTEMPTY      | \
  64                             G_REGEX_MATCH_PARTIAL       | \
  65                             G_REGEX_MATCH_NEWLINE_CR    | \
  66                             G_REGEX_MATCH_NEWLINE_LF    | \
  67                             G_REGEX_MATCH_NEWLINE_CRLF  | \
  68                             G_REGEX_MATCH_NEWLINE_ANY)
  69
  70 /* if the string is in UTF-8 use g_utf8_ functions, else use
  71  * use just +/- 1. */
  72 #define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
  73                                 g_utf8_next_char (s) : \
  74                                 ((s) + 1))
  75 #define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
  76                                 g_utf8_prev_char (s) : \
  77                                 ((s) - 1))
  78
  79 struct _GMatchInfo
  80 {
  81   GRegex *regex;                /* the regex */
  82   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
  83   gint matches;                 /* number of matching sub patterns */
  84   gint pos;                     /* position in the string where last match left off */
  85   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
  86   gint n_offsets;               /* number of offsets */
  87   gint *workspace;              /* workspace for pcre_dfa_exec() */
  88   gint n_workspace;             /* number of workspace elements */
  89   const gchar *string;          /* string passed to the match function */
  90   gssize string_len;            /* length of string */
  91 };
  92
  93 struct _GRegex
  94 {
  95   volatile gint ref_count;      /* the ref count for the immutable part */
  96   gchar *pattern;               /* the pattern */
  97   pcre *pcre_re;                /* compiled form of the pattern */
  98   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
  99   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
 100   pcre_extra *extra;            /* data stored when G_REGEX_OPTIMIZE is used */
 101 };
 102
 103 /* TRUE if ret is an error code, FALSE otherwise. */
 104 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 105
 106 typedef struct _InterpolationData InterpolationData;
 107 static gboolean  interpolation_list_needs_match (GList *list);
 108 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
 109                                                  GString *result,
 110                                                  gpointer data);
 111 static GList    *split_replacement              (const gchar *replacement,
 112                                                  GError **error);
 113 static void      free_interpolation_data        (InterpolationData *data);
 114
 115
 116 static const gchar *
 117 match_error (gint errcode)
 118 {
 119   switch (errcode)
 120     {
 121     case PCRE_ERROR_NOMATCH:
 122       /* not an error */
 123       break;
 124     case PCRE_ERROR_NULL:
 125       /* NULL argument, this should not happen in GRegex */
 126       g_warning ("A NULL argument was passed to PCRE");
 127       break;
 128     case PCRE_ERROR_BADOPTION:
 129       return "bad options";
 130     case PCRE_ERROR_BADMAGIC:
 131       return _("corrupted object");
 132     case PCRE_ERROR_UNKNOWN_OPCODE:
 133       return N_("internal error or corrupted object");
 134     case PCRE_ERROR_NOMEMORY:
 135       return _("out of memory");
 136     case PCRE_ERROR_NOSUBSTRING:
 137       /* not used by pcre_exec() */
 138       break;
 139     case PCRE_ERROR_MATCHLIMIT:
 140       return _("backtracking limit reached");
 141     case PCRE_ERROR_CALLOUT:
 142       /* callouts are not implemented */
 143       break;
 144     case PCRE_ERROR_BADUTF8:
 145     case PCRE_ERROR_BADUTF8_OFFSET:
 146       /* we do not check if strings are valid */
 147       break;
 148     case PCRE_ERROR_PARTIAL:
 149       /* not an error */
 150       break;
 151     case PCRE_ERROR_BADPARTIAL:
 152       return _("the pattern contains items not supported for partial matching");
 153     case PCRE_ERROR_INTERNAL:
 154       return _("internal error");
 155     case PCRE_ERROR_BADCOUNT:
 156       /* negative ovecsize, this should not happen in GRegex */
 157       g_warning ("A negative ovecsize was passed to PCRE");
 158       break;
 159     case PCRE_ERROR_DFA_UITEM:
 160       return _("the pattern contains items not supported for partial matching");
 161     case PCRE_ERROR_DFA_UCOND:
 162       return _("back references as conditions are not supported for partial matching");
 163     case PCRE_ERROR_DFA_UMLIMIT:
 164       /* the match_field field is not used in GRegex */
 165       break;
 166     case PCRE_ERROR_DFA_WSSIZE:
 167       /* handled expanding the workspace */
 168       break;
 169     case PCRE_ERROR_DFA_RECURSE:
 170     case PCRE_ERROR_RECURSIONLIMIT:
 171       return _("recursion limit reached");
 172     case PCRE_ERROR_NULLWSLIMIT:
 173       return _("workspace limit for empty substrings reached");
 174     case PCRE_ERROR_BADNEWLINE:
 175       return _("invalid combination of newline flags");
 176     default:
 177       break;
 178     }
 179   return _("unknown error");
 180 }
 181
 182
 183 /* GMatchInfo */
 184
 185 static GMatchInfo *
 186 match_info_new (const GRegex *regex,
 187                 const gchar  *string,
 188                 gint          string_len,
 189                 gint          start_position,
 190                 gint          match_options,
 191                 gboolean      is_dfa)
 192 {
 193   GMatchInfo *match_info;
 194
 195   if (string_len < 0)
 196     string_len = strlen (string);
 197
 198   match_info = g_new0 (GMatchInfo, 1);
 199   match_info->regex = g_regex_ref ((GRegex *)regex);
 200   match_info->string = string;
 201   match_info->string_len = string_len;
 202   match_info->matches = PCRE_ERROR_NOMATCH;
 203   match_info->pos = start_position;
 204   match_info->match_opts = match_options;
 205
 206   if (is_dfa)
 207     {
 208       /* These values should be enough for most cases, if they are not
 209        * enough g_regex_match_all_full() will expand them. */
 210       match_info->n_offsets = 24;
 211       match_info->n_workspace = 100;
 212       match_info->workspace = g_new (gint, match_info->n_workspace);
 213     }
 214   else
 215     {
 216       gint capture_count;
 217       pcre_fullinfo (regex->pcre_re, regex->extra,
 218                      PCRE_INFO_CAPTURECOUNT, &capture_count);
 219       match_info->n_offsets = (capture_count + 1) * 3;
 220     }
 221   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 222
 223   return match_info;
 224 }
 225
 226 /**
 227  * g_match_info_get_regex:
 228  * @match_info: a #GMatchInfo
 229  *
 230  * Returns #GRegex object used in @match_info. It belongs to Glib
 231  * and must not be freed. Use g_regex_ref() if you need to keep it
 232  * after you free @match_info object.
 233  *
 234  * Returns: #GRegex object used in @match_info
 235  *
 236  * Since: 2.14
 237  */
 238 GRegex *
 239 g_match_info_get_regex (const GMatchInfo *match_info)
 240 {
 241   g_return_val_if_fail (match_info != NULL, NULL);
 242   return match_info->regex;
 243 }
 244
 245 /**
 246  * g_match_info_get_string:
 247  * @match_info: a #GMatchInfo
 248  *
 249  * Returns the string searched with @match_info. This is the
 250  * string passed to g_regex_match() or g_regex_replace() so
 251  * you may not free it before calling this function.
 252  *
 253  * Returns: the string searched with @match_info
 254  *
 255  * Since: 2.14
 256  */
 257 const gchar *
 258 g_match_info_get_string (const GMatchInfo *match_info)
 259 {
 260   g_return_val_if_fail (match_info != NULL, NULL);
 261   return match_info->string;
 262 }
 263
 264 /**
 265  * g_match_info_free:
 266  * @match_info: a #GMatchInfo
 267  *
 268  * Frees all the memory associated with the #GMatchInfo structure.
 269  *
 270  * Since: 2.14
 271  */
 272 void
 273 g_match_info_free (GMatchInfo *match_info)
 274 {
 275   if (match_info)
 276     {
 277       g_regex_unref (match_info->regex);
 278       g_free (match_info->offsets);
 279       g_free (match_info->workspace);
 280       g_free (match_info);
 281     }
 282 }
 283
 284 /**
 285  * g_match_info_next:
 286  * @match_info: a #GMatchInfo structure
 287  * @error: location to store the error occuring, or %NULL to ignore errors
 288  *
 289  * Scans for the next match using the same parameters of the previous
 290  * call to g_regex_match_full() or g_regex_match() that returned
 291  * @match_info.
 292  *
 293  * The match is done on the string passed to the match function, so you
 294  * cannot free it before calling this function.
 295  *
 296  * Returns: %TRUE is the string matched, %FALSE otherwise
 297  *
 298  * Since: 2.14
 299  */
 300 gboolean
 301 g_match_info_next (GMatchInfo  *match_info,
 302                    GError     **error)
 303 {
 304   gint opts;
 305
 306   g_return_val_if_fail (match_info != NULL, FALSE);
 307   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 308   g_return_val_if_fail (match_info->pos >= 0, FALSE);
 309
 310   opts = match_info->regex->match_opts | match_info->match_opts;
 311
 312   match_info->matches = pcre_exec (match_info->regex->pcre_re,
 313                                    match_info->regex->extra,
 314                                    match_info->string,
 315                                    match_info->string_len,
 316                                    match_info->pos,
 317                                    match_info->regex->match_opts |
 318                                    match_info->match_opts,
 319                                    match_info->offsets,
 320                                    match_info->n_offsets);
 321   if (IS_PCRE_ERROR (match_info->matches))
 322     {
 323       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 324                    _("Error while matching regular expression %s: %s"),
 325                    match_info->regex->pattern, match_error (match_info->matches));
 326       return FALSE;
 327     }
 328
 329   /* avoid infinite loops if the pattern is an empty string or something
 330    * equivalent */
 331   if (match_info->pos == match_info->offsets[1])
 332     {
 333       if (match_info->pos > match_info->string_len)
 334         {
 335           /* we have reached the end of the string */
 336           match_info->pos = -1;
 337           match_info->matches = PCRE_ERROR_NOMATCH;
 338           return FALSE;
 339         }
 340
 341       match_info->pos = NEXT_CHAR (match_info->regex,
 342                                    &match_info->string[match_info->pos]) -
 343                                    match_info->string;
 344     }
 345   else
 346     {
 347       match_info->pos = match_info->offsets[1];
 348     }
 349
 350   return match_info->matches >= 0;
 351 }
 352
 353 /**
 354  * g_match_info_matches:
 355  * @match_info: a #GMatchInfo structure
 356  *
 357  * Returns whether the previous match operation succeeded.
 358  *
 359  * Returns: %TRUE if the previous match operation succeeded,
 360  *   %FALSE otherwise
 361  *
 362  * Since: 2.14
 363  */
 364 gboolean
 365 g_match_info_matches (const GMatchInfo *match_info)
 366 {
 367   g_return_val_if_fail (match_info != NULL, FALSE);
 368
 369   return match_info->matches >= 0;
 370 }
 371
 372 /**
 373  * g_match_info_get_match_count:
 374  * @match_info: a #GMatchInfo structure
 375  *
 376  * Retrieves the number of matched substrings (including substring 0,
 377  * that is the whole matched text), so 1 is returned if the pattern
 378  * has no substrings in it and 0 is returned if the match failed.
 379  *
 380  * If the last match was obtained using the DFA algorithm, that is
 381  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
 382  * count is not that of the number of capturing parentheses but that of
 383  * the number of matched substrings.
 384  *
 385  * Returns: Number of matched substrings, or -1 if an error occurred
 386  *
 387  * Since: 2.14
 388  */
 389 gint
 390 g_match_info_get_match_count (const GMatchInfo *match_info)
 391 {
 392   g_return_val_if_fail (match_info, -1);
 393
 394   if (match_info->matches == PCRE_ERROR_NOMATCH)
 395     /* no match */
 396     return 0;
 397   else if (match_info->matches < PCRE_ERROR_NOMATCH)
 398     /* error */
 399     return -1;
 400   else
 401     /* match */
 402     return match_info->matches;
 403 }
 404
 405 /**
 406  * g_match_info_is_partial_match:
 407  * @match_info: a #GMatchInfo structure
 408  *
 409  * Usually if the string passed to g_regex_match*() matches as far as
 410  * it goes, but is too short to match the entire pattern, %FALSE is
 411  * returned. There are circumstances where it might be helpful to
 412  * distinguish this case from other cases in which there is no match.
 413  *
 414  * Consider, for example, an application where a human is required to
 415  * type in data for a field with specific formatting requirements. An
 416  * example might be a date in the form ddmmmyy, defined by the pattern
 417  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 418  * If the application sees the user’s keystrokes one by one, and can
 419  * check that what has been typed so far is potentially valid, it is
 420  * able to raise an error as soon as a mistake is made.
 421  *
 422  * GRegex supports the concept of partial matching by means of the
 423  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 424  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 425  * for a complete match, %FALSE otherwise. But, when these functions
 426  * return %FALSE, you can check if the match was partial calling
 427  * g_match_info_is_partial_match().
 428  *
 429  * When using partial matching you cannot use g_match_info_fetch*().
 430  *
 431  * Because of the way certain internal optimizations are implemented
 432  * the partial matching algorithm cannot be used with all patterns.
 433  * So repeated single characters such as "a{2,4}" and repeated single
 434  * meta-sequences such as "\d+" are not permitted if the maximum number
 435  * of occurrences is greater than one. Optional items such as "\d?"
 436  * (where the maximum is one) are permitted. Quantifiers with any values
 437  * are permitted after parentheses, so the invalid examples above can be
 438  * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
 439  * for a pattern that does not conform to the restrictions, matching
 440  * functions return an error.
 441  *
 442  * Returns: %TRUE if the match was partial, %FALSE otherwise
 443  *
 444  * Since: 2.14
 445  */
 446 gboolean
 447 g_match_info_is_partial_match (const GMatchInfo *match_info)
 448 {
 449   g_return_val_if_fail (match_info != NULL, FALSE);
 450
 451   return match_info->matches == PCRE_ERROR_PARTIAL;
 452 }
 453
 454 /**
 455  * g_match_info_expand_references:
 456  * @match_info: a #GMatchInfo or %NULL
 457  * @string_to_expand: the string to expand
 458  * @error: location to store the error occuring, or %NULL to ignore errors
 459  *
 460  * Returns a new string containing the text in @string_to_expand with
 461  * references and escape sequences expanded. References refer to the last
 462  * match done with @string against @regex and have the same syntax used by
 463  * g_regex_replace().
 464  *
 465  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
 466  * passed to g_regex_new().
 467  *
 468  * The backreferences are extracted from the string passed to the match
 469  * function, so you cannot call this function after freeing the string.
 470  *
 471  * @match_info may be %NULL in which case @string_to_expand must not
 472  * contain references. For instance "foo\n" does not refer to an actual
 473  * pattern and '\n' merely will be replaced with \n character,
 474  * while to expand "\0" (whole match) one needs the result of a match.
 475  * Use g_regex_check_replacement() to find out whether @string_to_expand
 476  * contains references.
 477  *
 478  * Returns: the expanded string, or %NULL if an error occurred
 479  *
 480  * Since: 2.14
 481  */
 482 gchar *
 483 g_match_info_expand_references (const GMatchInfo  *match_info,
 484                                 const gchar       *string_to_expand,
 485                                 GError           **error)
 486 {
 487   GString *result;
 488   GList *list;
 489   GError *tmp_error = NULL;
 490
 491   g_return_val_if_fail (string_to_expand != NULL, NULL);
 492   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 493
 494   list = split_replacement (string_to_expand, &tmp_error);
 495   if (tmp_error != NULL)
 496     {
 497       g_propagate_error (error, tmp_error);
 498       return NULL;
 499     }
 500
 501   if (!match_info && interpolation_list_needs_match (list))
 502     {
 503       g_critical ("String '%s' contains references to the match, can't "
 504                   "expand references without GMatchInfo object",
 505                   string_to_expand);
 506       return NULL;
 507     }
 508
 509   result = g_string_sized_new (strlen (string_to_expand));
 510   interpolate_replacement (match_info, result, list);
 511
 512   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
 513   g_list_free (list);
 514
 515   return g_string_free (result, FALSE);
 516 }
 517
 518 /**
 519  * g_match_info_fetch:
 520  * @match_info: #GMatchInfo structure
 521  * @match_num: number of the sub expression
 522  *
 523  * Retrieves the text matching the @match_num<!-- -->'th capturing
 524  * parentheses. 0 is the full text of the match, 1 is the first paren
 525  * set, 2 the second, and so on.
 526  *
 527  * If @match_num is a valid sub pattern but it didn't match anything
 528  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
 529  * string is returned.
 530  *
 531  * If the match was obtained using the DFA algorithm, that is using
 532  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 533  * string is not that of a set of parentheses but that of a matched
 534  * substring. Substrings are matched in reverse order of length, so
 535  * 0 is the longest match.
 536  *
 537  * The string is fetched from the string passed to the match function,
 538  * so you cannot call this function after freeing the string.
 539  *
 540  * Returns: The matched substring, or %NULL if an error occurred.
 541  *          You have to free the string yourself
 542  *
 543  * Since: 2.14
 544  */
 545 gchar *
 546 g_match_info_fetch (const GMatchInfo *match_info,
 547                     gint              match_num)
 548 {
 549   /* we cannot use pcre_get_substring() because it allocates the
 550    * string using pcre_malloc(). */
 551   gchar *match = NULL;
 552   gint start, end;
 553
 554   g_return_val_if_fail (match_info != NULL, NULL);
 555   g_return_val_if_fail (match_num >= 0, NULL);
 556
 557   /* match_num does not exist or it didn't matched, i.e. matching "b"
 558    * against "(a)?b" then group 0 is empty. */
 559   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
 560     match = NULL;
 561   else if (start == -1)
 562     match = g_strdup ("");
 563   else
 564     match = g_strndup (&match_info->string[start], end - start);
 565
 566   return match;
 567 }
 568
 569 /**
 570  * g_match_info_fetch_pos:
 571  * @match_info: #GMatchInfo structure
 572  * @match_num: number of the sub expression
 573  * @start_pos: pointer to location where to store the start position
 574  * @end_pos: pointer to location where to store the end position
 575  *
 576  * Retrieves the position of the @match_num<!-- -->'th capturing
 577  * parentheses. 0 is the full text of the match, 1 is the first
 578  * paren set, 2 the second, and so on.
 579  *
 580  * If @match_num is a valid sub pattern but it didn't match anything
 581  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
 582  * and @end_pos are set to -1 and %TRUE is returned.
 583  *
 584  * If the match was obtained using the DFA algorithm, that is using
 585  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 586  * position is not that of a set of parentheses but that of a matched
 587  * substring. Substrings are matched in reverse order of length, so
 588  * 0 is the longest match.
 589  *
 590  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
 591  *   the position cannot be fetched, @start_pos and @end_pos are left
 592  *   unchanged
 593  *
 594  * Since: 2.14
 595  */
 596 gboolean
 597 g_match_info_fetch_pos (const GMatchInfo *match_info,
 598                         gint              match_num,
 599                         gint             *start_pos,
 600                         gint             *end_pos)
 601 {
 602   g_return_val_if_fail (match_info != NULL, FALSE);
 603   g_return_val_if_fail (match_num >= 0, FALSE);
 604
 605   /* make sure the sub expression number they're requesting is less than
 606    * the total number of sub expressions that were matched. */
 607   if (match_num >= match_info->matches)
 608     return FALSE;
 609
 610   if (start_pos != NULL)
 611     *start_pos = match_info->offsets[2 * match_num];
 612
 613   if (end_pos != NULL)
 614     *end_pos = match_info->offsets[2 * match_num + 1];
 615
 616   return TRUE;
 617 }
 618
 619 /*
 620  * Returns number of first matched subpattern with name @name.
 621  * There may be more than one in case when DUPNAMES is used,
 622  * and not all subpatterns with that name match;
 623  * pcre_get_stringnumber() does not work in that case.
 624  */
 625 static gint
 626 get_matched_substring_number (const GMatchInfo *match_info,
 627                               const gchar      *name)
 628 {
 629   gint entrysize;
 630   gchar *first, *last;
 631   guchar *entry;
 632
 633   if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES))
 634     return pcre_get_stringnumber (match_info->regex->pcre_re, name);
 635
 636   /* This code is copied from pcre_get.c: get_first_set() */
 637   entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
 638                                             name,
 639                                             &first,
 640                                             &last);
 641
 642   if (entrysize <= 0)
 643     return entrysize;
 644
 645   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
 646     {
 647       gint n = (entry[0] << 8) + entry[1];
 648       if (match_info->offsets[n*2] >= 0)
 649         return n;
 650     }
 651
 652   return (first[0] << 8) + first[1];
 653 }
 654
 655 /**
 656  * g_match_info_fetch_named:
 657  * @match_info: #GMatchInfo structure
 658  * @name: name of the subexpression
 659  *
 660  * Retrieves the text matching the capturing parentheses named @name.
 661  *
 662  * If @name is a valid sub pattern name but it didn't match anything
 663  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
 664  * then an empty string is returned.
 665  *
 666  * The string is fetched from the string passed to the match function,
 667  * so you cannot call this function after freeing the string.
 668  *
 669  * Returns: The matched substring, or %NULL if an error occurred.
 670  *          You have to free the string yourself
 671  *
 672  * Since: 2.14
 673  */
 674 gchar *
 675 g_match_info_fetch_named (const GMatchInfo *match_info,
 676                           const gchar      *name)
 677 {
 678   /* we cannot use pcre_get_named_substring() because it allocates the
 679    * string using pcre_malloc(). */
 680   gint num;
 681
 682   g_return_val_if_fail (match_info != NULL, NULL);
 683   g_return_val_if_fail (name != NULL, NULL);
 684
 685   num = get_matched_substring_number (match_info, name);
 686   if (num < 0)
 687     return NULL;
 688   else
 689     return g_match_info_fetch (match_info, num);
 690 }
 691
 692 /**
 693  * g_match_info_fetch_named_pos:
 694  * @match_info: #GMatchInfo structure
 695  * @name: name of the subexpression
 696  * @start_pos: pointer to location where to store the start position
 697  * @end_pos: pointer to location where to store the end position
 698  *
 699  * Retrieves the position of the capturing parentheses named @name.
 700  *
 701  * If @name is a valid sub pattern name but it didn't match anything
 702  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
 703  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
 704  *
 705  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
 706  *   the position cannot be fetched, @start_pos and @end_pos are left
 707  *   unchanged
 708  *
 709  * Since: 2.14
 710  */
 711 gboolean
 712 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
 713                               const gchar      *name,
 714                               gint             *start_pos,
 715                               gint             *end_pos)
 716 {
 717   gint num;
 718
 719   g_return_val_if_fail (match_info != NULL, FALSE);
 720   g_return_val_if_fail (name != NULL, FALSE);
 721
 722   num = get_matched_substring_number (match_info, name);
 723   if (num < 0)
 724     return FALSE;
 725
 726   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
 727 }
 728
 729 /**
 730  * g_match_info_fetch_all:
 731  * @match_info: a #GMatchInfo structure
 732  *
 733  * Bundles up pointers to each of the matching substrings from a match
 734  * and stores them in an array of gchar pointers. The first element in
 735  * the returned array is the match number 0, i.e. the entire matched
 736  * text.
 737  *
 738  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
 739  * "b" against "(a)?b") then an empty string is inserted.
 740  *
 741  * If the last match was obtained using the DFA algorithm, that is using
 742  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 743  * strings are not that matched by sets of parentheses but that of the
 744  * matched substring. Substrings are matched in reverse order of length,
 745  * so the first one is the longest match.
 746  *
 747  * The strings are fetched from the string passed to the match function,
 748  * so you cannot call this function after freeing the string.
 749  *
 750  * Returns: a %NULL-terminated array of gchar * pointers. It must be
 751  *   freed using g_strfreev(). If the previous match failed %NULL is
 752  *   returned
 753  *
 754  * Since: 2.14
 755  */
 756 gchar **
 757 g_match_info_fetch_all (const GMatchInfo *match_info)
 758 {
 759   /* we cannot use pcre_get_substring_list() because the returned value
 760    * isn't suitable for g_strfreev(). */
 761   gchar **result;
 762   gint i;
 763
 764   g_return_val_if_fail (match_info != NULL, NULL);
 765
 766   if (match_info->matches < 0)
 767     return NULL;
 768
 769   result = g_new (gchar *, match_info->matches + 1);
 770   for (i = 0; i < match_info->matches; i++)
 771     result[i] = g_match_info_fetch (match_info, i);
 772   result[i] = NULL;
 773
 774   return result;
 775 }
 776
 777
 778 /* GRegex */
 779
 780 GQuark
 781 g_regex_error_quark (void)
 782 {
 783   static GQuark error_quark = 0;
 784
 785   if (error_quark == 0)
 786     error_quark = g_quark_from_static_string ("g-regex-error-quark");
 787
 788   return error_quark;
 789 }
 790
 791 /**
 792  * g_regex_ref:
 793  * @regex: a #GRegex
 794  *
 795  * Increases reference count of @regex by 1.
 796  *
 797  * Returns: @regex
 798  *
 799  * Since: 2.14
 800  */
 801 GRegex *
 802 g_regex_ref (GRegex *regex)
 803 {
 804   g_return_val_if_fail (regex != NULL, NULL);
 805   g_atomic_int_inc (&regex->ref_count);
 806   return regex;
 807 }
 808
 809 /**
 810  * g_regex_unref:
 811  * @regex: a #GRegex
 812  *
 813  * Decreases reference count of @regex by 1. When reference count drops
 814  * to zero, it frees all the memory associated with the regex structure.
 815  *
 816  * Since: 2.14
 817  */
 818 void
 819 g_regex_unref (GRegex *regex)
 820 {
 821   g_return_if_fail (regex != NULL);
 822
 823   if (g_atomic_int_exchange_and_add (&regex->ref_count, -1) - 1 == 0)
 824     {
 825       g_free (regex->pattern);
 826       if (regex->pcre_re != NULL)
 827         pcre_free (regex->pcre_re);
 828       if (regex->extra != NULL)
 829         pcre_free (regex->extra);
 830       g_free (regex);
 831     }
 832 }
 833
 834 /**
 835  * g_regex_new:
 836  * @pattern: the regular expression
 837  * @compile_options: compile options for the regular expression
 838  * @match_options: match options for the regular expression
 839  * @error: return location for a #GError
 840  *
 841  * Compiles the regular expression to an internal form, and does
 842  * the initial setup of the #GRegex structure.
 843  *
 844  * Returns: a #GRegex structure. Call g_regex_unref() when you
 845  *   are done with it
 846  *
 847  * Since: 2.14
 848  */
 849 GRegex *
 850 g_regex_new (const gchar         *pattern,
 851              GRegexCompileFlags   compile_options,
 852              GRegexMatchFlags     match_options,
 853              GError             **error)
 854 {
 855   GRegex *regex;
 856   pcre *re;
 857   const gchar *errmsg;
 858   gint erroffset;
 859   gboolean optimize = FALSE;
 860   static gboolean initialized = FALSE;
 861   unsigned long int pcre_compile_options;
 862
 863   g_return_val_if_fail (pattern != NULL, NULL);
 864   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 865   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
 866   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
 867
 868   if (!initialized)
 869     {
 870       gint support;
 871       const gchar *msg;
 872
 873       pcre_config (PCRE_CONFIG_UTF8, &support);
 874       if (!support)
 875         {
 876           msg = N_("PCRE library is compiled without UTF8 support");
 877           g_critical (msg);
 878           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 879           return NULL;
 880         }
 881
 882       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
 883       if (!support)
 884         {
 885           msg = N_("PCRE library is compiled without UTF8 properties support");
 886           g_critical (msg);
 887           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 888           return NULL;
 889         }
 890
 891       initialized = TRUE;
 892     }
 893
 894   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
 895    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
 896   if (compile_options & G_REGEX_OPTIMIZE)
 897     optimize = TRUE;
 898
 899   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
 900    * instead uses UTF-8 only if required with PCRE_UTF8. */
 901   if (compile_options & G_REGEX_RAW)
 902     {
 903       /* disable utf-8 */
 904       compile_options &= ~G_REGEX_RAW;
 905     }
 906   else
 907     {
 908       /* enable utf-8 */
 909       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
 910       match_options |= PCRE_NO_UTF8_CHECK;
 911     }
 912
 913   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
 914    * not for the system one. */
 915   if (!(compile_options & G_REGEX_NEWLINE_CR) &&
 916       !(compile_options & G_REGEX_NEWLINE_LF))
 917     {
 918       compile_options |= PCRE_NEWLINE_ANY;
 919     }
 920
 921   /* compile the pattern */
 922   re = pcre_compile (pattern, compile_options, &errmsg, &erroffset, NULL);
 923
 924   /* if the compilation failed, set the error member and return
 925    * immediately */
 926   if (re == NULL)
 927     {
 928       GError *tmp_error;
 929
 930       /* PCRE uses byte offsets but we want to show character offsets */
 931       erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
 932
 933       tmp_error = g_error_new (G_REGEX_ERROR,
 934                                G_REGEX_ERROR_COMPILE,
 935                                _("Error while compiling regular "
 936                                  "expression %s at char %d: %s"),
 937                                pattern, erroffset, errmsg);
 938       g_propagate_error (error, tmp_error);
 939
 940       return NULL;
 941     }
 942
 943   /* For options set at the beginning of the pattern, pcre puts them into
 944    * compile options, e.g. "(?i)foo" will make the pcre structure store
 945    * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
 946   pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);
 947   compile_options = pcre_compile_options;
 948
 949   if (!(compile_options & G_REGEX_DUPNAMES))
 950     {
 951       gboolean jchanged = FALSE;
 952       pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged);
 953       if (jchanged)
 954         compile_options |= G_REGEX_DUPNAMES;
 955     }
 956
 957   regex = g_new0 (GRegex, 1);
 958   regex->ref_count = 1;
 959   regex->pattern = g_strdup (pattern);
 960   regex->pcre_re = re;
 961   regex->compile_opts = compile_options;
 962   regex->match_opts = match_options;
 963
 964   if (optimize)
 965     {
 966       regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
 967       if (errmsg != NULL)
 968         {
 969           GError *tmp_error = g_error_new (G_REGEX_ERROR,
 970                                            G_REGEX_ERROR_OPTIMIZE,
 971                                            _("Error while optimizing "
 972                                              "regular expression %s: %s"),
 973                                            regex->pattern,
 974                                            errmsg);
 975           g_propagate_error (error, tmp_error);
 976           return NULL;
 977         }
 978     }
 979
 980   return regex;
 981 }
 982
 983 /**
 984  * g_regex_get_pattern:
 985  * @regex: a #GRegex structure
 986  *
 987  * Gets the pattern string associated with @regex, i.e. a copy of
 988  * the string passed to g_regex_new().
 989  *
 990  * Returns: the pattern of @regex
 991  *
 992  * Since: 2.14
 993  */
 994 const gchar *
 995 g_regex_get_pattern (const GRegex *regex)
 996 {
 997   g_return_val_if_fail (regex != NULL, NULL);
 998
 999   return regex->pattern;
1000 }
1001
1002 /**
1003  * g_regex_get_max_backref:
1004  * @regex: a #GRegex
1005  *
1006  * Returns the number of the highest back reference
1007  * in the pattern, or 0 if the pattern does not contain
1008  * back references.
1009  *
1010  * Returns: the number of the highest back reference
1011  *
1012  * Since: 2.14
1013  */
1014 gint
1015 g_regex_get_max_backref (const GRegex *regex)
1016 {
1017   gint value;
1018
1019   pcre_fullinfo (regex->pcre_re, regex->extra,
1020                  PCRE_INFO_BACKREFMAX, &value);
1021
1022   return value;
1023 }
1024
1025 /**
1026  * g_regex_get_capture_count:
1027  * @regex: a #GRegex
1028  *
1029  * Returns the number of capturing subpatterns in the pattern.
1030  *
1031  * Returns: the number of capturing subpatterns
1032  *
1033  * Since: 2.14
1034  */
1035 gint
1036 g_regex_get_capture_count (const GRegex *regex)
1037 {
1038   gint value;
1039
1040   pcre_fullinfo (regex->pcre_re, regex->extra,
1041                  PCRE_INFO_CAPTURECOUNT, &value);
1042
1043   return value;
1044 }
1045
1046 /**
1047  * g_regex_match_simple:
1048  * @pattern: the regular expression
1049  * @string: the string to scan for matches
1050  * @compile_options: compile options for the regular expression
1051  * @match_options: match options
1052  *
1053  * Scans for a match in @string for @pattern.
1054  *
1055  * This function is equivalent to g_regex_match() but it does not
1056  * require to compile the pattern with g_regex_new(), avoiding some
1057  * lines of code when you need just to do a match without extracting
1058  * substrings, capture counts, and so on.
1059  *
1060  * If this function is to be called on the same @pattern more than
1061  * once, it's more efficient to compile the pattern once with
1062  * g_regex_new() and then use g_regex_match().
1063  *
1064  * Returns: %TRUE is the string matched, %FALSE otherwise
1065  *
1066  * Since: 2.14
1067  */
1068 gboolean
1069 g_regex_match_simple (const gchar        *pattern,
1070                       const gchar        *string,
1071                       GRegexCompileFlags  compile_options,
1072                       GRegexMatchFlags    match_options)
1073 {
1074   GRegex *regex;
1075   gboolean result;
1076
1077   regex = g_regex_new (pattern, compile_options, 0, NULL);
1078   if (!regex)
1079     return FALSE;
1080   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1081   g_regex_unref (regex);
1082   return result;
1083 }
1084
1085 /**
1086  * g_regex_match:
1087  * @regex: a #GRegex structure from g_regex_new()
1088  * @string: the string to scan for matches
1089  * @match_options: match options
1090  * @match_info: pointer to location where to store the #GMatchInfo,
1091  *   or %NULL if you do not need it
1092  *
1093  * Scans for a match in string for the pattern in @regex.
1094  * The @match_options are combined with the match options specified
1095  * when the @regex structure was created, letting you have more
1096  * flexibility in reusing #GRegex structures.
1097  *
1098  * A #GMatchInfo structure, used to get information on the match,
1099  * is stored in @match_info if not %NULL. Note that if @match_info
1100  * is not %NULL then it is created even if the function returns %FALSE,
1101  * i.e. you must free it regardless if regular expression actually matched.
1102  *
1103  * To retrieve all the non-overlapping matches of the pattern in
1104  * string you can use g_match_info_next().
1105  *
1106  * <informalexample><programlisting>
1107  * static void
1108  * print_uppercase_words (const gchar *string)
1109  * {
1110  *   /&ast; Print all uppercase-only words. &ast;/
1111  *   GRegex *regex;
1112  *   GMatchInfo *match_info;
1113  *   &nbsp;
1114  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1115  *   g_regex_match (regex, string, 0, &amp;match_info);
1116  *   while (g_match_info_matches (match_info))
1117  *     {
1118  *       gchar *word = g_match_info_fetch (match_info, 0);
1119  *       g_print ("Found: %s\n", word);
1120  *       g_free (word);
1121  *       g_match_info_next (match_info, NULL);
1122  *     }
1123  *   g_match_info_free (match_info);
1124  *   g_regex_unref (regex);
1125  * }
1126  * </programlisting></informalexample>
1127  *
1128  * Returns: %TRUE is the string matched, %FALSE otherwise
1129  *
1130  * Since: 2.14
1131  */
1132 gboolean
1133 g_regex_match (const GRegex      *regex,
1134                const gchar       *string,
1135                GRegexMatchFlags   match_options,
1136                GMatchInfo       **match_info)
1137 {
1138   return g_regex_match_full (regex, string, -1, 0, match_options,
1139                              match_info, NULL);
1140 }
1141
1142 /**
1143  * g_regex_match_full:
1144  * @regex: a #GRegex structure from g_regex_new()
1145  * @string: the string to scan for matches
1146  * @string_len: the length of @string, or -1 if @string is nul-terminated
1147  * @start_position: starting index of the string to match
1148  * @match_options: match options
1149  * @match_info: pointer to location where to store the #GMatchInfo,
1150  *   or %NULL if you do not need it
1151  * @error: location to store the error occuring, or %NULL to ignore errors
1152  *
1153  * Scans for a match in string for the pattern in @regex.
1154  * The @match_options are combined with the match options specified
1155  * when the @regex structure was created, letting you have more
1156  * flexibility in reusing #GRegex structures.
1157  *
1158  * Setting @start_position differs from just passing over a shortened
1159  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1160  * that begins with any kind of lookbehind assertion, such as "\b".
1161  *
1162  * A #GMatchInfo structure, used to get information on the match, is
1163  * stored in @match_info if not %NULL. Note that if @match_info is
1164  * not %NULL then it is created even if the function returns %FALSE,
1165  * i.e. you must free it regardless if regular expression actually
1166  * matched.
1167  *
1168  * @string is not copied and is used in #GMatchInfo internally. If
1169  * you use any #GMatchInfo method (except g_match_info_free()) after
1170  * freeing or modifying @string then the behaviour is undefined.
1171  *
1172  * To retrieve all the non-overlapping matches of the pattern in
1173  * string you can use g_match_info_next().
1174  *
1175  * <informalexample><programlisting>
1176  * static void
1177  * print_uppercase_words (const gchar *string)
1178  * {
1179  *   /&ast; Print all uppercase-only words. &ast;/
1180  *   GRegex *regex;
1181  *   GMatchInfo *match_info;
1182  *   GError *error = NULL;
1183  *   &nbsp;
1184  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1185  *   g_regex_match_full (regex, string, -1, 0, 0, &amp;match_info, &amp;error);
1186  *   while (g_match_info_matches (match_info))
1187  *     {
1188  *       gchar *word = g_match_info_fetch (match_info, 0);
1189  *       g_print ("Found: %s\n", word);
1190  *       g_free (word);
1191  *       g_match_info_next (match_info, &amp;error);
1192  *     }
1193  *   g_match_info_free (match_info);
1194  *   g_regex_unref (regex);
1195  *   if (error != NULL)
1196  *     {
1197  *       g_printerr ("Error while matching: %s\n", error->message);
1198  *       g_error_free (error);
1199  *     }
1200  * }
1201  * </programlisting></informalexample>
1202  *
1203  * Returns: %TRUE is the string matched, %FALSE otherwise
1204  *
1205  * Since: 2.14
1206  */
1207 gboolean
1208 g_regex_match_full (const GRegex      *regex,
1209                     const gchar       *string,
1210                     gssize             string_len,
1211                     gint               start_position,
1212                     GRegexMatchFlags   match_options,
1213                     GMatchInfo       **match_info,
1214                     GError           **error)
1215 {
1216   GMatchInfo *info;
1217   gboolean match_ok;
1218
1219   g_return_val_if_fail (regex != NULL, FALSE);
1220   g_return_val_if_fail (string != NULL, FALSE);
1221   g_return_val_if_fail (start_position >= 0, FALSE);
1222   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1223   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1224
1225   info = match_info_new (regex, string, string_len, start_position,
1226                          match_options, FALSE);
1227   match_ok = g_match_info_next (info, error);
1228   if (match_info != NULL)
1229     *match_info = info;
1230   else
1231     g_match_info_free (info);
1232
1233   return match_ok;
1234 }
1235
1236 /**
1237  * g_regex_match_all:
1238  * @regex: a #GRegex structure from g_regex_new()
1239  * @string: the string to scan for matches
1240  * @match_options: match options
1241  * @match_info: pointer to location where to store the #GMatchInfo,
1242  *   or %NULL if you do not need it
1243  *
1244  * Using the standard algorithm for regular expression matching only
1245  * the longest match in the string is retrieved. This function uses
1246  * a different algorithm so it can retrieve all the possible matches.
1247  * For more documentation see g_regex_match_all_full().
1248  *
1249  * A #GMatchInfo structure, used to get information on the match, is
1250  * stored in @match_info if not %NULL. Note that if @match_info is
1251  * not %NULL then it is created even if the function returns %FALSE,
1252  * i.e. you must free it regardless if regular expression actually
1253  * matched.
1254  *
1255  * Returns: %TRUE is the string matched, %FALSE otherwise
1256  *
1257  * Since: 2.14
1258  */
1259 gboolean
1260 g_regex_match_all (const GRegex      *regex,
1261                    const gchar       *string,
1262                    GRegexMatchFlags   match_options,
1263                    GMatchInfo       **match_info)
1264 {
1265   return g_regex_match_all_full (regex, string, -1, 0, match_options,
1266                                  match_info, NULL);
1267 }
1268
1269 /**
1270  * g_regex_match_all_full:
1271  * @regex: a #GRegex structure from g_regex_new()
1272  * @string: the string to scan for matches
1273  * @string_len: the length of @string, or -1 if @string is nul-terminated
1274  * @start_position: starting index of the string to match
1275  * @match_options: match options
1276  * @match_info: pointer to location where to store the #GMatchInfo,
1277  *   or %NULL if you do not need it
1278  * @error: location to store the error occuring, or %NULL to ignore errors
1279  *
1280  * Using the standard algorithm for regular expression matching only
1281  * the longest match in the string is retrieved, it is not possibile
1282  * to obtain all the available matches. For instance matching
1283  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1284  * you get "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
1285  *
1286  * This function uses a different algorithm (called DFA, i.e. deterministic
1287  * finite automaton), so it can retrieve all the possible matches, all
1288  * starting at the same point in the string. For instance matching
1289  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1290  * you would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
1291  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
1292  *
1293  * The number of matched strings is retrieved using
1294  * g_match_info_get_match_count(). To obtain the matched strings and
1295  * their position you can use, respectively, g_match_info_fetch() and
1296  * g_match_info_fetch_pos(). Note that the strings are returned in
1297  * reverse order of length; that is, the longest matching string is
1298  * given first.
1299  *
1300  * Note that the DFA algorithm is slower than the standard one and it
1301  * is not able to capture substrings, so backreferences do not work.
1302  *
1303  * Setting @start_position differs from just passing over a shortened
1304  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1305  * that begins with any kind of lookbehind assertion, such as "\b".
1306  *
1307  * A #GMatchInfo structure, used to get information on the match, is
1308  * stored in @match_info if not %NULL. Note that if @match_info is
1309  * not %NULL then it is created even if the function returns %FALSE,
1310  * i.e. you must free it regardless if regular expression actually
1311  * matched.
1312  *
1313  * Returns: %TRUE is the string matched, %FALSE otherwise
1314  *
1315  * Since: 2.14
1316  */
1317 gboolean
1318 g_regex_match_all_full (const GRegex      *regex,
1319                         const gchar       *string,
1320                         gssize             string_len,
1321                         gint               start_position,
1322                         GRegexMatchFlags   match_options,
1323                         GMatchInfo       **match_info,
1324                         GError           **error)
1325 {
1326   GMatchInfo *info;
1327   gboolean done;
1328
1329   g_return_val_if_fail (regex != NULL, FALSE);
1330   g_return_val_if_fail (string != NULL, FALSE);
1331   g_return_val_if_fail (start_position >= 0, FALSE);
1332   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1333   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1334
1335   info = match_info_new (regex, string, string_len, start_position,
1336                          match_options, TRUE);
1337
1338   done = FALSE;
1339   while (!done)
1340     {
1341       done = TRUE;
1342       info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1343                                      info->string, info->string_len,
1344                                      info->pos,
1345                                      regex->match_opts | match_options,
1346                                      info->offsets, info->n_offsets,
1347                                      info->workspace, info->n_workspace);
1348       if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1349         {
1350           /* info->workspace is too small. */
1351           info->n_workspace *= 2;
1352           info->workspace = g_realloc (info->workspace,
1353                                        info->n_workspace * sizeof (gint));
1354           done = FALSE;
1355         }
1356       else if (info->matches == 0)
1357         {
1358           /* info->offsets is too small. */
1359           info->n_offsets *= 2;
1360           info->offsets = g_realloc (info->offsets,
1361                                      info->n_offsets * sizeof (gint));
1362           done = FALSE;
1363         }
1364       else if (IS_PCRE_ERROR (info->matches))
1365         {
1366           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1367                        _("Error while matching regular expression %s: %s"),
1368                        regex->pattern, match_error (info->matches));
1369         }
1370     }
1371
1372   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1373   info->pos = -1;
1374
1375   if (match_info != NULL)
1376     *match_info = info;
1377   else
1378     g_match_info_free (info);
1379
1380   return info->matches >= 0;
1381 }
1382
1383 /**
1384  * g_regex_get_string_number:
1385  * @regex: #GRegex structure
1386  * @name: name of the subexpression
1387  *
1388  * Retrieves the number of the subexpression named @name.
1389  *
1390  * Returns: The number of the subexpression or -1 if @name
1391  *   does not exists
1392  *
1393  * Since: 2.14
1394  */
1395 gint
1396 g_regex_get_string_number (const GRegex *regex,
1397                            const gchar  *name)
1398 {
1399   gint num;
1400
1401   g_return_val_if_fail (regex != NULL, -1);
1402   g_return_val_if_fail (name != NULL, -1);
1403
1404   num = pcre_get_stringnumber (regex->pcre_re, name);
1405   if (num == PCRE_ERROR_NOSUBSTRING)
1406     num = -1;
1407
1408   return num;
1409 }
1410
1411 /**
1412  * g_regex_split_simple:
1413  * @pattern: the regular expression
1414  * @string: the string to scan for matches
1415  * @compile_options: compile options for the regular expression
1416  * @match_options: match options
1417  *
1418  * Breaks the string on the pattern, and returns an array of
1419  * the tokens. If the pattern contains capturing parentheses,
1420  * then the text for each of the substrings will also be returned.
1421  * If the pattern does not match anywhere in the string, then the
1422  * whole string is returned as the first token.
1423  *
1424  * This function is equivalent to g_regex_split() but it does
1425  * not require to compile the pattern with g_regex_new(), avoiding
1426  * some lines of code when you need just to do a split without
1427  * extracting substrings, capture counts, and so on.
1428  *
1429  * If this function is to be called on the same @pattern more than
1430  * once, it's more efficient to compile the pattern once with
1431  * g_regex_new() and then use g_regex_split().
1432  *
1433  * As a special case, the result of splitting the empty string ""
1434  * is an empty vector, not a vector containing a single string.
1435  * The reason for this special case is that being able to represent
1436  * a empty vector is typically more useful than consistent handling
1437  * of empty elements. If you do need to represent empty elements,
1438  * you'll need to check for the empty string before calling this
1439  * function.
1440  *
1441  * A pattern that can match empty strings splits @string into
1442  * separate characters wherever it matches the empty string between
1443  * characters. For example splitting "ab c" using as a separator
1444  * "\s*", you will get "a", "b" and "c".
1445  *
1446  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1447  *
1448  * Since: 2.14
1449  **/
1450 gchar **
1451 g_regex_split_simple (const gchar        *pattern,
1452                       const gchar        *string,
1453                       GRegexCompileFlags  compile_options,
1454                       GRegexMatchFlags    match_options)
1455 {
1456   GRegex *regex;
1457   gchar **result;
1458
1459   regex = g_regex_new (pattern, compile_options, 0, NULL);
1460   if (!regex)
1461     return NULL;
1462   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1463   g_regex_unref (regex);
1464   return result;
1465 }
1466
1467 /**
1468  * g_regex_split:
1469  * @regex: a #GRegex structure
1470  * @string: the string to split with the pattern
1471  * @match_options: match time option flags
1472  *
1473  * Breaks the string on the pattern, and returns an array of the tokens.
1474  * If the pattern contains capturing parentheses, then the text for each
1475  * of the substrings will also be returned. If the pattern does not match
1476  * anywhere in the string, then the whole string is returned as the first
1477  * token.
1478  *
1479  * As a special case, the result of splitting the empty string "" is an
1480  * empty vector, not a vector containing a single string. The reason for
1481  * this special case is that being able to represent a empty vector is
1482  * typically more useful than consistent handling of empty elements. If
1483  * you do need to represent empty elements, you'll need to check for the
1484  * empty string before calling this function.
1485  *
1486  * A pattern that can match empty strings splits @string into separate
1487  * characters wherever it matches the empty string between characters.
1488  * For example splitting "ab c" using as a separator "\s*", you will get
1489  * "a", "b" and "c".
1490  *
1491  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1492  *
1493  * Since: 2.14
1494  **/
1495 gchar **
1496 g_regex_split (const GRegex     *regex,
1497                const gchar      *string,
1498                GRegexMatchFlags  match_options)
1499 {
1500   return g_regex_split_full (regex, string, -1, 0,
1501                              match_options, 0, NULL);
1502 }
1503
1504 /**
1505  * g_regex_split_full:
1506  * @regex: a #GRegex structure
1507  * @string: the string to split with the pattern
1508  * @string_len: the length of @string, or -1 if @string is nul-terminated
1509  * @start_position: starting index of the string to match
1510  * @match_options: match time option flags
1511  * @max_tokens: the maximum number of tokens to split @string into.
1512  *   If this is less than 1, the string is split completely
1513  * @error: return location for a #GError
1514  *
1515  * Breaks the string on the pattern, and returns an array of the tokens.
1516  * If the pattern contains capturing parentheses, then the text for each
1517  * of the substrings will also be returned. If the pattern does not match
1518  * anywhere in the string, then the whole string is returned as the first
1519  * token.
1520  *
1521  * As a special case, the result of splitting the empty string "" is an
1522  * empty vector, not a vector containing a single string. The reason for
1523  * this special case is that being able to represent a empty vector is
1524  * typically more useful than consistent handling of empty elements. If
1525  * you do need to represent empty elements, you'll need to check for the
1526  * empty string before calling this function.
1527  *
1528  * A pattern that can match empty strings splits @string into separate
1529  * characters wherever it matches the empty string between characters.
1530  * For example splitting "ab c" using as a separator "\s*", you will get
1531  * "a", "b" and "c".
1532  *
1533  * Setting @start_position differs from just passing over a shortened
1534  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1535  * that begins with any kind of lookbehind assertion, such as "\b".
1536  *
1537  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1538  *
1539  * Since: 2.14
1540  **/
1541 gchar **
1542 g_regex_split_full (const GRegex      *regex,
1543                     const gchar       *string,
1544                     gssize             string_len,
1545                     gint               start_position,
1546                     GRegexMatchFlags   match_options,
1547                     gint               max_tokens,
1548                     GError           **error)
1549 {
1550   GError *tmp_error = NULL;
1551   GMatchInfo *match_info;
1552   GList *list, *last;
1553   gint i;
1554   gint token_count;
1555   gboolean match_ok;
1556   /* position of the last separator. */
1557   gint last_separator_end;
1558   /* was the last match 0 bytes long? */
1559   gboolean last_match_is_empty;
1560   /* the returned array of char **s */
1561   gchar **string_list;
1562
1563   g_return_val_if_fail (regex != NULL, NULL);
1564   g_return_val_if_fail (string != NULL, NULL);
1565   g_return_val_if_fail (start_position >= 0, NULL);
1566   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1567   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1568
1569   if (max_tokens <= 0)
1570     max_tokens = G_MAXINT;
1571
1572   if (string_len < 0)
1573     string_len = strlen (string);
1574
1575   /* zero-length string */
1576   if (string_len - start_position == 0)
1577     return g_new0 (gchar *, 1);
1578
1579   if (max_tokens == 1)
1580     {
1581       string_list = g_new0 (gchar *, 2);
1582       string_list[0] = g_strndup (&string[start_position],
1583                                   string_len - start_position);
1584       return string_list;
1585     }
1586
1587   list = NULL;
1588   token_count = 0;
1589   last_separator_end = start_position;
1590   last_match_is_empty = FALSE;
1591
1592   match_ok = g_regex_match_full (regex, string, string_len, start_position,
1593                                  match_options, &match_info, &tmp_error);
1594   while (tmp_error == NULL)
1595     {
1596       if (match_ok)
1597         {
1598           last_match_is_empty =
1599                     (match_info->offsets[0] == match_info->offsets[1]);
1600
1601           /* we need to skip empty separators at the same position of the end
1602            * of another separator. e.g. the string is "a b" and the separator
1603            * is " *", so from 1 to 2 we have a match and at position 2 we have
1604            * an empty match. */
1605           if (last_separator_end != match_info->offsets[1])
1606             {
1607               gchar *token;
1608               gint match_count;
1609
1610               token = g_strndup (string + last_separator_end,
1611                                  match_info->offsets[0] - last_separator_end);
1612               list = g_list_prepend (list, token);
1613               token_count++;
1614
1615               /* if there were substrings, these need to be added to
1616                * the list. */
1617               match_count = g_match_info_get_match_count (match_info);
1618               if (match_count > 1)
1619                 {
1620                   for (i = 1; i < match_count; i++)
1621                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
1622                 }
1623             }
1624         }
1625       else
1626         {
1627           /* if there was no match, copy to end of string. */
1628           if (!last_match_is_empty)
1629             {
1630               gchar *token = g_strndup (string + last_separator_end,
1631                                         match_info->string_len - last_separator_end);
1632               list = g_list_prepend (list, token);
1633             }
1634           /* no more tokens, end the loop. */
1635           break;
1636         }
1637
1638       /* -1 to leave room for the last part. */
1639       if (token_count >= max_tokens - 1)
1640         {
1641           /* we have reached the maximum number of tokens, so we copy
1642            * the remaining part of the string. */
1643           if (last_match_is_empty)
1644             {
1645               /* the last match was empty, so we have moved one char
1646                * after the real position to avoid empty matches at the
1647                * same position. */
1648               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
1649             }
1650           /* the if is needed in the case we have terminated the available
1651            * tokens, but we are at the end of the string, so there are no
1652            * characters left to copy. */
1653           if (string_len > match_info->pos)
1654             {
1655               gchar *token = g_strndup (string + match_info->pos,
1656                                         string_len - match_info->pos);
1657               list = g_list_prepend (list, token);
1658             }
1659           /* end the loop. */
1660           break;
1661         }
1662
1663       last_separator_end = match_info->pos;
1664       if (last_match_is_empty)
1665         /* if the last match was empty, g_match_info_next() has moved
1666          * forward to avoid infinite loops, but we still need to copy that
1667          * character. */
1668         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
1669
1670       match_ok = g_match_info_next (match_info, &tmp_error);
1671     }
1672   g_match_info_free (match_info);
1673   if (tmp_error != NULL)
1674     {
1675       g_propagate_error (error, tmp_error);
1676       g_list_foreach (list, (GFunc)g_free, NULL);
1677       g_list_free (list);
1678       match_info->pos = -1;
1679       return NULL;
1680     }
1681
1682   string_list = g_new (gchar *, g_list_length (list) + 1);
1683   i = 0;
1684   for (last = g_list_last (list); last; last = g_list_previous (last))
1685     string_list[i++] = last->data;
1686   string_list[i] = NULL;
1687   g_list_free (list);
1688
1689   return string_list;
1690 }
1691
1692 enum
1693 {
1694   REPL_TYPE_STRING,
1695   REPL_TYPE_CHARACTER,
1696   REPL_TYPE_SYMBOLIC_REFERENCE,
1697   REPL_TYPE_NUMERIC_REFERENCE,
1698   REPL_TYPE_CHANGE_CASE
1699 };
1700
1701 typedef enum
1702 {
1703   CHANGE_CASE_NONE         = 1 << 0,
1704   CHANGE_CASE_UPPER        = 1 << 1,
1705   CHANGE_CASE_LOWER        = 1 << 2,
1706   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
1707   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
1708   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
1709   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
1710   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
1711 } ChangeCase;
1712
1713 struct _InterpolationData
1714 {
1715   gchar     *text;
1716   gint       type;
1717   gint       num;
1718   gchar      c;
1719   ChangeCase change_case;
1720 };
1721
1722 static void
1723 free_interpolation_data (InterpolationData *data)
1724 {
1725   g_free (data->text);
1726   g_free (data);
1727 }
1728
1729 static const gchar *
1730 expand_escape (const gchar        *replacement,
1731                const gchar        *p,
1732                InterpolationData  *data,
1733                GError            **error)
1734 {
1735   const gchar *q, *r;
1736   gint x, d, h, i;
1737   const gchar *error_detail;
1738   gint base = 0;
1739   GError *tmp_error = NULL;
1740
1741   p++;
1742   switch (*p)
1743     {
1744     case 't':
1745       p++;
1746       data->c = '\t';
1747       data->type = REPL_TYPE_CHARACTER;
1748       break;
1749     case 'n':
1750       p++;
1751       data->c = '\n';
1752       data->type = REPL_TYPE_CHARACTER;
1753       break;
1754     case 'v':
1755       p++;
1756       data->c = '\v';
1757       data->type = REPL_TYPE_CHARACTER;
1758       break;
1759     case 'r':
1760       p++;
1761       data->c = '\r';
1762       data->type = REPL_TYPE_CHARACTER;
1763       break;
1764     case 'f':
1765       p++;
1766       data->c = '\f';
1767       data->type = REPL_TYPE_CHARACTER;
1768       break;
1769     case 'a':
1770       p++;
1771       data->c = '\a';
1772       data->type = REPL_TYPE_CHARACTER;
1773       break;
1774     case 'b':
1775       p++;
1776       data->c = '\b';
1777       data->type = REPL_TYPE_CHARACTER;
1778       break;
1779     case '\\':
1780       p++;
1781       data->c = '\\';
1782       data->type = REPL_TYPE_CHARACTER;
1783       break;
1784     case 'x':
1785       p++;
1786       x = 0;
1787       if (*p == '{')
1788         {
1789           p++;
1790           do
1791             {
1792               h = g_ascii_xdigit_value (*p);
1793               if (h < 0)
1794                 {
1795                   error_detail = _("hexadecimal digit or '}' expected");
1796                   goto error;
1797                 }
1798               x = x * 16 + h;
1799               p++;
1800             }
1801           while (*p != '}');
1802           p++;
1803         }
1804       else
1805         {
1806           for (i = 0; i < 2; i++)
1807             {
1808               h = g_ascii_xdigit_value (*p);
1809               if (h < 0)
1810                 {
1811                   error_detail = _("hexadecimal digit expected");
1812                   goto error;
1813                 }
1814               x = x * 16 + h;
1815               p++;
1816             }
1817         }
1818       data->type = REPL_TYPE_STRING;
1819       data->text = g_new0 (gchar, 8);
1820       g_unichar_to_utf8 (x, data->text);
1821       break;
1822     case 'l':
1823       p++;
1824       data->type = REPL_TYPE_CHANGE_CASE;
1825       data->change_case = CHANGE_CASE_LOWER_SINGLE;
1826       break;
1827     case 'u':
1828       p++;
1829       data->type = REPL_TYPE_CHANGE_CASE;
1830       data->change_case = CHANGE_CASE_UPPER_SINGLE;
1831       break;
1832     case 'L':
1833       p++;
1834       data->type = REPL_TYPE_CHANGE_CASE;
1835       data->change_case = CHANGE_CASE_LOWER;
1836       break;
1837     case 'U':
1838       p++;
1839       data->type = REPL_TYPE_CHANGE_CASE;
1840       data->change_case = CHANGE_CASE_UPPER;
1841       break;
1842     case 'E':
1843       p++;
1844       data->type = REPL_TYPE_CHANGE_CASE;
1845       data->change_case = CHANGE_CASE_NONE;
1846       break;
1847     case 'g':
1848       p++;
1849       if (*p != '<')
1850         {
1851           error_detail = _("missing '<' in symbolic reference");
1852           goto error;
1853         }
1854       q = p + 1;
1855       do
1856         {
1857           p++;
1858           if (!*p)
1859             {
1860               error_detail = _("unfinished symbolic reference");
1861               goto error;
1862             }
1863         }
1864       while (*p != '>');
1865       if (p - q == 0)
1866         {
1867           error_detail = _("zero-length symbolic reference");
1868           goto error;
1869         }
1870       if (g_ascii_isdigit (*q))
1871         {
1872           x = 0;
1873           do
1874             {
1875               h = g_ascii_digit_value (*q);
1876               if (h < 0)
1877                 {
1878                   error_detail = _("digit expected");
1879                   p = q;
1880                   goto error;
1881                 }
1882               x = x * 10 + h;
1883               q++;
1884             }
1885           while (q != p);
1886           data->num = x;
1887           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1888         }
1889       else
1890         {
1891           r = q;
1892           do
1893             {
1894               if (!g_ascii_isalnum (*r))
1895                 {
1896                   error_detail = _("illegal symbolic reference");
1897                   p = r;
1898                   goto error;
1899                 }
1900               r++;
1901             }
1902           while (r != p);
1903           data->text = g_strndup (q, p - q);
1904           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
1905         }
1906       p++;
1907       break;
1908     case '0':
1909       /* if \0 is followed by a number is an octal number representing a
1910        * character, else it is a numeric reference. */
1911       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
1912         {
1913           base = 8;
1914           p = g_utf8_next_char (p);
1915         }
1916     case '1':
1917     case '2':
1918     case '3':
1919     case '4':
1920     case '5':
1921     case '6':
1922     case '7':
1923     case '8':
1924     case '9':
1925       x = 0;
1926       d = 0;
1927       for (i = 0; i < 3; i++)
1928         {
1929           h = g_ascii_digit_value (*p);
1930           if (h < 0)
1931             break;
1932           if (h > 7)
1933             {
1934               if (base == 8)
1935                 break;
1936               else
1937                 base = 10;
1938             }
1939           if (i == 2 && base == 10)
1940             break;
1941           x = x * 8 + h;
1942           d = d * 10 + h;
1943           p++;
1944         }
1945       if (base == 8 || i == 3)
1946         {
1947           data->type = REPL_TYPE_STRING;
1948           data->text = g_new0 (gchar, 8);
1949           g_unichar_to_utf8 (x, data->text);
1950         }
1951       else
1952         {
1953           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1954           data->num = d;
1955         }
1956       break;
1957     case 0:
1958       error_detail = _("stray final '\\'");
1959       goto error;
1960       break;
1961     default:
1962       error_detail = _("unknown escape sequence");
1963       goto error;
1964     }
1965
1966   return p;
1967
1968  error:
1969   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
1970   tmp_error = g_error_new (G_REGEX_ERROR,
1971                            G_REGEX_ERROR_REPLACE,
1972                            _("Error while parsing replacement "
1973                              "text \"%s\" at char %lu: %s"),
1974                            replacement,
1975                            (gulong)(p - replacement),
1976                            error_detail);
1977   g_propagate_error (error, tmp_error);
1978
1979   return NULL;
1980 }
1981
1982 static GList *
1983 split_replacement (const gchar  *replacement,
1984                    GError      **error)
1985 {
1986   GList *list = NULL;
1987   InterpolationData *data;
1988   const gchar *p, *start;
1989
1990   start = p = replacement;
1991   while (*p)
1992     {
1993       if (*p == '\\')
1994         {
1995           data = g_new0 (InterpolationData, 1);
1996           start = p = expand_escape (replacement, p, data, error);
1997           if (p == NULL)
1998             {
1999               g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2000               g_list_free (list);
2001               free_interpolation_data (data);
2002
2003               return NULL;
2004             }
2005           list = g_list_prepend (list, data);
2006         }
2007       else
2008         {
2009           p++;
2010           if (*p == '\\' || *p == '\0')
2011             {
2012               if (p - start > 0)
2013                 {
2014                   data = g_new0 (InterpolationData, 1);
2015                   data->text = g_strndup (start, p - start);
2016                   data->type = REPL_TYPE_STRING;
2017                   list = g_list_prepend (list, data);
2018                 }
2019             }
2020         }
2021     }
2022
2023   return g_list_reverse (list);
2024 }
2025
2026 /* Change the case of c based on change_case. */
2027 #define CHANGE_CASE(c, change_case) \
2028         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2029                 g_unichar_tolower (c) : \
2030                 g_unichar_toupper (c))
2031
2032 static void
2033 string_append (GString     *string,
2034                const gchar *text,
2035                ChangeCase  *change_case)
2036 {
2037   gunichar c;
2038
2039   if (text[0] == '\0')
2040     return;
2041
2042   if (*change_case == CHANGE_CASE_NONE)
2043     {
2044       g_string_append (string, text);
2045     }
2046   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2047     {
2048       c = g_utf8_get_char (text);
2049       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2050       g_string_append (string, g_utf8_next_char (text));
2051       *change_case = CHANGE_CASE_NONE;
2052     }
2053   else
2054     {
2055       while (*text != '\0')
2056         {
2057           c = g_utf8_get_char (text);
2058           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2059           text = g_utf8_next_char (text);
2060         }
2061     }
2062 }
2063
2064 static gboolean
2065 interpolate_replacement (const GMatchInfo *match_info,
2066                          GString          *result,
2067                          gpointer          data)
2068 {
2069   GList *list;
2070   InterpolationData *idata;
2071   gchar *match;
2072   ChangeCase change_case = CHANGE_CASE_NONE;
2073
2074   for (list = data; list; list = list->next)
2075     {
2076       idata = list->data;
2077       switch (idata->type)
2078         {
2079         case REPL_TYPE_STRING:
2080           string_append (result, idata->text, &change_case);
2081           break;
2082         case REPL_TYPE_CHARACTER:
2083           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2084           if (change_case & CHANGE_CASE_SINGLE_MASK)
2085             change_case = CHANGE_CASE_NONE;
2086           break;
2087         case REPL_TYPE_NUMERIC_REFERENCE:
2088           match = g_match_info_fetch (match_info, idata->num);
2089           if (match)
2090             {
2091               string_append (result, match, &change_case);
2092               g_free (match);
2093             }
2094           break;
2095         case REPL_TYPE_SYMBOLIC_REFERENCE:
2096           match = g_match_info_fetch_named (match_info, idata->text);
2097           if (match)
2098             {
2099               string_append (result, match, &change_case);
2100               g_free (match);
2101             }
2102           break;
2103         case REPL_TYPE_CHANGE_CASE:
2104           change_case = idata->change_case;
2105           break;
2106         }
2107     }
2108
2109   return FALSE;
2110 }
2111
2112 /* whether actual match_info is needed for replacement, i.e.
2113  * whether there are references
2114  */
2115 static gboolean
2116 interpolation_list_needs_match (GList *list)
2117 {
2118   while (list != NULL)
2119     {
2120       InterpolationData *data = list->data;
2121
2122       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
2123           data->type == REPL_TYPE_NUMERIC_REFERENCE)
2124         {
2125           return TRUE;
2126         }
2127
2128       list = list->next;
2129     }
2130
2131   return FALSE;
2132 }
2133
2134 /**
2135  * g_regex_replace:
2136  * @regex: a #GRegex structure
2137  * @string: the string to perform matches against
2138  * @string_len: the length of @string, or -1 if @string is nul-terminated
2139  * @start_position: starting index of the string to match
2140  * @replacement: text to replace each match with
2141  * @match_options: options for the match
2142  * @error: location to store the error occuring, or %NULL to ignore errors
2143  *
2144  * Replaces all occurances of the pattern in @regex with the
2145  * replacement text. Backreferences of the form '\number' or
2146  * '\g&lt;number&gt;' in the replacement text are interpolated by the
2147  * number-th captured subexpression of the match, '\g&lt;name&gt;' refers
2148  * to the captured subexpression with the given name. '\0' refers to the
2149  * complete match, but '\0' followed by a number is the octal representation
2150  * of a character. To include a literal '\' in the replacement, write '\\'.
2151  * There are also escapes that changes the case of the following text:
2152  *
2153  * <variablelist>
2154  * <varlistentry><term>\l</term>
2155  * <listitem>
2156  * <para>Convert to lower case the next character</para>
2157  * </listitem>
2158  * </varlistentry>
2159  * <varlistentry><term>\u</term>
2160  * <listitem>
2161  * <para>Convert to upper case the next character</para>
2162  * </listitem>
2163  * </varlistentry>
2164  * <varlistentry><term>\L</term>
2165  * <listitem>
2166  * <para>Convert to lower case till \E</para>
2167  * </listitem>
2168  * </varlistentry>
2169  * <varlistentry><term>\U</term>
2170  * <listitem>
2171  * <para>Convert to upper case till \E</para>
2172  * </listitem>
2173  * </varlistentry>
2174  * <varlistentry><term>\E</term>
2175  * <listitem>
2176  * <para>End case modification</para>
2177  * </listitem>
2178  * </varlistentry>
2179  * </variablelist>
2180  *
2181  * If you do not need to use backreferences use g_regex_replace_literal().
2182  *
2183  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2184  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2185  * you can use g_regex_replace_literal().
2186  *
2187  * Setting @start_position differs from just passing over a shortened
2188  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
2189  * begins with any kind of lookbehind assertion, such as "\b".
2190  *
2191  * Returns: a newly allocated string containing the replacements
2192  *
2193  * Since: 2.14
2194  */
2195 gchar *
2196 g_regex_replace (const GRegex      *regex,
2197                  const gchar       *string,
2198                  gssize             string_len,
2199                  gint               start_position,
2200                  const gchar       *replacement,
2201                  GRegexMatchFlags   match_options,
2202                  GError           **error)
2203 {
2204   gchar *result;
2205   GList *list;
2206   GError *tmp_error = NULL;
2207
2208   g_return_val_if_fail (regex != NULL, NULL);
2209   g_return_val_if_fail (string != NULL, NULL);
2210   g_return_val_if_fail (start_position >= 0, NULL);
2211   g_return_val_if_fail (replacement != NULL, NULL);
2212   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2213   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2214
2215   list = split_replacement (replacement, &tmp_error);
2216   if (tmp_error != NULL)
2217     {
2218       g_propagate_error (error, tmp_error);
2219       return NULL;
2220     }
2221
2222   result = g_regex_replace_eval (regex,
2223                                  string, string_len, start_position,
2224                                  match_options,
2225                                  interpolate_replacement,
2226                                  (gpointer)list,
2227                                  &tmp_error);
2228   if (tmp_error != NULL)
2229     g_propagate_error (error, tmp_error);
2230
2231   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2232   g_list_free (list);
2233
2234   return result;
2235 }
2236
2237 static gboolean
2238 literal_replacement (const GMatchInfo *match_info,
2239                      GString          *result,
2240                      gpointer          data)
2241 {
2242   g_string_append (result, data);
2243   return FALSE;
2244 }
2245
2246 /**
2247  * g_regex_replace_literal:
2248  * @regex: a #GRegex structure
2249  * @string: the string to perform matches against
2250  * @string_len: the length of @string, or -1 if @string is nul-terminated
2251  * @start_position: starting index of the string to match
2252  * @replacement: text to replace each match with
2253  * @match_options: options for the match
2254  * @error: location to store the error occuring, or %NULL to ignore errors
2255  *
2256  * Replaces all occurances of the pattern in @regex with the
2257  * replacement text. @replacement is replaced literally, to
2258  * include backreferences use g_regex_replace().
2259  *
2260  * Setting @start_position differs from just passing over a
2261  * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
2262  * case of a pattern that begins with any kind of lookbehind
2263  * assertion, such as "\b".
2264  *
2265  * Returns: a newly allocated string containing the replacements
2266  *
2267  * Since: 2.14
2268  */
2269 gchar *
2270 g_regex_replace_literal (const GRegex      *regex,
2271                          const gchar       *string,
2272                          gssize             string_len,
2273                          gint               start_position,
2274                          const gchar       *replacement,
2275                          GRegexMatchFlags   match_options,
2276                          GError           **error)
2277 {
2278   g_return_val_if_fail (replacement != NULL, NULL);
2279   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2280
2281   return g_regex_replace_eval (regex,
2282                                string, string_len, start_position,
2283                                match_options,
2284                                literal_replacement,
2285                                (gpointer)replacement,
2286                                error);
2287 }
2288
2289 /**
2290  * g_regex_replace_eval:
2291  * @regex: a #GRegex structure from g_regex_new()
2292  * @string: string to perform matches against
2293  * @string_len: the length of @string, or -1 if @string is nul-terminated
2294  * @start_position: starting index of the string to match
2295  * @match_options: options for the match
2296  * @eval: a function to call for each match
2297  * @user_data: user data to pass to the function
2298  * @error: location to store the error occuring, or %NULL to ignore errors
2299  *
2300  * Replaces occurances of the pattern in regex with the output of
2301  * @eval for that occurance.
2302  *
2303  * Setting @start_position differs from just passing over a shortened
2304  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2305  * that begins with any kind of lookbehind assertion, such as "\b".
2306  *
2307  * Returns: a newly allocated string containing the replacements
2308  *
2309  * Since: 2.14
2310  */
2311 gchar *
2312 g_regex_replace_eval (const GRegex        *regex,
2313                       const gchar         *string,
2314                       gssize               string_len,
2315                       gint                 start_position,
2316                       GRegexMatchFlags     match_options,
2317                       GRegexEvalCallback   eval,
2318                       gpointer             user_data,
2319                       GError             **error)
2320 {
2321   GMatchInfo *match_info;
2322   GString *result;
2323   gint str_pos = 0;
2324   gboolean done = FALSE;
2325   GError *tmp_error = NULL;
2326
2327   g_return_val_if_fail (regex != NULL, NULL);
2328   g_return_val_if_fail (string != NULL, NULL);
2329   g_return_val_if_fail (start_position >= 0, NULL);
2330   g_return_val_if_fail (eval != NULL, NULL);
2331   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2332
2333   if (string_len < 0)
2334     string_len = strlen (string);
2335
2336   result = g_string_sized_new (string_len);
2337
2338   /* run down the string making matches. */
2339   g_regex_match_full (regex, string, string_len, start_position,
2340                       match_options, &match_info, &tmp_error);
2341   while (!done && g_match_info_matches (match_info))
2342     {
2343       g_string_append_len (result,
2344                            string + str_pos,
2345                            match_info->offsets[0] - str_pos);
2346       done = (*eval) (match_info, result, user_data);
2347       str_pos = match_info->offsets[1];
2348       g_match_info_next (match_info, &tmp_error);
2349     }
2350   g_match_info_free (match_info);
2351   if (tmp_error != NULL)
2352     {
2353       g_propagate_error (error, tmp_error);
2354       g_string_free (result, TRUE);
2355       return NULL;
2356     }
2357
2358   g_string_append_len (result, string + str_pos, string_len - str_pos);
2359   return g_string_free (result, FALSE);
2360 }
2361
2362 /**
2363  * g_regex_check_replacement:
2364  * @replacement: the replacement string
2365  * @has_references: location to store information about
2366  *   references in @replacement or %NULL
2367  * @error: location to store error
2368  *
2369  * Checks whether @replacement is a valid replacement string
2370  * (see g_regex_replace()), i.e. that all escape sequences in
2371  * it are valid.
2372  *
2373  * If @has_references is not %NULL then @replacement is checked
2374  * for pattern references. For instance, replacement text 'foo\n'
2375  * does not contain references and may be evaluated without information
2376  * about actual match, but '\0\1' (whole match followed by first
2377  * subpattern) requires valid #GMatchInfo object.
2378  *
2379  * Returns: whether @replacement is a valid replacement string
2380  *
2381  * Since: 2.14
2382  */
2383 gboolean
2384 g_regex_check_replacement (const gchar  *replacement,
2385                            gboolean     *has_references,
2386                            GError      **error)
2387 {
2388   GList *list;
2389   GError *tmp = NULL;
2390
2391   list = split_replacement (replacement, &tmp);
2392
2393   if (tmp)
2394   {
2395     g_propagate_error (error, tmp);
2396     return FALSE;
2397   }
2398
2399   if (has_references)
2400     *has_references = interpolation_list_needs_match (list);
2401
2402   g_list_foreach (list, (GFunc) free_interpolation_data, NULL);
2403   g_list_free (list);
2404
2405   return TRUE;
2406 }
2407
2408 /**
2409  * g_regex_escape_string:
2410  * @string: the string to escape
2411  * @length: the length of @string, or -1 if @string is nul-terminated
2412  *
2413  * Escapes the special characters used for regular expressions
2414  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
2415  * function is useful to dynamically generate regular expressions.
2416  *
2417  * @string can contain nul characters that are replaced with "\0",
2418  * in this case remember to specify the correct length of @string
2419  * in @length.
2420  *
2421  * Returns: a newly-allocated escaped string
2422  *
2423  * Since: 2.14
2424  */
2425 gchar *
2426 g_regex_escape_string (const gchar *string,
2427                        gint         length)
2428 {
2429   GString *escaped;
2430   const char *p, *piece_start, *end;
2431
2432   g_return_val_if_fail (string != NULL, NULL);
2433
2434   if (length < 0)
2435     length = strlen (string);
2436
2437   end = string + length;
2438   p = piece_start = string;
2439   escaped = g_string_sized_new (length + 1);
2440
2441   while (p < end)
2442     {
2443       switch (*p)
2444         {
2445         case '\0':
2446         case '\\':
2447         case '|':
2448         case '(':
2449         case ')':
2450         case '[':
2451         case ']':
2452         case '{':
2453         case '}':
2454         case '^':
2455         case '$':
2456         case '*':
2457         case '+':
2458         case '?':
2459         case '.':
2460           if (p != piece_start)
2461             /* copy the previous piece. */
2462             g_string_append_len (escaped, piece_start, p - piece_start);
2463           g_string_append_c (escaped, '\\');
2464           if (*p == '\0')
2465             g_string_append_c (escaped, '0');
2466           else
2467             g_string_append_c (escaped, *p);
2468           piece_start = ++p;
2469           break;
2470         default:
2471           p = g_utf8_next_char (p);
2472           break;
2473         }
2474   }
2475
2476   if (piece_start < end)
2477     g_string_append_len (escaped, piece_start, end - piece_start);
2478
2479   return g_string_free (escaped, FALSE);
2480 }
2481
2482 #define __G_REGEX_C__
2483 #include "galiasdef.c"