glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include "config.h"
  23
  24 #include <string.h>
  25
  26 #include "glib.h"
  27 #include "glibintl.h"
  28 #include "gregex.h"
  29
  30 #ifdef USE_SYSTEM_PCRE
  31 #include <pcre.h>
  32 #else
  33 #include "pcre/pcre.h"
  34 #endif
  35
  36 #include "galias.h"
  37
  38 /* Mask of all the possible values for GRegexCompileFlags. */
  39 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
  40                               G_REGEX_MULTILINE         | \
  41                               G_REGEX_DOTALL            | \
  42                               G_REGEX_EXTENDED          | \
  43                               G_REGEX_ANCHORED          | \
  44                               G_REGEX_DOLLAR_ENDONLY    | \
  45                               G_REGEX_UNGREEDY          | \
  46                               G_REGEX_RAW               | \
  47                               G_REGEX_NO_AUTO_CAPTURE   | \
  48                               G_REGEX_OPTIMIZE          | \
  49                               G_REGEX_DUPNAMES          | \
  50                               G_REGEX_NEWLINE_CR        | \
  51                               G_REGEX_NEWLINE_LF        | \
  52                               G_REGEX_NEWLINE_CRLF)
  53
  54 /* Mask of all the possible values for GRegexMatchFlags. */
  55 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED      | \
  56                             G_REGEX_MATCH_NOTBOL        | \
  57                             G_REGEX_MATCH_NOTEOL        | \
  58                             G_REGEX_MATCH_NOTEMPTY      | \
  59                             G_REGEX_MATCH_PARTIAL       | \
  60                             G_REGEX_MATCH_NEWLINE_CR    | \
  61                             G_REGEX_MATCH_NEWLINE_LF    | \
  62                             G_REGEX_MATCH_NEWLINE_CRLF  | \
  63                             G_REGEX_MATCH_NEWLINE_ANY)
  64
  65 /* if the string is in UTF-8 use g_utf8_ functions, else use
  66  * use just +/- 1. */
  67 #define NEXT_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
  68                                 g_utf8_next_char (s) : \
  69                                 ((s) + 1))
  70 #define PREV_CHAR(re, s) (((re)->compile_opts & PCRE_UTF8) ? \
  71                                 g_utf8_prev_char (s) : \
  72                                 ((s) - 1))
  73
  74 struct _GMatchInfo
  75 {
  76   GRegex *regex;                /* the regex */
  77   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
  78   gint matches;                 /* number of matching sub patterns */
  79   gint pos;                     /* position in the string where last match left off */
  80   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
  81   gint n_offsets;               /* number of offsets */
  82   gint *workspace;              /* workspace for pcre_dfa_exec() */
  83   gint n_workspace;             /* number of workspace elements */
  84   const gchar *string;          /* string passed to the match function */
  85   gssize string_len;            /* length of string */
  86 };
  87
  88 struct _GRegex
  89 {
  90   volatile guint ref_count;     /* the ref count for the immutable part */
  91   gchar *pattern;               /* the pattern */
  92   pcre *pcre_re;                /* compiled form of the pattern */
  93   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
  94   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
  95   pcre_extra *extra;            /* data stored when G_REGEX_OPTIMIZE is used */
  96 };
  97
  98 /* TRUE if ret is an error code, FALSE otherwise. */
  99 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 100
 101 typedef struct _InterpolationData InterpolationData;
 102 static gboolean  interpolation_list_needs_match (GList *list);
 103 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
 104                                                  GString *result,
 105                                                  gpointer data);
 106 static GList    *split_replacement              (const gchar *replacement,
 107                                                  GError **error);
 108 static void      free_interpolation_data        (InterpolationData *data);
 109
 110
 111 static const gchar *
 112 match_error (gint errcode)
 113 {
 114   switch (errcode)
 115     {
 116     case PCRE_ERROR_NOMATCH:
 117       /* not an error */
 118       break;
 119     case PCRE_ERROR_NULL:
 120       /* NULL argument, this should not happen in GRegex */
 121       g_warning ("A NULL argument was passed to PCRE");
 122       break;
 123     case PCRE_ERROR_BADOPTION:
 124       return "bad options";
 125     case PCRE_ERROR_BADMAGIC:
 126       return _("corrupted object");
 127     case PCRE_ERROR_UNKNOWN_OPCODE:
 128       return N_("internal error or corrupted object");
 129     case PCRE_ERROR_NOMEMORY:
 130       return _("out of memory");
 131     case PCRE_ERROR_NOSUBSTRING:
 132       /* not used by pcre_exec() */
 133       break;
 134     case PCRE_ERROR_MATCHLIMIT:
 135       return _("backtracking limit reached");
 136     case PCRE_ERROR_CALLOUT:
 137       /* callouts are not implemented */
 138       break;
 139     case PCRE_ERROR_BADUTF8:
 140     case PCRE_ERROR_BADUTF8_OFFSET:
 141       /* we do not check if strings are valid */
 142       break;
 143     case PCRE_ERROR_PARTIAL:
 144       /* not an error */
 145       break;
 146     case PCRE_ERROR_BADPARTIAL:
 147       return _("the pattern contains items not supported for partial matching");
 148     case PCRE_ERROR_INTERNAL:
 149       return _("internal error");
 150     case PCRE_ERROR_BADCOUNT:
 151       /* negative ovecsize, this should not happen in GRegex */
 152       g_warning ("A negative ovecsize was passed to PCRE");
 153       break;
 154     case PCRE_ERROR_DFA_UITEM:
 155       return _("the pattern contains items not supported for partial matching");
 156     case PCRE_ERROR_DFA_UCOND:
 157       return _("back references as conditions are not supported for partial matching");
 158     case PCRE_ERROR_DFA_UMLIMIT:
 159       /* the match_field field is not used in GRegex */
 160       break;
 161     case PCRE_ERROR_DFA_WSSIZE:
 162       /* handled expanding the workspace */
 163       break;
 164     case PCRE_ERROR_DFA_RECURSE:
 165     case PCRE_ERROR_RECURSIONLIMIT:
 166       return _("recursion limit reached");
 167     case PCRE_ERROR_NULLWSLIMIT:
 168       return _("workspace limit for empty substrings reached");
 169     case PCRE_ERROR_BADNEWLINE:
 170       return _("invalid combination of newline flags");
 171     default:
 172       break;
 173     }
 174   return _("unknown error");
 175 }
 176
 177
 178 /* GMatchInfo */
 179
 180 static GMatchInfo *
 181 match_info_new (const GRegex *regex,
 182                 const gchar  *string,
 183                 gint          string_len,
 184                 gint          start_position,
 185                 gint          match_options,
 186                 gboolean      is_dfa)
 187 {
 188   GMatchInfo *match_info;
 189
 190   if (string_len < 0)
 191     string_len = strlen (string);
 192
 193   match_info = g_new0 (GMatchInfo, 1);
 194   match_info->regex = g_regex_ref ((GRegex *)regex);
 195   match_info->string = string;
 196   match_info->string_len = string_len;
 197   match_info->matches = PCRE_ERROR_NOMATCH;
 198   match_info->pos = start_position;
 199   match_info->match_opts = match_options;
 200
 201   if (is_dfa)
 202     {
 203       /* These values should be enough for most cases, if they are not
 204        * enough g_regex_match_all_full() will expand them. */
 205       match_info->n_offsets = 24;
 206       match_info->n_workspace = 100;
 207       match_info->workspace = g_new (gint, match_info->n_workspace);
 208     }
 209   else
 210     {
 211       gint capture_count;
 212       pcre_fullinfo (regex->pcre_re, regex->extra,
 213                      PCRE_INFO_CAPTURECOUNT, &capture_count);
 214       match_info->n_offsets = (capture_count + 1) * 3;
 215     }
 216   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 217
 218   return match_info;
 219 }
 220
 221 /**
 222  * g_match_info_get_regex:
 223  * @match_info: a #GMatchInfo
 224  *
 225  * Returns #GRegex object used in @match_info. It belongs to Glib
 226  * and must not be freed. Use g_regex_ref() if you need to keep it
 227  * after you free @match_info object.
 228  *
 229  * Returns: #GRegex object used in @match_info
 230  *
 231  * Since: 2.14
 232  */
 233 GRegex *
 234 g_match_info_get_regex (const GMatchInfo *match_info)
 235 {
 236   g_return_val_if_fail (match_info != NULL, NULL);
 237   return match_info->regex;
 238 }
 239
 240 /**
 241  * g_match_info_get_string:
 242  * @match_info: a #GMatchInfo
 243  *
 244  * Returns the string searched with @match_info. This is the
 245  * string passed to g_regex_match() or g_regex_replace() so
 246  * you may not free it before calling this function.
 247  *
 248  * Returns: the string searched with @match_info
 249  *
 250  * Since: 2.14
 251  */
 252 const gchar *
 253 g_match_info_get_string (const GMatchInfo *match_info)
 254 {
 255   g_return_val_if_fail (match_info != NULL, NULL);
 256   return match_info->string;
 257 }
 258
 259 /**
 260  * g_match_info_free:
 261  * @match_info: a #GMatchInfo
 262  *
 263  * Frees all the memory associated with the #GMatchInfo structure.
 264  *
 265  * Since: 2.14
 266  */
 267 void
 268 g_match_info_free (GMatchInfo *match_info)
 269 {
 270   if (match_info)
 271     {
 272       g_regex_unref (match_info->regex);
 273       g_free (match_info->offsets);
 274       g_free (match_info->workspace);
 275       g_free (match_info);
 276     }
 277 }
 278
 279 /**
 280  * g_match_info_next:
 281  * @match_info: a #GMatchInfo structure
 282  * @error: location to store the error occuring, or %NULL to ignore errors
 283  *
 284  * Scans for the next match using the same parameters of the previous
 285  * call to g_regex_match_full() or g_regex_match() that returned
 286  * @match_info.
 287  *
 288  * The match is done on the string passed to the match function, so you
 289  * cannot free it before calling this function.
 290  *
 291  * Returns: %TRUE is the string matched, %FALSE otherwise
 292  *
 293  * Since: 2.14
 294  */
 295 gboolean
 296 g_match_info_next (GMatchInfo  *match_info,
 297                    GError     **error)
 298 {
 299   gint opts;
 300
 301   g_return_val_if_fail (match_info != NULL, FALSE);
 302   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 303   g_return_val_if_fail (match_info->pos >= 0, FALSE);
 304
 305   opts = match_info->regex->match_opts | match_info->match_opts;
 306
 307   match_info->matches = pcre_exec (match_info->regex->pcre_re,
 308                                    match_info->regex->extra,
 309                                    match_info->string,
 310                                    match_info->string_len,
 311                                    match_info->pos,
 312                                    match_info->regex->match_opts |
 313                                    match_info->match_opts,
 314                                    match_info->offsets,
 315                                    match_info->n_offsets);
 316   if (IS_PCRE_ERROR (match_info->matches))
 317     {
 318       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 319                    _("Error while matching regular expression %s: %s"),
 320                    match_info->regex->pattern, match_error (match_info->matches));
 321       return FALSE;
 322     }
 323
 324   /* avoid infinite loops if the pattern is an empty string or something
 325    * equivalent */
 326   if (match_info->pos == match_info->offsets[1])
 327     {
 328       if (match_info->pos > match_info->string_len)
 329         {
 330           /* we have reached the end of the string */
 331           match_info->pos = -1;
 332           match_info->matches = PCRE_ERROR_NOMATCH;
 333           return FALSE;
 334         }
 335
 336       match_info->pos = NEXT_CHAR (match_info->regex,
 337                                    &match_info->string[match_info->pos]) -
 338                                    match_info->string;
 339     }
 340   else
 341     {
 342       match_info->pos = match_info->offsets[1];
 343     }
 344
 345   return match_info->matches >= 0;
 346 }
 347
 348 /**
 349  * g_match_info_matches:
 350  * @match_info: a #GMatchInfo structure
 351  *
 352  * Returns whether the previous match operation succeeded.
 353  *
 354  * Returns: %TRUE if the previous match operation succeeded,
 355  *   %FALSE otherwise
 356  *
 357  * Since: 2.14
 358  */
 359 gboolean
 360 g_match_info_matches (const GMatchInfo *match_info)
 361 {
 362   g_return_val_if_fail (match_info != NULL, FALSE);
 363
 364   return match_info->matches >= 0;
 365 }
 366
 367 /**
 368  * g_match_info_get_match_count:
 369  * @match_info: a #GMatchInfo structure
 370  *
 371  * Retrieves the number of matched substrings (including substring 0,
 372  * that is the whole matched text), so 1 is returned if the pattern
 373  * has no substrings in it and 0 is returned if the match failed.
 374  *
 375  * If the last match was obtained using the DFA algorithm, that is
 376  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
 377  * count is not that of the number of capturing parentheses but that of
 378  * the number of matched substrings.
 379  *
 380  * Returns: Number of matched substrings, or -1 if an error occurred
 381  *
 382  * Since: 2.14
 383  */
 384 gint
 385 g_match_info_get_match_count (const GMatchInfo *match_info)
 386 {
 387   g_return_val_if_fail (match_info, -1);
 388
 389   if (match_info->matches == PCRE_ERROR_NOMATCH)
 390     /* no match */
 391     return 0;
 392   else if (match_info->matches < PCRE_ERROR_NOMATCH)
 393     /* error */
 394     return -1;
 395   else
 396     /* match */
 397     return match_info->matches;
 398 }
 399
 400 /**
 401  * g_match_info_is_partial_match:
 402  * @match_info: a #GMatchInfo structure
 403  *
 404  * Usually if the string passed to g_regex_match*() matches as far as
 405  * it goes, but is too short to match the entire pattern, %FALSE is
 406  * returned. There are circumstances where it might be helpful to
 407  * distinguish this case from other cases in which there is no match.
 408  *
 409  * Consider, for example, an application where a human is required to
 410  * type in data for a field with specific formatting requirements. An
 411  * example might be a date in the form ddmmmyy, defined by the pattern
 412  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 413  * If the application sees the user’s keystrokes one by one, and can
 414  * check that what has been typed so far is potentially valid, it is
 415  * able to raise an error as soon as a mistake is made.
 416  *
 417  * GRegex supports the concept of partial matching by means of the
 418  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 419  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 420  * for a complete match, %FALSE otherwise. But, when these functions
 421  * return %FALSE, you can check if the match was partial calling
 422  * g_match_info_is_partial_match().
 423  *
 424  * When using partial matching you cannot use g_match_info_fetch*().
 425  *
 426  * Because of the way certain internal optimizations are implemented
 427  * the partial matching algorithm cannot be used with all patterns.
 428  * So repeated single characters such as "a{2,4}" and repeated single
 429  * meta-sequences such as "\d+" are not permitted if the maximum number
 430  * of occurrences is greater than one. Optional items such as "\d?"
 431  * (where the maximum is one) are permitted. Quantifiers with any values
 432  * are permitted after parentheses, so the invalid examples above can be
 433  * coded thus "(a){2,4}" and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set
 434  * for a pattern that does not conform to the restrictions, matching
 435  * functions return an error.
 436  *
 437  * Returns: %TRUE if the match was partial, %FALSE otherwise
 438  *
 439  * Since: 2.14
 440  */
 441 gboolean
 442 g_match_info_is_partial_match (const GMatchInfo *match_info)
 443 {
 444   g_return_val_if_fail (match_info != NULL, FALSE);
 445
 446   return match_info->matches == PCRE_ERROR_PARTIAL;
 447 }
 448
 449 /**
 450  * g_match_info_expand_references:
 451  * @match_info: a #GMatchInfo or %NULL
 452  * @string_to_expand: the string to expand
 453  * @error: location to store the error occuring, or %NULL to ignore errors
 454  *
 455  * Returns a new string containing the text in @string_to_expand with
 456  * references and escape sequences expanded. References refer to the last
 457  * match done with @string against @regex and have the same syntax used by
 458  * g_regex_replace().
 459  *
 460  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
 461  * passed to g_regex_new().
 462  *
 463  * The backreferences are extracted from the string passed to the match
 464  * function, so you cannot call this function after freeing the string.
 465  *
 466  * @match_info may be %NULL in which case @string_to_expand must not
 467  * contain references. For instance "foo\n" does not refer to an actual
 468  * pattern and '\n' merely will be replaced with \n character,
 469  * while to expand "\0" (whole match) one needs the result of a match.
 470  * Use g_regex_check_replacement() to find out whether @string_to_expand
 471  * contains references.
 472  *
 473  * Returns: the expanded string, or %NULL if an error occurred
 474  *
 475  * Since: 2.14
 476  */
 477 gchar *
 478 g_match_info_expand_references (const GMatchInfo  *match_info,
 479                                 const gchar       *string_to_expand,
 480                                 GError           **error)
 481 {
 482   GString *result;
 483   GList *list;
 484   GError *tmp_error = NULL;
 485
 486   g_return_val_if_fail (string_to_expand != NULL, NULL);
 487   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 488
 489   list = split_replacement (string_to_expand, &tmp_error);
 490   if (tmp_error != NULL)
 491     {
 492       g_propagate_error (error, tmp_error);
 493       return NULL;
 494     }
 495
 496   if (!match_info && interpolation_list_needs_match (list))
 497     {
 498       g_critical ("String '%s' contains references to the match, can't "
 499                   "expand references without GMatchInfo object",
 500                   string_to_expand);
 501       return NULL;
 502     }
 503
 504   result = g_string_sized_new (strlen (string_to_expand));
 505   interpolate_replacement (match_info, result, list);
 506
 507   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
 508   g_list_free (list);
 509
 510   return g_string_free (result, FALSE);
 511 }
 512
 513 /**
 514  * g_match_info_fetch:
 515  * @match_info: #GMatchInfo structure
 516  * @match_num: number of the sub expression
 517  *
 518  * Retrieves the text matching the @match_num<!-- -->'th capturing
 519  * parentheses. 0 is the full text of the match, 1 is the first paren
 520  * set, 2 the second, and so on.
 521  *
 522  * If @match_num is a valid sub pattern but it didn't match anything
 523  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
 524  * string is returned.
 525  *
 526  * If the match was obtained using the DFA algorithm, that is using
 527  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 528  * string is not that of a set of parentheses but that of a matched
 529  * substring. Substrings are matched in reverse order of length, so
 530  * 0 is the longest match.
 531  *
 532  * The string is fetched from the string passed to the match function,
 533  * so you cannot call this function after freeing the string.
 534  *
 535  * Returns: The matched substring, or %NULL if an error occurred.
 536  *          You have to free the string yourself
 537  *
 538  * Since: 2.14
 539  */
 540 gchar *
 541 g_match_info_fetch (const GMatchInfo *match_info,
 542                     gint              match_num)
 543 {
 544   /* we cannot use pcre_get_substring() because it allocates the
 545    * string using pcre_malloc(). */
 546   gchar *match = NULL;
 547   gint start, end;
 548
 549   g_return_val_if_fail (match_info != NULL, NULL);
 550   g_return_val_if_fail (match_num >= 0, NULL);
 551
 552   /* match_num does not exist or it didn't matched, i.e. matching "b"
 553    * against "(a)?b" then group 0 is empty. */
 554   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
 555     match = NULL;
 556   else if (start == -1)
 557     match = g_strdup ("");
 558   else
 559     match = g_strndup (&match_info->string[start], end - start);
 560
 561   return match;
 562 }
 563
 564 /**
 565  * g_match_info_fetch_pos:
 566  * @match_info: #GMatchInfo structure
 567  * @match_num: number of the sub expression
 568  * @start_pos: pointer to location where to store the start position
 569  * @end_pos: pointer to location where to store the end position
 570  *
 571  * Retrieves the position of the @match_num<!-- -->'th capturing
 572  * parentheses. 0 is the full text of the match, 1 is the first
 573  * paren set, 2 the second, and so on.
 574  *
 575  * If @match_num is a valid sub pattern but it didn't match anything
 576  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
 577  * and @end_pos are set to -1 and %TRUE is returned.
 578  *
 579  * If the match was obtained using the DFA algorithm, that is using
 580  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 581  * position is not that of a set of parentheses but that of a matched
 582  * substring. Substrings are matched in reverse order of length, so
 583  * 0 is the longest match.
 584  *
 585  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
 586  *   the position cannot be fetched, @start_pos and @end_pos are left
 587  *   unchanged
 588  *
 589  * Since: 2.14
 590  */
 591 gboolean
 592 g_match_info_fetch_pos (const GMatchInfo *match_info,
 593                         gint              match_num,
 594                         gint             *start_pos,
 595                         gint             *end_pos)
 596 {
 597   g_return_val_if_fail (match_info != NULL, FALSE);
 598   g_return_val_if_fail (match_num >= 0, FALSE);
 599
 600   /* make sure the sub expression number they're requesting is less than
 601    * the total number of sub expressions that were matched. */
 602   if (match_num >= match_info->matches)
 603     return FALSE;
 604
 605   if (start_pos != NULL)
 606     *start_pos = match_info->offsets[2 * match_num];
 607
 608   if (end_pos != NULL)
 609     *end_pos = match_info->offsets[2 * match_num + 1];
 610
 611   return TRUE;
 612 }
 613
 614 /*
 615  * Returns number of first matched subpattern with name @name.
 616  * There may be more than one in case when DUPNAMES is used,
 617  * and not all subpatterns with that name match;
 618  * pcre_get_stringnumber() does not work in that case.
 619  */
 620 static gint
 621 get_matched_substring_number (const GMatchInfo *match_info,
 622                               const gchar      *name)
 623 {
 624   gint entrysize;
 625   gchar *first, *last;
 626   guchar *entry;
 627
 628   /*
 629    * FIXME: (?J) may be used inside the pattern as the equivalent of
 630    * DUPNAMES compile option. In this case we can't know about it,
 631    * and pcre doesn't tell us about it either, it uses private flag
 632    * PCRE_JCHANGED for this. So we have to always search string
 633    * table, unlike pcre which uses pcre_get_stringnumber() shortcut
 634    * when possible. It shouldn't be actually bad since
 635    * pcre_get_stringtable_entries() uses binary search; still would
 636    * be better to fix it, to be not worse than pcre.
 637    */
 638 #if 0
 639   if ((match_info->regex->compile_opts & G_REGEX_DUPNAMES) == 0)
 640     return pcre_get_stringnumber (match_info->regex->pcre_re, name);
 641 #endif
 642
 643   /* This code is copied from pcre_get.c: get_first_set() */
 644   entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,
 645                                             name,
 646                                             &first,
 647                                             &last);
 648
 649   if (entrysize <= 0)
 650     return entrysize;
 651
 652   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
 653     {
 654       gint n = (entry[0] << 8) + entry[1];
 655       if (match_info->offsets[n*2] >= 0)
 656         return n;
 657     }
 658
 659   return (first[0] << 8) + first[1];
 660 }
 661
 662 /**
 663  * g_match_info_fetch_named:
 664  * @match_info: #GMatchInfo structure
 665  * @name: name of the subexpression
 666  *
 667  * Retrieves the text matching the capturing parentheses named @name.
 668  *
 669  * If @name is a valid sub pattern name but it didn't match anything
 670  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
 671  * then an empty string is returned.
 672  *
 673  * The string is fetched from the string passed to the match function,
 674  * so you cannot call this function after freeing the string.
 675  *
 676  * Returns: The matched substring, or %NULL if an error occurred.
 677  *          You have to free the string yourself
 678  *
 679  * Since: 2.14
 680  */
 681 gchar *
 682 g_match_info_fetch_named (const GMatchInfo *match_info,
 683                           const gchar      *name)
 684 {
 685   /* we cannot use pcre_get_named_substring() because it allocates the
 686    * string using pcre_malloc(). */
 687   gint num;
 688
 689   g_return_val_if_fail (match_info != NULL, NULL);
 690   g_return_val_if_fail (name != NULL, NULL);
 691
 692   num = get_matched_substring_number (match_info, name);
 693   if (num < 0)
 694     return NULL;
 695   else
 696     return g_match_info_fetch (match_info, num);
 697 }
 698
 699 /**
 700  * g_match_info_fetch_named_pos:
 701  * @match_info: #GMatchInfo structure
 702  * @name: name of the subexpression
 703  * @start_pos: pointer to location where to store the start position
 704  * @end_pos: pointer to location where to store the end position
 705  *
 706  * Retrieves the position of the capturing parentheses named @name.
 707  *
 708  * If @name is a valid sub pattern name but it didn't match anything
 709  * (e.g. sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b")
 710  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
 711  *
 712  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
 713  *   the position cannot be fetched, @start_pos and @end_pos are left
 714  *   unchanged
 715  *
 716  * Since: 2.14
 717  */
 718 gboolean
 719 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
 720                               const gchar      *name,
 721                               gint             *start_pos,
 722                               gint             *end_pos)
 723 {
 724   gint num;
 725
 726   g_return_val_if_fail (match_info != NULL, FALSE);
 727   g_return_val_if_fail (name != NULL, FALSE);
 728
 729   num = get_matched_substring_number (match_info, name);
 730   if (num < 0)
 731     return FALSE;
 732
 733   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
 734 }
 735
 736 /**
 737  * g_match_info_fetch_all:
 738  * @match_info: a #GMatchInfo structure
 739  *
 740  * Bundles up pointers to each of the matching substrings from a match
 741  * and stores them in an array of gchar pointers. The first element in
 742  * the returned array is the match number 0, i.e. the entire matched
 743  * text.
 744  *
 745  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
 746  * "b" against "(a)?b") then an empty string is inserted.
 747  *
 748  * If the last match was obtained using the DFA algorithm, that is using
 749  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 750  * strings are not that matched by sets of parentheses but that of the
 751  * matched substring. Substrings are matched in reverse order of length,
 752  * so the first one is the longest match.
 753  *
 754  * The strings are fetched from the string passed to the match function,
 755  * so you cannot call this function after freeing the string.
 756  *
 757  * Returns: a %NULL-terminated array of gchar * pointers. It must be
 758  *   freed using g_strfreev(). If the previous match failed %NULL is
 759  *   returned
 760  *
 761  * Since: 2.14
 762  */
 763 gchar **
 764 g_match_info_fetch_all (const GMatchInfo *match_info)
 765 {
 766   /* we cannot use pcre_get_substring_list() because the returned value
 767    * isn't suitable for g_strfreev(). */
 768   gchar **result;
 769   gint i;
 770
 771   g_return_val_if_fail (match_info != NULL, FALSE);
 772
 773   if (match_info->matches < 0)
 774     return NULL;
 775
 776   result = g_new (gchar *, match_info->matches + 1);
 777   for (i = 0; i < match_info->matches; i++)
 778     result[i] = g_match_info_fetch (match_info, i);
 779   result[i] = NULL;
 780
 781   return result;
 782 }
 783
 784
 785 /* GRegex */
 786
 787 GQuark
 788 g_regex_error_quark (void)
 789 {
 790   static GQuark error_quark = 0;
 791
 792   if (error_quark == 0)
 793     error_quark = g_quark_from_static_string ("g-regex-error-quark");
 794
 795   return error_quark;
 796 }
 797
 798 /**
 799  * g_regex_ref:
 800  * @regex: a #GRegex
 801  *
 802  * Increases reference count of @regex by 1.
 803  *
 804  * Returns: @regex
 805  *
 806  * Since: 2.14
 807  */
 808 GRegex *
 809 g_regex_ref (GRegex *regex)
 810 {
 811   g_return_val_if_fail (regex != NULL, NULL);
 812   g_atomic_int_inc ((gint*) &regex->ref_count);
 813   return regex;
 814 }
 815
 816 /**
 817  * g_regex_unref:
 818  * @regex: a #GRegex
 819  *
 820  * Decreases reference count of @regex by 1. When reference count drops
 821  * to zero, it frees all the memory associated with the regex structure.
 822  *
 823  * Since: 2.14
 824  */
 825 void
 826 g_regex_unref (GRegex *regex)
 827 {
 828   g_return_if_fail (regex != NULL);
 829
 830   if (g_atomic_int_exchange_and_add ((gint *) &regex->ref_count, -1) - 1 == 0)
 831     {
 832       g_free (regex->pattern);
 833       if (regex->pcre_re != NULL)
 834         pcre_free (regex->pcre_re);
 835       if (regex->extra != NULL)
 836         pcre_free (regex->extra);
 837       g_free (regex);
 838     }
 839 }
 840
 841 /**
 842  * g_regex_new:
 843  * @pattern: the regular expression
 844  * @compile_options: compile options for the regular expression
 845  * @match_options: match options for the regular expression
 846  * @error: return location for a #GError
 847  *
 848  * Compiles the regular expression to an internal form, and does
 849  * the initial setup of the #GRegex structure.
 850  *
 851  * Returns: a #GRegex structure. Call g_regex_unref() when you
 852  *   are done with it
 853  *
 854  * Since: 2.14
 855  */
 856 GRegex *
 857 g_regex_new (const gchar         *pattern,
 858              GRegexCompileFlags   compile_options,
 859              GRegexMatchFlags     match_options,
 860              GError             **error)
 861 {
 862   GRegex *regex;
 863   pcre *re;
 864   const gchar *errmsg;
 865   gint erroffset;
 866   gboolean optimize = FALSE;
 867   static gboolean initialized = FALSE;
 868
 869   g_return_val_if_fail (pattern != NULL, NULL);
 870   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 871   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
 872   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
 873
 874   if (!initialized)
 875     {
 876       gint support;
 877       const gchar *msg;
 878
 879       pcre_config (PCRE_CONFIG_UTF8, &support);
 880       if (!support)
 881         {
 882           msg = N_("PCRE library is compiled without UTF8 support");
 883           g_critical (msg);
 884           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 885           return NULL;
 886         }
 887
 888       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
 889       if (!support)
 890         {
 891           msg = N_("PCRE library is compiled without UTF8 properties support");
 892           g_critical (msg);
 893           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 894           return NULL;
 895         }
 896
 897       initialized = TRUE;
 898     }
 899
 900   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
 901    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
 902   if (compile_options & G_REGEX_OPTIMIZE)
 903     optimize = TRUE;
 904
 905   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
 906    * instead uses UTF-8 only if required with PCRE_UTF8. */
 907   if (compile_options & G_REGEX_RAW)
 908     {
 909       /* disable utf-8 */
 910       compile_options &= ~G_REGEX_RAW;
 911     }
 912   else
 913     {
 914       /* enable utf-8 */
 915       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
 916       match_options |= PCRE_NO_UTF8_CHECK;
 917     }
 918
 919   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
 920    * not for the system one. */
 921   if (!(compile_options & G_REGEX_NEWLINE_CR) &&
 922       !(compile_options & G_REGEX_NEWLINE_LF))
 923     {
 924       compile_options |= PCRE_NEWLINE_ANY;
 925     }
 926
 927   /* compile the pattern */
 928   re = pcre_compile (pattern, compile_options, &errmsg, &erroffset, NULL);
 929
 930   /* if the compilation failed, set the error member and return
 931    * immediately */
 932   if (re == NULL)
 933     {
 934       GError *tmp_error = g_error_new (G_REGEX_ERROR,
 935                                        G_REGEX_ERROR_COMPILE,
 936                                        _("Error while compiling regular "
 937                                          "expression %s at char %d: %s"),
 938                                        pattern, erroffset, errmsg);
 939       g_propagate_error (error, tmp_error);
 940
 941       return NULL;
 942     }
 943
 944   regex = g_new0 (GRegex, 1);
 945   regex->ref_count = 1;
 946   regex->pattern = g_strdup (pattern);
 947   regex->pcre_re = re;
 948   regex->compile_opts = compile_options;
 949   regex->match_opts = match_options;
 950
 951   if (optimize)
 952     {
 953       regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
 954       if (errmsg != NULL)
 955         {
 956           GError *tmp_error = g_error_new (G_REGEX_ERROR,
 957                                            G_REGEX_ERROR_OPTIMIZE,
 958                                            _("Error while optimizing "
 959                                              "regular expression %s: %s"),
 960                                            regex->pattern,
 961                                            errmsg);
 962           g_propagate_error (error, tmp_error);
 963           return NULL;
 964         }
 965     }
 966
 967   return regex;
 968 }
 969
 970 /**
 971  * g_regex_get_pattern:
 972  * @regex: a #GRegex structure
 973  *
 974  * Gets the pattern string associated with @regex, i.e. a copy of
 975  * the string passed to g_regex_new().
 976  *
 977  * Returns: the pattern of @regex
 978  *
 979  * Since: 2.14
 980  */
 981 const gchar *
 982 g_regex_get_pattern (const GRegex *regex)
 983 {
 984   g_return_val_if_fail (regex != NULL, NULL);
 985
 986   return regex->pattern;
 987 }
 988
 989 /**
 990  * g_regex_get_max_backref:
 991  * @regex: a #GRegex
 992  *
 993  * Returns the number of the highest back reference
 994  * in the pattern, or 0 if the pattern does not contain
 995  * back references.
 996  *
 997  * Returns: the number of the highest back reference
 998  *
 999  * Since: 2.14
1000  */
1001 gint
1002 g_regex_get_max_backref (const GRegex *regex)
1003 {
1004   gint value;
1005
1006   pcre_fullinfo (regex->pcre_re, regex->extra,
1007                  PCRE_INFO_BACKREFMAX, &value);
1008
1009   return value;
1010 }
1011
1012 /**
1013  * g_regex_get_capture_count:
1014  * @regex: a #GRegex
1015  *
1016  * Returns the number of capturing subpatterns in the pattern.
1017  *
1018  * Returns: the number of capturing subpatterns
1019  *
1020  * Since: 2.14
1021  */
1022 gint
1023 g_regex_get_capture_count (const GRegex *regex)
1024 {
1025   gint value;
1026
1027   pcre_fullinfo (regex->pcre_re, regex->extra,
1028                  PCRE_INFO_CAPTURECOUNT, &value);
1029
1030   return value;
1031 }
1032
1033 /**
1034  * g_regex_match_simple:
1035  * @pattern: the regular expression
1036  * @string: the string to scan for matches
1037  * @compile_options: compile options for the regular expression
1038  * @match_options: match options
1039  *
1040  * Scans for a match in @string for @pattern.
1041  *
1042  * This function is equivalent to g_regex_match() but it does not
1043  * require to compile the pattern with g_regex_new(), avoiding some
1044  * lines of code when you need just to do a match without extracting
1045  * substrings, capture counts, and so on.
1046  *
1047  * If this function is to be called on the same @pattern more than
1048  * once, it's more efficient to compile the pattern once with
1049  * g_regex_new() and then use g_regex_match().
1050  *
1051  * Returns: %TRUE is the string matched, %FALSE otherwise
1052  *
1053  * Since: 2.14
1054  */
1055 gboolean
1056 g_regex_match_simple (const gchar        *pattern,
1057                       const gchar        *string,
1058                       GRegexCompileFlags  compile_options,
1059                       GRegexMatchFlags    match_options)
1060 {
1061   GRegex *regex;
1062   gboolean result;
1063
1064   regex = g_regex_new (pattern, compile_options, 0, NULL);
1065   if (!regex)
1066     return FALSE;
1067   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1068   g_regex_unref (regex);
1069   return result;
1070 }
1071
1072 /**
1073  * g_regex_match:
1074  * @regex: a #GRegex structure from g_regex_new()
1075  * @string: the string to scan for matches
1076  * @match_options: match options
1077  * @match_info: pointer to location where to store the #GMatchInfo,
1078  *   or %NULL if you do not need it
1079  *
1080  * Scans for a match in string for the pattern in @regex.
1081  * The @match_options are combined with the match options specified
1082  * when the @regex structure was created, letting you have more
1083  * flexibility in reusing #GRegex structures.
1084  *
1085  * A #GMatchInfo structure, used to get information on the match,
1086  * is stored in @match_info if not %NULL. Note that if @match_info
1087  * is not %NULL then it is created even if the function returns %FALSE,
1088  * i.e. you must free it regardless if regular expression actually matched.
1089  *
1090  * To retrieve all the non-overlapping matches of the pattern in
1091  * string you can use g_match_info_next().
1092  *
1093  * <informalexample><programlisting>
1094  * static void
1095  * print_uppercase_words (const gchar *string)
1096  * {
1097  *   /&ast; Print all uppercase-only words. &ast;/
1098  *   GRegex *regex;
1099  *   GMatchInfo *match_info;
1100  *   &nbsp;
1101  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1102  *   g_regex_match (regex, string, 0, &amp;match_info);
1103  *   while (g_match_info_matches (match_info))
1104  *     {
1105  *       gchar *word = g_match_info_fetch (match_info, 0);
1106  *       g_print ("Found: %s\n", word);
1107  *       g_free (word);
1108  *       g_match_info_next (match_info, NULL);
1109  *     }
1110  *   g_match_info_free (match_info);
1111  *   g_regex_unref (regex);
1112  * }
1113  * </programlisting></informalexample>
1114  *
1115  * Returns: %TRUE is the string matched, %FALSE otherwise
1116  *
1117  * Since: 2.14
1118  */
1119 gboolean
1120 g_regex_match (const GRegex      *regex,
1121                const gchar       *string,
1122                GRegexMatchFlags   match_options,
1123                GMatchInfo       **match_info)
1124 {
1125   return g_regex_match_full (regex, string, -1, 0, match_options,
1126                              match_info, NULL);
1127 }
1128
1129 /**
1130  * g_regex_match_full:
1131  * @regex: a #GRegex structure from g_regex_new()
1132  * @string: the string to scan for matches
1133  * @string_len: the length of @string, or -1 if @string is nul-terminated
1134  * @start_position: starting index of the string to match
1135  * @match_options: match options
1136  * @match_info: pointer to location where to store the #GMatchInfo,
1137  *   or %NULL if you do not need it
1138  * @error: location to store the error occuring, or %NULL to ignore errors
1139  *
1140  * Scans for a match in string for the pattern in @regex.
1141  * The @match_options are combined with the match options specified
1142  * when the @regex structure was created, letting you have more
1143  * flexibility in reusing #GRegex structures.
1144  *
1145  * Setting @start_position differs from just passing over a shortened
1146  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1147  * that begins with any kind of lookbehind assertion, such as "\b".
1148  *
1149  * A #GMatchInfo structure, used to get information on the match, is
1150  * stored in @match_info if not %NULL. Note that if @match_info is
1151  * not %NULL then it is created even if the function returns %FALSE,
1152  * i.e. you must free it regardless if regular expression actually
1153  * matched.
1154  *
1155  * @string is not copied and is used in #GMatchInfo internally. If
1156  * you use any #GMatchInfo method (except g_match_info_free()) after
1157  * freeing or modifying @string then the behaviour is undefined.
1158  *
1159  * To retrieve all the non-overlapping matches of the pattern in
1160  * string you can use g_match_info_next().
1161  *
1162  * <informalexample><programlisting>
1163  * static void
1164  * print_uppercase_words (const gchar *string)
1165  * {
1166  *   /&ast; Print all uppercase-only words. &ast;/
1167  *   GRegex *regex;
1168  *   GMatchInfo *match_info;
1169  *   GError *error = NULL;
1170  *   &nbsp;
1171  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1172  *   g_regex_match_full (regex, string, -1, 0, 0, &amp;match_info, &amp;error);
1173  *   while (g_match_info_matches (match_info))
1174  *     {
1175  *       gchar *word = g_match_info_fetch (match_info, 0);
1176  *       g_print ("Found: %s\n", word);
1177  *       g_free (word);
1178  *       g_match_info_next (match_info, &amp;error);
1179  *     }
1180  *   g_match_info_free (match_info);
1181  *   g_regex_unref (regex);
1182  *   if (error != NULL)
1183  *     {
1184  *       g_printerr ("Error while matching: %s\n", error->message);
1185  *       g_error_free (error);
1186  *     }
1187  * }
1188  * </programlisting></informalexample>
1189  *
1190  * Returns: %TRUE is the string matched, %FALSE otherwise
1191  *
1192  * Since: 2.14
1193  */
1194 gboolean
1195 g_regex_match_full (const GRegex      *regex,
1196                     const gchar       *string,
1197                     gssize             string_len,
1198                     gint               start_position,
1199                     GRegexMatchFlags   match_options,
1200                     GMatchInfo       **match_info,
1201                     GError           **error)
1202 {
1203   GMatchInfo *info;
1204   gboolean match_ok;
1205
1206   g_return_val_if_fail (regex != NULL, FALSE);
1207   g_return_val_if_fail (string != NULL, FALSE);
1208   g_return_val_if_fail (start_position >= 0, FALSE);
1209   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1210   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1211
1212   info = match_info_new (regex, string, string_len, start_position,
1213                          match_options, FALSE);
1214   match_ok = g_match_info_next (info, error);
1215   if (match_info != NULL)
1216     *match_info = info;
1217   else
1218     g_match_info_free (info);
1219
1220   return match_ok;
1221 }
1222
1223 /**
1224  * g_regex_match_all:
1225  * @regex: a #GRegex structure from g_regex_new()
1226  * @string: the string to scan for matches
1227  * @match_options: match options
1228  * @match_info: pointer to location where to store the #GMatchInfo,
1229  *   or %NULL if you do not need it
1230  *
1231  * Using the standard algorithm for regular expression matching only
1232  * the longest match in the string is retrieved. This function uses
1233  * a different algorithm so it can retrieve all the possible matches.
1234  * For more documentation see g_regex_match_all_full().
1235  *
1236  * A #GMatchInfo structure, used to get information on the match, is
1237  * stored in @match_info if not %NULL. Note that if @match_info is
1238  * not %NULL then it is created even if the function returns %FALSE,
1239  * i.e. you must free it regardless if regular expression actually
1240  * matched.
1241  *
1242  * Returns: %TRUE is the string matched, %FALSE otherwise
1243  *
1244  * Since: 2.14
1245  */
1246 gboolean
1247 g_regex_match_all (const GRegex      *regex,
1248                    const gchar       *string,
1249                    GRegexMatchFlags   match_options,
1250                    GMatchInfo       **match_info)
1251 {
1252   return g_regex_match_all_full (regex, string, -1, 0, match_options,
1253                                  match_info, NULL);
1254 }
1255
1256 /**
1257  * g_regex_match_all_full:
1258  * @regex: a #GRegex structure from g_regex_new()
1259  * @string: the string to scan for matches
1260  * @string_len: the length of @string, or -1 if @string is nul-terminated
1261  * @start_position: starting index of the string to match
1262  * @match_options: match options
1263  * @match_info: pointer to location where to store the #GMatchInfo,
1264  *   or %NULL if you do not need it
1265  * @error: location to store the error occuring, or %NULL to ignore errors
1266  *
1267  * Using the standard algorithm for regular expression matching only
1268  * the longest match in the string is retrieved, it is not possibile
1269  * to obtain all the available matches. For instance matching
1270  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1271  * you get "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
1272  *
1273  * This function uses a different algorithm (called DFA, i.e. deterministic
1274  * finite automaton), so it can retrieve all the possible matches, all
1275  * starting at the same point in the string. For instance matching
1276  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;"
1277  * you would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
1278  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
1279  *
1280  * The number of matched strings is retrieved using
1281  * g_match_info_get_match_count(). To obtain the matched strings and
1282  * their position you can use, respectively, g_match_info_fetch() and
1283  * g_match_info_fetch_pos(). Note that the strings are returned in
1284  * reverse order of length; that is, the longest matching string is
1285  * given first.
1286  *
1287  * Note that the DFA algorithm is slower than the standard one and it
1288  * is not able to capture substrings, so backreferences do not work.
1289  *
1290  * Setting @start_position differs from just passing over a shortened
1291  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1292  * that begins with any kind of lookbehind assertion, such as "\b".
1293  *
1294  * A #GMatchInfo structure, used to get information on the match, is
1295  * stored in @match_info if not %NULL. Note that if @match_info is
1296  * not %NULL then it is created even if the function returns %FALSE,
1297  * i.e. you must free it regardless if regular expression actually
1298  * matched.
1299  *
1300  * Returns: %TRUE is the string matched, %FALSE otherwise
1301  *
1302  * Since: 2.14
1303  */
1304 gboolean
1305 g_regex_match_all_full (const GRegex      *regex,
1306                         const gchar       *string,
1307                         gssize             string_len,
1308                         gint               start_position,
1309                         GRegexMatchFlags   match_options,
1310                         GMatchInfo       **match_info,
1311                         GError           **error)
1312 {
1313   GMatchInfo *info;
1314   gboolean done;
1315
1316   g_return_val_if_fail (regex != NULL, FALSE);
1317   g_return_val_if_fail (string != NULL, FALSE);
1318   g_return_val_if_fail (start_position >= 0, FALSE);
1319   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1320   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
1321
1322   info = match_info_new (regex, string, string_len, start_position,
1323                          match_options, TRUE);
1324
1325   done = FALSE;
1326   while (!done)
1327     {
1328       done = TRUE;
1329       info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
1330                                      info->string, info->string_len,
1331                                      info->pos,
1332                                      regex->match_opts | match_options,
1333                                      info->offsets, info->n_offsets,
1334                                      info->workspace, info->n_workspace);
1335       if (info->matches == PCRE_ERROR_DFA_WSSIZE)
1336         {
1337           /* info->workspace is too small. */
1338           info->n_workspace *= 2;
1339           info->workspace = g_realloc (info->workspace,
1340                                        info->n_workspace * sizeof (gint));
1341           done = FALSE;
1342         }
1343       else if (info->matches == 0)
1344         {
1345           /* info->offsets is too small. */
1346           info->n_offsets *= 2;
1347           info->offsets = g_realloc (info->offsets,
1348                                      info->n_offsets * sizeof (gint));
1349           done = FALSE;
1350         }
1351       else if (IS_PCRE_ERROR (info->matches))
1352         {
1353           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1354                        _("Error while matching regular expression %s: %s"),
1355                        regex->pattern, match_error (info->matches));
1356         }
1357     }
1358
1359   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
1360   info->pos = -1;
1361
1362   if (match_info != NULL)
1363     *match_info = info;
1364   else
1365     g_match_info_free (info);
1366
1367   return info->matches >= 0;
1368 }
1369
1370 /**
1371  * g_regex_get_string_number:
1372  * @regex: #GRegex structure
1373  * @name: name of the subexpression
1374  *
1375  * Retrieves the number of the subexpression named @name.
1376  *
1377  * Returns: The number of the subexpression or -1 if @name
1378  *   does not exists
1379  *
1380  * Since: 2.14
1381  */
1382 gint
1383 g_regex_get_string_number (const GRegex *regex,
1384                            const gchar  *name)
1385 {
1386   gint num;
1387
1388   g_return_val_if_fail (regex != NULL, -1);
1389   g_return_val_if_fail (name != NULL, -1);
1390
1391   num = pcre_get_stringnumber (regex->pcre_re, name);
1392   if (num == PCRE_ERROR_NOSUBSTRING)
1393     num = -1;
1394
1395   return num;
1396 }
1397
1398 /**
1399  * g_regex_split_simple:
1400  * @pattern: the regular expression
1401  * @string: the string to scan for matches
1402  * @compile_options: compile options for the regular expression
1403  * @match_options: match options
1404  *
1405  * Breaks the string on the pattern, and returns an array of
1406  * the tokens. If the pattern contains capturing parentheses,
1407  * then the text for each of the substrings will also be returned.
1408  * If the pattern does not match anywhere in the string, then the
1409  * whole string is returned as the first token.
1410  *
1411  * This function is equivalent to g_regex_split() but it does
1412  * not require to compile the pattern with g_regex_new(), avoiding
1413  * some lines of code when you need just to do a split without
1414  * extracting substrings, capture counts, and so on.
1415  *
1416  * If this function is to be called on the same @pattern more than
1417  * once, it's more efficient to compile the pattern once with
1418  * g_regex_new() and then use g_regex_split().
1419  *
1420  * As a special case, the result of splitting the empty string ""
1421  * is an empty vector, not a vector containing a single string.
1422  * The reason for this special case is that being able to represent
1423  * a empty vector is typically more useful than consistent handling
1424  * of empty elements. If you do need to represent empty elements,
1425  * you'll need to check for the empty string before calling this
1426  * function.
1427  *
1428  * A pattern that can match empty strings splits @string into
1429  * separate characters wherever it matches the empty string between
1430  * characters. For example splitting "ab c" using as a separator
1431  * "\s*", you will get "a", "b" and "c".
1432  *
1433  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1434  *
1435  * Since: 2.14
1436  **/
1437 gchar **
1438 g_regex_split_simple (const gchar        *pattern,
1439                       const gchar        *string,
1440                       GRegexCompileFlags  compile_options,
1441                       GRegexMatchFlags    match_options)
1442 {
1443   GRegex *regex;
1444   gchar **result;
1445
1446   regex = g_regex_new (pattern, compile_options, 0, NULL);
1447   if (!regex)
1448     return NULL;
1449   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1450   g_regex_unref (regex);
1451   return result;
1452 }
1453
1454 /**
1455  * g_regex_split:
1456  * @regex: a #GRegex structure
1457  * @string: the string to split with the pattern
1458  * @match_options: match time option flags
1459  *
1460  * Breaks the string on the pattern, and returns an array of the tokens.
1461  * If the pattern contains capturing parentheses, then the text for each
1462  * of the substrings will also be returned. If the pattern does not match
1463  * anywhere in the string, then the whole string is returned as the first
1464  * token.
1465  *
1466  * As a special case, the result of splitting the empty string "" is an
1467  * empty vector, not a vector containing a single string. The reason for
1468  * this special case is that being able to represent a empty vector is
1469  * typically more useful than consistent handling of empty elements. If
1470  * you do need to represent empty elements, you'll need to check for the
1471  * empty string before calling this function.
1472  *
1473  * A pattern that can match empty strings splits @string into separate
1474  * characters wherever it matches the empty string between characters.
1475  * For example splitting "ab c" using as a separator "\s*", you will get
1476  * "a", "b" and "c".
1477  *
1478  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1479  *
1480  * Since: 2.14
1481  **/
1482 gchar **
1483 g_regex_split (const GRegex     *regex,
1484                const gchar      *string,
1485                GRegexMatchFlags  match_options)
1486 {
1487   return g_regex_split_full (regex, string, -1, 0,
1488                              match_options, 0, NULL);
1489 }
1490
1491 /**
1492  * g_regex_split_full:
1493  * @regex: a #GRegex structure
1494  * @string: the string to split with the pattern
1495  * @string_len: the length of @string, or -1 if @string is nul-terminated
1496  * @start_position: starting index of the string to match
1497  * @match_options: match time option flags
1498  * @max_tokens: the maximum number of tokens to split @string into.
1499  *   If this is less than 1, the string is split completely
1500  * @error: return location for a #GError
1501  *
1502  * Breaks the string on the pattern, and returns an array of the tokens.
1503  * If the pattern contains capturing parentheses, then the text for each
1504  * of the substrings will also be returned. If the pattern does not match
1505  * anywhere in the string, then the whole string is returned as the first
1506  * token.
1507  *
1508  * As a special case, the result of splitting the empty string "" is an
1509  * empty vector, not a vector containing a single string. The reason for
1510  * this special case is that being able to represent a empty vector is
1511  * typically more useful than consistent handling of empty elements. If
1512  * you do need to represent empty elements, you'll need to check for the
1513  * empty string before calling this function.
1514  *
1515  * A pattern that can match empty strings splits @string into separate
1516  * characters wherever it matches the empty string between characters.
1517  * For example splitting "ab c" using as a separator "\s*", you will get
1518  * "a", "b" and "c".
1519  *
1520  * Setting @start_position differs from just passing over a shortened
1521  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
1522  * that begins with any kind of lookbehind assertion, such as "\b".
1523  *
1524  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev()
1525  *
1526  * Since: 2.14
1527  **/
1528 gchar **
1529 g_regex_split_full (const GRegex      *regex,
1530                     const gchar       *string,
1531                     gssize             string_len,
1532                     gint               start_position,
1533                     GRegexMatchFlags   match_options,
1534                     gint               max_tokens,
1535                     GError           **error)
1536 {
1537   GError *tmp_error = NULL;
1538   GMatchInfo *match_info;
1539   GList *list, *last;
1540   gint i;
1541   gint token_count;
1542   gboolean match_ok;
1543   /* position of the last separator. */
1544   gint last_separator_end;
1545   /* was the last match 0 bytes long? */
1546   gboolean last_match_is_empty;
1547   /* the returned array of char **s */
1548   gchar **string_list;
1549
1550   g_return_val_if_fail (regex != NULL, NULL);
1551   g_return_val_if_fail (string != NULL, NULL);
1552   g_return_val_if_fail (start_position >= 0, NULL);
1553   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1554   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1555
1556   if (max_tokens <= 0)
1557     max_tokens = G_MAXINT;
1558
1559   if (string_len < 0)
1560     string_len = strlen (string);
1561
1562   /* zero-length string */
1563   if (string_len - start_position == 0)
1564     return g_new0 (gchar *, 1);
1565
1566   if (max_tokens == 1)
1567     {
1568       string_list = g_new0 (gchar *, 2);
1569       string_list[0] = g_strndup (&string[start_position],
1570                                   string_len - start_position);
1571       return string_list;
1572     }
1573
1574   list = NULL;
1575   token_count = 0;
1576   last_separator_end = start_position;
1577   last_match_is_empty = FALSE;
1578
1579   match_ok = g_regex_match_full (regex, string, string_len, start_position,
1580                                  match_options, &match_info, &tmp_error);
1581   while (tmp_error == NULL)
1582     {
1583       if (match_ok)
1584         {
1585           last_match_is_empty =
1586                     (match_info->offsets[0] == match_info->offsets[1]);
1587
1588           /* we need to skip empty separators at the same position of the end
1589            * of another separator. e.g. the string is "a b" and the separator
1590            * is " *", so from 1 to 2 we have a match and at position 2 we have
1591            * an empty match. */
1592           if (last_separator_end != match_info->offsets[1])
1593             {
1594               gchar *token;
1595               gint match_count;
1596
1597               token = g_strndup (string + last_separator_end,
1598                                  match_info->offsets[0] - last_separator_end);
1599               list = g_list_prepend (list, token);
1600               token_count++;
1601
1602               /* if there were substrings, these need to be added to
1603                * the list. */
1604               match_count = g_match_info_get_match_count (match_info);
1605               if (match_count > 1)
1606                 {
1607                   for (i = 1; i < match_count; i++)
1608                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
1609                 }
1610             }
1611         }
1612       else
1613         {
1614           /* if there was no match, copy to end of string. */
1615           if (!last_match_is_empty)
1616             {
1617               gchar *token = g_strndup (string + last_separator_end,
1618                                         match_info->string_len - last_separator_end);
1619               list = g_list_prepend (list, token);
1620             }
1621           /* no more tokens, end the loop. */
1622           break;
1623         }
1624
1625       /* -1 to leave room for the last part. */
1626       if (token_count >= max_tokens - 1)
1627         {
1628           /* we have reached the maximum number of tokens, so we copy
1629            * the remaining part of the string. */
1630           if (last_match_is_empty)
1631             {
1632               /* the last match was empty, so we have moved one char
1633                * after the real position to avoid empty matches at the
1634                * same position. */
1635               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
1636             }
1637           /* the if is needed in the case we have terminated the available
1638            * tokens, but we are at the end of the string, so there are no
1639            * characters left to copy. */
1640           if (string_len > match_info->pos)
1641             {
1642               gchar *token = g_strndup (string + match_info->pos,
1643                                         string_len - match_info->pos);
1644               list = g_list_prepend (list, token);
1645             }
1646           /* end the loop. */
1647           break;
1648         }
1649
1650       last_separator_end = match_info->pos;
1651       if (last_match_is_empty)
1652         /* if the last match was empty, g_match_info_next() has moved
1653          * forward to avoid infinite loops, but we still need to copy that
1654          * character. */
1655         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
1656
1657       match_ok = g_match_info_next (match_info, &tmp_error);
1658     }
1659   g_match_info_free (match_info);
1660   if (tmp_error != NULL)
1661     {
1662       g_propagate_error (error, tmp_error);
1663       g_list_foreach (list, (GFunc)g_free, NULL);
1664       g_list_free (list);
1665       match_info->pos = -1;
1666       return NULL;
1667     }
1668
1669   string_list = g_new (gchar *, g_list_length (list) + 1);
1670   i = 0;
1671   for (last = g_list_last (list); last; last = g_list_previous (last))
1672     string_list[i++] = last->data;
1673   string_list[i] = 0;
1674   g_list_free (list);
1675
1676   return string_list;
1677 }
1678
1679 enum
1680 {
1681   REPL_TYPE_STRING,
1682   REPL_TYPE_CHARACTER,
1683   REPL_TYPE_SYMBOLIC_REFERENCE,
1684   REPL_TYPE_NUMERIC_REFERENCE,
1685   REPL_TYPE_CHANGE_CASE
1686 };
1687
1688 typedef enum
1689 {
1690   CHANGE_CASE_NONE         = 1 << 0,
1691   CHANGE_CASE_UPPER        = 1 << 1,
1692   CHANGE_CASE_LOWER        = 1 << 2,
1693   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
1694   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
1695   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
1696   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
1697   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
1698 } ChangeCase;
1699
1700 struct _InterpolationData
1701 {
1702   gchar     *text;
1703   gint       type;
1704   gint       num;
1705   gchar      c;
1706   ChangeCase change_case;
1707 };
1708
1709 static void
1710 free_interpolation_data (InterpolationData *data)
1711 {
1712   g_free (data->text);
1713   g_free (data);
1714 }
1715
1716 static const gchar *
1717 expand_escape (const gchar        *replacement,
1718                const gchar        *p,
1719                InterpolationData  *data,
1720                GError            **error)
1721 {
1722   const gchar *q, *r;
1723   gint x, d, h, i;
1724   const gchar *error_detail;
1725   gint base = 0;
1726   GError *tmp_error = NULL;
1727
1728   p++;
1729   switch (*p)
1730     {
1731     case 't':
1732       p++;
1733       data->c = '\t';
1734       data->type = REPL_TYPE_CHARACTER;
1735       break;
1736     case 'n':
1737       p++;
1738       data->c = '\n';
1739       data->type = REPL_TYPE_CHARACTER;
1740       break;
1741     case 'v':
1742       p++;
1743       data->c = '\v';
1744       data->type = REPL_TYPE_CHARACTER;
1745       break;
1746     case 'r':
1747       p++;
1748       data->c = '\r';
1749       data->type = REPL_TYPE_CHARACTER;
1750       break;
1751     case 'f':
1752       p++;
1753       data->c = '\f';
1754       data->type = REPL_TYPE_CHARACTER;
1755       break;
1756     case 'a':
1757       p++;
1758       data->c = '\a';
1759       data->type = REPL_TYPE_CHARACTER;
1760       break;
1761     case 'b':
1762       p++;
1763       data->c = '\b';
1764       data->type = REPL_TYPE_CHARACTER;
1765       break;
1766     case '\\':
1767       p++;
1768       data->c = '\\';
1769       data->type = REPL_TYPE_CHARACTER;
1770       break;
1771     case 'x':
1772       p++;
1773       x = 0;
1774       if (*p == '{')
1775         {
1776           p++;
1777           do
1778             {
1779               h = g_ascii_xdigit_value (*p);
1780               if (h < 0)
1781                 {
1782                   error_detail = _("hexadecimal digit or '}' expected");
1783                   goto error;
1784                 }
1785               x = x * 16 + h;
1786               p++;
1787             }
1788           while (*p != '}');
1789           p++;
1790         }
1791       else
1792         {
1793           for (i = 0; i < 2; i++)
1794             {
1795               h = g_ascii_xdigit_value (*p);
1796               if (h < 0)
1797                 {
1798                   error_detail = _("hexadecimal digit expected");
1799                   goto error;
1800                 }
1801               x = x * 16 + h;
1802               p++;
1803             }
1804         }
1805       data->type = REPL_TYPE_STRING;
1806       data->text = g_new0 (gchar, 8);
1807       g_unichar_to_utf8 (x, data->text);
1808       break;
1809     case 'l':
1810       p++;
1811       data->type = REPL_TYPE_CHANGE_CASE;
1812       data->change_case = CHANGE_CASE_LOWER_SINGLE;
1813       break;
1814     case 'u':
1815       p++;
1816       data->type = REPL_TYPE_CHANGE_CASE;
1817       data->change_case = CHANGE_CASE_UPPER_SINGLE;
1818       break;
1819     case 'L':
1820       p++;
1821       data->type = REPL_TYPE_CHANGE_CASE;
1822       data->change_case = CHANGE_CASE_LOWER;
1823       break;
1824     case 'U':
1825       p++;
1826       data->type = REPL_TYPE_CHANGE_CASE;
1827       data->change_case = CHANGE_CASE_UPPER;
1828       break;
1829     case 'E':
1830       p++;
1831       data->type = REPL_TYPE_CHANGE_CASE;
1832       data->change_case = CHANGE_CASE_NONE;
1833       break;
1834     case 'g':
1835       p++;
1836       if (*p != '<')
1837         {
1838           error_detail = _("missing '<' in symbolic reference");
1839           goto error;
1840         }
1841       q = p + 1;
1842       do
1843         {
1844           p++;
1845           if (!*p)
1846             {
1847               error_detail = _("unfinished symbolic reference");
1848               goto error;
1849             }
1850         }
1851       while (*p != '>');
1852       if (p - q == 0)
1853         {
1854           error_detail = _("zero-length symbolic reference");
1855           goto error;
1856         }
1857       if (g_ascii_isdigit (*q))
1858         {
1859           x = 0;
1860           do
1861             {
1862               h = g_ascii_digit_value (*q);
1863               if (h < 0)
1864                 {
1865                   error_detail = _("digit expected");
1866                   p = q;
1867                   goto error;
1868                 }
1869               x = x * 10 + h;
1870               q++;
1871             }
1872           while (q != p);
1873           data->num = x;
1874           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1875         }
1876       else
1877         {
1878           r = q;
1879           do
1880             {
1881               if (!g_ascii_isalnum (*r))
1882                 {
1883                   error_detail = _("illegal symbolic reference");
1884                   p = r;
1885                   goto error;
1886                 }
1887               r++;
1888             }
1889           while (r != p);
1890           data->text = g_strndup (q, p - q);
1891           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
1892         }
1893       p++;
1894       break;
1895     case '0':
1896       /* if \0 is followed by a number is an octal number representing a
1897        * character, else it is a numeric reference. */
1898       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
1899         {
1900           base = 8;
1901           p = g_utf8_next_char (p);
1902         }
1903     case '1':
1904     case '2':
1905     case '3':
1906     case '4':
1907     case '5':
1908     case '6':
1909     case '7':
1910     case '8':
1911     case '9':
1912       x = 0;
1913       d = 0;
1914       for (i = 0; i < 3; i++)
1915         {
1916           h = g_ascii_digit_value (*p);
1917           if (h < 0)
1918             break;
1919           if (h > 7)
1920             {
1921               if (base == 8)
1922                 break;
1923               else
1924                 base = 10;
1925             }
1926           if (i == 2 && base == 10)
1927             break;
1928           x = x * 8 + h;
1929           d = d * 10 + h;
1930           p++;
1931         }
1932       if (base == 8 || i == 3)
1933         {
1934           data->type = REPL_TYPE_STRING;
1935           data->text = g_new0 (gchar, 8);
1936           g_unichar_to_utf8 (x, data->text);
1937         }
1938       else
1939         {
1940           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1941           data->num = d;
1942         }
1943       break;
1944     case 0:
1945       error_detail = _("stray final '\\'");
1946       goto error;
1947       break;
1948     default:
1949       error_detail = _("unknown escape sequence");
1950       goto error;
1951     }
1952
1953   return p;
1954
1955  error:
1956   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
1957   tmp_error = g_error_new (G_REGEX_ERROR,
1958                            G_REGEX_ERROR_REPLACE,
1959                            _("Error while parsing replacement "
1960                              "text \"%s\" at char %lu: %s"),
1961                            replacement,
1962                            (gulong)(p - replacement),
1963                            error_detail);
1964   g_propagate_error (error, tmp_error);
1965
1966   return NULL;
1967 }
1968
1969 static GList *
1970 split_replacement (const gchar  *replacement,
1971                    GError      **error)
1972 {
1973   GList *list = NULL;
1974   InterpolationData *data;
1975   const gchar *p, *start;
1976
1977   start = p = replacement;
1978   while (*p)
1979     {
1980       if (*p == '\\')
1981         {
1982           data = g_new0 (InterpolationData, 1);
1983           start = p = expand_escape (replacement, p, data, error);
1984           if (p == NULL)
1985             {
1986               g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
1987               g_list_free (list);
1988               free_interpolation_data (data);
1989
1990               return NULL;
1991             }
1992           list = g_list_prepend (list, data);
1993         }
1994       else
1995         {
1996           p++;
1997           if (*p == '\\' || *p == '\0')
1998             {
1999               if (p - start > 0)
2000                 {
2001                   data = g_new0 (InterpolationData, 1);
2002                   data->text = g_strndup (start, p - start);
2003                   data->type = REPL_TYPE_STRING;
2004                   list = g_list_prepend (list, data);
2005                 }
2006             }
2007         }
2008     }
2009
2010   return g_list_reverse (list);
2011 }
2012
2013 /* Change the case of c based on change_case. */
2014 #define CHANGE_CASE(c, change_case) \
2015         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2016                 g_unichar_tolower (c) : \
2017                 g_unichar_toupper (c))
2018
2019 static void
2020 string_append (GString     *string,
2021                const gchar *text,
2022                ChangeCase  *change_case)
2023 {
2024   gunichar c;
2025
2026   if (text[0] == '\0')
2027     return;
2028
2029   if (*change_case == CHANGE_CASE_NONE)
2030     {
2031       g_string_append (string, text);
2032     }
2033   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2034     {
2035       c = g_utf8_get_char (text);
2036       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2037       g_string_append (string, g_utf8_next_char (text));
2038       *change_case = CHANGE_CASE_NONE;
2039     }
2040   else
2041     {
2042       while (*text != '\0')
2043         {
2044           c = g_utf8_get_char (text);
2045           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2046           text = g_utf8_next_char (text);
2047         }
2048     }
2049 }
2050
2051 static gboolean
2052 interpolate_replacement (const GMatchInfo *match_info,
2053                          GString          *result,
2054                          gpointer          data)
2055 {
2056   GList *list;
2057   InterpolationData *idata;
2058   gchar *match;
2059   ChangeCase change_case = CHANGE_CASE_NONE;
2060
2061   for (list = data; list; list = list->next)
2062     {
2063       idata = list->data;
2064       switch (idata->type)
2065         {
2066         case REPL_TYPE_STRING:
2067           string_append (result, idata->text, &change_case);
2068           break;
2069         case REPL_TYPE_CHARACTER:
2070           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2071           if (change_case & CHANGE_CASE_SINGLE_MASK)
2072             change_case = CHANGE_CASE_NONE;
2073           break;
2074         case REPL_TYPE_NUMERIC_REFERENCE:
2075           match = g_match_info_fetch (match_info, idata->num);
2076           if (match)
2077             {
2078               string_append (result, match, &change_case);
2079               g_free (match);
2080             }
2081           break;
2082         case REPL_TYPE_SYMBOLIC_REFERENCE:
2083           match = g_match_info_fetch_named (match_info, idata->text);
2084           if (match)
2085             {
2086               string_append (result, match, &change_case);
2087               g_free (match);
2088             }
2089           break;
2090         case REPL_TYPE_CHANGE_CASE:
2091           change_case = idata->change_case;
2092           break;
2093         }
2094     }
2095
2096   return FALSE;
2097 }
2098
2099 /* whether actual match_info is needed for replacement, i.e.
2100  * whether there are references
2101  */
2102 static gboolean
2103 interpolation_list_needs_match (GList *list)
2104 {
2105   while (list != NULL)
2106     {
2107       InterpolationData *data = list->data;
2108
2109       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
2110           data->type == REPL_TYPE_NUMERIC_REFERENCE)
2111         {
2112           return TRUE;
2113         }
2114
2115       list = list->next;
2116     }
2117
2118   return FALSE;
2119 }
2120
2121 /**
2122  * g_regex_replace:
2123  * @regex: a #GRegex structure
2124  * @string: the string to perform matches against
2125  * @string_len: the length of @string, or -1 if @string is nul-terminated
2126  * @start_position: starting index of the string to match
2127  * @replacement: text to replace each match with
2128  * @match_options: options for the match
2129  * @error: location to store the error occuring, or %NULL to ignore errors
2130  *
2131  * Replaces all occurances of the pattern in @regex with the
2132  * replacement text. Backreferences of the form '\number' or
2133  * '\g&lt;number&gt;' in the replacement text are interpolated by the
2134  * number-th captured subexpression of the match, '\g&lt;name&gt;' refers
2135  * to the captured subexpression with the given name. '\0' refers to the
2136  * complete match, but '\0' followed by a number is the octal representation
2137  * of a character. To include a literal '\' in the replacement, write '\\'.
2138  * There are also escapes that changes the case of the following text:
2139  *
2140  * <variablelist>
2141  * <varlistentry><term>\l</term>
2142  * <listitem>
2143  * <para>Convert to lower case the next character</para>
2144  * </listitem>
2145  * </varlistentry>
2146  * <varlistentry><term>\u</term>
2147  * <listitem>
2148  * <para>Convert to upper case the next character</para>
2149  * </listitem>
2150  * </varlistentry>
2151  * <varlistentry><term>\L</term>
2152  * <listitem>
2153  * <para>Convert to lower case till \E</para>
2154  * </listitem>
2155  * </varlistentry>
2156  * <varlistentry><term>\U</term>
2157  * <listitem>
2158  * <para>Convert to upper case till \E</para>
2159  * </listitem>
2160  * </varlistentry>
2161  * <varlistentry><term>\E</term>
2162  * <listitem>
2163  * <para>End case modification</para>
2164  * </listitem>
2165  * </varlistentry>
2166  * </variablelist>
2167  *
2168  * If you do not need to use backreferences use g_regex_replace_literal().
2169  *
2170  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2171  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2172  * you can use g_regex_replace_literal().
2173  *
2174  * Setting @start_position differs from just passing over a shortened
2175  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
2176  * begins with any kind of lookbehind assertion, such as "\b".
2177  *
2178  * Returns: a newly allocated string containing the replacements
2179  *
2180  * Since: 2.14
2181  */
2182 gchar *
2183 g_regex_replace (const GRegex      *regex,
2184                  const gchar       *string,
2185                  gssize             string_len,
2186                  gint               start_position,
2187                  const gchar       *replacement,
2188                  GRegexMatchFlags   match_options,
2189                  GError           **error)
2190 {
2191   gchar *result;
2192   GList *list;
2193   GError *tmp_error = NULL;
2194
2195   g_return_val_if_fail (regex != NULL, NULL);
2196   g_return_val_if_fail (string != NULL, NULL);
2197   g_return_val_if_fail (start_position >= 0, NULL);
2198   g_return_val_if_fail (replacement != NULL, NULL);
2199   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2200   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2201
2202   list = split_replacement (replacement, &tmp_error);
2203   if (tmp_error != NULL)
2204     {
2205       g_propagate_error (error, tmp_error);
2206       return NULL;
2207     }
2208
2209   result = g_regex_replace_eval (regex,
2210                                  string, string_len, start_position,
2211                                  match_options,
2212                                  interpolate_replacement,
2213                                  (gpointer)list,
2214                                  &tmp_error);
2215   if (tmp_error != NULL)
2216     g_propagate_error (error, tmp_error);
2217
2218   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2219   g_list_free (list);
2220
2221   return result;
2222 }
2223
2224 static gboolean
2225 literal_replacement (const GMatchInfo *match_info,
2226                      GString          *result,
2227                      gpointer          data)
2228 {
2229   g_string_append (result, data);
2230   return FALSE;
2231 }
2232
2233 /**
2234  * g_regex_replace_literal:
2235  * @regex: a #GRegex structure
2236  * @string: the string to perform matches against
2237  * @string_len: the length of @string, or -1 if @string is nul-terminated
2238  * @start_position: starting index of the string to match
2239  * @replacement: text to replace each match with
2240  * @match_options: options for the match
2241  * @error: location to store the error occuring, or %NULL to ignore errors
2242  *
2243  * Replaces all occurances of the pattern in @regex with the
2244  * replacement text. @replacement is replaced literally, to
2245  * include backreferences use g_regex_replace().
2246  *
2247  * Setting @start_position differs from just passing over a
2248  * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
2249  * case of a pattern that begins with any kind of lookbehind
2250  * assertion, such as "\b".
2251  *
2252  * Returns: a newly allocated string containing the replacements
2253  *
2254  * Since: 2.14
2255  */
2256 gchar *
2257 g_regex_replace_literal (const GRegex      *regex,
2258                          const gchar       *string,
2259                          gssize             string_len,
2260                          gint               start_position,
2261                          const gchar       *replacement,
2262                          GRegexMatchFlags   match_options,
2263                          GError           **error)
2264 {
2265   g_return_val_if_fail (replacement != NULL, NULL);
2266   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2267
2268   return g_regex_replace_eval (regex,
2269                                string, string_len, start_position,
2270                                match_options,
2271                                literal_replacement,
2272                                (gpointer)replacement,
2273                                error);
2274 }
2275
2276 /**
2277  * g_regex_replace_eval:
2278  * @regex: a #GRegex structure from g_regex_new()
2279  * @string: string to perform matches against
2280  * @string_len: the length of @string, or -1 if @string is nul-terminated
2281  * @start_position: starting index of the string to match
2282  * @match_options: options for the match
2283  * @eval: a function to call for each match
2284  * @user_data: user data to pass to the function
2285  * @error: location to store the error occuring, or %NULL to ignore errors
2286  *
2287  * Replaces occurances of the pattern in regex with the output of
2288  * @eval for that occurance.
2289  *
2290  * Setting @start_position differs from just passing over a shortened
2291  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2292  * that begins with any kind of lookbehind assertion, such as "\b".
2293  *
2294  * Returns: a newly allocated string containing the replacements
2295  *
2296  * Since: 2.14
2297  */
2298 gchar *
2299 g_regex_replace_eval (const GRegex        *regex,
2300                       const gchar         *string,
2301                       gssize               string_len,
2302                       gint                 start_position,
2303                       GRegexMatchFlags     match_options,
2304                       GRegexEvalCallback   eval,
2305                       gpointer             user_data,
2306                       GError             **error)
2307 {
2308   GMatchInfo *match_info;
2309   GString *result;
2310   gint str_pos = 0;
2311   gboolean done = FALSE;
2312   GError *tmp_error = NULL;
2313
2314   g_return_val_if_fail (regex != NULL, NULL);
2315   g_return_val_if_fail (string != NULL, NULL);
2316   g_return_val_if_fail (start_position >= 0, NULL);
2317   g_return_val_if_fail (eval != NULL, NULL);
2318   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2319
2320   if (string_len < 0)
2321     string_len = strlen (string);
2322
2323   result = g_string_sized_new (string_len);
2324
2325   /* run down the string making matches. */
2326   g_regex_match_full (regex, string, string_len, start_position,
2327                       match_options, &match_info, &tmp_error);
2328   while (!done && g_match_info_matches (match_info))
2329     {
2330       g_string_append_len (result,
2331                            string + str_pos,
2332                            match_info->offsets[0] - str_pos);
2333       done = (*eval) (match_info, result, user_data);
2334       str_pos = match_info->offsets[1];
2335       g_match_info_next (match_info, &tmp_error);
2336     }
2337   g_match_info_free (match_info);
2338   if (tmp_error != NULL)
2339     {
2340       g_propagate_error (error, tmp_error);
2341       g_string_free (result, TRUE);
2342       return NULL;
2343     }
2344
2345   g_string_append_len (result, string + str_pos, string_len - str_pos);
2346   return g_string_free (result, FALSE);
2347 }
2348
2349 /**
2350  * g_regex_check_replacement:
2351  * @replacement: the replacement string
2352  * @has_references: location to store information about
2353  *   references in @replacement or %NULL
2354  * @error: location to store error
2355  *
2356  * Checks whether @replacement is a valid replacement string
2357  * (see g_regex_replace()), i.e. that all escape sequences in
2358  * it are valid.
2359  *
2360  * If @has_references is not %NULL then @replacement is checked
2361  * for pattern references. For instance, replacement text 'foo\n'
2362  * does not contain references and may be evaluated without information
2363  * about actual match, but '\0\1' (whole match followed by first
2364  * subpattern) requires valid #GMatchInfo object.
2365  *
2366  * Returns: whether @replacement is a valid replacement string
2367  *
2368  * Since: 2.14
2369  */
2370 gboolean
2371 g_regex_check_replacement (const gchar  *replacement,
2372                            gboolean     *has_references,
2373                            GError      **error)
2374 {
2375   GList *list;
2376   GError *tmp = NULL;
2377
2378   list = split_replacement (replacement, &tmp);
2379
2380   if (tmp)
2381   {
2382     g_propagate_error (error, tmp);
2383     return FALSE;
2384   }
2385
2386   if (has_references)
2387     *has_references = interpolation_list_needs_match (list);
2388
2389   g_list_foreach (list, (GFunc) free_interpolation_data, NULL);
2390   g_list_free (list);
2391
2392   return TRUE;
2393 }
2394
2395 /**
2396  * g_regex_escape_string:
2397  * @string: the string to escape
2398  * @length: the length of @string, or -1 if @string is nul-terminated
2399  *
2400  * Escapes the special characters used for regular expressions
2401  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
2402  * function is useful to dynamically generate regular expressions.
2403  *
2404  * @string can contain nul characters that are replaced with "\0",
2405  * in this case remember to specify the correct length of @string
2406  * in @length.
2407  *
2408  * Returns: a newly-allocated escaped string
2409  *
2410  * Since: 2.14
2411  */
2412 gchar *
2413 g_regex_escape_string (const gchar *string,
2414                        gint         length)
2415 {
2416   GString *escaped;
2417   const char *p, *piece_start, *end;
2418
2419   g_return_val_if_fail (string != NULL, NULL);
2420
2421   if (length < 0)
2422     length = strlen (string);
2423
2424   end = string + length;
2425   p = piece_start = string;
2426   escaped = g_string_sized_new (length + 1);
2427
2428   while (p < end)
2429     {
2430       switch (*p)
2431         {
2432         case '\0':
2433         case '\\':
2434         case '|':
2435         case '(':
2436         case ')':
2437         case '[':
2438         case ']':
2439         case '{':
2440         case '}':
2441         case '^':
2442         case '$':
2443         case '*':
2444         case '+':
2445         case '?':
2446         case '.':
2447           if (p != piece_start)
2448             /* copy the previous piece. */
2449             g_string_append_len (escaped, piece_start, p - piece_start);
2450           g_string_append_c (escaped, '\\');
2451           if (*p == '\0')
2452             g_string_append_c (escaped, '0');
2453           else
2454             g_string_append_c (escaped, *p);
2455           piece_start = ++p;
2456           break;
2457         default:
2458           p = g_utf8_next_char (p);
2459           break;
2460         }
2461   }
2462
2463   if (piece_start < end)
2464     g_string_append_len (escaped, piece_start, end - piece_start);
2465
2466   return g_string_free (escaped, FALSE);
2467 }
2468
2469 #define __G_REGEX_C__
2470 #include "galiasdef.c"