glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "gregex.h"
  25
  26 #include <glib.h>
  27 #include <glib/gi18n.h>
  28 #include <string.h>
  29
  30 #ifdef USE_SYSTEM_PCRE
  31 #include <pcre.h>
  32 #else
  33 #include "pcre/pcre.h"
  34 #endif
  35
  36 #include "galias.h"
  37
  38 /* Mask of all the possible values for GRegexCompileFlags. */
  39 #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS          | \
  40                               G_REGEX_MULTILINE         | \
  41                               G_REGEX_DOTALL            | \
  42                               G_REGEX_EXTENDED          | \
  43                               G_REGEX_ANCHORED          | \
  44                               G_REGEX_DOLLAR_ENDONLY    | \
  45                               G_REGEX_UNGREEDY          | \
  46                               G_REGEX_RAW               | \
  47                               G_REGEX_NO_AUTO_CAPTURE   | \
  48                               G_REGEX_DUPNAMES          | \
  49                               G_REGEX_NEWLINE_CR        | \
  50                               G_REGEX_NEWLINE_LF        | \
  51                               G_REGEX_NEWLINE_CRLF)
  52
  53 /* Mask of all the possible values for GRegexMatchFlags. */
  54 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED      | \
  55                             G_REGEX_MATCH_NOTBOL        | \
  56                             G_REGEX_MATCH_NOTEOL        | \
  57                             G_REGEX_MATCH_NOTEMPTY      | \
  58                             G_REGEX_MATCH_PARTIAL       | \
  59                             G_REGEX_MATCH_NEWLINE_CR    | \
  60                             G_REGEX_MATCH_NEWLINE_LF    | \
  61                             G_REGEX_MATCH_NEWLINE_CRLF  | \
  62                             G_REGEX_MATCH_NEWLINE_ANY)
  63
  64 /* if the string is in UTF-8 use g_utf8_ functions, else use
  65  * use just +/- 1. */
  66 #define NEXT_CHAR(re, s) (((re)->pattern->compile_opts & PCRE_UTF8) ? \
  67                                 g_utf8_next_char (s) : \
  68                                 ((s) + 1))
  69 #define PREV_CHAR(re, s) (((re)->pattern->compile_opts & PCRE_UTF8) ? \
  70                                 g_utf8_prev_char (s) : \
  71                                 ((s) - 1))
  72
  73 #define WORKSPACE_INITIAL 1000
  74 #define OFFSETS_DFA_MIN_SIZE 21
  75
  76 /* atomically returns the pcre_extra struct in the regex. */
  77 #define REGEX_GET_EXTRA(re) ((pcre_extra *)g_atomic_pointer_get (&(re)->pattern->extra))
  78
  79 /* this struct can be shared by more regexes */
  80 typedef struct
  81 {
  82   volatile guint ref_count;     /* the ref count for the immutable part */
  83   gchar *pattern;               /* the pattern */
  84   pcre *pcre_re;                /* compiled form of the pattern */
  85   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
  86   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
  87   pcre_extra *extra;            /* data stored when g_regex_optimize() is used */
  88 } GRegexPattern;
  89
  90 /* this struct is used only by a single regex */
  91 typedef struct
  92 {
  93   gint matches;                 /* number of matching sub patterns */
  94   gint pos;                     /* position in the string where last match left off */
  95   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
  96   gint n_offsets;               /* number of offsets */
  97   gint *workspace;              /* workspace for pcre_dfa_exec() */
  98   gint n_workspace;             /* number of workspace elements */
  99   gssize string_len;            /* length of the string last used against */
 100   GSList *delims;               /* delimiter sub strings from split next */
 101   gint last_separator_end;      /* position of the last separator for split_next_full() */
 102   gboolean last_match_is_empty; /* was the last match in split_next_full() 0 bytes long? */
 103 } GRegexMatch;
 104
 105 struct _GRegex
 106 {
 107   GRegexPattern *pattern;       /* immutable part, shared */
 108   GRegexMatch *match;           /* mutable part, not shared */
 109 };
 110
 111 /* TRUE if ret is an error code, FALSE otherwise. */
 112 #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)
 113
 114 static const gchar *
 115 match_error (gint errcode)
 116 {
 117   switch (errcode)
 118     {
 119     case PCRE_ERROR_NOMATCH:
 120       /* not an error */
 121       break;
 122     case PCRE_ERROR_NULL:
 123       /* NULL argument, this should not happen in GRegex */
 124       g_warning ("A NULL argument was passed to PCRE");
 125       break;
 126     case PCRE_ERROR_BADOPTION:
 127       return "bad options";
 128     case PCRE_ERROR_BADMAGIC:
 129       return _("corrupted object");
 130     case PCRE_ERROR_UNKNOWN_OPCODE:
 131       return N_("internal error or corrupted object");
 132     case PCRE_ERROR_NOMEMORY:
 133       return _("out of memory");
 134     case PCRE_ERROR_NOSUBSTRING:
 135       /* not used by pcre_exec() */
 136       break;
 137     case PCRE_ERROR_MATCHLIMIT:
 138       return _("backtracking limit reached");
 139     case PCRE_ERROR_CALLOUT:
 140       /* callouts are not implemented */
 141       break;
 142     case PCRE_ERROR_BADUTF8:
 143     case PCRE_ERROR_BADUTF8_OFFSET:
 144       /* we do not check if strings are valid */
 145       break;
 146     case PCRE_ERROR_PARTIAL:
 147       /* not an error */
 148       break;
 149     case PCRE_ERROR_BADPARTIAL:
 150       return _("the pattern contains items not supported for partial matching");
 151     case PCRE_ERROR_INTERNAL:
 152       return _("internal error");
 153     case PCRE_ERROR_BADCOUNT:
 154       /* negative ovecsize, this should not happen in GRegex */
 155       g_warning ("A negative ovecsize was passed to PCRE");
 156       break;
 157     case PCRE_ERROR_DFA_UITEM:
 158       return _("the pattern contains items not supported for partial matching");
 159     case PCRE_ERROR_DFA_UCOND:
 160       return _("back references as conditions are not supported for partial matching");
 161     case PCRE_ERROR_DFA_UMLIMIT:
 162       /* the match_field field is not udes in GRegex */
 163       break;
 164     case PCRE_ERROR_DFA_WSSIZE:
 165       /* handled expanding the workspace */
 166       break;
 167     case PCRE_ERROR_DFA_RECURSE:
 168     case PCRE_ERROR_RECURSIONLIMIT:
 169       return _("recursion limit reached");
 170     case PCRE_ERROR_NULLWSLIMIT:
 171       return _("workspace limit for empty substrings reached");
 172     case PCRE_ERROR_BADNEWLINE:
 173       return _("invalid combination of newline flags");
 174     default:
 175       break;
 176     }
 177   return _("unknown error");
 178 }
 179
 180 GQuark
 181 g_regex_error_quark (void)
 182 {
 183   static GQuark error_quark = 0;
 184
 185   if (error_quark == 0)
 186     error_quark = g_quark_from_static_string ("g-regex-error-quark");
 187
 188   return error_quark;
 189 }
 190
 191 static GRegexPattern *
 192 regex_pattern_new (pcre              *re,
 193                    const gchar       *pattern,
 194                    GRegexCompileFlags compile_options,
 195                    GRegexMatchFlags   match_options)
 196 {
 197   GRegexPattern *rp = g_new0 (GRegexPattern, 1);
 198   rp->ref_count = 1;
 199   rp->pcre_re = re;
 200   rp->pattern = g_strdup (pattern);
 201   rp->compile_opts = compile_options;
 202   rp->match_opts = match_options;
 203   return rp;
 204 }
 205
 206 static GRegexPattern *
 207 regex_pattern_ref (GRegexPattern *rp)
 208 {
 209   /* increases the ref count of the immutable part of the GRegex */
 210   g_atomic_int_inc ((gint*) &rp->ref_count);
 211   return rp;
 212 }
 213
 214 static void
 215 regex_pattern_unref (GRegexPattern *rp)
 216 {
 217   /* decreases the ref count of the immutable part of the GRegex
 218    * and deletes it if the ref count went to 0 */
 219   if (g_atomic_int_exchange_and_add ((gint *) &rp->ref_count, -1) - 1 == 0)
 220     {
 221       g_free (rp->pattern);
 222       if (rp->pcre_re != NULL)
 223         pcre_free (rp->pcre_re);
 224       if (rp->extra != NULL)
 225         pcre_free (rp->extra);
 226       g_free (rp);
 227     }
 228 }
 229
 230 static void
 231 regex_match_free (GRegexMatch *rm)
 232 {
 233   if (rm == NULL)
 234     return;
 235
 236   g_slist_foreach (rm->delims, (GFunc) g_free, NULL);
 237   g_slist_free (rm->delims);
 238   g_free (rm->offsets);
 239   g_free (rm->workspace);
 240   g_free (rm);
 241 }
 242
 243 static void
 244 regex_lazy_init_match (GRegex *regex,
 245                        gint    min_offsets)
 246 {
 247   gint n_offsets;
 248
 249   if (regex->match != NULL)
 250     return;
 251
 252   pcre_fullinfo (regex->pattern->pcre_re,
 253                  REGEX_GET_EXTRA (regex),
 254                  PCRE_INFO_CAPTURECOUNT, &n_offsets);
 255   n_offsets = (MAX (n_offsets, min_offsets) + 1) * 3;
 256
 257   regex->match = g_new0 (GRegexMatch, 1);
 258   regex->match->string_len = -1;
 259   regex->match->matches = -1000;
 260   regex->match->n_offsets = n_offsets;
 261   regex->match->offsets = g_new0 (gint, n_offsets);
 262 }
 263
 264 /**
 265  * g_regex_new:
 266  * @pattern: the regular expression
 267  * @compile_options: compile options for the regular expression
 268  * @match_options: match options for the regular expression
 269  * @error: return location for a #GError
 270  *
 271  * Compiles the regular expression to an internal form, and does the initial
 272  * setup of the #GRegex structure.
 273  *
 274  * Returns: a #GRegex structure
 275  *
 276  * Since: 2.14
 277  */
 278 GRegex *
 279 g_regex_new (const gchar         *pattern,
 280              GRegexCompileFlags   compile_options,
 281              GRegexMatchFlags     match_options,
 282              GError             **error)
 283 {
 284   pcre *re;
 285   const gchar *errmsg;
 286   gint erroffset;
 287   static gboolean initialized = FALSE;
 288
 289   g_return_val_if_fail (pattern != NULL, NULL);
 290   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
 291   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
 292   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
 293
 294   if (!initialized)
 295     {
 296       gint support;
 297       const gchar *msg;
 298
 299       pcre_config (PCRE_CONFIG_UTF8, &support);
 300       if (!support)
 301         {
 302           msg = N_("PCRE library is compiled without UTF8 support");
 303           g_critical (msg);
 304           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 305           return NULL;
 306         }
 307
 308       pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support);
 309       if (!support)
 310         {
 311           msg = N_("PCRE library is compiled without UTF8 properties support");
 312           g_critical (msg);
 313           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg));
 314           return NULL;
 315         }
 316
 317       initialized = TRUE;
 318     }
 319
 320   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
 321    * instead uses UTF-8 only if required with PCRE_UTF8. */
 322   if (compile_options & G_REGEX_RAW)
 323     {
 324       /* disable utf-8 */
 325       compile_options &= ~G_REGEX_RAW;
 326     }
 327   else
 328     {
 329       /* enable utf-8 */
 330       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
 331       match_options |= PCRE_NO_UTF8_CHECK;
 332     }
 333
 334   /* compile the pattern */
 335   re = pcre_compile (pattern, compile_options, &errmsg, &erroffset, NULL);
 336
 337   /* if the compilation failed, set the error member and return
 338    * immediately */
 339   if (re == NULL)
 340     {
 341       GError *tmp_error = g_error_new (G_REGEX_ERROR,
 342                                        G_REGEX_ERROR_COMPILE,
 343                                        _("Error while compiling regular "
 344                                          "expression %s at char %d: %s"),
 345                                        pattern, erroffset, errmsg);
 346       g_propagate_error (error, tmp_error);
 347
 348       return NULL;
 349     }
 350   else
 351     {
 352       GRegex *regex = g_new0 (GRegex, 1);
 353       regex->pattern = regex_pattern_new (re, pattern,
 354                                           compile_options, match_options);
 355       return regex;
 356     }
 357 }
 358
 359 /**
 360  * g_regex_free:
 361  * @regex: a #GRegex
 362  *
 363  * Frees all the memory associated with the regex structure.
 364  *
 365  * Since: 2.14
 366  */
 367 void
 368 g_regex_free (GRegex *regex)
 369 {
 370   if (regex == NULL)
 371     return;
 372
 373   regex_pattern_unref (regex->pattern);
 374   regex_match_free (regex->match);
 375   g_free (regex);
 376 }
 377
 378 /**
 379  * g_regex_copy:
 380  * @regex: a #GRegex structure from g_regex_new()
 381  *
 382  * Copies a #GRegex. The returned #Gregex is in the same state as after
 383  * a call to g_regex_clear(), so it does not contain information on the
 384  * last match. If @regex is %NULL it returns %NULL.
 385  *
 386  * The returned copy shares some of its internal state with the original
 387  * @regex, and the other internal variables are created only when needed,
 388  * so the copy is a lightweight operation.
 389  *
 390  * Returns: a newly-allocated copy of @regex, or %NULL if an error
 391  *          occurred
 392  *
 393  * Since: 2.14
 394  */
 395 GRegex *
 396 g_regex_copy (const GRegex *regex)
 397 {
 398   GRegex *copy;
 399
 400   if (regex == NULL)
 401     return NULL;
 402
 403   copy = g_new0 (GRegex, 1);
 404   copy->pattern = regex_pattern_ref (regex->pattern);
 405
 406   return copy;
 407 }
 408
 409 /**
 410  * g_regex_get_pattern:
 411  * @regex: a #GRegex structure
 412  *
 413  * Gets the pattern string associated with @regex, i.e. a copy of
 414  * the string passed to g_regex_new().
 415  *
 416  * Returns: the pattern of @regex
 417  *
 418  * Since: 2.14
 419  */
 420 const gchar *
 421 g_regex_get_pattern (const GRegex *regex)
 422 {
 423   g_return_val_if_fail (regex != NULL, NULL);
 424
 425   return regex->pattern->pattern;
 426 }
 427
 428 /**
 429  * g_regex_clear:
 430  * @regex: a #GRegex structure
 431  *
 432  * Clears out the members of @regex that are holding information about the
 433  * last set of matches for this pattern.  g_regex_clear() needs to be
 434  * called between uses of g_regex_match_next() or g_regex_match_next_full()
 435  * against new target strings.
 436  *
 437  * Since: 2.14
 438  */
 439 void
 440 g_regex_clear (GRegex *regex)
 441 {
 442   g_return_if_fail (regex != NULL);
 443
 444   if (regex->match == NULL)
 445     return;
 446
 447   regex->match->matches = -1000; /* an error code not used by PCRE */
 448   regex->match->string_len = -1;
 449   regex->match->pos = 0;
 450
 451   /* if the pattern was used with g_regex_split_next(), it may have
 452    * delimiter offsets stored.  Free up those guys as well. */
 453   if (regex->match->delims != NULL)
 454     {
 455       g_slist_foreach (regex->match->delims, (GFunc) g_free, NULL);
 456       g_slist_free (regex->match->delims);
 457       regex->match->delims = NULL;
 458     }
 459 }
 460
 461 /**
 462  * g_regex_optimize:
 463  * @regex: a #GRegex structure
 464  * @error: return location for a #GError
 465  *
 466  * If the pattern will be used many times, then it may be worth the
 467  * effort to optimize it to improve the speed of matches.
 468  *
 469  * Returns: %TRUE if @regex has been optimized or was already optimized,
 470  *          %FALSE otherwise
 471  *
 472  * Since: 2.14
 473  */
 474 gboolean
 475 g_regex_optimize (GRegex  *regex,
 476                   GError **error)
 477 {
 478   const gchar *errmsg;
 479   pcre_extra *extra;
 480   pcre_extra G_GNUC_MAY_ALIAS **extra_p;
 481
 482   g_return_val_if_fail (regex != NULL, FALSE);
 483   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 484
 485   if (REGEX_GET_EXTRA (regex) != NULL)
 486     /* already optimized. */
 487     return TRUE;
 488
 489   extra = pcre_study (regex->pattern->pcre_re, 0, &errmsg);
 490
 491   if (errmsg != NULL)
 492     {
 493       GError *tmp_error = g_error_new (G_REGEX_ERROR,
 494                                        G_REGEX_ERROR_OPTIMIZE,
 495                                        _("Error while optimizing "
 496                                          "regular expression %s: %s"),
 497                                        regex->pattern->pattern,
 498                                        errmsg);
 499       g_propagate_error (error, tmp_error);
 500       return FALSE;
 501     }
 502
 503   if (extra == NULL)
 504     return TRUE;
 505
 506   extra_p = &regex->pattern->extra;
 507   if (!g_atomic_pointer_compare_and_exchange ((gpointer *)extra_p, NULL, extra))
 508     /* someone else has optimized the regex while this function was running */
 509     pcre_free (extra);
 510
 511   return TRUE;
 512 }
 513
 514 /**
 515  * g_regex_match_simple:
 516  * @pattern: the regular expression
 517  * @string: the string to scan for matches
 518  * @compile_options: compile options for the regular expression
 519  * @match_options: match options
 520  *
 521  * Scans for a match in @string for @pattern.
 522  *
 523  * This function is equivalent to g_regex_match() but it does not
 524  * require to compile the pattern with g_regex_new(), avoiding some
 525  * lines of code when you need just to do a match without extracting
 526  * substrings, capture counts, and so on.
 527  *
 528  * If this function is to be called on the same @pattern more than
 529  * once, it's more efficient to compile the pattern once with
 530  * g_regex_new() and then use g_regex_match().
 531  *
 532  * Returns: %TRUE is the string matched, %FALSE otherwise
 533  *
 534  * Since: 2.14
 535  */
 536 gboolean
 537 g_regex_match_simple (const gchar        *pattern,
 538                       const gchar        *string,
 539                       GRegexCompileFlags  compile_options,
 540                       GRegexMatchFlags    match_options)
 541 {
 542   GRegex *regex;
 543   gboolean result;
 544
 545   regex = g_regex_new (pattern, compile_options, 0, NULL);
 546   if (!regex)
 547     return FALSE;
 548   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL);
 549   g_regex_free (regex);
 550   return result;
 551 }
 552
 553 /**
 554  * g_regex_match:
 555  * @regex: a #GRegex structure from g_regex_new()
 556  * @string: the string to scan for matches
 557  * @match_options: match options
 558  *
 559  * Scans for a match in string for the pattern in @regex. The @match_options
 560  * are combined with the match options specified when the @regex structure
 561  * was created, letting you have more flexibility in reusing #GRegex
 562  * structures.
 563  *
 564  * Returns: %TRUE is the string matched, %FALSE otherwise
 565  *
 566  * Since: 2.14
 567  */
 568 gboolean
 569 g_regex_match (GRegex          *regex,
 570                const gchar     *string,
 571                GRegexMatchFlags match_options)
 572 {
 573   return g_regex_match_full (regex, string, -1, 0,
 574                              match_options, NULL);
 575 }
 576
 577 /**
 578  * g_regex_match_full:
 579  * @regex: a #GRegex structure from g_regex_new()
 580  * @string: the string to scan for matches
 581  * @string_len: the length of @string, or -1 if @string is nul-terminated
 582  * @start_position: starting index of the string to match
 583  * @match_options: match options
 584  * @error: location to store the error occuring, or %NULL to ignore errors
 585  *
 586  * Scans for a match in string for the pattern in @regex. The @match_options
 587  * are combined with the match options specified when the @regex structure
 588  * was created, letting you have more flexibility in reusing #GRegex
 589  * structures.
 590  *
 591  * Setting @start_position differs from just passing over a shortened string
 592  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
 593  * with any kind of lookbehind assertion, such as "\b".
 594  *
 595  * Returns: %TRUE is the string matched, %FALSE otherwise
 596  *
 597  * Since: 2.14
 598  */
 599 gboolean
 600 g_regex_match_full (GRegex          *regex,
 601                     const gchar       *string,
 602                     gssize             string_len,
 603                     gint               start_position,
 604                     GRegexMatchFlags match_options,
 605                     GError           **error)
 606 {
 607   g_return_val_if_fail (regex != NULL, FALSE);
 608   g_return_val_if_fail (string != NULL, FALSE);
 609   g_return_val_if_fail (start_position >= 0, FALSE);
 610   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 611   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
 612
 613   regex_lazy_init_match (regex, 0);
 614
 615   if (string_len < 0)
 616     string_len = strlen (string);
 617
 618   regex->match->string_len = string_len;
 619
 620   /* create regex->match->offsets if it does not exist */
 621   regex_lazy_init_match (regex, 0);
 622
 623   /* perform the match */
 624   regex->match->matches = pcre_exec (regex->pattern->pcre_re,
 625                                      REGEX_GET_EXTRA (regex),
 626                                      string, regex->match->string_len,
 627                                      start_position,
 628                                      regex->pattern->match_opts | match_options,
 629                                      regex->match->offsets, regex->match->n_offsets);
 630   if (IS_PCRE_ERROR (regex->match->matches))
 631     {
 632       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 633                    _("Error while matching regular expression %s: %s"),
 634                    regex->pattern->pattern, match_error (regex->match->matches));
 635       return FALSE;
 636     }
 637
 638   /* set regex->match->pos to -1 so that a call to g_regex_match_next()
 639    * fails without a previous call to g_regex_clear(). */
 640   regex->match->pos = -1;
 641
 642   return regex->match->matches >= 0;
 643 }
 644
 645 /**
 646  * g_regex_match_next:
 647  * @regex: a #GRegex structure
 648  * @string: the string to scan for matches
 649  * @match_options: the match options
 650  *
 651  * Scans for the next match in @string of the pattern in @regex.
 652  * array.  The match options are combined with the match options set when
 653  * the @regex was created.
 654  *
 655  * You have to call g_regex_clear() to reuse the same pattern on a new
 656  * string.
 657  *
 658  * Returns: %TRUE is the string matched, %FALSE otherwise
 659  *
 660  * Since: 2.14
 661  */
 662 gboolean
 663 g_regex_match_next (GRegex          *regex,
 664                     const gchar     *string,
 665                     GRegexMatchFlags match_options)
 666 {
 667   return g_regex_match_next_full (regex, string, -1, 0,
 668                                   match_options, NULL);
 669 }
 670
 671 /**
 672  * g_regex_match_next_full:
 673  * @regex: a #GRegex structure
 674  * @string: the string to scan for matches
 675  * @string_len: the length of @string, or -1 if @string is nul-terminated
 676  * @start_position: starting index of the string to match
 677  * @match_options: the match options
 678  * @error: location to store the error occuring, or %NULL to ignore errors
 679  *
 680  * Scans for the next match in @string of the pattern in @regex. Calling
 681  * g_regex_match_next_full() until it returns %FALSE, you can retrieve
 682  * all the non-overlapping matches of the pattern in @string. Empty matches
 683  * are included, so matching the string "ab" with the pattern "b*" will
 684  * find three matches: "" at position 0, "b" from position 1 to 2 and
 685  * "" at position 2.
 686  *
 687  * The match options are combined with the match options set when the
 688  * @regex was created.
 689  *
 690  * You have to call g_regex_clear() to reuse the same pattern on a new
 691  * string.
 692  *
 693  * Setting @start_position differs from just passing over a shortened string
 694  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
 695  * with any kind of lookbehind assertion, such as "\b".
 696  *
 697  * Returns: %TRUE is the string matched, %FALSE otherwise
 698  *
 699  * Since: 2.14
 700  */
 701 gboolean
 702 g_regex_match_next_full (GRegex          *regex,
 703                          const gchar     *string,
 704                          gssize           string_len,
 705                          gint             start_position,
 706                          GRegexMatchFlags match_options,
 707                          GError         **error)
 708 {
 709   g_return_val_if_fail (regex != NULL, FALSE);
 710   g_return_val_if_fail (string != NULL, FALSE);
 711   g_return_val_if_fail (start_position >= 0, FALSE);
 712   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 713   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
 714
 715   regex_lazy_init_match (regex, 0);
 716
 717   if (G_UNLIKELY (regex->match->pos < 0))
 718     {
 719       const gchar *msg = _("g_regex_match_next_full: called without a "
 720                            "previous call to g_regex_clear()");
 721       g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL, msg);
 722       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, msg);
 723       return FALSE;
 724     }
 725
 726   /* if this regex hasn't been used on this string before, then we
 727    * need to calculate the length of the string, and set pos to the
 728    * start of it.
 729    * Knowing if this regex has been used on this string is a bit of
 730    * a challenge.  For now, we require the user to call g_regex_clear()
 731    * in between usages on a new string.  Not perfect, but not such a
 732    * bad solution either.
 733    */
 734   if (regex->match->string_len == -1)
 735     {
 736       if (string_len < 0)
 737         string_len = strlen (string);
 738       regex->match->string_len = string_len;
 739
 740       regex->match->pos = start_position;
 741     }
 742
 743   /* create regex->match->offsets if it does not exist */
 744   regex_lazy_init_match (regex, 0);
 745
 746   /* perform the match */
 747   regex->match->matches = pcre_exec (regex->pattern->pcre_re,
 748                                      REGEX_GET_EXTRA (regex),
 749                                      string, regex->match->string_len,
 750                                      regex->match->pos,
 751                                      regex->pattern->match_opts | match_options,
 752                                      regex->match->offsets, regex->match->n_offsets);
 753   if (IS_PCRE_ERROR (regex->match->matches))
 754     {
 755       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 756                    _("Error while matching regular expression %s: %s"),
 757                    regex->pattern->pattern, match_error (regex->match->matches));
 758       return FALSE;
 759     }
 760
 761   /* avoid infinite loops if regex is an empty string or something
 762    * equivalent */
 763   if (regex->match->pos == regex->match->offsets[1])
 764     {
 765       if (regex->match->pos > regex->match->string_len)
 766         {
 767           /* we have reached the end of the string */
 768           regex->match->pos = -1;
 769           return FALSE;
 770         }
 771       regex->match->pos = NEXT_CHAR (regex, &string[regex->match->pos]) - string;
 772     }
 773   else
 774     {
 775       regex->match->pos = regex->match->offsets[1];
 776     }
 777
 778   return regex->match->matches >= 0;
 779 }
 780
 781 /**
 782  * g_regex_match_all:
 783  * @regex: a #GRegex structure from g_regex_new()
 784  * @string: the string to scan for matches
 785  * @match_options: match options
 786  *
 787  * Using the standard algorithm for regular expression matching only the
 788  * longest match in the string is retrieved. This function uses a
 789  * different algorithm so it can retrieve all the possible matches.
 790  * For more documentation see g_regex_match_all_full().
 791  *
 792  * Returns: %TRUE is the string matched, %FALSE otherwise
 793  *
 794  * Since: 2.14
 795  */
 796 gboolean
 797 g_regex_match_all (GRegex          *regex,
 798                    const gchar     *string,
 799                    GRegexMatchFlags match_options)
 800 {
 801   return g_regex_match_all_full (regex, string, -1, 0,
 802                                  match_options, NULL);
 803 }
 804
 805 /**
 806  * g_regex_match_all_full:
 807  * @regex: a #GRegex structure from g_regex_new()
 808  * @string: the string to scan for matches
 809  * @string_len: the length of @string, or -1 if @string is nul-terminated
 810  * @start_position: starting index of the string to match
 811  * @match_options: match options
 812  * @error: location to store the error occuring, or %NULL to ignore errors
 813  *
 814  * Using the standard algorithm for regular expression matching only the
 815  * longest match in the string is retrieved, it is not possibile to obtain
 816  * all the available matches. For instance matching
 817  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;" you get
 818  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;".
 819  *
 820  * This function uses a different algorithm (called DFA, i.e. deterministic
 821  * finite automaton), so it can retrieve all the possible matches, all
 822  * starting at the same point in the string. For instance matching
 823  * "&lt;a&gt; &lt;b&gt; &lt;c&gt;" against the pattern "&lt;.*&gt;" you
 824  * would obtain three matches: "&lt;a&gt; &lt;b&gt; &lt;c&gt;",
 825  * "&lt;a&gt; &lt;b&gt;" and "&lt;a&gt;".
 826  *
 827  * The number of matched strings is retrieved using
 828  * g_regex_get_match_count().
 829  * To obtain the matched strings and their position you can use,
 830  * respectively, g_regex_fetch() and g_regex_fetch_pos(). Note that the
 831  * strings are returned in reverse order of length; that is, the longest
 832  * matching string is given first.
 833  *
 834  * Note that the DFA algorithm is slower than the standard one and it is not
 835  * able to capture substrings, so backreferences do not work.
 836  *
 837  * Setting @start_position differs from just passing over a shortened string
 838  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
 839  * with any kind of lookbehind assertion, such as "\b".
 840  *
 841  * Returns: %TRUE is the string matched, %FALSE otherwise
 842  *
 843  * Since: 2.14
 844  */
 845 gboolean
 846 g_regex_match_all_full (GRegex          *regex,
 847                         const gchar     *string,
 848                         gssize           string_len,
 849                         gint             start_position,
 850                         GRegexMatchFlags match_options,
 851                         GError         **error)
 852 {
 853   g_return_val_if_fail (regex != NULL, FALSE);
 854   g_return_val_if_fail (string != NULL, FALSE);
 855   g_return_val_if_fail (start_position >= 0, FALSE);
 856   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
 857   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
 858
 859   regex_lazy_init_match (regex, 0);
 860
 861   if (string_len < 0)
 862     string_len = strlen (string);
 863
 864   regex->match->string_len = string_len;
 865
 866   if (regex->match->workspace == NULL)
 867     {
 868       regex->match->n_workspace = WORKSPACE_INITIAL;
 869       regex->match->workspace = g_new (gint, regex->match->n_workspace);
 870     }
 871
 872   if (regex->match->n_offsets < OFFSETS_DFA_MIN_SIZE)
 873     {
 874       regex->match->n_offsets = OFFSETS_DFA_MIN_SIZE;
 875       regex->match->offsets = g_realloc (regex->match->offsets,
 876                                          regex->match->n_offsets * sizeof(gint));
 877     }
 878
 879   /* perform the match */
 880   regex->match->matches = pcre_dfa_exec (regex->pattern->pcre_re,
 881                                          REGEX_GET_EXTRA (regex),
 882                                          string, regex->match->string_len,
 883                                          start_position,
 884                                          regex->pattern->match_opts | match_options,
 885                                          regex->match->offsets, regex->match->n_offsets,
 886                                          regex->match->workspace,
 887                                          regex->match->n_workspace);
 888   if (regex->match->matches == PCRE_ERROR_DFA_WSSIZE)
 889     {
 890       /* regex->match->workspace is too small. */
 891       regex->match->n_workspace *= 2;
 892       regex->match->workspace = g_realloc (regex->match->workspace,
 893                                            regex->match->n_workspace * sizeof (gint));
 894       return g_regex_match_all_full (regex, string, string_len,
 895                                      start_position, match_options, error);
 896     }
 897   else if (regex->match->matches == 0)
 898     {
 899       /* regex->match->offsets is too small. */
 900       regex->match->n_offsets *= 2;
 901       regex->match->offsets = g_realloc (regex->match->offsets,
 902                                          regex->match->n_offsets * sizeof (gint));
 903       return g_regex_match_all_full (regex, string, string_len,
 904                                      start_position, match_options, error);
 905     }
 906   else if (IS_PCRE_ERROR (regex->match->matches))
 907     {
 908       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 909                    _("Error while matching regular expression %s: %s"),
 910                    regex->pattern->pattern, match_error (regex->match->matches));
 911       return FALSE;
 912     }
 913
 914   /* set regex->match->pos to -1 so that a call to g_regex_match_next()
 915    * fails without a previous call to g_regex_clear(). */
 916   regex->match->pos = -1;
 917
 918   return regex->match->matches >= 0;
 919 }
 920
 921 /**
 922  * g_regex_get_match_count:
 923  * @regex: a #GRegex structure
 924  *
 925  * Retrieves the number of matched substrings (including substring 0, that
 926  * is the whole matched text) in the last call to g_regex_match*(), so 1
 927  * is returned if the pattern has no substrings in it and 0 is returned if
 928  * the match failed.
 929  *
 930  * If the last match was obtained using the DFA algorithm, that is using
 931  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
 932  * count is not that of the number of capturing parentheses but that of
 933  * the number of matched substrings.
 934  *
 935  * Returns:  Number of matched substrings, or -1 if an error occurred
 936  *
 937  * Since: 2.14
 938  */
 939 gint
 940 g_regex_get_match_count (const GRegex *regex)
 941 {
 942   g_return_val_if_fail (regex != NULL, -1);
 943
 944   if (regex->match == NULL)
 945     return -1;
 946
 947   if (regex->match->matches == PCRE_ERROR_NOMATCH)
 948     /* no match */
 949     return 0;
 950   else if (regex->match->matches < PCRE_ERROR_NOMATCH)
 951     /* error */
 952     return -1;
 953   else
 954     /* match */
 955     return regex->match->matches;
 956 }
 957
 958 /**
 959  * g_regex_is_partial_match:
 960  * @regex: a #GRegex structure
 961  *
 962  * Usually if the string passed to g_regex_match*() matches as far as
 963  * it goes, but is too short to match the entire pattern, %FALSE is
 964  * returned. There are circumstances where it might be helpful to
 965  * distinguish this case from other cases in which there is no match.
 966  *
 967  * Consider, for example, an application where a human is required to
 968  * type in data for a field with specific formatting requirements. An
 969  * example might be a date in the form ddmmmyy, defined by the pattern
 970  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
 971  * If the application sees the user’s keystrokes one by one, and can
 972  * check that what has been typed so far is potentially valid, it is
 973  * able to raise an error as soon as a mistake is made.
 974  *
 975  * GRegex supports the concept of partial matching by means of the
 976  * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for
 977  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
 978  * for a complete match, %FALSE otherwise. But, when this functions
 979  * returns %FALSE, you can check if the match was partial calling
 980  * g_regex_is_partial_match().
 981  *
 982  * When using partial matching you cannot use g_regex_fetch*().
 983  *
 984  * Because of the way certain internal optimizations are implemented the
 985  * partial matching algorithm cannot be used with all patterns. So repeated
 986  * single characters such as "a{2,4}" and repeated single metasequences such
 987  * as "\d+" are not permitted if the maximum number of occurrences is
 988  * greater than one. Optional items such as "\d?" (where the maximum is one)
 989  * are permitted. Quantifiers with any values are permitted after
 990  * parentheses, so the invalid examples above can be coded thus "(a){2,4}"
 991  * and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set for a pattern that does
 992  * not conform to the restrictions, matching functions return an error.
 993  *
 994  * Returns: %TRUE if the match was partial, %FALSE otherwise
 995  *
 996  * Since: 2.14
 997  */
 998 gboolean
 999 g_regex_is_partial_match (const GRegex *regex)
1000 {
1001   g_return_val_if_fail (regex != NULL, FALSE);
1002
1003   if (regex->match == NULL)
1004     return FALSE;
1005
1006   return regex->match->matches == PCRE_ERROR_PARTIAL;
1007 }
1008
1009 /**
1010  * g_regex_fetch:
1011  * @regex: #GRegex structure used in last match
1012  * @match_num: number of the sub expression
1013  * @string: the string on which the last match was made
1014  *
1015  * Retrieves the text matching the @match_num<!-- -->'th capturing parentheses.
1016  * 0 is the full text of the match, 1 is the first paren set, 2 the second,
1017  * and so on.
1018  *
1019  * If @match_num is a valid sub pattern but it didn't match anything (e.g.
1020  * sub pattern 1, matching "b" against "(a)?b") then an empty string is
1021  * returned.
1022  *
1023  * If the last match was obtained using the DFA algorithm, that is using
1024  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1025  * string is not that of a set of parentheses but that of a matched
1026  * substring. Substrings are matched in reverse order of length, so 0 is
1027  * the longest match.
1028  *
1029  * Returns: The matched substring, or %NULL if an error occurred.
1030  *          You have to free the string yourself.
1031  *
1032  * Since: 2.14
1033  */
1034 gchar *
1035 g_regex_fetch (const GRegex *regex,
1036                gint          match_num,
1037                const gchar  *string)
1038 {
1039   /* we cannot use pcre_get_substring() because it allocates the
1040    * string using pcre_malloc(). */
1041   gchar *match = NULL;
1042   gint start, end;
1043
1044   g_return_val_if_fail (regex != NULL, NULL);
1045   g_return_val_if_fail (match_num >= 0, NULL);
1046
1047   if (regex->match == NULL)
1048     return NULL;
1049
1050   if (regex->match->string_len < 0)
1051     return NULL;
1052
1053   /* match_num does not exist or it didn't matched, i.e. matching "b"
1054    * against "(a)?b" then group 0 is empty. */
1055   if (!g_regex_fetch_pos (regex, match_num, &start, &end))
1056     match = NULL;
1057   else if (start == -1)
1058     match = g_strdup ("");
1059   else
1060     match = g_strndup (&string[start], end - start);
1061
1062   return match;
1063 }
1064
1065 /**
1066  * g_regex_fetch_pos:
1067  * @regex: #GRegex structure used in last match
1068  * @match_num: number of the sub expression
1069  * @start_pos: pointer to location where to store the start position
1070  * @end_pos: pointer to location where to store the end position
1071  *
1072  * Retrieves the position of the @match_num<!-- -->'th capturing parentheses.
1073  * 0 is the full text of the match, 1 is the first paren set, 2 the second,
1074  * and so on.
1075  *
1076  * If @match_num is a valid sub pattern but it didn't match anything (e.g.
1077  * sub pattern 1, matching "b" against "(a)?b") then @start_pos and @end_pos
1078  * are set to -1 and %TRUE is returned.
1079  *
1080  * If the last match was obtained using the DFA algorithm, that is using
1081  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1082  * position is not that of a set of parentheses but that of a matched
1083  * substring. Substrings are matched in reverse order of length, so 0 is
1084  * the longest match.
1085  *
1086  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If the
1087  *          position cannot be fetched, @start_pos and @end_pos are left
1088  *          unchanged.
1089  *
1090  * Since: 2.14
1091  */
1092 gboolean
1093 g_regex_fetch_pos (const GRegex *regex,
1094                    gint          match_num,
1095                    gint         *start_pos,
1096                    gint         *end_pos)
1097 {
1098   g_return_val_if_fail (regex != NULL, FALSE);
1099   g_return_val_if_fail (match_num >= 0, FALSE);
1100
1101   if (regex->match == NULL)
1102     return FALSE;
1103
1104   /* make sure the sub expression number they're requesting is less than
1105    * the total number of sub expressions that were matched. */
1106   if (match_num >= regex->match->matches)
1107     return FALSE;
1108
1109   if (start_pos != NULL)
1110     *start_pos = regex->match->offsets[2 * match_num];
1111
1112   if (end_pos != NULL)
1113     *end_pos = regex->match->offsets[2 * match_num + 1];
1114
1115   return TRUE;
1116 }
1117
1118 /**
1119  * g_regex_fetch_named:
1120  * @regex: #GRegex structure used in last match
1121  * @name: name of the subexpression
1122  * @string: the string on which the last match was made
1123  *
1124  * Retrieves the text matching the capturing parentheses named @name.
1125  *
1126  * If @name is a valid sub pattern name but it didn't match anything (e.g.
1127  * sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b") then an empty
1128  * string is returned.
1129  *
1130  * Returns: The matched substring, or %NULL if an error occurred.
1131  *          You have to free the string yourself.
1132  *
1133  * Since: 2.14
1134  */
1135 gchar *
1136 g_regex_fetch_named (const GRegex *regex,
1137                      const gchar  *name,
1138                      const gchar  *string)
1139 {
1140   /* we cannot use pcre_get_named_substring() because it allocates the
1141    * string using pcre_malloc(). */
1142   gint num;
1143
1144   g_return_val_if_fail (regex != NULL, NULL);
1145   g_return_val_if_fail (string != NULL, NULL);
1146   g_return_val_if_fail (name != NULL, NULL);
1147
1148   num = g_regex_get_string_number (regex, name);
1149   if (num == -1)
1150     return NULL;
1151   else
1152     return g_regex_fetch (regex, num, string);
1153 }
1154
1155 /**
1156  * g_regex_fetch_named_pos:
1157  * @regex: #GRegex structure used in last match
1158  * @name: name of the subexpression
1159  * @start_pos: pointer to location where to store the start position
1160  * @end_pos: pointer to location where to store the end position
1161  *
1162  * Retrieves the position of the capturing parentheses named @name.
1163  *
1164  * If @name is a valid sub pattern name but it didn't match anything (e.g.
1165  * sub pattern "X", matching "b" against "(?P&lt;X&gt;a)?b") then @start_pos and
1166  * @end_pos are set to -1 and %TRUE is returned.
1167  *
1168  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If the
1169  *          position cannot be fetched, @start_pos and @end_pos are left
1170  *          unchanged.
1171  *
1172  * Since: 2.14
1173  */
1174 gboolean
1175 g_regex_fetch_named_pos (const GRegex *regex,
1176                          const gchar  *name,
1177                          gint         *start_pos,
1178                          gint         *end_pos)
1179 {
1180   gint num;
1181
1182   num = g_regex_get_string_number (regex, name);
1183   if (num == -1)
1184     return FALSE;
1185
1186   return g_regex_fetch_pos (regex, num, start_pos, end_pos);
1187 }
1188
1189 /**
1190  * g_regex_fetch_all:
1191  * @regex: a #GRegex structure
1192  * @string: the string on which the last match was made
1193  *
1194  * Bundles up pointers to each of the matching substrings from a match
1195  * and stores them in an array of gchar pointers. The first element in
1196  * the returned array is the match number 0, i.e. the entire matched
1197  * text.
1198  *
1199  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1200  * "b" against "(a)?b") then an empty string is inserted.
1201  *
1202  * If the last match was obtained using the DFA algorithm, that is using
1203  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1204  * strings are not that matched by sets of parentheses but that of the
1205  * matched substring. Substrings are matched in reverse order of length,
1206  * so the first one is the longest match.
1207  *
1208  * Returns: a %NULL-terminated array of gchar * pointers. It must be freed
1209  *          using g_strfreev(). If the memory can't be allocated, returns
1210  *          %NULL.
1211  *
1212  * Since: 2.14
1213  */
1214 gchar **
1215 g_regex_fetch_all (const GRegex *regex,
1216                    const gchar  *string)
1217 {
1218   /* we cannot use pcre_get_substring_list() because the returned value
1219    * isn't suitable for g_strfreev(). */
1220   gchar **result;
1221   gint i;
1222
1223   g_return_val_if_fail (regex != NULL, FALSE);
1224   g_return_val_if_fail (string != NULL, FALSE);
1225
1226   if (regex->match == NULL)
1227     return NULL;
1228
1229   if (regex->match->matches < 0)
1230     return NULL;
1231
1232   result = g_new (gchar *, regex->match->matches + 1);
1233   for (i = 0; i < regex->match->matches; i++)
1234     result[i] = g_regex_fetch (regex, i, string);
1235   result[i] = NULL;
1236
1237   return result;
1238 }
1239
1240 /**
1241  * g_regex_get_string_number:
1242  * @regex: #GRegex structure
1243  * @name: name of the subexpression
1244  *
1245  * Retrieves the number of the subexpression named @name.
1246  *
1247  * Returns: The number of the subexpression or -1 if @name does not exists.
1248  *
1249  * Since: 2.14
1250  */
1251 gint
1252 g_regex_get_string_number (const GRegex *regex,
1253                            const gchar  *name)
1254 {
1255   gint num;
1256
1257   g_return_val_if_fail (regex != NULL, -1);
1258   g_return_val_if_fail (name != NULL, -1);
1259
1260   num = pcre_get_stringnumber (regex->pattern->pcre_re, name);
1261   if (num == PCRE_ERROR_NOSUBSTRING)
1262     num = -1;
1263
1264   return num;
1265 }
1266
1267 /**
1268  * g_regex_split_simple:
1269  * @pattern: the regular expression
1270  * @string: the string to scan for matches
1271  * @compile_options: compile options for the regular expression
1272  * @match_options: match options
1273  *
1274  * Breaks the string on the pattern, and returns an array of the tokens.
1275  * If the pattern contains capturing parentheses, then the text for each
1276  * of the substrings will also be returned. If the pattern does not match
1277  * anywhere in the string, then the whole string is returned as the first
1278  * token.
1279  *
1280  * This function is equivalent to g_regex_split() but it does not
1281  * require to compile the pattern with g_regex_new(), avoiding some
1282  * lines of code when you need just to do a split without extracting
1283  * substrings, capture counts, and so on.
1284  *
1285  * If this function is to be called on the same @pattern more than
1286  * once, it's more efficient to compile the pattern once with
1287  * g_regex_new() and then use g_regex_split().
1288  *
1289  * As a special case, the result of splitting the empty string "" is an
1290  * empty vector, not a vector containing a single string. The reason for
1291  * this special case is that being able to represent a empty vector is
1292  * typically more useful than consistent handling of empty elements. If
1293  * you do need to represent empty elements, you'll need to check for the
1294  * empty string before calling this function.
1295  *
1296  * A pattern that can match empty strings splits @string into separate
1297  * characters wherever it matches the empty string between characters.
1298  * For example splitting "ab c" using as a separator "\s*", you will get
1299  * "a", "b" and "c".
1300  *
1301  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev().
1302  *
1303  * Since: 2.14
1304  **/
1305 gchar **
1306 g_regex_split_simple (const gchar        *pattern,
1307                       const gchar        *string,
1308                       GRegexCompileFlags  compile_options,
1309                       GRegexMatchFlags    match_options)
1310 {
1311   GRegex *regex;
1312   gchar **result;
1313
1314   regex = g_regex_new (pattern, compile_options, 0, NULL);
1315   if (!regex)
1316     return NULL;
1317   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
1318   g_regex_free (regex);
1319   return result;
1320 }
1321
1322 /**
1323  * g_regex_split:
1324  * @regex: a #GRegex structure
1325  * @string: the string to split with the pattern
1326  * @match_options: match time option flags
1327  *
1328  * Breaks the string on the pattern, and returns an array of the tokens.
1329  * If the pattern contains capturing parentheses, then the text for each
1330  * of the substrings will also be returned. If the pattern does not match
1331  * anywhere in the string, then the whole string is returned as the first
1332  * token.
1333  *
1334  * As a special case, the result of splitting the empty string "" is an
1335  * empty vector, not a vector containing a single string. The reason for
1336  * this special case is that being able to represent a empty vector is
1337  * typically more useful than consistent handling of empty elements. If
1338  * you do need to represent empty elements, you'll need to check for the
1339  * empty string before calling this function.
1340  *
1341  * A pattern that can match empty strings splits @string into separate
1342  * characters wherever it matches the empty string between characters.
1343  * For example splitting "ab c" using as a separator "\s*", you will get
1344  * "a", "b" and "c".
1345  *
1346  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev().
1347  *
1348  * Since: 2.14
1349  **/
1350 gchar **
1351 g_regex_split (GRegex           *regex,
1352                const gchar      *string,
1353                GRegexMatchFlags  match_options)
1354 {
1355   return g_regex_split_full (regex, string, -1, 0,
1356                              match_options, 0, NULL);
1357 }
1358
1359 /**
1360  * g_regex_split_full:
1361  * @regex: a #GRegex structure
1362  * @string: the string to split with the pattern
1363  * @string_len: the length of @string, or -1 if @string is nul-terminated
1364  * @start_position: starting index of the string to match
1365  * @match_options: match time option flags
1366  * @max_tokens: the maximum number of tokens to split @string into. If this
1367  *    is less than 1, the string is split completely.
1368  * @error: return location for a #GError
1369  *
1370  * Breaks the string on the pattern, and returns an array of the tokens.
1371  * If the pattern contains capturing parentheses, then the text for each
1372  * of the substrings will also be returned. If the pattern does not match
1373  * anywhere in the string, then the whole string is returned as the first
1374  * token.
1375  *
1376  * As a special case, the result of splitting the empty string "" is an
1377  * empty vector, not a vector containing a single string. The reason for
1378  * this special case is that being able to represent a empty vector is
1379  * typically more useful than consistent handling of empty elements. If
1380  * you do need to represent empty elements, you'll need to check for the
1381  * empty string before calling this function.
1382  *
1383  * A pattern that can match empty strings splits @string into separate
1384  * characters wherever it matches the empty string between characters.
1385  * For example splitting "ab c" using as a separator "\s*", you will get
1386  * "a", "b" and "c".
1387  *
1388  * Setting @start_position differs from just passing over a shortened string
1389  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
1390  * with any kind of lookbehind assertion, such as "\b".
1391  *
1392  * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev().
1393  *
1394  * Since: 2.14
1395  **/
1396 gchar **
1397 g_regex_split_full (GRegex           *regex,
1398                     const gchar      *string,
1399                     gssize            string_len,
1400                     gint              start_position,
1401                     GRegexMatchFlags  match_options,
1402                     gint              max_tokens,
1403                     GError          **error)
1404 {
1405   gchar **string_list;          /* The array of char **s worked on */
1406   gint pos;
1407   gint tokens;
1408   GList *list, *last;
1409   GError *tmp_error = NULL;
1410
1411   g_return_val_if_fail (regex != NULL, NULL);
1412   g_return_val_if_fail (string != NULL, NULL);
1413   g_return_val_if_fail (start_position >= 0, NULL);
1414   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1415   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1416
1417   regex_lazy_init_match (regex, 0);
1418
1419   if (max_tokens <= 0)
1420     max_tokens = G_MAXINT;
1421
1422   if (string_len < 0)
1423     string_len = strlen (string);
1424
1425   if (string_len - start_position == 0)
1426     return g_new0 (gchar *, 1);
1427
1428   /* clear out the regex for reuse, just in case */
1429   g_regex_clear (regex);
1430
1431   list = NULL;
1432   tokens = 0;
1433   while (TRUE)
1434     {
1435       gchar *token;
1436
1437       /* -1 to leave room for the last part. */
1438       if (tokens >= max_tokens - 1)
1439         {
1440           /* we have reached the maximum number of tokens, so we copy
1441            * the remaining part of the string. */
1442           if (regex->match->last_match_is_empty)
1443             {
1444               /* the last match was empty, so we have moved one char
1445                * after the real position to avoid empty matches at the
1446                * same position. */
1447               regex->match->pos = PREV_CHAR (regex, &string[regex->match->pos]) - string;
1448             }
1449           /* the if is needed in the case we have terminated the available
1450            * tokens, but we are at the end of the string, so there are no
1451            * characters left to copy. */
1452           if (string_len > regex->match->pos)
1453             {
1454               token = g_strndup (string + regex->match->pos,
1455                                  string_len - regex->match->pos);
1456               list = g_list_prepend (list, token);
1457             }
1458           /* end the loop. */
1459           break;
1460         }
1461
1462       token = g_regex_split_next_full (regex, string, string_len, start_position,
1463                                        match_options, &tmp_error);
1464       if (tmp_error != NULL)
1465         {
1466           g_propagate_error (error, tmp_error);
1467           g_list_foreach (list, (GFunc)g_free, NULL);
1468           g_list_free (list);
1469           regex->match->pos = -1;
1470           return NULL;
1471         }
1472
1473       if (token == NULL)
1474         /* no more tokens. */
1475         break;
1476
1477       tokens++;
1478       list = g_list_prepend (list, token);
1479     }
1480
1481   string_list = g_new (gchar *, g_list_length (list) + 1);
1482   pos = 0;
1483   for (last = g_list_last (list); last; last = g_list_previous (last))
1484     string_list[pos++] = last->data;
1485   string_list[pos] = 0;
1486
1487   regex->match->pos = -1;
1488   g_list_free (list);
1489
1490   return string_list;
1491 }
1492
1493 /**
1494  * g_regex_split_next:
1495  * @regex: a #GRegex structure from g_regex_new()
1496  * @string: the string to split on pattern
1497  * @match_options: match time options for the regex
1498  *
1499  * g_regex_split_next() breaks the string on pattern, and returns the
1500  * tokens, one per call.  If the pattern contains capturing parentheses,
1501  * then the text for each of the substrings will also be returned.
1502  * If the pattern does not match anywhere in the string, then the whole
1503  * string is returned as the first token.
1504  *
1505  * A pattern that can match empty strings splits @string into separate
1506  * characters wherever it matches the empty string between characters.
1507  * For example splitting "ab c" using as a separator "\s*", you will get
1508  * "a", "b" and "c".
1509  *
1510  * You have to call g_regex_clear() to reuse the same pattern on a new
1511  * string.
1512  *
1513  * Returns:  a gchar * to the next token of the string
1514  *
1515  * Since: 2.14
1516  */
1517 gchar *
1518 g_regex_split_next (GRegex          *regex,
1519                     const gchar     *string,
1520                     GRegexMatchFlags match_options)
1521 {
1522   return g_regex_split_next_full (regex, string, -1, 0, match_options,
1523                                   NULL);
1524 }
1525
1526 /**
1527  * g_regex_split_next_full:
1528  * @regex: a #GRegex structure from g_regex_new()
1529  * @string: the string to split on pattern
1530  * @string_len: the length of @string, or -1 if @string is nul-terminated
1531  * @start_position: starting index of the string to match
1532  * @match_options: match time options for the regex
1533  * @error: return location for a #GError
1534  *
1535  * g_regex_split_next_full() breaks the string on pattern, and returns
1536  * the tokens, one per call.  If the pattern contains capturing parentheses,
1537  * then the text for each of the substrings will also be returned.
1538  * If the pattern does not match anywhere in the string, then the whole
1539  * string is returned as the first token.
1540  *
1541  * A pattern that can match empty strings splits @string into separate
1542  * characters wherever it matches the empty string between characters.
1543  * For example splitting "ab c" using as a separator "\s*", you will get
1544  * "a", "b" and "c".
1545  *
1546  * You have to call g_regex_clear() to reuse the same pattern on a new
1547  * string.
1548  *
1549  * Setting @start_position differs from just passing over a shortened string
1550  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
1551  * with any kind of lookbehind assertion, such as "\b".
1552  *
1553  * Returns:  a gchar * to the next token of the string
1554  *
1555  * Since: 2.14
1556  */
1557 gchar *
1558 g_regex_split_next_full (GRegex          *regex,
1559                          const gchar     *string,
1560                          gssize           string_len,
1561                          gint             start_position,
1562                          GRegexMatchFlags match_options,
1563                          GError         **error)
1564 {
1565   gint new_pos;
1566   gchar *token = NULL;
1567   gboolean match_ok;
1568   gint match_count;
1569   GError *tmp_error = NULL;
1570
1571   g_return_val_if_fail (regex != NULL, NULL);
1572   g_return_val_if_fail (string != NULL, NULL);
1573   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1574   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1575
1576   regex_lazy_init_match (regex, 0);
1577
1578   new_pos = MAX (regex->match->pos, start_position);
1579   if (regex->match->last_match_is_empty)
1580     /* if the last match was empty, g_regex_match_next_full() has moved
1581      * forward to avoid infinite loops, but we still need to copy that
1582      * character. */
1583     new_pos = PREV_CHAR (regex, &string[new_pos]) - string;
1584
1585   /* if there are delimiter substrings stored, return those one at a
1586    * time.
1587    */
1588   if (regex->match->delims != NULL)
1589     {
1590       token = regex->match->delims->data;
1591       regex->match->delims = g_slist_remove (regex->match->delims, token);
1592       return token;
1593     }
1594
1595   if (regex->match->pos == -1)
1596     /* the last call to g_regex_match_next_full() returned NULL. */
1597     return NULL;
1598
1599   if (regex->match->string_len < 0)
1600     {
1601       regex->match->last_match_is_empty = FALSE;
1602       /* initialize last_separator_end to start_position to skip the
1603        * empty token at the beginning of the string. */
1604       regex->match->last_separator_end = start_position;
1605     }
1606
1607   /* use g_regex_match_next() to find the next occurance of the pattern
1608    * in the string. We use new_pos to keep track of where the stuff
1609    * up to the current match starts. Copy that token of the string off
1610    * and append it to the buffer using g_strndup. */
1611   match_ok = g_regex_match_next_full (regex, string, string_len,
1612                                       start_position, match_options,
1613                                       &tmp_error);
1614   if (tmp_error != NULL)
1615     {
1616       g_propagate_error (error, tmp_error);
1617       return NULL;
1618     }
1619
1620   if (match_ok)
1621     {
1622       regex->match->last_match_is_empty =
1623                 (regex->match->offsets[0] == regex->match->offsets[1]);
1624
1625       /* we need to skip empty separators at the same position of the end
1626        * of another separator. e.g. the string is "a b" and the separator
1627        * is "*", so from 1 to 2 we have a match and at position 2 we have
1628        * an empty match. */
1629       if (regex->match->last_separator_end != regex->match->offsets[1])
1630         {
1631           token = g_strndup (string + new_pos, regex->match->offsets[0] - new_pos);
1632
1633           /* if there were substrings, these need to get added to the
1634            * list of delims */
1635           match_count = g_regex_get_match_count (regex);
1636           if (match_count > 1)
1637             {
1638               gint i;
1639               for (i = 1; i < match_count; i++)
1640                 regex->match->delims = g_slist_append (regex->match->delims,
1641                                                        g_regex_fetch (regex, i, string));
1642             }
1643
1644           regex->match->last_separator_end = regex->match->offsets[1];
1645         }
1646       else
1647         {
1648           /* we have skipped an empty separator so we need to find the
1649            * next match. */
1650           return g_regex_split_next_full (regex, string, string_len,
1651                                           start_position, match_options,
1652                                           error);
1653         }
1654     }
1655   else
1656     {
1657       /* if there was no match, copy to end of string. */
1658       if (!regex->match->last_match_is_empty)
1659         token = g_strndup (string + new_pos, regex->match->string_len - new_pos);
1660       else
1661         token = NULL;
1662     }
1663
1664   return token;
1665 }
1666
1667 enum
1668 {
1669   REPL_TYPE_STRING,
1670   REPL_TYPE_CHARACTER,
1671   REPL_TYPE_SYMBOLIC_REFERENCE,
1672   REPL_TYPE_NUMERIC_REFERENCE,
1673   REPL_TYPE_CHANGE_CASE
1674 };
1675
1676 typedef enum
1677 {
1678   CHANGE_CASE_NONE         = 1 << 0,
1679   CHANGE_CASE_UPPER        = 1 << 1,
1680   CHANGE_CASE_LOWER        = 1 << 2,
1681   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
1682   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
1683   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
1684   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
1685   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
1686 } ChangeCase;
1687
1688 typedef struct
1689 {
1690   gchar     *text;
1691   gint       type;
1692   gint       num;
1693   gchar      c;
1694   ChangeCase change_case;
1695 } InterpolationData;
1696
1697 static void
1698 free_interpolation_data (InterpolationData *data)
1699 {
1700   g_free (data->text);
1701   g_free (data);
1702 }
1703
1704 static const gchar *
1705 expand_escape (const gchar        *replacement,
1706                const gchar        *p,
1707                InterpolationData  *data,
1708                GError            **error)
1709 {
1710   const gchar *q, *r;
1711   gint x, d, h, i;
1712   const gchar *error_detail;
1713   gint base = 0;
1714   GError *tmp_error = NULL;
1715
1716   p++;
1717   switch (*p)
1718     {
1719     case 't':
1720       p++;
1721       data->c = '\t';
1722       data->type = REPL_TYPE_CHARACTER;
1723       break;
1724     case 'n':
1725       p++;
1726       data->c = '\n';
1727       data->type = REPL_TYPE_CHARACTER;
1728       break;
1729     case 'v':
1730       p++;
1731       data->c = '\v';
1732       data->type = REPL_TYPE_CHARACTER;
1733       break;
1734     case 'r':
1735       p++;
1736       data->c = '\r';
1737       data->type = REPL_TYPE_CHARACTER;
1738       break;
1739     case 'f':
1740       p++;
1741       data->c = '\f';
1742       data->type = REPL_TYPE_CHARACTER;
1743       break;
1744     case 'a':
1745       p++;
1746       data->c = '\a';
1747       data->type = REPL_TYPE_CHARACTER;
1748       break;
1749     case 'b':
1750       p++;
1751       data->c = '\b';
1752       data->type = REPL_TYPE_CHARACTER;
1753       break;
1754     case '\\':
1755       p++;
1756       data->c = '\\';
1757       data->type = REPL_TYPE_CHARACTER;
1758       break;
1759     case 'x':
1760       p++;
1761       x = 0;
1762       if (*p == '{')
1763         {
1764           p++;
1765           do
1766             {
1767               h = g_ascii_xdigit_value (*p);
1768               if (h < 0)
1769                 {
1770                   error_detail = _("hexadecimal digit or '}' expected");
1771                   goto error;
1772                 }
1773               x = x * 16 + h;
1774               p++;
1775             }
1776           while (*p != '}');
1777           p++;
1778         }
1779       else
1780         {
1781           for (i = 0; i < 2; i++)
1782             {
1783               h = g_ascii_xdigit_value (*p);
1784               if (h < 0)
1785                 {
1786                   error_detail = _("hexadecimal digit expected");
1787                   goto error;
1788                 }
1789               x = x * 16 + h;
1790               p++;
1791             }
1792         }
1793       data->type = REPL_TYPE_STRING;
1794       data->text = g_new0 (gchar, 8);
1795       g_unichar_to_utf8 (x, data->text);
1796       break;
1797     case 'l':
1798       p++;
1799       data->type = REPL_TYPE_CHANGE_CASE;
1800       data->change_case = CHANGE_CASE_LOWER_SINGLE;
1801       break;
1802     case 'u':
1803       p++;
1804       data->type = REPL_TYPE_CHANGE_CASE;
1805       data->change_case = CHANGE_CASE_UPPER_SINGLE;
1806       break;
1807     case 'L':
1808       p++;
1809       data->type = REPL_TYPE_CHANGE_CASE;
1810       data->change_case = CHANGE_CASE_LOWER;
1811       break;
1812     case 'U':
1813       p++;
1814       data->type = REPL_TYPE_CHANGE_CASE;
1815       data->change_case = CHANGE_CASE_UPPER;
1816       break;
1817     case 'E':
1818       p++;
1819       data->type = REPL_TYPE_CHANGE_CASE;
1820       data->change_case = CHANGE_CASE_NONE;
1821       break;
1822     case 'g':
1823       p++;
1824       if (*p != '<')
1825         {
1826           error_detail = _("missing '<' in symbolic reference");
1827           goto error;
1828         }
1829       q = p + 1;
1830       do
1831         {
1832           p++;
1833           if (!*p)
1834             {
1835               error_detail = _("unfinished symbolic reference");
1836               goto error;
1837             }
1838         }
1839       while (*p != '>');
1840       if (p - q == 0)
1841         {
1842           error_detail = _("zero-length symbolic reference");
1843           goto error;
1844         }
1845       if (g_ascii_isdigit (*q))
1846         {
1847           x = 0;
1848           do
1849             {
1850               h = g_ascii_digit_value (*q);
1851               if (h < 0)
1852                 {
1853                   error_detail = _("digit expected");
1854                   p = q;
1855                   goto error;
1856                 }
1857               x = x * 10 + h;
1858               q++;
1859             }
1860           while (q != p);
1861           data->num = x;
1862           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1863         }
1864       else
1865         {
1866           r = q;
1867           do
1868             {
1869               if (!g_ascii_isalnum (*r))
1870                 {
1871                   error_detail = _("illegal symbolic reference");
1872                   p = r;
1873                   goto error;
1874                 }
1875               r++;
1876             }
1877           while (r != p);
1878           data->text = g_strndup (q, p - q);
1879           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
1880         }
1881       p++;
1882       break;
1883     case '0':
1884       /* if \0 is followed by a number is an octal number representing a
1885        * character, else it is a numeric reference. */
1886       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
1887         {
1888           base = 8;
1889           p = g_utf8_next_char (p);
1890         }
1891     case '1':
1892     case '2':
1893     case '3':
1894     case '4':
1895     case '5':
1896     case '6':
1897     case '7':
1898     case '8':
1899     case '9':
1900       x = 0;
1901       d = 0;
1902       for (i = 0; i < 3; i++)
1903         {
1904           h = g_ascii_digit_value (*p);
1905           if (h < 0)
1906             break;
1907           if (h > 7)
1908             {
1909               if (base == 8)
1910                 break;
1911               else
1912                 base = 10;
1913             }
1914           if (i == 2 && base == 10)
1915             break;
1916           x = x * 8 + h;
1917           d = d * 10 + h;
1918           p++;
1919         }
1920       if (base == 8 || i == 3)
1921         {
1922           data->type = REPL_TYPE_STRING;
1923           data->text = g_new0 (gchar, 8);
1924           g_unichar_to_utf8 (x, data->text);
1925         }
1926       else
1927         {
1928           data->type = REPL_TYPE_NUMERIC_REFERENCE;
1929           data->num = d;
1930         }
1931       break;
1932     case 0:
1933       error_detail = _("stray final '\\'");
1934       goto error;
1935       break;
1936     default:
1937       error_detail = _("unknown escape sequence");
1938       goto error;
1939     }
1940
1941   return p;
1942
1943  error:
1944   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
1945   tmp_error = g_error_new (G_REGEX_ERROR,
1946                            G_REGEX_ERROR_REPLACE,
1947                            _("Error while parsing replacement "
1948                              "text \"%s\" at char %lu: %s"),
1949                            replacement,
1950                            (gulong)(p - replacement),
1951                            error_detail);
1952   g_propagate_error (error, tmp_error);
1953
1954   return NULL;
1955 }
1956
1957 static GList *
1958 split_replacement (const gchar  *replacement,
1959                    GError      **error)
1960 {
1961   GList *list = NULL;
1962   InterpolationData *data;
1963   const gchar *p, *start;
1964
1965   start = p = replacement;
1966   while (*p)
1967     {
1968       if (*p == '\\')
1969         {
1970           data = g_new0 (InterpolationData, 1);
1971           start = p = expand_escape (replacement, p, data, error);
1972           if (p == NULL)
1973             {
1974               g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
1975               g_list_free (list);
1976               free_interpolation_data (data);
1977
1978               return NULL;
1979             }
1980           list = g_list_prepend (list, data);
1981         }
1982       else
1983         {
1984           p++;
1985           if (*p == '\\' || *p == '\0')
1986             {
1987               if (p - start > 0)
1988                 {
1989                   data = g_new0 (InterpolationData, 1);
1990                   data->text = g_strndup (start, p - start);
1991                   data->type = REPL_TYPE_STRING;
1992                   list = g_list_prepend (list, data);
1993                 }
1994             }
1995         }
1996     }
1997
1998   return g_list_reverse (list);
1999 }
2000
2001 /* Change the case of c based on change_case. */
2002 #define CHANGE_CASE(c, change_case) \
2003         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2004                 g_unichar_tolower (c) : \
2005                 g_unichar_toupper (c))
2006
2007 static void
2008 string_append (GString     *string,
2009                const gchar *text,
2010                ChangeCase  *change_case)
2011 {
2012   gunichar c;
2013
2014   if (text[0] == '\0')
2015     return;
2016
2017   if (*change_case == CHANGE_CASE_NONE)
2018     {
2019       g_string_append (string, text);
2020     }
2021   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2022     {
2023       c = g_utf8_get_char (text);
2024       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2025       g_string_append (string, g_utf8_next_char (text));
2026       *change_case = CHANGE_CASE_NONE;
2027     }
2028   else
2029     {
2030       while (*text != '\0')
2031         {
2032           c = g_utf8_get_char (text);
2033           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2034           text = g_utf8_next_char (text);
2035         }
2036     }
2037 }
2038
2039 static gboolean
2040 interpolate_replacement (const GRegex *regex,
2041                          const gchar  *string,
2042                          GString      *result,
2043                          gpointer      data)
2044 {
2045   GList *list;
2046   InterpolationData *idata;
2047   gchar *match;
2048   ChangeCase change_case = CHANGE_CASE_NONE;
2049
2050   for (list = data; list; list = list->next)
2051     {
2052       idata = list->data;
2053       switch (idata->type)
2054         {
2055         case REPL_TYPE_STRING:
2056           string_append (result, idata->text, &change_case);
2057           break;
2058         case REPL_TYPE_CHARACTER:
2059           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
2060           if (change_case & CHANGE_CASE_SINGLE_MASK)
2061             change_case = CHANGE_CASE_NONE;
2062           break;
2063         case REPL_TYPE_NUMERIC_REFERENCE:
2064           match = g_regex_fetch (regex, idata->num, string);
2065           if (match)
2066             {
2067               string_append (result, match, &change_case);
2068               g_free (match);
2069             }
2070           break;
2071         case REPL_TYPE_SYMBOLIC_REFERENCE:
2072           match = g_regex_fetch_named (regex, idata->text, string);
2073           if (match)
2074             {
2075               string_append (result, match, &change_case);
2076               g_free (match);
2077             }
2078           break;
2079         case REPL_TYPE_CHANGE_CASE:
2080           change_case = idata->change_case;
2081           break;
2082         }
2083     }
2084
2085   return FALSE;
2086 }
2087
2088 /**
2089  * g_regex_expand_references:
2090  * @regex: #GRegex structure used in last match
2091  * @string: the string on which the last match was made
2092  * @string_to_expand: the string to expand
2093  * @error: location to store the error occuring, or %NULL to ignore errors
2094  *
2095  * Returns a new string containing the text in @string_to_expand with
2096  * references expanded. References refer to the last match done with
2097  * @string against @regex and have the same syntax used by g_regex_replace().
2098  *
2099  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
2100  * passed to g_regex_new().
2101  *
2102  * Returns: the expanded string, or %NULL if an error occurred
2103  *
2104  * Since: 2.14
2105  */
2106 gchar *
2107 g_regex_expand_references (GRegex            *regex,
2108                            const gchar       *string,
2109                            const gchar       *string_to_expand,
2110                            GError           **error)
2111 {
2112   GString *result;
2113   GList *list;
2114   GError *tmp_error = NULL;
2115
2116   g_return_val_if_fail (regex != NULL, NULL);
2117   g_return_val_if_fail (string != NULL, NULL);
2118   g_return_val_if_fail (string_to_expand != NULL, NULL);
2119   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2120
2121   list = split_replacement (string_to_expand, &tmp_error);
2122   if (tmp_error != NULL)
2123     {
2124       g_propagate_error (error, tmp_error);
2125       return NULL;
2126     }
2127
2128   result = g_string_sized_new (strlen (string_to_expand));
2129   interpolate_replacement (regex, string, result, list);
2130
2131   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2132   g_list_free (list);
2133
2134   return g_string_free (result, FALSE);
2135 }
2136
2137 /**
2138  * g_regex_replace:
2139  * @regex: a #GRegex structure
2140  * @string: the string to perform matches against
2141  * @string_len: the length of @string, or -1 if @string is nul-terminated
2142  * @start_position: starting index of the string to match
2143  * @replacement: text to replace each match with
2144  * @match_options: options for the match
2145  * @error: location to store the error occuring, or %NULL to ignore errors
2146  *
2147  * Replaces all occurances of the pattern in @regex with the
2148  * replacement text. Backreferences of the form '\number' or '\g&lt;number&gt;'
2149  * in the replacement text are interpolated by the number-th captured
2150  * subexpression of the match, '\g&lt;name&gt;' refers to the captured subexpression
2151  * with the given name. '\0' refers to the complete match, but '\0' followed
2152  * by a number is the octal representation of a character. To include a
2153  * literal '\' in the replacement, write '\\'.
2154  * There are also escapes that changes the case of the following text:
2155  *
2156  * <variablelist>
2157  * <varlistentry><term>\l</term>
2158  * <listitem>
2159  * <para>Convert to lower case the next character</para>
2160  * </listitem>
2161  * </varlistentry>
2162  * <varlistentry><term>\u</term>
2163  * <listitem>
2164  * <para>Convert to upper case the next character</para>
2165  * </listitem>
2166  * </varlistentry>
2167  * <varlistentry><term>\L</term>
2168  * <listitem>
2169  * <para>Convert to lower case till \E</para>
2170  * </listitem>
2171  * </varlistentry>
2172  * <varlistentry><term>\U</term>
2173  * <listitem>
2174  * <para>Convert to upper case till \E</para>
2175  * </listitem>
2176  * </varlistentry>
2177  * <varlistentry><term>\E</term>
2178  * <listitem>
2179  * <para>End case modification</para>
2180  * </listitem>
2181  * </varlistentry>
2182  * </variablelist>
2183  *
2184  * If you do not need to use backreferences use g_regex_replace_literal().
2185  *
2186  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
2187  * passed to g_regex_new(). If you want to use not UTF-8 encoded stings
2188  * you can use g_regex_replace_literal().
2189  *
2190  * Setting @start_position differs from just passing over a shortened string
2191  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
2192  * with any kind of lookbehind assertion, such as "\b".
2193  *
2194  * Returns: a newly allocated string containing the replacements
2195  *
2196  * Since: 2.14
2197  */
2198 gchar *
2199 g_regex_replace (GRegex            *regex,
2200                  const gchar       *string,
2201                  gssize             string_len,
2202                  gint               start_position,
2203                  const gchar       *replacement,
2204                  GRegexMatchFlags   match_options,
2205                  GError           **error)
2206 {
2207   gchar *result;
2208   GList *list;
2209   GError *tmp_error = NULL;
2210
2211   g_return_val_if_fail (regex != NULL, NULL);
2212   g_return_val_if_fail (string != NULL, NULL);
2213   g_return_val_if_fail (start_position >= 0, NULL);
2214   g_return_val_if_fail (replacement != NULL, NULL);
2215   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2216   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2217
2218   list = split_replacement (replacement, &tmp_error);
2219   if (tmp_error != NULL)
2220     {
2221       g_propagate_error (error, tmp_error);
2222       return NULL;
2223     }
2224
2225   result = g_regex_replace_eval (regex,
2226                                  string, string_len, start_position,
2227                                  match_options,
2228                                  interpolate_replacement,
2229                                  (gpointer)list,
2230                                  &tmp_error);
2231   if (tmp_error != NULL)
2232     g_propagate_error (error, tmp_error);
2233
2234   g_list_foreach (list, (GFunc)free_interpolation_data, NULL);
2235   g_list_free (list);
2236
2237   return result;
2238 }
2239
2240 static gboolean
2241 literal_replacement (const GRegex *regex,
2242                      const gchar  *string,
2243                      GString      *result,
2244                      gpointer      data)
2245 {
2246   g_string_append (result, data);
2247   return FALSE;
2248 }
2249
2250 /**
2251  * g_regex_replace_literal:
2252  * @regex: a #GRegex structure
2253  * @string: the string to perform matches against
2254  * @string_len: the length of @string, or -1 if @string is nul-terminated
2255  * @start_position: starting index of the string to match
2256  * @replacement: text to replace each match with
2257  * @match_options: options for the match
2258  * @error: location to store the error occuring, or %NULL to ignore errors
2259  *
2260  * Replaces all occurances of the pattern in @regex with the
2261  * replacement text. @replacement is replaced literally, to
2262  * include backreferences use g_regex_replace().
2263  *
2264  * Setting @start_position differs from just passing over a shortened string
2265  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
2266  * with any kind of lookbehind assertion, such as "\b".
2267  *
2268  * Returns: a newly allocated string containing the replacements
2269  *
2270  * Since: 2.14
2271  */
2272 gchar *
2273 g_regex_replace_literal (GRegex          *regex,
2274                          const gchar     *string,
2275                          gssize           string_len,
2276                          gint             start_position,
2277                          const gchar     *replacement,
2278                          GRegexMatchFlags match_options,
2279                          GError         **error)
2280 {
2281   g_return_val_if_fail (replacement != NULL, NULL);
2282   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2283
2284   return g_regex_replace_eval (regex,
2285                                string, string_len, start_position,
2286                                match_options,
2287                                literal_replacement,
2288                                (gpointer)replacement,
2289                                error);
2290 }
2291
2292 /**
2293  * g_regex_replace_eval:
2294  * @regex: a #GRegex structure from g_regex_new()
2295  * @string: string to perform matches against
2296  * @string_len: the length of @string, or -1 if @string is nul-terminated
2297  * @start_position: starting index of the string to match
2298  * @match_options: options for the match
2299  * @eval: a function to call for each match
2300  * @user_data: user data to pass to the function
2301  * @error: location to store the error occuring, or %NULL to ignore errors
2302  *
2303  * Replaces occurances of the pattern in regex with the output of @eval
2304  * for that occurance.
2305  *
2306  * Setting @start_position differs from just passing over a shortened string
2307  * and  setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins
2308  * with any kind of lookbehind assertion, such as "\b".
2309  *
2310  * Returns: a newly allocated string containing the replacements
2311  *
2312  * Since: 2.14
2313  */
2314 gchar *
2315 g_regex_replace_eval (GRegex            *regex,
2316                       const gchar       *string,
2317                       gssize             string_len,
2318                       gint               start_position,
2319                       GRegexMatchFlags   match_options,
2320                       GRegexEvalCallback eval,
2321                       gpointer           user_data,
2322                       GError           **error)
2323 {
2324   GString *result;
2325   gint str_pos = 0;
2326   gboolean done = FALSE;
2327   GError *tmp_error = NULL;
2328
2329   g_return_val_if_fail (regex != NULL, NULL);
2330   g_return_val_if_fail (string != NULL, NULL);
2331   g_return_val_if_fail (start_position >= 0, NULL);
2332   g_return_val_if_fail (eval != NULL, NULL);
2333   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2334
2335   regex_lazy_init_match (regex, 0);
2336
2337   if (string_len < 0)
2338     string_len = strlen (string);
2339
2340   /* clear out the regex for reuse, just in case */
2341   g_regex_clear (regex);
2342
2343   result = g_string_sized_new (string_len);
2344
2345   /* run down the string making matches. */
2346   while (!done &&
2347          g_regex_match_next_full (regex, string, string_len,
2348                                   start_position, match_options, &tmp_error))
2349     {
2350       g_string_append_len (result,
2351                            string + str_pos,
2352                            regex->match->offsets[0] - str_pos);
2353       done = (*eval) (regex, string, result, user_data);
2354       str_pos = regex->match->offsets[1];
2355     }
2356
2357   if (tmp_error != NULL)
2358     {
2359       g_propagate_error (error, tmp_error);
2360       g_string_free (result, TRUE);
2361       return NULL;
2362     }
2363
2364   g_string_append_len (result, string + str_pos, string_len - str_pos);
2365
2366   return g_string_free (result, FALSE);
2367 }
2368
2369 /**
2370  * g_regex_escape_string:
2371  * @string: the string to escape
2372  * @length: the length of @string, or -1 if @string is nul-terminated
2373  *
2374  * Escapes the special characters used for regular expressions in @string,
2375  * for instance "a.b*c" becomes "a\.b\*c". This function is useful to
2376  * dynamically generate regular expressions.
2377  *
2378  * @string can contain nul characters that are replaced with "\0", in this
2379  * case remember to specify the correct length of @string in @length.
2380  *
2381  * Returns: a newly-allocated escaped string
2382  *
2383  * Since: 2.14
2384  */
2385 gchar *
2386 g_regex_escape_string (const gchar *string,
2387                        gint         length)
2388 {
2389   GString *escaped;
2390   const char *p, *piece_start, *end;
2391
2392   g_return_val_if_fail (string != NULL, NULL);
2393
2394   if (length < 0)
2395     length = strlen (string);
2396
2397   end = string + length;
2398   p = piece_start = string;
2399   escaped = g_string_sized_new (length + 1);
2400
2401   while (p < end)
2402     {
2403       switch (*p)
2404         {
2405         case '\0':
2406         case '\\':
2407         case '|':
2408         case '(':
2409         case ')':
2410         case '[':
2411         case ']':
2412         case '{':
2413         case '}':
2414         case '^':
2415         case '$':
2416         case '*':
2417         case '+':
2418         case '?':
2419         case '.':
2420           if (p != piece_start)
2421             /* copy the previous piece. */
2422             g_string_append_len (escaped, piece_start, p - piece_start);
2423           g_string_append_c (escaped, '\\');
2424           if (*p == '\0')
2425             g_string_append_c (escaped, '0');
2426           else
2427             g_string_append_c (escaped, *p);
2428           piece_start = ++p;
2429           break;
2430         default:
2431           p = g_utf8_next_char (p);
2432           break;
2433         }
2434   }
2435
2436   if (piece_start < end)
2437     g_string_append_len (escaped, piece_start, end - piece_start);
2438
2439   return g_string_free (escaped, FALSE);
2440 }
2441
2442 #define __G_REGEX_C__
2443 #include "galiasdef.c"