glib/gregex.c

   1 /* GRegex -- regular expression API wrapper around PCRE.
   2  *
   3  * Copyright (C) 1999, 2000 Scott Wimer
   4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
   5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
   6  * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com>
   7  *
   8  * SPDX-License-Identifier: LGPL-2.1-or-later
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * This library is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this library; if not, see <http://www.gnu.org/licenses/>.
  22  */
  23
  24 #include "config.h"
  25
  26 #include <stdint.h>
  27 #include <string.h>
  28
  29 #define PCRE2_CODE_UNIT_WIDTH 8
  30 #include <pcre2.h>
  31
  32 #include "gtypes.h"
  33 #include "gregex.h"
  34 #include "glibintl.h"
  35 #include "glist.h"
  36 #include "gmessages.h"
  37 #include "gstrfuncs.h"
  38 #include "gatomic.h"
  39 #include "gtestutils.h"
  40 #include "gthread.h"
  41
  42 /**
  43  * GRegex:
  44  *
  45  * A `GRegex` is the "compiled" form of a regular expression pattern.
  46  *
  47  * `GRegex` implements regular expression pattern matching using syntax and
  48  * semantics similar to Perl regular expression. See the
  49  * [PCRE documentation](man:pcrepattern(3)) for the syntax definition.
  50  *
  51  * Some functions accept a @start_position argument, setting it differs
  52  * from just passing over a shortened string and setting %G_REGEX_MATCH_NOTBOL
  53  * in the case of a pattern that begins with any kind of lookbehind assertion.
  54  * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
  55  * in the middle of words. ("\B" matches only if the current position in the
  56  * subject is not a word boundary.) When applied to the string "Mississipi"
  57  * from the fourth byte, namely "issipi", it does not match, because "\B" is
  58  * always false at the start of the subject, which is deemed to be a word
  59  * boundary. However, if the entire string is passed , but with
  60  * @start_position set to 4, it finds the second occurrence of "iss" because
  61  * it is able to look behind the starting point to discover that it is
  62  * preceded by a letter.
  63  *
  64  * Note that, unless you set the %G_REGEX_RAW flag, all the strings passed
  65  * to these functions must be encoded in UTF-8. The lengths and the positions
  66  * inside the strings are in bytes and not in characters, so, for instance,
  67  * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a
  68  * single character. If you set %G_REGEX_RAW the strings can be non-valid
  69  * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
  70  * bytes and two characters long.
  71  *
  72  * When matching a pattern, "\n" matches only against a "\n" character in
  73  * the string, and "\r" matches only a "\r" character. To match any newline
  74  * sequence use "\R". This particular group matches either the two-character
  75  * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
  76  * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
  77  * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
  78  * separator, U+2028), or PS (paragraph separator, U+2029).
  79  *
  80  * The behaviour of the dot, circumflex, and dollar metacharacters are
  81  * affected by newline characters, the default is to recognize any newline
  82  * character (the same characters recognized by "\R"). This can be changed
  83  * with `G_REGEX_NEWLINE_CR`, `G_REGEX_NEWLINE_LF` and `G_REGEX_NEWLINE_CRLF`
  84  * compile options, and with `G_REGEX_MATCH_NEWLINE_ANY`,
  85  * `G_REGEX_MATCH_NEWLINE_CR`, `G_REGEX_MATCH_NEWLINE_LF` and
  86  * `G_REGEX_MATCH_NEWLINE_CRLF` match options. These settings are also
  87  * relevant when compiling a pattern if `G_REGEX_EXTENDED` is set, and an
  88  * unescaped "#" outside a character class is encountered. This indicates
  89  * a comment that lasts until after the next newline.
  90  *
  91  * Creating and manipulating the same `GRegex` structure from different
  92  * threads is not a problem as `GRegex` does not modify its internal
  93  * state between creation and destruction, on the other hand `GMatchInfo`
  94  * is not threadsafe.
  95  *
  96  * The regular expressions low-level functionalities are obtained through
  97  * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel.
  98  *
  99  * Since: 2.14
 100  */
 101
 102 #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED       | \
 103                                    PCRE2_NO_UTF_CHECK   | \
 104                                    PCRE2_ENDANCHORED)
 105
 106 /* Mask of all the possible values for GRegexCompileFlags. */
 107 #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT          | \
 108                               G_REGEX_CASELESS         | \
 109                               G_REGEX_MULTILINE        | \
 110                               G_REGEX_DOTALL           | \
 111                               G_REGEX_EXTENDED         | \
 112                               G_REGEX_ANCHORED         | \
 113                               G_REGEX_DOLLAR_ENDONLY   | \
 114                               G_REGEX_UNGREEDY         | \
 115                               G_REGEX_RAW              | \
 116                               G_REGEX_NO_AUTO_CAPTURE  | \
 117                               G_REGEX_OPTIMIZE         | \
 118                               G_REGEX_FIRSTLINE        | \
 119                               G_REGEX_DUPNAMES         | \
 120                               G_REGEX_NEWLINE_CR       | \
 121                               G_REGEX_NEWLINE_LF       | \
 122                               G_REGEX_NEWLINE_CRLF     | \
 123                               G_REGEX_NEWLINE_ANYCRLF  | \
 124                               G_REGEX_BSR_ANYCRLF)
 125
 126 #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS    | \
 127                                     PCRE2_ALT_BSUX             | \
 128                                     PCRE2_AUTO_CALLOUT         | \
 129                                     PCRE2_CASELESS             | \
 130                                     PCRE2_DOLLAR_ENDONLY       | \
 131                                     PCRE2_DOTALL               | \
 132                                     PCRE2_DUPNAMES             | \
 133                                     PCRE2_EXTENDED             | \
 134                                     PCRE2_FIRSTLINE            | \
 135                                     PCRE2_MATCH_UNSET_BACKREF  | \
 136                                     PCRE2_MULTILINE            | \
 137                                     PCRE2_NEVER_UCP            | \
 138                                     PCRE2_NEVER_UTF            | \
 139                                     PCRE2_NO_AUTO_CAPTURE      | \
 140                                     PCRE2_NO_AUTO_POSSESS      | \
 141                                     PCRE2_NO_DOTSTAR_ANCHOR    | \
 142                                     PCRE2_NO_START_OPTIMIZE    | \
 143                                     PCRE2_UCP                  | \
 144                                     PCRE2_UNGREEDY             | \
 145                                     PCRE2_UTF                  | \
 146                                     PCRE2_NEVER_BACKSLASH_C    | \
 147                                     PCRE2_ALT_CIRCUMFLEX       | \
 148                                     PCRE2_ALT_VERBNAMES        | \
 149                                     PCRE2_USE_OFFSET_LIMIT     | \
 150                                     PCRE2_EXTENDED_MORE        | \
 151                                     PCRE2_LITERAL              | \
 152                                     PCRE2_MATCH_INVALID_UTF    | \
 153                                     G_REGEX_PCRE_GENERIC_MASK)
 154
 155 #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF)
 156
 157 /* Mask of all the possible values for GRegexMatchFlags. */
 158 #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT          | \
 159                             G_REGEX_MATCH_ANCHORED         | \
 160                             G_REGEX_MATCH_NOTBOL           | \
 161                             G_REGEX_MATCH_NOTEOL           | \
 162                             G_REGEX_MATCH_NOTEMPTY         | \
 163                             G_REGEX_MATCH_PARTIAL          | \
 164                             G_REGEX_MATCH_NEWLINE_CR       | \
 165                             G_REGEX_MATCH_NEWLINE_LF       | \
 166                             G_REGEX_MATCH_NEWLINE_CRLF     | \
 167                             G_REGEX_MATCH_NEWLINE_ANY      | \
 168                             G_REGEX_MATCH_NEWLINE_ANYCRLF  | \
 169                             G_REGEX_MATCH_BSR_ANYCRLF      | \
 170                             G_REGEX_MATCH_BSR_ANY          | \
 171                             G_REGEX_MATCH_PARTIAL_SOFT     | \
 172                             G_REGEX_MATCH_PARTIAL_HARD     | \
 173                             G_REGEX_MATCH_NOTEMPTY_ATSTART)
 174
 175 #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL                      |\
 176                                   PCRE2_NOTEOL                      |\
 177                                   PCRE2_NOTEMPTY                    |\
 178                                   PCRE2_NOTEMPTY_ATSTART            |\
 179                                   PCRE2_PARTIAL_SOFT                |\
 180                                   PCRE2_PARTIAL_HARD                |\
 181                                   PCRE2_NO_JIT                      |\
 182                                   PCRE2_COPY_MATCHED_SUBJECT        |\
 183                                   G_REGEX_PCRE_GENERIC_MASK)
 184
 185 /* TODO: Support PCRE2_NEWLINE_NUL */
 186 #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR |     \
 187                               PCRE2_NEWLINE_LF |     \
 188                               PCRE2_NEWLINE_CRLF |   \
 189                               PCRE2_NEWLINE_ANYCRLF)
 190
 191 /* Some match options are not supported when using JIT as stated in the
 192  * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section:
 193  *   https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5
 194  */
 195 #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \
 196                                                PCRE2_ENDANCHORED)
 197
 198 #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR      | \
 199                                       G_REGEX_NEWLINE_LF      | \
 200                                       G_REGEX_NEWLINE_CRLF    | \
 201                                       G_REGEX_NEWLINE_ANYCRLF)
 202
 203 #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR      | \
 204                                     G_REGEX_MATCH_NEWLINE_LF      | \
 205                                     G_REGEX_MATCH_NEWLINE_CRLF    | \
 206                                     G_REGEX_MATCH_NEWLINE_ANY    | \
 207                                     G_REGEX_MATCH_NEWLINE_ANYCRLF)
 208
 209 /* if the string is in UTF-8 use g_utf8_ functions, else use
 210  * use just +/- 1. */
 211 #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
 212                                 ((s) + 1) : \
 213                                 g_utf8_next_char (s))
 214 #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
 215                                 ((s) - 1) : \
 216                                 g_utf8_prev_char (s))
 217
 218 struct _GMatchInfo
 219 {
 220   gint ref_count;               /* the ref count (atomic) */
 221   GRegex *regex;                /* the regex */
 222   uint32_t match_opts;          /* pcre match options used at match time on the regex */
 223   gint matches;                 /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */
 224   uint32_t n_subpatterns;       /* total number of sub patterns in the regex */
 225   gint pos;                     /* position in the string where last match left off */
 226   uint32_t n_offsets;           /* number of offsets */
 227   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
 228   gint *workspace;              /* workspace for pcre2_dfa_match() */
 229   PCRE2_SIZE n_workspace;       /* number of workspace elements */
 230   const gchar *string;          /* string passed to the match function */
 231   gssize string_len;            /* length of string, in bytes */
 232   pcre2_match_context *match_context;
 233   pcre2_match_data *match_data;
 234   pcre2_jit_stack *jit_stack;
 235 };
 236
 237 typedef enum
 238 {
 239   JIT_STATUS_DEFAULT,
 240   JIT_STATUS_ENABLED,
 241   JIT_STATUS_DISABLED
 242 } JITStatus;
 243
 244 struct _GRegex
 245 {
 246   gint ref_count;               /* the ref count for the immutable part (atomic) */
 247   gchar *pattern;               /* the pattern */
 248   pcre2_code *pcre_re;          /* compiled form of the pattern */
 249   uint32_t compile_opts;        /* options used at compile time on the pattern, pcre2 values */
 250   GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */
 251   uint32_t match_opts;          /* pcre2 options used at match time on the regex */
 252   GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */
 253   uint32_t jit_options;         /* options which were enabled for jit compiler */
 254   JITStatus jit_status;         /* indicates the status of jit compiler for this compiled regex */
 255   /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation,
 256    * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever
 257    * enabled for the current regex AND current set of jit_options.
 258    * JIT_STATUS_DEFAULT means enablement was never tried,
 259    * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it),
 260    * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again).
 261    */
 262 };
 263
 264 /* TRUE if ret is an error code, FALSE otherwise. */
 265 #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
 266
 267 typedef struct _InterpolationData InterpolationData;
 268 static gboolean  interpolation_list_needs_match (GList *list);
 269 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
 270                                                  GString *result,
 271                                                  gpointer data);
 272 static GList    *split_replacement              (const gchar *replacement,
 273                                                  GError **error);
 274 static void      free_interpolation_data        (InterpolationData *data);
 275
 276 static uint32_t
 277 get_pcre2_compile_options (GRegexCompileFlags compile_flags)
 278 {
 279   /* Maps compile flags to pcre2 values */
 280   uint32_t pcre2_flags = 0;
 281
 282   if (compile_flags & G_REGEX_CASELESS)
 283     pcre2_flags |= PCRE2_CASELESS;
 284   if (compile_flags & G_REGEX_MULTILINE)
 285     pcre2_flags |= PCRE2_MULTILINE;
 286   if (compile_flags & G_REGEX_DOTALL)
 287     pcre2_flags |= PCRE2_DOTALL;
 288   if (compile_flags & G_REGEX_EXTENDED)
 289     pcre2_flags |= PCRE2_EXTENDED;
 290   if (compile_flags & G_REGEX_ANCHORED)
 291     pcre2_flags |= PCRE2_ANCHORED;
 292   if (compile_flags & G_REGEX_DOLLAR_ENDONLY)
 293     pcre2_flags |= PCRE2_DOLLAR_ENDONLY;
 294   if (compile_flags & G_REGEX_UNGREEDY)
 295     pcre2_flags |= PCRE2_UNGREEDY;
 296   if (!(compile_flags & G_REGEX_RAW))
 297     pcre2_flags |= PCRE2_UTF;
 298   if (compile_flags & G_REGEX_NO_AUTO_CAPTURE)
 299     pcre2_flags |= PCRE2_NO_AUTO_CAPTURE;
 300   if (compile_flags & G_REGEX_FIRSTLINE)
 301     pcre2_flags |= PCRE2_FIRSTLINE;
 302   if (compile_flags & G_REGEX_DUPNAMES)
 303     pcre2_flags |= PCRE2_DUPNAMES;
 304
 305   return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK;
 306 }
 307
 308 static uint32_t
 309 get_pcre2_match_options (GRegexMatchFlags   match_flags,
 310                          GRegexCompileFlags compile_flags)
 311 {
 312   /* Maps match flags to pcre2 values */
 313   uint32_t pcre2_flags = 0;
 314
 315   if (match_flags & G_REGEX_MATCH_ANCHORED)
 316     pcre2_flags |= PCRE2_ANCHORED;
 317   if (match_flags & G_REGEX_MATCH_NOTBOL)
 318     pcre2_flags |= PCRE2_NOTBOL;
 319   if (match_flags & G_REGEX_MATCH_NOTEOL)
 320     pcre2_flags |= PCRE2_NOTEOL;
 321   if (match_flags & G_REGEX_MATCH_NOTEMPTY)
 322     pcre2_flags |= PCRE2_NOTEMPTY;
 323   if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT)
 324     pcre2_flags |= PCRE2_PARTIAL_SOFT;
 325   if (match_flags & G_REGEX_MATCH_PARTIAL_HARD)
 326     pcre2_flags |= PCRE2_PARTIAL_HARD;
 327   if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART)
 328     pcre2_flags |= PCRE2_NOTEMPTY_ATSTART;
 329
 330   if (compile_flags & G_REGEX_RAW)
 331     pcre2_flags |= PCRE2_NO_UTF_CHECK;
 332
 333   return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK;
 334 }
 335
 336 static GRegexCompileFlags
 337 g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags)
 338 {
 339   GRegexCompileFlags compile_flags = G_REGEX_DEFAULT;
 340
 341   if (pcre2_flags & PCRE2_CASELESS)
 342     compile_flags |= G_REGEX_CASELESS;
 343   if (pcre2_flags & PCRE2_MULTILINE)
 344     compile_flags |= G_REGEX_MULTILINE;
 345   if (pcre2_flags & PCRE2_DOTALL)
 346     compile_flags |= G_REGEX_DOTALL;
 347   if (pcre2_flags & PCRE2_EXTENDED)
 348     compile_flags |= G_REGEX_EXTENDED;
 349   if (pcre2_flags & PCRE2_ANCHORED)
 350     compile_flags |= G_REGEX_ANCHORED;
 351   if (pcre2_flags & PCRE2_DOLLAR_ENDONLY)
 352     compile_flags |= G_REGEX_DOLLAR_ENDONLY;
 353   if (pcre2_flags & PCRE2_UNGREEDY)
 354     compile_flags |= G_REGEX_UNGREEDY;
 355   if (!(pcre2_flags & PCRE2_UTF))
 356     compile_flags |= G_REGEX_RAW;
 357   if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE)
 358     compile_flags |= G_REGEX_NO_AUTO_CAPTURE;
 359   if (pcre2_flags & PCRE2_FIRSTLINE)
 360     compile_flags |= G_REGEX_FIRSTLINE;
 361   if (pcre2_flags & PCRE2_DUPNAMES)
 362     compile_flags |= G_REGEX_DUPNAMES;
 363
 364   return compile_flags & G_REGEX_COMPILE_MASK;
 365 }
 366
 367 static GRegexMatchFlags
 368 g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags)
 369 {
 370   GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT;
 371
 372   if (pcre2_flags & PCRE2_ANCHORED)
 373     match_flags |= G_REGEX_MATCH_ANCHORED;
 374   if (pcre2_flags & PCRE2_NOTBOL)
 375     match_flags |= G_REGEX_MATCH_NOTBOL;
 376   if (pcre2_flags & PCRE2_NOTEOL)
 377     match_flags |= G_REGEX_MATCH_NOTEOL;
 378   if (pcre2_flags & PCRE2_NOTEMPTY)
 379     match_flags |= G_REGEX_MATCH_NOTEMPTY;
 380   if (pcre2_flags & PCRE2_PARTIAL_SOFT)
 381     match_flags |= G_REGEX_MATCH_PARTIAL_SOFT;
 382   if (pcre2_flags & PCRE2_PARTIAL_HARD)
 383     match_flags |= G_REGEX_MATCH_PARTIAL_HARD;
 384   if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART)
 385     match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART;
 386
 387   return (match_flags & G_REGEX_MATCH_MASK);
 388 }
 389
 390 static uint32_t
 391 get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags)
 392 {
 393   compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK;
 394
 395   switch (compile_flags)
 396     {
 397     case G_REGEX_NEWLINE_CR:
 398       return PCRE2_NEWLINE_CR;
 399     case G_REGEX_NEWLINE_LF:
 400       return PCRE2_NEWLINE_LF;
 401     case G_REGEX_NEWLINE_CRLF:
 402       return PCRE2_NEWLINE_CRLF;
 403     case G_REGEX_NEWLINE_ANYCRLF:
 404       return PCRE2_NEWLINE_ANYCRLF;
 405     default:
 406       if (compile_flags != 0)
 407         return 0;
 408
 409       return PCRE2_NEWLINE_ANY;
 410     }
 411 }
 412
 413 static uint32_t
 414 get_pcre2_newline_match_options (GRegexMatchFlags match_flags)
 415 {
 416   switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK)
 417     {
 418     case G_REGEX_MATCH_NEWLINE_CR:
 419       return PCRE2_NEWLINE_CR;
 420     case G_REGEX_MATCH_NEWLINE_LF:
 421       return PCRE2_NEWLINE_LF;
 422     case G_REGEX_MATCH_NEWLINE_CRLF:
 423       return PCRE2_NEWLINE_CRLF;
 424     case G_REGEX_MATCH_NEWLINE_ANY:
 425       return PCRE2_NEWLINE_ANY;
 426     case G_REGEX_MATCH_NEWLINE_ANYCRLF:
 427       return PCRE2_NEWLINE_ANYCRLF;
 428     default:
 429       return 0;
 430     }
 431 }
 432
 433 static uint32_t
 434 get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags)
 435 {
 436   if (compile_flags & G_REGEX_BSR_ANYCRLF)
 437     return PCRE2_BSR_ANYCRLF;
 438
 439   return PCRE2_BSR_UNICODE;
 440 }
 441
 442 static uint32_t
 443 get_pcre2_bsr_match_options (GRegexMatchFlags match_flags)
 444 {
 445   if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF)
 446     return PCRE2_BSR_ANYCRLF;
 447
 448   if (match_flags & G_REGEX_MATCH_BSR_ANY)
 449     return PCRE2_BSR_UNICODE;
 450
 451   return 0;
 452 }
 453
 454 static char *
 455 get_pcre2_error_string (int errcode)
 456 {
 457   PCRE2_UCHAR8 error_msg[2048];
 458   int err_length;
 459
 460   err_length = pcre2_get_error_message (errcode, error_msg,
 461                                         G_N_ELEMENTS (error_msg));
 462
 463   if (err_length <= 0)
 464     return NULL;
 465
 466   /* The array is always filled with a trailing zero */
 467   g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg));
 468   return g_memdup2 (error_msg, err_length + 1);
 469 }
 470
 471 static const gchar *
 472 translate_match_error (gint errcode)
 473 {
 474   switch (errcode)
 475     {
 476     case PCRE2_ERROR_NOMATCH:
 477       /* not an error */
 478       break;
 479     case PCRE2_ERROR_NULL:
 480       /* NULL argument, this should not happen in GRegex */
 481       g_critical ("A NULL argument was passed to PCRE");
 482       break;
 483     case PCRE2_ERROR_BADOPTION:
 484       return "bad options";
 485     case PCRE2_ERROR_BADMAGIC:
 486       return _("corrupted object");
 487     case PCRE2_ERROR_NOMEMORY:
 488       return _("out of memory");
 489     case PCRE2_ERROR_NOSUBSTRING:
 490       /* not used by pcre2_match() */
 491       break;
 492     case PCRE2_ERROR_MATCHLIMIT:
 493     case PCRE2_ERROR_CALLOUT:
 494       /* callouts are not implemented */
 495       break;
 496     case PCRE2_ERROR_BADUTFOFFSET:
 497       /* we do not check if strings are valid */
 498       break;
 499     case PCRE2_ERROR_PARTIAL:
 500       /* not an error */
 501       break;
 502     case PCRE2_ERROR_INTERNAL:
 503       return _("internal error");
 504     case PCRE2_ERROR_DFA_UITEM:
 505       return _("the pattern contains items not supported for partial matching");
 506     case PCRE2_ERROR_DFA_UCOND:
 507       return _("back references as conditions are not supported for partial matching");
 508     case PCRE2_ERROR_DFA_WSSIZE:
 509       /* handled expanding the workspace */
 510       break;
 511     case PCRE2_ERROR_DFA_RECURSE:
 512     case PCRE2_ERROR_RECURSIONLIMIT:
 513       return _("recursion limit reached");
 514     case PCRE2_ERROR_BADOFFSET:
 515       return _("bad offset");
 516     case PCRE2_ERROR_RECURSELOOP:
 517       return _("recursion loop");
 518     case PCRE2_ERROR_JIT_BADOPTION:
 519       /* should not happen in GRegex since we check modes before each match */
 520       return _("matching mode is requested that was not compiled for JIT");
 521     default:
 522       break;
 523     }
 524   return NULL;
 525 }
 526
 527 static char *
 528 get_match_error_message (int errcode)
 529 {
 530   const char *msg = translate_match_error (errcode);
 531   char *error_string;
 532
 533   if (msg)
 534     return g_strdup (msg);
 535
 536   error_string = get_pcre2_error_string (errcode);
 537
 538   if (error_string)
 539     return error_string;
 540
 541   return g_strdup (_("unknown error"));
 542 }
 543
 544 static void
 545 translate_compile_error (gint *errcode, const gchar **errmsg)
 546 {
 547   /* If errcode is known we put the translatable error message in
 548    * errmsg. If errcode is unknown we put the generic
 549    * G_REGEX_ERROR_COMPILE error code in errcode.
 550    * Note that there can be more PCRE errors with the same GRegexError
 551    * and that some PCRE errors are useless for us.
 552    */
 553   gint original_errcode = *errcode;
 554
 555   *errcode = -1;
 556   *errmsg = NULL;
 557
 558   switch (original_errcode)
 559     {
 560     case PCRE2_ERROR_END_BACKSLASH:
 561       *errcode = G_REGEX_ERROR_STRAY_BACKSLASH;
 562       *errmsg = _("\\ at end of pattern");
 563       break;
 564     case PCRE2_ERROR_END_BACKSLASH_C:
 565       *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR;
 566       *errmsg = _("\\c at end of pattern");
 567       break;
 568     case PCRE2_ERROR_UNKNOWN_ESCAPE:
 569     case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE:
 570       *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
 571       *errmsg = _("unrecognized character following \\");
 572       break;
 573     case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER:
 574       *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER;
 575       *errmsg = _("numbers out of order in {} quantifier");
 576       break;
 577     case PCRE2_ERROR_QUANTIFIER_TOO_BIG:
 578       *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG;
 579       *errmsg = _("number too big in {} quantifier");
 580       break;
 581     case PCRE2_ERROR_MISSING_SQUARE_BRACKET:
 582       *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS;
 583       *errmsg = _("missing terminating ] for character class");
 584       break;
 585     case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS:
 586       *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS;
 587       *errmsg = _("invalid escape sequence in character class");
 588       break;
 589     case PCRE2_ERROR_CLASS_RANGE_ORDER:
 590       *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER;
 591       *errmsg = _("range out of order in character class");
 592       break;
 593     case PCRE2_ERROR_QUANTIFIER_INVALID:
 594     case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT:
 595       *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT;
 596       *errmsg = _("nothing to repeat");
 597       break;
 598     case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY:
 599       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 600       *errmsg = _("unrecognized character after (? or (?-");
 601       break;
 602     case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS:
 603       *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS;
 604       *errmsg = _("POSIX named classes are supported only within a class");
 605       break;
 606     case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING:
 607       *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED;
 608       *errmsg = _("POSIX collating elements are not supported");
 609       break;
 610     case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS:
 611     case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS:
 612     case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING:
 613       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
 614       *errmsg = _("missing terminating )");
 615       break;
 616     case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE:
 617       *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE;
 618       *errmsg = _("reference to non-existent subpattern");
 619       break;
 620     case PCRE2_ERROR_MISSING_COMMENT_CLOSING:
 621       *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT;
 622       *errmsg = _("missing ) after comment");
 623       break;
 624     case PCRE2_ERROR_PATTERN_TOO_LARGE:
 625       *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE;
 626       *errmsg = _("regular expression is too large");
 627       break;
 628     case PCRE2_ERROR_MISSING_CONDITION_CLOSING:
 629       *errcode = G_REGEX_ERROR_MALFORMED_CONDITION;
 630       *errmsg = _("malformed number or name after (?(");
 631       break;
 632     case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH:
 633       *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND;
 634       *errmsg = _("lookbehind assertion is not fixed length");
 635       break;
 636     case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES:
 637       *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES;
 638       *errmsg = _("conditional group contains more than two branches");
 639       break;
 640     case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED:
 641       *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED;
 642       *errmsg = _("assertion expected after (?(");
 643       break;
 644     case PCRE2_ERROR_BAD_RELATIVE_REFERENCE:
 645       *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE;
 646       *errmsg = _("a numbered reference must not be zero");
 647       break;
 648     case PCRE2_ERROR_UNKNOWN_POSIX_CLASS:
 649       *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME;
 650       *errmsg = _("unknown POSIX class name");
 651       break;
 652     case PCRE2_ERROR_CODE_POINT_TOO_BIG:
 653     case PCRE2_ERROR_INVALID_HEXADECIMAL:
 654       *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
 655       *errmsg = _("character value in \\x{...} sequence is too large");
 656       break;
 657     case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C:
 658       *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND;
 659       *errmsg = _("\\C not allowed in lookbehind assertion");
 660       break;
 661     case PCRE2_ERROR_MISSING_NAME_TERMINATOR:
 662       *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR;
 663       *errmsg = _("missing terminator in subpattern name");
 664       break;
 665     case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME:
 666       *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME;
 667       *errmsg = _("two named subpatterns have the same name");
 668       break;
 669     case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY:
 670       *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY;
 671       *errmsg = _("malformed \\P or \\p sequence");
 672       break;
 673     case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY:
 674       *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY;
 675       *errmsg = _("unknown property name after \\P or \\p");
 676       break;
 677     case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG:
 678       *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG;
 679       *errmsg = _("subpattern name is too long (maximum 32 characters)");
 680       break;
 681     case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS:
 682       *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS;
 683       *errmsg = _("too many named subpatterns (maximum 10,000)");
 684       break;
 685     case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG:
 686       *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE;
 687       *errmsg = _("octal value is greater than \\377");
 688       break;
 689     case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES:
 690       *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE;
 691       *errmsg = _("DEFINE group contains more than one branch");
 692       break;
 693     case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE:
 694       *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS;
 695       *errmsg = _("inconsistent NEWLINE options");
 696       break;
 697     case PCRE2_ERROR_BACKSLASH_G_SYNTAX:
 698       *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE;
 699       *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
 700                   "number, or by a plain number");
 701       break;
 702     case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED:
 703       *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN;
 704       *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
 705       break;
 706     case PCRE2_ERROR_VERB_UNKNOWN:
 707       *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB;
 708       *errmsg = _("(*VERB) not recognized");
 709       break;
 710     case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG:
 711       *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG;
 712       *errmsg = _("number is too big");
 713       break;
 714     case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED:
 715       *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME;
 716       *errmsg = _("missing subpattern name after (?&");
 717       break;
 718     case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH:
 719       *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME;
 720       *errmsg = _("different names for subpatterns of the same number are not allowed");
 721       break;
 722     case PCRE2_ERROR_MARK_MISSING_ARGUMENT:
 723       *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED;
 724       *errmsg = _("(*MARK) must have an argument");
 725       break;
 726     case PCRE2_ERROR_BACKSLASH_C_SYNTAX:
 727       *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR;
 728       *errmsg = _( "\\c must be followed by an ASCII character");
 729       break;
 730     case PCRE2_ERROR_BACKSLASH_K_SYNTAX:
 731       *errcode = G_REGEX_ERROR_MISSING_NAME;
 732       *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
 733       break;
 734     case PCRE2_ERROR_BACKSLASH_N_IN_CLASS:
 735       *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS;
 736       *errmsg = _("\\N is not supported in a class");
 737       break;
 738     case PCRE2_ERROR_VERB_NAME_TOO_LONG:
 739       *errcode = G_REGEX_ERROR_NAME_TOO_LONG;
 740       *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
 741       break;
 742     case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW:
 743       *errcode = G_REGEX_ERROR_INTERNAL;
 744       *errmsg = _("code overflow");
 745       break;
 746     case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P:
 747       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
 748       *errmsg = _("unrecognized character after (?P");
 749       break;
 750     case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE:
 751       *errcode = G_REGEX_ERROR_INTERNAL;
 752       *errmsg = _("overran compiling workspace");
 753       break;
 754     case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN:
 755       *errcode = G_REGEX_ERROR_INTERNAL;
 756       *errmsg = _("previously-checked referenced subpattern not found");
 757       break;
 758     case PCRE2_ERROR_HEAP_FAILED:
 759     case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW:
 760     case PCRE2_ERROR_UNICODE_NOT_SUPPORTED:
 761     case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT:
 762     case PCRE2_ERROR_NO_SURROGATES_IN_UTF16:
 763     case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS:
 764     case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE:
 765     case PCRE2_ERROR_INTERNAL_STUDY_ERROR:
 766     case PCRE2_ERROR_UTF_IS_DISABLED:
 767     case PCRE2_ERROR_UCP_IS_DISABLED:
 768     case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS:
 769     case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED:
 770     case PCRE2_ERROR_INTERNAL_BAD_CODE:
 771     case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP:
 772       *errcode = G_REGEX_ERROR_INTERNAL;
 773       break;
 774     case PCRE2_ERROR_INVALID_SUBPATTERN_NAME:
 775     case PCRE2_ERROR_CLASS_INVALID_RANGE:
 776     case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE:
 777     case PCRE2_ERROR_PARENTHESES_STACK_CHECK:
 778     case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED:
 779     case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG:
 780     case PCRE2_ERROR_MISSING_CALLOUT_CLOSING:
 781     case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB:
 782     case PCRE2_ERROR_NULL_PATTERN:
 783     case PCRE2_ERROR_BAD_OPTIONS:
 784     case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP:
 785     case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE:
 786     case PCRE2_ERROR_INVALID_OCTAL:
 787     case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG:
 788     case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG:
 789     case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS:
 790     case PCRE2_ERROR_VERSION_CONDITION_SYNTAX:
 791     case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER:
 792     case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER:
 793     case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED:
 794     case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP:
 795     case PCRE2_ERROR_PATTERN_TOO_COMPLICATED:
 796     case PCRE2_ERROR_LOOKBEHIND_TOO_LONG:
 797     case PCRE2_ERROR_PATTERN_STRING_TOO_LONG:
 798     case PCRE2_ERROR_BAD_LITERAL_OPTIONS:
 799     default:
 800       *errcode = G_REGEX_ERROR_COMPILE;
 801       break;
 802     }
 803
 804   g_assert (*errcode != -1);
 805 }
 806
 807 /* GMatchInfo */
 808
 809 static GMatchInfo *
 810 match_info_new (const GRegex     *regex,
 811                 const gchar      *string,
 812                 gint              string_len,
 813                 gint              start_position,
 814                 GRegexMatchFlags  match_options,
 815                 gboolean          is_dfa)
 816 {
 817   GMatchInfo *match_info;
 818
 819   if (string_len < 0)
 820     string_len = strlen (string);
 821
 822   match_info = g_new0 (GMatchInfo, 1);
 823   match_info->ref_count = 1;
 824   match_info->regex = g_regex_ref ((GRegex *)regex);
 825   match_info->string = string;
 826   match_info->string_len = string_len;
 827   match_info->matches = PCRE2_ERROR_NOMATCH;
 828   match_info->pos = start_position;
 829   match_info->match_opts =
 830     get_pcre2_match_options (match_options, regex->orig_compile_opts);
 831
 832   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT,
 833                       &match_info->n_subpatterns);
 834
 835   match_info->match_context = pcre2_match_context_create (NULL);
 836
 837   if (is_dfa)
 838     {
 839       /* These values should be enough for most cases, if they are not
 840        * enough g_regex_match_all_full() will expand them. */
 841       match_info->n_workspace = 100;
 842       match_info->workspace = g_new (gint, match_info->n_workspace);
 843     }
 844
 845   match_info->n_offsets = 2;
 846   match_info->offsets = g_new0 (gint, match_info->n_offsets);
 847   /* Set an invalid position for the previous match. */
 848   match_info->offsets[0] = -1;
 849   match_info->offsets[1] = -1;
 850
 851   match_info->match_data = pcre2_match_data_create_from_pattern (
 852       match_info->regex->pcre_re,
 853       NULL);
 854
 855   return match_info;
 856 }
 857
 858 static gboolean
 859 recalc_match_offsets (GMatchInfo *match_info,
 860                       GError     **error)
 861 {
 862   PCRE2_SIZE *ovector;
 863   uint32_t ovector_size = 0;
 864   uint32_t pre_n_offset;
 865   uint32_t i;
 866
 867   g_assert (!IS_PCRE2_ERROR (match_info->matches));
 868
 869   if (match_info->matches == PCRE2_ERROR_PARTIAL)
 870     ovector_size = 1;
 871   else if (match_info->matches > 0)
 872     ovector_size = match_info->matches;
 873
 874   g_assert (ovector_size != 0);
 875
 876   if (pcre2_get_ovector_count (match_info->match_data) < ovector_size)
 877     {
 878       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
 879                    _("Error while matching regular expression %s: %s"),
 880                    match_info->regex->pattern, _("code overflow"));
 881       return FALSE;
 882     }
 883
 884   pre_n_offset = match_info->n_offsets;
 885   match_info->n_offsets = ovector_size * 2;
 886   ovector = pcre2_get_ovector_pointer (match_info->match_data);
 887
 888   if (match_info->n_offsets != pre_n_offset)
 889     {
 890       match_info->offsets = g_realloc_n (match_info->offsets,
 891                                          match_info->n_offsets,
 892                                          sizeof (gint));
 893     }
 894
 895   for (i = 0; i < match_info->n_offsets; i++)
 896     {
 897       match_info->offsets[i] = (int) ovector[i];
 898     }
 899
 900   return TRUE;
 901 }
 902
 903 static JITStatus
 904 enable_jit_with_match_options (GMatchInfo  *match_info,
 905                                uint32_t  match_options)
 906 {
 907   gint retval;
 908   uint32_t old_jit_options, new_jit_options;
 909
 910   if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE))
 911     return JIT_STATUS_DISABLED;
 912
 913   if (match_info->regex->jit_status == JIT_STATUS_DISABLED)
 914     return JIT_STATUS_DISABLED;
 915
 916   if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS)
 917     return JIT_STATUS_DISABLED;
 918
 919   old_jit_options = match_info->regex->jit_options;
 920   new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE;
 921   if (match_options & PCRE2_PARTIAL_HARD)
 922     new_jit_options |= PCRE2_JIT_PARTIAL_HARD;
 923   if (match_options & PCRE2_PARTIAL_SOFT)
 924     new_jit_options |= PCRE2_JIT_PARTIAL_SOFT;
 925
 926   /* no new options enabled */
 927   if (new_jit_options == old_jit_options)
 928     {
 929       g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT);
 930       return match_info->regex->jit_status;
 931     }
 932
 933   retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options);
 934   if (retval == 0)
 935     {
 936       match_info->regex->jit_status = JIT_STATUS_ENABLED;
 937
 938       match_info->regex->jit_options = new_jit_options;
 939       /* Set min stack size for JIT to 32KiB and max to 512KiB */
 940       match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL);
 941       pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack);
 942     }
 943   else
 944     {
 945       match_info->regex->jit_status = JIT_STATUS_DISABLED;
 946
 947       switch (retval)
 948         {
 949         case PCRE2_ERROR_NOMEMORY:
 950           g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
 951                    "but JIT was unable to allocate executable memory for the "
 952                    "compiler. Falling back to interpretive code.");
 953           break;
 954         case PCRE2_ERROR_JIT_BADOPTION:
 955           g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
 956                    "but JIT support is not available. Falling back to "
 957                    "interpretive code.");
 958           break;
 959         default:
 960           g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
 961                    "but request for JIT support had unexpectedly failed (error %d). "
 962                    "Falling back to interpretive code.",
 963                    retval);
 964           break;
 965         }
 966     }
 967
 968   return match_info->regex->jit_status;
 969
 970   g_assert_not_reached ();
 971 }
 972
 973 /**
 974  * g_match_info_get_regex:
 975  * @match_info: a #GMatchInfo
 976  *
 977  * Returns #GRegex object used in @match_info. It belongs to Glib
 978  * and must not be freed. Use g_regex_ref() if you need to keep it
 979  * after you free @match_info object.
 980  *
 981  * Returns: (transfer none): #GRegex object used in @match_info
 982  *
 983  * Since: 2.14
 984  */
 985 GRegex *
 986 g_match_info_get_regex (const GMatchInfo *match_info)
 987 {
 988   g_return_val_if_fail (match_info != NULL, NULL);
 989   return match_info->regex;
 990 }
 991
 992 /**
 993  * g_match_info_get_string:
 994  * @match_info: a #GMatchInfo
 995  *
 996  * Returns the string searched with @match_info. This is the
 997  * string passed to g_regex_match() or g_regex_replace() so
 998  * you may not free it before calling this function.
 999  *
1000  * Returns: the string searched with @match_info
1001  *
1002  * Since: 2.14
1003  */
1004 const gchar *
1005 g_match_info_get_string (const GMatchInfo *match_info)
1006 {
1007   g_return_val_if_fail (match_info != NULL, NULL);
1008   return match_info->string;
1009 }
1010
1011 /**
1012  * g_match_info_ref:
1013  * @match_info: a #GMatchInfo
1014  *
1015  * Increases reference count of @match_info by 1.
1016  *
1017  * Returns: @match_info
1018  *
1019  * Since: 2.30
1020  */
1021 GMatchInfo       *
1022 g_match_info_ref (GMatchInfo *match_info)
1023 {
1024   g_return_val_if_fail (match_info != NULL, NULL);
1025   g_atomic_int_inc (&match_info->ref_count);
1026   return match_info;
1027 }
1028
1029 /**
1030  * g_match_info_unref:
1031  * @match_info: a #GMatchInfo
1032  *
1033  * Decreases reference count of @match_info by 1. When reference count drops
1034  * to zero, it frees all the memory associated with the match_info structure.
1035  *
1036  * Since: 2.30
1037  */
1038 void
1039 g_match_info_unref (GMatchInfo *match_info)
1040 {
1041   if (g_atomic_int_dec_and_test (&match_info->ref_count))
1042     {
1043       g_regex_unref (match_info->regex);
1044       if (match_info->match_context)
1045         pcre2_match_context_free (match_info->match_context);
1046       if (match_info->jit_stack)
1047         pcre2_jit_stack_free (match_info->jit_stack);
1048       if (match_info->match_data)
1049         pcre2_match_data_free (match_info->match_data);
1050       g_free (match_info->offsets);
1051       g_free (match_info->workspace);
1052       g_free (match_info);
1053     }
1054 }
1055
1056 /**
1057  * g_match_info_free:
1058  * @match_info: (nullable): a #GMatchInfo, or %NULL
1059  *
1060  * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
1061  * nothing.
1062  *
1063  * Since: 2.14
1064  */
1065 void
1066 g_match_info_free (GMatchInfo *match_info)
1067 {
1068   if (match_info == NULL)
1069     return;
1070
1071   g_match_info_unref (match_info);
1072 }
1073
1074 /**
1075  * g_match_info_next:
1076  * @match_info: a #GMatchInfo structure
1077  * @error: location to store the error occurring, or %NULL to ignore errors
1078  *
1079  * Scans for the next match using the same parameters of the previous
1080  * call to g_regex_match_full() or g_regex_match() that returned
1081  * @match_info.
1082  *
1083  * The match is done on the string passed to the match function, so you
1084  * cannot free it before calling this function.
1085  *
1086  * Returns: %TRUE is the string matched, %FALSE otherwise
1087  *
1088  * Since: 2.14
1089  */
1090 gboolean
1091 g_match_info_next (GMatchInfo  *match_info,
1092                    GError     **error)
1093 {
1094   JITStatus jit_status;
1095   gint prev_match_start;
1096   gint prev_match_end;
1097   uint32_t opts;
1098
1099   g_return_val_if_fail (match_info != NULL, FALSE);
1100   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1101   g_return_val_if_fail (match_info->pos >= 0, FALSE);
1102
1103   prev_match_start = match_info->offsets[0];
1104   prev_match_end = match_info->offsets[1];
1105
1106   if (match_info->pos > match_info->string_len)
1107     {
1108       /* we have reached the end of the string */
1109       match_info->pos = -1;
1110       match_info->matches = PCRE2_ERROR_NOMATCH;
1111       return FALSE;
1112     }
1113
1114   opts = match_info->regex->match_opts | match_info->match_opts;
1115
1116   jit_status = enable_jit_with_match_options (match_info, opts);
1117   if (jit_status == JIT_STATUS_ENABLED)
1118     {
1119       match_info->matches = pcre2_jit_match (match_info->regex->pcre_re,
1120                                              (PCRE2_SPTR8) match_info->string,
1121                                              match_info->string_len,
1122                                              match_info->pos,
1123                                              opts,
1124                                              match_info->match_data,
1125                                              match_info->match_context);
1126       /* if the JIT stack limit was reached, fall back to non-JIT matching in
1127        * the next conditional statement */
1128       if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT)
1129         {
1130           g_debug ("PCRE2 JIT stack limit reached, falling back to "
1131                    "non-optimized matching.");
1132           opts |= PCRE2_NO_JIT;
1133           jit_status = JIT_STATUS_DISABLED;
1134         }
1135     }
1136
1137   if (jit_status != JIT_STATUS_ENABLED)
1138     {
1139       match_info->matches = pcre2_match (match_info->regex->pcre_re,
1140                                          (PCRE2_SPTR8) match_info->string,
1141                                          match_info->string_len,
1142                                          match_info->pos,
1143                                          opts,
1144                                          match_info->match_data,
1145                                          match_info->match_context);
1146     }
1147
1148   if (IS_PCRE2_ERROR (match_info->matches))
1149     {
1150       gchar *error_msg = get_match_error_message (match_info->matches);
1151
1152       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1153                    _("Error while matching regular expression %s: %s"),
1154                    match_info->regex->pattern, error_msg);
1155       g_clear_pointer (&error_msg, g_free);
1156       return FALSE;
1157     }
1158   else if (match_info->matches == 0)
1159     {
1160       /* info->offsets is too small. */
1161       match_info->n_offsets *= 2;
1162       match_info->offsets = g_realloc_n (match_info->offsets,
1163                                          match_info->n_offsets,
1164                                          sizeof (gint));
1165
1166       pcre2_match_data_free (match_info->match_data);
1167       match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL);
1168
1169       return g_match_info_next (match_info, error);
1170     }
1171   else if (match_info->matches == PCRE2_ERROR_NOMATCH)
1172     {
1173       /* We're done with this match info */
1174       match_info->pos = -1;
1175       return FALSE;
1176     }
1177   else
1178     if (!recalc_match_offsets (match_info, error))
1179       return FALSE;
1180
1181   /* avoid infinite loops if the pattern is an empty string or something
1182    * equivalent */
1183   if (match_info->pos == match_info->offsets[1])
1184     {
1185       if (match_info->pos > match_info->string_len)
1186         {
1187           /* we have reached the end of the string */
1188           match_info->pos = -1;
1189           match_info->matches = PCRE2_ERROR_NOMATCH;
1190           return FALSE;
1191         }
1192
1193       match_info->pos = NEXT_CHAR (match_info->regex,
1194                                    &match_info->string[match_info->pos]) -
1195                                    match_info->string;
1196     }
1197   else
1198     {
1199       match_info->pos = match_info->offsets[1];
1200     }
1201
1202   g_assert (match_info->matches < 0 ||
1203             (uint32_t) match_info->matches <= match_info->n_subpatterns + 1);
1204
1205   /* it's possible to get two identical matches when we are matching
1206    * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
1207    * the string is "RegExTest" we have:
1208    *  - search at position 0: match from 0 to 0
1209    *  - search at position 1: match from 3 to 3
1210    *  - search at position 3: match from 3 to 3 (duplicate)
1211    *  - search at position 4: match from 5 to 5
1212    *  - search at position 5: match from 5 to 5 (duplicate)
1213    *  - search at position 6: no match -> stop
1214    * so we have to ignore the duplicates.
1215    * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
1216   if (match_info->matches >= 0 &&
1217       prev_match_start == match_info->offsets[0] &&
1218       prev_match_end == match_info->offsets[1])
1219     {
1220       /* ignore this match and search the next one */
1221       return g_match_info_next (match_info, error);
1222     }
1223
1224   return match_info->matches >= 0;
1225 }
1226
1227 /**
1228  * g_match_info_matches:
1229  * @match_info: a #GMatchInfo structure
1230  *
1231  * Returns whether the previous match operation succeeded.
1232  *
1233  * Returns: %TRUE if the previous match operation succeeded,
1234  *   %FALSE otherwise
1235  *
1236  * Since: 2.14
1237  */
1238 gboolean
1239 g_match_info_matches (const GMatchInfo *match_info)
1240 {
1241   g_return_val_if_fail (match_info != NULL, FALSE);
1242
1243   return match_info->matches >= 0;
1244 }
1245
1246 /**
1247  * g_match_info_get_match_count:
1248  * @match_info: a #GMatchInfo structure
1249  *
1250  * Retrieves the number of matched substrings (including substring 0,
1251  * that is the whole matched text), so 1 is returned if the pattern
1252  * has no substrings in it and 0 is returned if the match failed.
1253  *
1254  * If the last match was obtained using the DFA algorithm, that is
1255  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
1256  * count is not that of the number of capturing parentheses but that of
1257  * the number of matched substrings.
1258  *
1259  * Returns: Number of matched substrings, or -1 if an error occurred
1260  *
1261  * Since: 2.14
1262  */
1263 gint
1264 g_match_info_get_match_count (const GMatchInfo *match_info)
1265 {
1266   g_return_val_if_fail (match_info, -1);
1267
1268   if (match_info->matches == PCRE2_ERROR_NOMATCH)
1269     /* no match */
1270     return 0;
1271   else if (match_info->matches < PCRE2_ERROR_NOMATCH)
1272     /* error */
1273     return -1;
1274   else
1275     /* match */
1276     return match_info->matches;
1277 }
1278
1279 /**
1280  * g_match_info_is_partial_match:
1281  * @match_info: a #GMatchInfo structure
1282  *
1283  * Usually if the string passed to g_regex_match*() matches as far as
1284  * it goes, but is too short to match the entire pattern, %FALSE is
1285  * returned. There are circumstances where it might be helpful to
1286  * distinguish this case from other cases in which there is no match.
1287  *
1288  * Consider, for example, an application where a human is required to
1289  * type in data for a field with specific formatting requirements. An
1290  * example might be a date in the form ddmmmyy, defined by the pattern
1291  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
1292  * If the application sees the user’s keystrokes one by one, and can
1293  * check that what has been typed so far is potentially valid, it is
1294  * able to raise an error as soon as a mistake is made.
1295  *
1296  * GRegex supports the concept of partial matching by means of the
1297  * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags.
1298  * When they are used, the return code for
1299  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
1300  * for a complete match, %FALSE otherwise. But, when these functions
1301  * return %FALSE, you can check if the match was partial calling
1302  * g_match_info_is_partial_match().
1303  *
1304  * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and
1305  * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered
1306  * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a
1307  * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching
1308  * stops at the partial match.
1309  * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD
1310  * are set, the latter takes precedence.
1311  *
1312  * There were formerly some restrictions on the pattern for partial matching.
1313  * The restrictions no longer apply.
1314  *
1315  * See pcrepartial(3) for more information on partial matching.
1316  *
1317  * Returns: %TRUE if the match was partial, %FALSE otherwise
1318  *
1319  * Since: 2.14
1320  */
1321 gboolean
1322 g_match_info_is_partial_match (const GMatchInfo *match_info)
1323 {
1324   g_return_val_if_fail (match_info != NULL, FALSE);
1325
1326   return match_info->matches == PCRE2_ERROR_PARTIAL;
1327 }
1328
1329 /**
1330  * g_match_info_expand_references:
1331  * @match_info: (nullable): a #GMatchInfo or %NULL
1332  * @string_to_expand: the string to expand
1333  * @error: location to store the error occurring, or %NULL to ignore errors
1334  *
1335  * Returns a new string containing the text in @string_to_expand with
1336  * references and escape sequences expanded. References refer to the last
1337  * match done with @string against @regex and have the same syntax used by
1338  * g_regex_replace().
1339  *
1340  * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was
1341  * passed to g_regex_new().
1342  *
1343  * The backreferences are extracted from the string passed to the match
1344  * function, so you cannot call this function after freeing the string.
1345  *
1346  * @match_info may be %NULL in which case @string_to_expand must not
1347  * contain references. For instance "foo\n" does not refer to an actual
1348  * pattern and '\n' merely will be replaced with \n character,
1349  * while to expand "\0" (whole match) one needs the result of a match.
1350  * Use g_regex_check_replacement() to find out whether @string_to_expand
1351  * contains references.
1352  *
1353  * Returns: (nullable): the expanded string, or %NULL if an error occurred
1354  *
1355  * Since: 2.14
1356  */
1357 gchar *
1358 g_match_info_expand_references (const GMatchInfo  *match_info,
1359                                 const gchar       *string_to_expand,
1360                                 GError           **error)
1361 {
1362   GString *result;
1363   GList *list;
1364   GError *tmp_error = NULL;
1365
1366   g_return_val_if_fail (string_to_expand != NULL, NULL);
1367   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1368
1369   list = split_replacement (string_to_expand, &tmp_error);
1370   if (tmp_error != NULL)
1371     {
1372       g_propagate_error (error, tmp_error);
1373       return NULL;
1374     }
1375
1376   if (!match_info && interpolation_list_needs_match (list))
1377     {
1378       g_critical ("String '%s' contains references to the match, can't "
1379                   "expand references without GMatchInfo object",
1380                   string_to_expand);
1381       return NULL;
1382     }
1383
1384   result = g_string_sized_new (strlen (string_to_expand));
1385   interpolate_replacement (match_info, result, list);
1386
1387   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
1388
1389   return g_string_free (result, FALSE);
1390 }
1391
1392 /**
1393  * g_match_info_fetch:
1394  * @match_info: #GMatchInfo structure
1395  * @match_num: number of the sub expression
1396  *
1397  * Retrieves the text matching the @match_num'th capturing
1398  * parentheses. 0 is the full text of the match, 1 is the first paren
1399  * set, 2 the second, and so on.
1400  *
1401  * If @match_num is a valid sub pattern but it didn't match anything
1402  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
1403  * string is returned.
1404  *
1405  * If the match was obtained using the DFA algorithm, that is using
1406  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1407  * string is not that of a set of parentheses but that of a matched
1408  * substring. Substrings are matched in reverse order of length, so
1409  * 0 is the longest match.
1410  *
1411  * The string is fetched from the string passed to the match function,
1412  * so you cannot call this function after freeing the string.
1413  *
1414  * Returns: (nullable): The matched substring, or %NULL if an error
1415  *     occurred. You have to free the string yourself
1416  *
1417  * Since: 2.14
1418  */
1419 gchar *
1420 g_match_info_fetch (const GMatchInfo *match_info,
1421                     gint              match_num)
1422 {
1423   gchar *match = NULL;
1424   gint start, end;
1425
1426   g_return_val_if_fail (match_info != NULL, NULL);
1427   g_return_val_if_fail (match_num >= 0, NULL);
1428
1429   /* match_num does not exist or it didn't matched, i.e. matching "b"
1430    * against "(a)?b" then group 0 is empty. */
1431   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
1432     match = NULL;
1433   else if (start == -1)
1434     match = g_strdup ("");
1435   else
1436     match = g_strndup (&match_info->string[start], end - start);
1437
1438   return match;
1439 }
1440
1441 /**
1442  * g_match_info_fetch_pos:
1443  * @match_info: #GMatchInfo structure
1444  * @match_num: number of the sub expression
1445  * @start_pos: (out) (optional): pointer to location where to store
1446  *     the start position, or %NULL
1447  * @end_pos: (out) (optional): pointer to location where to store
1448  *     the end position, or %NULL
1449  *
1450  * Retrieves the position in bytes of the @match_num'th capturing
1451  * parentheses. 0 is the full text of the match, 1 is the first
1452  * paren set, 2 the second, and so on.
1453  *
1454  * If @match_num is a valid sub pattern but it didn't match anything
1455  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
1456  * and @end_pos are set to -1 and %TRUE is returned.
1457  *
1458  * If the match was obtained using the DFA algorithm, that is using
1459  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1460  * position is not that of a set of parentheses but that of a matched
1461  * substring. Substrings are matched in reverse order of length, so
1462  * 0 is the longest match.
1463  *
1464  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
1465  *   the position cannot be fetched, @start_pos and @end_pos are left
1466  *   unchanged
1467  *
1468  * Since: 2.14
1469  */
1470 gboolean
1471 g_match_info_fetch_pos (const GMatchInfo *match_info,
1472                         gint              match_num,
1473                         gint             *start_pos,
1474                         gint             *end_pos)
1475 {
1476   g_return_val_if_fail (match_info != NULL, FALSE);
1477   g_return_val_if_fail (match_num >= 0, FALSE);
1478
1479   /* check whether there was an error */
1480   if (match_info->matches < 0)
1481     return FALSE;
1482
1483   /* make sure the sub expression number they're requesting is less than
1484    * the total number of sub expressions in the regex. When matching all
1485    * (g_regex_match_all()), also compare against the number of matches */
1486   if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches))
1487     return FALSE;
1488
1489   if (start_pos != NULL)
1490     *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1;
1491
1492   if (end_pos != NULL)
1493     *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1;
1494
1495   return TRUE;
1496 }
1497
1498 /*
1499  * Returns number of first matched subpattern with name @name.
1500  * There may be more than one in case when DUPNAMES is used,
1501  * and not all subpatterns with that name match;
1502  * pcre2_substring_number_from_name() does not work in that case.
1503  */
1504 static gint
1505 get_matched_substring_number (const GMatchInfo *match_info,
1506                               const gchar      *name)
1507 {
1508   gint entrysize;
1509   PCRE2_SPTR first, last;
1510   guchar *entry;
1511
1512   if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES))
1513     return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name);
1514
1515   /* This code is analogous to code from pcre2_substring.c:
1516    * pcre2_substring_get_byname() */
1517   entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re,
1518                                               (PCRE2_SPTR8) name,
1519                                               &first,
1520                                               &last);
1521
1522   if (entrysize <= 0)
1523     return entrysize;
1524
1525   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1526     {
1527       guint n = (entry[0] << 8) + entry[1];
1528       if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0)
1529         return n;
1530     }
1531
1532   return (first[0] << 8) + first[1];
1533 }
1534
1535 /**
1536  * g_match_info_fetch_named:
1537  * @match_info: #GMatchInfo structure
1538  * @name: name of the subexpression
1539  *
1540  * Retrieves the text matching the capturing parentheses named @name.
1541  *
1542  * If @name is a valid sub pattern name but it didn't match anything
1543  * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
1544  * then an empty string is returned.
1545  *
1546  * The string is fetched from the string passed to the match function,
1547  * so you cannot call this function after freeing the string.
1548  *
1549  * Returns: (nullable): The matched substring, or %NULL if an error
1550  *     occurred. You have to free the string yourself
1551  *
1552  * Since: 2.14
1553  */
1554 gchar *
1555 g_match_info_fetch_named (const GMatchInfo *match_info,
1556                           const gchar      *name)
1557 {
1558   gint num;
1559
1560   g_return_val_if_fail (match_info != NULL, NULL);
1561   g_return_val_if_fail (name != NULL, NULL);
1562
1563   num = get_matched_substring_number (match_info, name);
1564   if (num < 0)
1565     return NULL;
1566   else
1567     return g_match_info_fetch (match_info, num);
1568 }
1569
1570 /**
1571  * g_match_info_fetch_named_pos:
1572  * @match_info: #GMatchInfo structure
1573  * @name: name of the subexpression
1574  * @start_pos: (out) (optional): pointer to location where to store
1575  *     the start position, or %NULL
1576  * @end_pos: (out) (optional): pointer to location where to store
1577  *     the end position, or %NULL
1578  *
1579  * Retrieves the position in bytes of the capturing parentheses named @name.
1580  *
1581  * If @name is a valid sub pattern name but it didn't match anything
1582  * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
1583  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1584  *
1585  * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1586  *     If the position cannot be fetched, @start_pos and @end_pos
1587  *     are left unchanged.
1588  *
1589  * Since: 2.14
1590  */
1591 gboolean
1592 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1593                               const gchar      *name,
1594                               gint             *start_pos,
1595                               gint             *end_pos)
1596 {
1597   gint num;
1598
1599   g_return_val_if_fail (match_info != NULL, FALSE);
1600   g_return_val_if_fail (name != NULL, FALSE);
1601
1602   num = get_matched_substring_number (match_info, name);
1603   if (num < 0)
1604     return FALSE;
1605
1606   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1607 }
1608
1609 /**
1610  * g_match_info_fetch_all:
1611  * @match_info: a #GMatchInfo structure
1612  *
1613  * Bundles up pointers to each of the matching substrings from a match
1614  * and stores them in an array of gchar pointers. The first element in
1615  * the returned array is the match number 0, i.e. the entire matched
1616  * text.
1617  *
1618  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1619  * "b" against "(a)?b") then an empty string is inserted.
1620  *
1621  * If the last match was obtained using the DFA algorithm, that is using
1622  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1623  * strings are not that matched by sets of parentheses but that of the
1624  * matched substring. Substrings are matched in reverse order of length,
1625  * so the first one is the longest match.
1626  *
1627  * The strings are fetched from the string passed to the match function,
1628  * so you cannot call this function after freeing the string.
1629  *
1630  * Returns: (transfer full): a %NULL-terminated array of gchar *
1631  *     pointers.  It must be freed using g_strfreev(). If the previous
1632  *     match failed %NULL is returned
1633  *
1634  * Since: 2.14
1635  */
1636 gchar **
1637 g_match_info_fetch_all (const GMatchInfo *match_info)
1638 {
1639   gchar **result;
1640   gint i;
1641
1642   g_return_val_if_fail (match_info != NULL, NULL);
1643
1644   if (match_info->matches < 0)
1645     return NULL;
1646
1647   result = g_new (gchar *, match_info->matches + 1);
1648   for (i = 0; i < match_info->matches; i++)
1649     result[i] = g_match_info_fetch (match_info, i);
1650   result[i] = NULL;
1651
1652   return result;
1653 }
1654
1655
1656 /* GRegex */
1657
1658 G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
1659
1660 /**
1661  * g_regex_ref:
1662  * @regex: a #GRegex
1663  *
1664  * Increases reference count of @regex by 1.
1665  *
1666  * Returns: @regex
1667  *
1668  * Since: 2.14
1669  */
1670 GRegex *
1671 g_regex_ref (GRegex *regex)
1672 {
1673   g_return_val_if_fail (regex != NULL, NULL);
1674   g_atomic_int_inc (&regex->ref_count);
1675   return regex;
1676 }
1677
1678 /**
1679  * g_regex_unref:
1680  * @regex: a #GRegex
1681  *
1682  * Decreases reference count of @regex by 1. When reference count drops
1683  * to zero, it frees all the memory associated with the regex structure.
1684  *
1685  * Since: 2.14
1686  */
1687 void
1688 g_regex_unref (GRegex *regex)
1689 {
1690   g_return_if_fail (regex != NULL);
1691
1692   if (g_atomic_int_dec_and_test (&regex->ref_count))
1693     {
1694       g_free (regex->pattern);
1695       if (regex->pcre_re != NULL)
1696         pcre2_code_free (regex->pcre_re);
1697       g_free (regex);
1698     }
1699 }
1700
1701 static pcre2_code * regex_compile (const gchar  *pattern,
1702                                    uint32_t      compile_options,
1703                                    uint32_t      newline_options,
1704                                    uint32_t      bsr_options,
1705                                    GError      **error);
1706
1707 static uint32_t get_pcre2_inline_compile_options (pcre2_code *re,
1708                                                   uint32_t    compile_options);
1709
1710 /**
1711  * g_regex_new:
1712  * @pattern: the regular expression
1713  * @compile_options: compile options for the regular expression, or 0
1714  * @match_options: match options for the regular expression, or 0
1715  * @error: return location for a #GError
1716  *
1717  * Compiles the regular expression to an internal form, and does
1718  * the initial setup of the #GRegex structure.
1719  *
1720  * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call
1721  *   g_regex_unref() when you are done with it
1722  *
1723  * Since: 2.14
1724  */
1725 GRegex *
1726 g_regex_new (const gchar         *pattern,
1727              GRegexCompileFlags   compile_options,
1728              GRegexMatchFlags     match_options,
1729              GError             **error)
1730 {
1731   GRegex *regex;
1732   pcre2_code *re;
1733   static gsize initialised = 0;
1734   uint32_t pcre_compile_options;
1735   uint32_t pcre_match_options;
1736   uint32_t newline_options;
1737   uint32_t bsr_options;
1738
1739   g_return_val_if_fail (pattern != NULL, NULL);
1740   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1741 G_GNUC_BEGIN_IGNORE_DEPRECATIONS
1742   g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK |
1743                                              G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL);
1744 G_GNUC_END_IGNORE_DEPRECATIONS
1745   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1746
1747   if (g_once_init_enter (&initialised))
1748     {
1749       int supports_utf8;
1750
1751       pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8);
1752       if (!supports_utf8)
1753         g_critical (_("PCRE library is compiled without UTF8 support"));
1754
1755       g_once_init_leave (&initialised, supports_utf8 ? 1 : 2);
1756     }
1757
1758   if (G_UNLIKELY (initialised != 1))
1759     {
1760       g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE,
1761                            _("PCRE library is compiled with incompatible options"));
1762       return NULL;
1763     }
1764
1765   pcre_compile_options = get_pcre2_compile_options (compile_options);
1766   pcre_match_options = get_pcre2_match_options (match_options, compile_options);
1767
1768   newline_options = get_pcre2_newline_match_options (match_options);
1769   if (newline_options == 0)
1770     newline_options = get_pcre2_newline_compile_options (compile_options);
1771
1772   if (newline_options == 0)
1773     {
1774       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
1775                    "Invalid newline flags");
1776       return NULL;
1777     }
1778
1779   bsr_options = get_pcre2_bsr_match_options (match_options);
1780   if (!bsr_options)
1781     bsr_options = get_pcre2_bsr_compile_options (compile_options);
1782
1783   re = regex_compile (pattern, pcre_compile_options,
1784                       newline_options, bsr_options, error);
1785   if (re == NULL)
1786     return NULL;
1787
1788   pcre_compile_options |=
1789     get_pcre2_inline_compile_options (re, pcre_compile_options);
1790
1791   regex = g_new0 (GRegex, 1);
1792   regex->ref_count = 1;
1793   regex->pattern = g_strdup (pattern);
1794   regex->pcre_re = re;
1795   regex->compile_opts = pcre_compile_options;
1796   regex->orig_compile_opts = compile_options;
1797   regex->match_opts = pcre_match_options;
1798   regex->orig_match_opts = match_options;
1799
1800   return regex;
1801 }
1802
1803 static pcre2_code *
1804 regex_compile (const gchar  *pattern,
1805                uint32_t      compile_options,
1806                uint32_t      newline_options,
1807                uint32_t      bsr_options,
1808                GError      **error)
1809 {
1810   pcre2_code *re;
1811   pcre2_compile_context *context;
1812   const gchar *errmsg;
1813   PCRE2_SIZE erroffset;
1814   gint errcode;
1815
1816   context = pcre2_compile_context_create (NULL);
1817
1818   /* set newline options */
1819   if (pcre2_set_newline (context, newline_options) != 0)
1820     {
1821       g_set_error (error, G_REGEX_ERROR,
1822                    G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
1823                    "Invalid newline flags");
1824       pcre2_compile_context_free (context);
1825       return NULL;
1826     }
1827
1828   /* set bsr options */
1829   if (pcre2_set_bsr (context, bsr_options) != 0)
1830     {
1831       g_set_error (error, G_REGEX_ERROR,
1832                    G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
1833                    "Invalid BSR flags");
1834       pcre2_compile_context_free (context);
1835       return NULL;
1836     }
1837
1838   /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */
1839   if (compile_options & PCRE2_UTF)
1840     compile_options |= PCRE2_NO_UTF_CHECK;
1841
1842   compile_options |= PCRE2_UCP;
1843
1844   /* compile the pattern */
1845   re = pcre2_compile ((PCRE2_SPTR8) pattern,
1846                       PCRE2_ZERO_TERMINATED,
1847                       compile_options,
1848                       &errcode,
1849                       &erroffset,
1850                       context);
1851   pcre2_compile_context_free (context);
1852
1853   /* if the compilation failed, set the error member and return
1854    * immediately */
1855   if (re == NULL)
1856     {
1857       GError *tmp_error;
1858       gchar *offset_str;
1859       gchar *pcre2_errmsg = NULL;
1860       int original_errcode;
1861
1862       /* Translate the PCRE error code to GRegexError and use a translated
1863        * error message if possible */
1864       original_errcode = errcode;
1865       translate_compile_error (&errcode, &errmsg);
1866
1867       if (!errmsg)
1868         {
1869           errmsg = _("unknown error");
1870           pcre2_errmsg = get_pcre2_error_string (original_errcode);
1871         }
1872
1873       /* PCRE uses byte offsets but we want to show character offsets */
1874       erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1875
1876       offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset);
1877       tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1878                                _("Error while compiling regular expression ‘%s’ "
1879                                  "at char %s: %s"),
1880                                pattern, offset_str,
1881                                pcre2_errmsg ? pcre2_errmsg : errmsg);
1882       g_propagate_error (error, tmp_error);
1883       g_free (offset_str);
1884       g_clear_pointer (&pcre2_errmsg, g_free);
1885
1886       return NULL;
1887     }
1888
1889   return re;
1890 }
1891
1892 static uint32_t
1893 get_pcre2_inline_compile_options (pcre2_code *re,
1894                                   uint32_t    compile_options)
1895 {
1896   uint32_t pcre_compile_options;
1897   uint32_t nonpcre_compile_options;
1898
1899   /* For options set at the beginning of the pattern, pcre puts them into
1900    * compile options, e.g. "(?i)foo" will make the pcre structure store
1901    * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */
1902   nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
1903   pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options);
1904   compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK;
1905   compile_options |= nonpcre_compile_options;
1906
1907   if (!(compile_options & PCRE2_DUPNAMES))
1908     {
1909       uint32_t jchanged = 0;
1910       pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged);
1911       if (jchanged)
1912         compile_options |= PCRE2_DUPNAMES;
1913     }
1914
1915   return compile_options;
1916 }
1917
1918 /**
1919  * g_regex_get_pattern:
1920  * @regex: a #GRegex structure
1921  *
1922  * Gets the pattern string associated with @regex, i.e. a copy of
1923  * the string passed to g_regex_new().
1924  *
1925  * Returns: the pattern of @regex
1926  *
1927  * Since: 2.14
1928  */
1929 const gchar *
1930 g_regex_get_pattern (const GRegex *regex)
1931 {
1932   g_return_val_if_fail (regex != NULL, NULL);
1933
1934   return regex->pattern;
1935 }
1936
1937 /**
1938  * g_regex_get_max_backref:
1939  * @regex: a #GRegex
1940  *
1941  * Returns the number of the highest back reference
1942  * in the pattern, or 0 if the pattern does not contain
1943  * back references.
1944  *
1945  * Returns: the number of the highest back reference
1946  *
1947  * Since: 2.14
1948  */
1949 gint
1950 g_regex_get_max_backref (const GRegex *regex)
1951 {
1952   uint32_t value;
1953
1954   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value);
1955
1956   return value;
1957 }
1958
1959 /**
1960  * g_regex_get_capture_count:
1961  * @regex: a #GRegex
1962  *
1963  * Returns the number of capturing subpatterns in the pattern.
1964  *
1965  * Returns: the number of capturing subpatterns
1966  *
1967  * Since: 2.14
1968  */
1969 gint
1970 g_regex_get_capture_count (const GRegex *regex)
1971 {
1972   uint32_t value;
1973
1974   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value);
1975
1976   return value;
1977 }
1978
1979 /**
1980  * g_regex_get_has_cr_or_lf:
1981  * @regex: a #GRegex structure
1982  *
1983  * Checks whether the pattern contains explicit CR or LF references.
1984  *
1985  * Returns: %TRUE if the pattern contains explicit CR or LF references
1986  *
1987  * Since: 2.34
1988  */
1989 gboolean
1990 g_regex_get_has_cr_or_lf (const GRegex *regex)
1991 {
1992   uint32_t value;
1993
1994   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value);
1995
1996   return !!value;
1997 }
1998
1999 /**
2000  * g_regex_get_max_lookbehind:
2001  * @regex: a #GRegex structure
2002  *
2003  * Gets the number of characters in the longest lookbehind assertion in the
2004  * pattern. This information is useful when doing multi-segment matching using
2005  * the partial matching facilities.
2006  *
2007  * Returns: the number of characters in the longest lookbehind assertion.
2008  *
2009  * Since: 2.38
2010  */
2011 gint
2012 g_regex_get_max_lookbehind (const GRegex *regex)
2013 {
2014   uint32_t max_lookbehind;
2015
2016   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND,
2017                       &max_lookbehind);
2018
2019   return max_lookbehind;
2020 }
2021
2022 /**
2023  * g_regex_get_compile_flags:
2024  * @regex: a #GRegex
2025  *
2026  * Returns the compile options that @regex was created with.
2027  *
2028  * Depending on the version of PCRE that is used, this may or may not
2029  * include flags set by option expressions such as `(?i)` found at the
2030  * top-level within the compiled pattern.
2031  *
2032  * Returns: flags from #GRegexCompileFlags
2033  *
2034  * Since: 2.26
2035  */
2036 GRegexCompileFlags
2037 g_regex_get_compile_flags (const GRegex *regex)
2038 {
2039   GRegexCompileFlags extra_flags;
2040   uint32_t info_value;
2041
2042   g_return_val_if_fail (regex != NULL, 0);
2043
2044   /* Preserve original G_REGEX_OPTIMIZE */
2045   extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE);
2046
2047   /* Also include the newline options */
2048   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value);
2049   switch (info_value)
2050     {
2051     case PCRE2_NEWLINE_ANYCRLF:
2052       extra_flags |= G_REGEX_NEWLINE_ANYCRLF;
2053       break;
2054     case PCRE2_NEWLINE_CRLF:
2055       extra_flags |= G_REGEX_NEWLINE_CRLF;
2056       break;
2057     case PCRE2_NEWLINE_LF:
2058       extra_flags |= G_REGEX_NEWLINE_LF;
2059       break;
2060     case PCRE2_NEWLINE_CR:
2061       extra_flags |= G_REGEX_NEWLINE_CR;
2062       break;
2063     default:
2064       break;
2065     }
2066
2067   /* Also include the bsr options */
2068   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value);
2069   switch (info_value)
2070     {
2071     case PCRE2_BSR_ANYCRLF:
2072       extra_flags |= G_REGEX_BSR_ANYCRLF;
2073       break;
2074     default:
2075       break;
2076     }
2077
2078   return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags;
2079 }
2080
2081 /**
2082  * g_regex_get_match_flags:
2083  * @regex: a #GRegex
2084  *
2085  * Returns the match options that @regex was created with.
2086  *
2087  * Returns: flags from #GRegexMatchFlags
2088  *
2089  * Since: 2.26
2090  */
2091 GRegexMatchFlags
2092 g_regex_get_match_flags (const GRegex *regex)
2093 {
2094   uint32_t flags;
2095
2096   g_return_val_if_fail (regex != NULL, 0);
2097
2098   flags = g_regex_match_flags_from_pcre2 (regex->match_opts);
2099   flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK);
2100   flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF));
2101
2102   return flags;
2103 }
2104
2105 /**
2106  * g_regex_match_simple:
2107  * @pattern: the regular expression
2108  * @string: the string to scan for matches
2109  * @compile_options: compile options for the regular expression, or 0
2110  * @match_options: match options, or 0
2111  *
2112  * Scans for a match in @string for @pattern.
2113  *
2114  * This function is equivalent to g_regex_match() but it does not
2115  * require to compile the pattern with g_regex_new(), avoiding some
2116  * lines of code when you need just to do a match without extracting
2117  * substrings, capture counts, and so on.
2118  *
2119  * If this function is to be called on the same @pattern more than
2120  * once, it's more efficient to compile the pattern once with
2121  * g_regex_new() and then use g_regex_match().
2122  *
2123  * Returns: %TRUE if the string matched, %FALSE otherwise
2124  *
2125  * Since: 2.14
2126  */
2127 gboolean
2128 g_regex_match_simple (const gchar        *pattern,
2129                       const gchar        *string,
2130                       GRegexCompileFlags  compile_options,
2131                       GRegexMatchFlags    match_options)
2132 {
2133   GRegex *regex;
2134   gboolean result;
2135
2136   regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL);
2137   if (!regex)
2138     return FALSE;
2139   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
2140   g_regex_unref (regex);
2141   return result;
2142 }
2143
2144 /**
2145  * g_regex_match:
2146  * @regex: a #GRegex structure from g_regex_new()
2147  * @string: the string to scan for matches
2148  * @match_options: match options
2149  * @match_info: (out) (optional): pointer to location where to store
2150  *     the #GMatchInfo, or %NULL if you do not need it
2151  *
2152  * Scans for a match in @string for the pattern in @regex.
2153  * The @match_options are combined with the match options specified
2154  * when the @regex structure was created, letting you have more
2155  * flexibility in reusing #GRegex structures.
2156  *
2157  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2158  *
2159  * A #GMatchInfo structure, used to get information on the match,
2160  * is stored in @match_info if not %NULL. Note that if @match_info
2161  * is not %NULL then it is created even if the function returns %FALSE,
2162  * i.e. you must free it regardless if regular expression actually matched.
2163  *
2164  * To retrieve all the non-overlapping matches of the pattern in
2165  * string you can use g_match_info_next().
2166  *
2167  * |[<!-- language="C" -->
2168  * static void
2169  * print_uppercase_words (const gchar *string)
2170  * {
2171  *   // Print all uppercase-only words.
2172  *   GRegex *regex;
2173  *   GMatchInfo *match_info;
2174  *
2175  *   regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
2176  *   g_regex_match (regex, string, 0, &match_info);
2177  *   while (g_match_info_matches (match_info))
2178  *     {
2179  *       gchar *word = g_match_info_fetch (match_info, 0);
2180  *       g_print ("Found: %s\n", word);
2181  *       g_free (word);
2182  *       g_match_info_next (match_info, NULL);
2183  *     }
2184  *   g_match_info_free (match_info);
2185  *   g_regex_unref (regex);
2186  * }
2187  * ]|
2188  *
2189  * @string is not copied and is used in #GMatchInfo internally. If
2190  * you use any #GMatchInfo method (except g_match_info_free()) after
2191  * freeing or modifying @string then the behaviour is undefined.
2192  *
2193  * Returns: %TRUE is the string matched, %FALSE otherwise
2194  *
2195  * Since: 2.14
2196  */
2197 gboolean
2198 g_regex_match (const GRegex      *regex,
2199                const gchar       *string,
2200                GRegexMatchFlags   match_options,
2201                GMatchInfo       **match_info)
2202 {
2203   return g_regex_match_full (regex, string, -1, 0, match_options,
2204                              match_info, NULL);
2205 }
2206
2207 /**
2208  * g_regex_match_full:
2209  * @regex: a #GRegex structure from g_regex_new()
2210  * @string: (array length=string_len): the string to scan for matches
2211  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2212  * @start_position: starting index of the string to match, in bytes
2213  * @match_options: match options
2214  * @match_info: (out) (optional): pointer to location where to store
2215  *     the #GMatchInfo, or %NULL if you do not need it
2216  * @error: location to store the error occurring, or %NULL to ignore errors
2217  *
2218  * Scans for a match in @string for the pattern in @regex.
2219  * The @match_options are combined with the match options specified
2220  * when the @regex structure was created, letting you have more
2221  * flexibility in reusing #GRegex structures.
2222  *
2223  * Setting @start_position differs from just passing over a shortened
2224  * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2225  * that begins with any kind of lookbehind assertion, such as "\b".
2226  *
2227  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2228  *
2229  * A #GMatchInfo structure, used to get information on the match, is
2230  * stored in @match_info if not %NULL. Note that if @match_info is
2231  * not %NULL then it is created even if the function returns %FALSE,
2232  * i.e. you must free it regardless if regular expression actually
2233  * matched.
2234  *
2235  * @string is not copied and is used in #GMatchInfo internally. If
2236  * you use any #GMatchInfo method (except g_match_info_free()) after
2237  * freeing or modifying @string then the behaviour is undefined.
2238  *
2239  * To retrieve all the non-overlapping matches of the pattern in
2240  * string you can use g_match_info_next().
2241  *
2242  * |[<!-- language="C" -->
2243  * static void
2244  * print_uppercase_words (const gchar *string)
2245  * {
2246  *   // Print all uppercase-only words.
2247  *   GRegex *regex;
2248  *   GMatchInfo *match_info;
2249  *   GError *error = NULL;
2250  *
2251  *   regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
2252  *   g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
2253  *   while (g_match_info_matches (match_info))
2254  *     {
2255  *       gchar *word = g_match_info_fetch (match_info, 0);
2256  *       g_print ("Found: %s\n", word);
2257  *       g_free (word);
2258  *       g_match_info_next (match_info, &error);
2259  *     }
2260  *   g_match_info_free (match_info);
2261  *   g_regex_unref (regex);
2262  *   if (error != NULL)
2263  *     {
2264  *       g_printerr ("Error while matching: %s\n", error->message);
2265  *       g_error_free (error);
2266  *     }
2267  * }
2268  * ]|
2269  *
2270  * Returns: %TRUE is the string matched, %FALSE otherwise
2271  *
2272  * Since: 2.14
2273  */
2274 gboolean
2275 g_regex_match_full (const GRegex      *regex,
2276                     const gchar       *string,
2277                     gssize             string_len,
2278                     gint               start_position,
2279                     GRegexMatchFlags   match_options,
2280                     GMatchInfo       **match_info,
2281                     GError           **error)
2282 {
2283   GMatchInfo *info;
2284   gboolean match_ok;
2285
2286   g_return_val_if_fail (regex != NULL, FALSE);
2287   g_return_val_if_fail (string != NULL, FALSE);
2288   g_return_val_if_fail (start_position >= 0, FALSE);
2289   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2290   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2291
2292   info = match_info_new (regex, string, string_len, start_position,
2293                          match_options, FALSE);
2294   match_ok = g_match_info_next (info, error);
2295   if (match_info != NULL)
2296     *match_info = info;
2297   else
2298     g_match_info_free (info);
2299
2300   return match_ok;
2301 }
2302
2303 /**
2304  * g_regex_match_all:
2305  * @regex: a #GRegex structure from g_regex_new()
2306  * @string: the string to scan for matches
2307  * @match_options: match options
2308  * @match_info: (out) (optional): pointer to location where to store
2309  *     the #GMatchInfo, or %NULL if you do not need it
2310  *
2311  * Using the standard algorithm for regular expression matching only
2312  * the longest match in the string is retrieved. This function uses
2313  * a different algorithm so it can retrieve all the possible matches.
2314  * For more documentation see g_regex_match_all_full().
2315  *
2316  * A #GMatchInfo structure, used to get information on the match, is
2317  * stored in @match_info if not %NULL. Note that if @match_info is
2318  * not %NULL then it is created even if the function returns %FALSE,
2319  * i.e. you must free it regardless if regular expression actually
2320  * matched.
2321  *
2322  * @string is not copied and is used in #GMatchInfo internally. If
2323  * you use any #GMatchInfo method (except g_match_info_free()) after
2324  * freeing or modifying @string then the behaviour is undefined.
2325  *
2326  * Returns: %TRUE is the string matched, %FALSE otherwise
2327  *
2328  * Since: 2.14
2329  */
2330 gboolean
2331 g_regex_match_all (const GRegex      *regex,
2332                    const gchar       *string,
2333                    GRegexMatchFlags   match_options,
2334                    GMatchInfo       **match_info)
2335 {
2336   return g_regex_match_all_full (regex, string, -1, 0, match_options,
2337                                  match_info, NULL);
2338 }
2339
2340 /**
2341  * g_regex_match_all_full:
2342  * @regex: a #GRegex structure from g_regex_new()
2343  * @string: (array length=string_len): the string to scan for matches
2344  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2345  * @start_position: starting index of the string to match, in bytes
2346  * @match_options: match options
2347  * @match_info: (out) (optional): pointer to location where to store
2348  *     the #GMatchInfo, or %NULL if you do not need it
2349  * @error: location to store the error occurring, or %NULL to ignore errors
2350  *
2351  * Using the standard algorithm for regular expression matching only
2352  * the longest match in the @string is retrieved, it is not possible
2353  * to obtain all the available matches. For instance matching
2354  * "<a> <b> <c>" against the pattern "<.*>"
2355  * you get "<a> <b> <c>".
2356  *
2357  * This function uses a different algorithm (called DFA, i.e. deterministic
2358  * finite automaton), so it can retrieve all the possible matches, all
2359  * starting at the same point in the string. For instance matching
2360  * "<a> <b> <c>" against the pattern "<.*>;"
2361  * you would obtain three matches: "<a> <b> <c>",
2362  * "<a> <b>" and "<a>".
2363  *
2364  * The number of matched strings is retrieved using
2365  * g_match_info_get_match_count(). To obtain the matched strings and
2366  * their position you can use, respectively, g_match_info_fetch() and
2367  * g_match_info_fetch_pos(). Note that the strings are returned in
2368  * reverse order of length; that is, the longest matching string is
2369  * given first.
2370  *
2371  * Note that the DFA algorithm is slower than the standard one and it
2372  * is not able to capture substrings, so backreferences do not work.
2373  *
2374  * Setting @start_position differs from just passing over a shortened
2375  * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2376  * that begins with any kind of lookbehind assertion, such as "\b".
2377  *
2378  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2379  *
2380  * A #GMatchInfo structure, used to get information on the match, is
2381  * stored in @match_info if not %NULL. Note that if @match_info is
2382  * not %NULL then it is created even if the function returns %FALSE,
2383  * i.e. you must free it regardless if regular expression actually
2384  * matched.
2385  *
2386  * @string is not copied and is used in #GMatchInfo internally. If
2387  * you use any #GMatchInfo method (except g_match_info_free()) after
2388  * freeing or modifying @string then the behaviour is undefined.
2389  *
2390  * Returns: %TRUE is the string matched, %FALSE otherwise
2391  *
2392  * Since: 2.14
2393  */
2394 gboolean
2395 g_regex_match_all_full (const GRegex      *regex,
2396                         const gchar       *string,
2397                         gssize             string_len,
2398                         gint               start_position,
2399                         GRegexMatchFlags   match_options,
2400                         GMatchInfo       **match_info,
2401                         GError           **error)
2402 {
2403   GMatchInfo *info;
2404   gboolean done;
2405   pcre2_code *pcre_re;
2406   gboolean retval;
2407   uint32_t newline_options;
2408   uint32_t bsr_options;
2409
2410   g_return_val_if_fail (regex != NULL, FALSE);
2411   g_return_val_if_fail (string != NULL, FALSE);
2412   g_return_val_if_fail (start_position >= 0, FALSE);
2413   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2414   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2415
2416   newline_options = get_pcre2_newline_match_options (match_options);
2417   if (!newline_options)
2418     newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts);
2419
2420   bsr_options = get_pcre2_bsr_match_options (match_options);
2421   if (!bsr_options)
2422     bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts);
2423
2424   /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an
2425    * optimization for normal regex matching, but results in omitting some
2426    * shorter matches here, and an observable behaviour change.
2427    *
2428    * DFA matching is rather niche, and very rarely used according to
2429    * codesearch.debian.net, so don't bother caching the recompiled RE. */
2430   pcre_re = regex_compile (regex->pattern,
2431                            regex->compile_opts | PCRE2_NO_AUTO_POSSESS,
2432                            newline_options, bsr_options, error);
2433   if (pcre_re == NULL)
2434     return FALSE;
2435
2436   info = match_info_new (regex, string, string_len, start_position,
2437                          match_options, TRUE);
2438
2439   done = FALSE;
2440   while (!done)
2441     {
2442       done = TRUE;
2443       info->matches = pcre2_dfa_match (pcre_re,
2444                                        (PCRE2_SPTR8) info->string, info->string_len,
2445                                        info->pos,
2446                                        (regex->match_opts | info->match_opts),
2447                                        info->match_data,
2448                                        info->match_context,
2449                                        info->workspace, info->n_workspace);
2450       if (info->matches == PCRE2_ERROR_DFA_WSSIZE)
2451         {
2452           /* info->workspace is too small. */
2453           info->n_workspace *= 2;
2454           info->workspace = g_realloc_n (info->workspace,
2455                                          info->n_workspace,
2456                                          sizeof (gint));
2457           done = FALSE;
2458         }
2459       else if (info->matches == 0)
2460         {
2461           /* info->offsets is too small. */
2462           info->n_offsets *= 2;
2463           info->offsets = g_realloc_n (info->offsets,
2464                                        info->n_offsets,
2465                                        sizeof (gint));
2466           pcre2_match_data_free (info->match_data);
2467           info->match_data = pcre2_match_data_create (info->n_offsets, NULL);
2468           done = FALSE;
2469         }
2470       else if (IS_PCRE2_ERROR (info->matches))
2471         {
2472           gchar *error_msg = get_match_error_message (info->matches);
2473
2474           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
2475                        _("Error while matching regular expression %s: %s"),
2476                        regex->pattern, error_msg);
2477           g_clear_pointer (&error_msg, g_free);
2478         }
2479       else if (info->matches != PCRE2_ERROR_NOMATCH)
2480         {
2481           if (!recalc_match_offsets (info, error))
2482             info->matches = PCRE2_ERROR_NOMATCH;
2483         }
2484     }
2485
2486   pcre2_code_free (pcre_re);
2487
2488   /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only
2489    * holds true for a single match, rather than matching all */
2490
2491   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
2492   info->pos = -1;
2493   retval = info->matches >= 0;
2494
2495   if (match_info != NULL)
2496     *match_info = info;
2497   else
2498     g_match_info_free (info);
2499
2500   return retval;
2501 }
2502
2503 /**
2504  * g_regex_get_string_number:
2505  * @regex: #GRegex structure
2506  * @name: name of the subexpression
2507  *
2508  * Retrieves the number of the subexpression named @name.
2509  *
2510  * Returns: The number of the subexpression or -1 if @name
2511  *   does not exists
2512  *
2513  * Since: 2.14
2514  */
2515 gint
2516 g_regex_get_string_number (const GRegex *regex,
2517                            const gchar  *name)
2518 {
2519   gint num;
2520
2521   g_return_val_if_fail (regex != NULL, -1);
2522   g_return_val_if_fail (name != NULL, -1);
2523
2524   num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name);
2525   if (num == PCRE2_ERROR_NOSUBSTRING)
2526     num = -1;
2527
2528   return num;
2529 }
2530
2531 /**
2532  * g_regex_split_simple:
2533  * @pattern: the regular expression
2534  * @string: the string to scan for matches
2535  * @compile_options: compile options for the regular expression, or 0
2536  * @match_options: match options, or 0
2537  *
2538  * Breaks the string on the pattern, and returns an array of
2539  * the tokens. If the pattern contains capturing parentheses,
2540  * then the text for each of the substrings will also be returned.
2541  * If the pattern does not match anywhere in the string, then the
2542  * whole string is returned as the first token.
2543  *
2544  * This function is equivalent to g_regex_split() but it does
2545  * not require to compile the pattern with g_regex_new(), avoiding
2546  * some lines of code when you need just to do a split without
2547  * extracting substrings, capture counts, and so on.
2548  *
2549  * If this function is to be called on the same @pattern more than
2550  * once, it's more efficient to compile the pattern once with
2551  * g_regex_new() and then use g_regex_split().
2552  *
2553  * As a special case, the result of splitting the empty string ""
2554  * is an empty vector, not a vector containing a single string.
2555  * The reason for this special case is that being able to represent
2556  * an empty vector is typically more useful than consistent handling
2557  * of empty elements. If you do need to represent empty elements,
2558  * you'll need to check for the empty string before calling this
2559  * function.
2560  *
2561  * A pattern that can match empty strings splits @string into
2562  * separate characters wherever it matches the empty string between
2563  * characters. For example splitting "ab c" using as a separator
2564  * "\s*", you will get "a", "b" and "c".
2565  *
2566  * Returns: (transfer full): a %NULL-terminated array of strings. Free
2567  * it using g_strfreev()
2568  *
2569  * Since: 2.14
2570  **/
2571 gchar **
2572 g_regex_split_simple (const gchar        *pattern,
2573                       const gchar        *string,
2574                       GRegexCompileFlags  compile_options,
2575                       GRegexMatchFlags    match_options)
2576 {
2577   GRegex *regex;
2578   gchar **result;
2579
2580   regex = g_regex_new (pattern, compile_options, 0, NULL);
2581   if (!regex)
2582     return NULL;
2583
2584   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
2585   g_regex_unref (regex);
2586   return result;
2587 }
2588
2589 /**
2590  * g_regex_split:
2591  * @regex: a #GRegex structure
2592  * @string: the string to split with the pattern
2593  * @match_options: match time option flags
2594  *
2595  * Breaks the string on the pattern, and returns an array of the tokens.
2596  * If the pattern contains capturing parentheses, then the text for each
2597  * of the substrings will also be returned. If the pattern does not match
2598  * anywhere in the string, then the whole string is returned as the first
2599  * token.
2600  *
2601  * As a special case, the result of splitting the empty string "" is an
2602  * empty vector, not a vector containing a single string. The reason for
2603  * this special case is that being able to represent an empty vector is
2604  * typically more useful than consistent handling of empty elements. If
2605  * you do need to represent empty elements, you'll need to check for the
2606  * empty string before calling this function.
2607  *
2608  * A pattern that can match empty strings splits @string into separate
2609  * characters wherever it matches the empty string between characters.
2610  * For example splitting "ab c" using as a separator "\s*", you will get
2611  * "a", "b" and "c".
2612  *
2613  * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2614  * it using g_strfreev()
2615  *
2616  * Since: 2.14
2617  **/
2618 gchar **
2619 g_regex_split (const GRegex     *regex,
2620                const gchar      *string,
2621                GRegexMatchFlags  match_options)
2622 {
2623   return g_regex_split_full (regex, string, -1, 0,
2624                              match_options, 0, NULL);
2625 }
2626
2627 /**
2628  * g_regex_split_full:
2629  * @regex: a #GRegex structure
2630  * @string: (array length=string_len): the string to split with the pattern
2631  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2632  * @start_position: starting index of the string to match, in bytes
2633  * @match_options: match time option flags
2634  * @max_tokens: the maximum number of tokens to split @string into.
2635  *   If this is less than 1, the string is split completely
2636  * @error: return location for a #GError
2637  *
2638  * Breaks the string on the pattern, and returns an array of the tokens.
2639  * If the pattern contains capturing parentheses, then the text for each
2640  * of the substrings will also be returned. If the pattern does not match
2641  * anywhere in the string, then the whole string is returned as the first
2642  * token.
2643  *
2644  * As a special case, the result of splitting the empty string "" is an
2645  * empty vector, not a vector containing a single string. The reason for
2646  * this special case is that being able to represent an empty vector is
2647  * typically more useful than consistent handling of empty elements. If
2648  * you do need to represent empty elements, you'll need to check for the
2649  * empty string before calling this function.
2650  *
2651  * A pattern that can match empty strings splits @string into separate
2652  * characters wherever it matches the empty string between characters.
2653  * For example splitting "ab c" using as a separator "\s*", you will get
2654  * "a", "b" and "c".
2655  *
2656  * Setting @start_position differs from just passing over a shortened
2657  * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2658  * that begins with any kind of lookbehind assertion, such as "\b".
2659  *
2660  * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2661  * it using g_strfreev()
2662  *
2663  * Since: 2.14
2664  **/
2665 gchar **
2666 g_regex_split_full (const GRegex      *regex,
2667                     const gchar       *string,
2668                     gssize             string_len,
2669                     gint               start_position,
2670                     GRegexMatchFlags   match_options,
2671                     gint               max_tokens,
2672                     GError           **error)
2673 {
2674   GError *tmp_error = NULL;
2675   GMatchInfo *match_info;
2676   GList *list, *last;
2677   gint i;
2678   gint token_count;
2679   gboolean match_ok;
2680   /* position of the last separator. */
2681   gint last_separator_end;
2682   /* was the last match 0 bytes long? */
2683   gboolean last_match_is_empty;
2684   /* the returned array of char **s */
2685   gchar **string_list;
2686
2687   g_return_val_if_fail (regex != NULL, NULL);
2688   g_return_val_if_fail (string != NULL, NULL);
2689   g_return_val_if_fail (start_position >= 0, NULL);
2690   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2691   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2692
2693   if (max_tokens <= 0)
2694     max_tokens = G_MAXINT;
2695
2696   if (string_len < 0)
2697     string_len = strlen (string);
2698
2699   /* zero-length string */
2700   if (string_len - start_position == 0)
2701     return g_new0 (gchar *, 1);
2702
2703   if (max_tokens == 1)
2704     {
2705       string_list = g_new0 (gchar *, 2);
2706       string_list[0] = g_strndup (&string[start_position],
2707                                   string_len - start_position);
2708       return string_list;
2709     }
2710
2711   list = NULL;
2712   token_count = 0;
2713   last_separator_end = start_position;
2714   last_match_is_empty = FALSE;
2715
2716   match_ok = g_regex_match_full (regex, string, string_len, start_position,
2717                                  match_options, &match_info, &tmp_error);
2718
2719   while (tmp_error == NULL)
2720     {
2721       if (match_ok)
2722         {
2723           last_match_is_empty =
2724                     (match_info->offsets[0] == match_info->offsets[1]);
2725
2726           /* we need to skip empty separators at the same position of the end
2727            * of another separator. e.g. the string is "a b" and the separator
2728            * is " *", so from 1 to 2 we have a match and at position 2 we have
2729            * an empty match. */
2730           if (last_separator_end != match_info->offsets[1])
2731             {
2732               gchar *token;
2733               gint match_count;
2734
2735               token = g_strndup (string + last_separator_end,
2736                                  match_info->offsets[0] - last_separator_end);
2737               list = g_list_prepend (list, token);
2738               token_count++;
2739
2740               /* if there were substrings, these need to be added to
2741                * the list. */
2742               match_count = g_match_info_get_match_count (match_info);
2743               if (match_count > 1)
2744                 {
2745                   for (i = 1; i < match_count; i++)
2746                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
2747                 }
2748             }
2749         }
2750       else
2751         {
2752           /* if there was no match, copy to end of string. */
2753           if (!last_match_is_empty)
2754             {
2755               gchar *token = g_strndup (string + last_separator_end,
2756                                         match_info->string_len - last_separator_end);
2757               list = g_list_prepend (list, token);
2758             }
2759           /* no more tokens, end the loop. */
2760           break;
2761         }
2762
2763       /* -1 to leave room for the last part. */
2764       if (token_count >= max_tokens - 1)
2765         {
2766           /* we have reached the maximum number of tokens, so we copy
2767            * the remaining part of the string. */
2768           if (last_match_is_empty)
2769             {
2770               /* the last match was empty, so we have moved one char
2771                * after the real position to avoid empty matches at the
2772                * same position. */
2773               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
2774             }
2775           /* the if is needed in the case we have terminated the available
2776            * tokens, but we are at the end of the string, so there are no
2777            * characters left to copy. */
2778           if (string_len > match_info->pos)
2779             {
2780               gchar *token = g_strndup (string + match_info->pos,
2781                                         string_len - match_info->pos);
2782               list = g_list_prepend (list, token);
2783             }
2784           /* end the loop. */
2785           break;
2786         }
2787
2788       last_separator_end = match_info->pos;
2789       if (last_match_is_empty)
2790         /* if the last match was empty, g_match_info_next() has moved
2791          * forward to avoid infinite loops, but we still need to copy that
2792          * character. */
2793         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
2794
2795       match_ok = g_match_info_next (match_info, &tmp_error);
2796     }
2797   g_match_info_free (match_info);
2798   if (tmp_error != NULL)
2799     {
2800       g_propagate_error (error, tmp_error);
2801       g_list_free_full (list, g_free);
2802       return NULL;
2803     }
2804
2805   string_list = g_new (gchar *, g_list_length (list) + 1);
2806   i = 0;
2807   for (last = g_list_last (list); last; last = g_list_previous (last))
2808     string_list[i++] = last->data;
2809   string_list[i] = NULL;
2810   g_list_free (list);
2811
2812   return string_list;
2813 }
2814
2815 enum
2816 {
2817   REPL_TYPE_STRING,
2818   REPL_TYPE_CHARACTER,
2819   REPL_TYPE_SYMBOLIC_REFERENCE,
2820   REPL_TYPE_NUMERIC_REFERENCE,
2821   REPL_TYPE_CHANGE_CASE
2822 };
2823
2824 typedef enum
2825 {
2826   CHANGE_CASE_NONE         = 1 << 0,
2827   CHANGE_CASE_UPPER        = 1 << 1,
2828   CHANGE_CASE_LOWER        = 1 << 2,
2829   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
2830   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
2831   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
2832   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
2833   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
2834 } ChangeCase;
2835
2836 struct _InterpolationData
2837 {
2838   gchar     *text;
2839   gint       type;
2840   gint       num;
2841   gchar      c;
2842   ChangeCase change_case;
2843 };
2844
2845 static void
2846 free_interpolation_data (InterpolationData *data)
2847 {
2848   g_free (data->text);
2849   g_free (data);
2850 }
2851
2852 static const gchar *
2853 expand_escape (const gchar        *replacement,
2854                const gchar        *p,
2855                InterpolationData  *data,
2856                GError            **error)
2857 {
2858   const gchar *q, *r;
2859   gint x, d, h, i;
2860   const gchar *error_detail;
2861   gint base = 0;
2862   GError *tmp_error = NULL;
2863
2864   p++;
2865   switch (*p)
2866     {
2867     case 't':
2868       p++;
2869       data->c = '\t';
2870       data->type = REPL_TYPE_CHARACTER;
2871       break;
2872     case 'n':
2873       p++;
2874       data->c = '\n';
2875       data->type = REPL_TYPE_CHARACTER;
2876       break;
2877     case 'v':
2878       p++;
2879       data->c = '\v';
2880       data->type = REPL_TYPE_CHARACTER;
2881       break;
2882     case 'r':
2883       p++;
2884       data->c = '\r';
2885       data->type = REPL_TYPE_CHARACTER;
2886       break;
2887     case 'f':
2888       p++;
2889       data->c = '\f';
2890       data->type = REPL_TYPE_CHARACTER;
2891       break;
2892     case 'a':
2893       p++;
2894       data->c = '\a';
2895       data->type = REPL_TYPE_CHARACTER;
2896       break;
2897     case 'b':
2898       p++;
2899       data->c = '\b';
2900       data->type = REPL_TYPE_CHARACTER;
2901       break;
2902     case '\\':
2903       p++;
2904       data->c = '\\';
2905       data->type = REPL_TYPE_CHARACTER;
2906       break;
2907     case 'x':
2908       p++;
2909       x = 0;
2910       if (*p == '{')
2911         {
2912           p++;
2913           do
2914             {
2915               h = g_ascii_xdigit_value (*p);
2916               if (h < 0)
2917                 {
2918                   error_detail = _("hexadecimal digit or “}” expected");
2919                   goto error;
2920                 }
2921               x = x * 16 + h;
2922               p++;
2923             }
2924           while (*p != '}');
2925           p++;
2926         }
2927       else
2928         {
2929           for (i = 0; i < 2; i++)
2930             {
2931               h = g_ascii_xdigit_value (*p);
2932               if (h < 0)
2933                 {
2934                   error_detail = _("hexadecimal digit expected");
2935                   goto error;
2936                 }
2937               x = x * 16 + h;
2938               p++;
2939             }
2940         }
2941       data->type = REPL_TYPE_STRING;
2942       data->text = g_new0 (gchar, 8);
2943       g_unichar_to_utf8 (x, data->text);
2944       break;
2945     case 'l':
2946       p++;
2947       data->type = REPL_TYPE_CHANGE_CASE;
2948       data->change_case = CHANGE_CASE_LOWER_SINGLE;
2949       break;
2950     case 'u':
2951       p++;
2952       data->type = REPL_TYPE_CHANGE_CASE;
2953       data->change_case = CHANGE_CASE_UPPER_SINGLE;
2954       break;
2955     case 'L':
2956       p++;
2957       data->type = REPL_TYPE_CHANGE_CASE;
2958       data->change_case = CHANGE_CASE_LOWER;
2959       break;
2960     case 'U':
2961       p++;
2962       data->type = REPL_TYPE_CHANGE_CASE;
2963       data->change_case = CHANGE_CASE_UPPER;
2964       break;
2965     case 'E':
2966       p++;
2967       data->type = REPL_TYPE_CHANGE_CASE;
2968       data->change_case = CHANGE_CASE_NONE;
2969       break;
2970     case 'g':
2971       p++;
2972       if (*p != '<')
2973         {
2974           error_detail = _("missing “<” in symbolic reference");
2975           goto error;
2976         }
2977       q = p + 1;
2978       do
2979         {
2980           p++;
2981           if (!*p)
2982             {
2983               error_detail = _("unfinished symbolic reference");
2984               goto error;
2985             }
2986         }
2987       while (*p != '>');
2988       if (p - q == 0)
2989         {
2990           error_detail = _("zero-length symbolic reference");
2991           goto error;
2992         }
2993       if (g_ascii_isdigit (*q))
2994         {
2995           x = 0;
2996           do
2997             {
2998               h = g_ascii_digit_value (*q);
2999               if (h < 0)
3000                 {
3001                   error_detail = _("digit expected");
3002                   p = q;
3003                   goto error;
3004                 }
3005               x = x * 10 + h;
3006               q++;
3007             }
3008           while (q != p);
3009           data->num = x;
3010           data->type = REPL_TYPE_NUMERIC_REFERENCE;
3011         }
3012       else
3013         {
3014           r = q;
3015           do
3016             {
3017               if (!g_ascii_isalnum (*r))
3018                 {
3019                   error_detail = _("illegal symbolic reference");
3020                   p = r;
3021                   goto error;
3022                 }
3023               r++;
3024             }
3025           while (r != p);
3026           data->text = g_strndup (q, p - q);
3027           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
3028         }
3029       p++;
3030       break;
3031     case '0':
3032       /* if \0 is followed by a number is an octal number representing a
3033        * character, else it is a numeric reference. */
3034       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
3035         {
3036           base = 8;
3037           p = g_utf8_next_char (p);
3038         }
3039       G_GNUC_FALLTHROUGH;
3040     case '1':
3041     case '2':
3042     case '3':
3043     case '4':
3044     case '5':
3045     case '6':
3046     case '7':
3047     case '8':
3048     case '9':
3049       x = 0;
3050       d = 0;
3051       for (i = 0; i < 3; i++)
3052         {
3053           h = g_ascii_digit_value (*p);
3054           if (h < 0)
3055             break;
3056           if (h > 7)
3057             {
3058               if (base == 8)
3059                 break;
3060               else
3061                 base = 10;
3062             }
3063           if (i == 2 && base == 10)
3064             break;
3065           x = x * 8 + h;
3066           d = d * 10 + h;
3067           p++;
3068         }
3069       if (base == 8 || i == 3)
3070         {
3071           data->type = REPL_TYPE_STRING;
3072           data->text = g_new0 (gchar, 8);
3073           g_unichar_to_utf8 (x, data->text);
3074         }
3075       else
3076         {
3077           data->type = REPL_TYPE_NUMERIC_REFERENCE;
3078           data->num = d;
3079         }
3080       break;
3081     case 0:
3082       error_detail = _("stray final “\\”");
3083       goto error;
3084       break;
3085     default:
3086       error_detail = _("unknown escape sequence");
3087       goto error;
3088     }
3089
3090   return p;
3091
3092  error:
3093   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
3094   tmp_error = g_error_new (G_REGEX_ERROR,
3095                            G_REGEX_ERROR_REPLACE,
3096                            _("Error while parsing replacement "
3097                              "text “%s” at char %lu: %s"),
3098                            replacement,
3099                            (gulong)(p - replacement),
3100                            error_detail);
3101   g_propagate_error (error, tmp_error);
3102
3103   return NULL;
3104 }
3105
3106 static GList *
3107 split_replacement (const gchar  *replacement,
3108                    GError      **error)
3109 {
3110   GList *list = NULL;
3111   InterpolationData *data;
3112   const gchar *p, *start;
3113
3114   start = p = replacement;
3115   while (*p)
3116     {
3117       if (*p == '\\')
3118         {
3119           data = g_new0 (InterpolationData, 1);
3120           start = p = expand_escape (replacement, p, data, error);
3121           if (p == NULL)
3122             {
3123               g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3124               free_interpolation_data (data);
3125
3126               return NULL;
3127             }
3128           list = g_list_prepend (list, data);
3129         }
3130       else
3131         {
3132           p++;
3133           if (*p == '\\' || *p == '\0')
3134             {
3135               if (p - start > 0)
3136                 {
3137                   data = g_new0 (InterpolationData, 1);
3138                   data->text = g_strndup (start, p - start);
3139                   data->type = REPL_TYPE_STRING;
3140                   list = g_list_prepend (list, data);
3141                 }
3142             }
3143         }
3144     }
3145
3146   return g_list_reverse (list);
3147 }
3148
3149 /* Change the case of c based on change_case. */
3150 #define CHANGE_CASE(c, change_case) \
3151         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
3152                 g_unichar_tolower (c) : \
3153                 g_unichar_toupper (c))
3154
3155 static void
3156 string_append (GString     *string,
3157                const gchar *text,
3158                ChangeCase  *change_case)
3159 {
3160   gunichar c;
3161
3162   if (text[0] == '\0')
3163     return;
3164
3165   if (*change_case == CHANGE_CASE_NONE)
3166     {
3167       g_string_append (string, text);
3168     }
3169   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
3170     {
3171       c = g_utf8_get_char (text);
3172       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
3173       g_string_append (string, g_utf8_next_char (text));
3174       *change_case = CHANGE_CASE_NONE;
3175     }
3176   else
3177     {
3178       while (*text != '\0')
3179         {
3180           c = g_utf8_get_char (text);
3181           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
3182           text = g_utf8_next_char (text);
3183         }
3184     }
3185 }
3186
3187 static gboolean
3188 interpolate_replacement (const GMatchInfo *match_info,
3189                          GString          *result,
3190                          gpointer          data)
3191 {
3192   GList *list;
3193   InterpolationData *idata;
3194   gchar *match;
3195   ChangeCase change_case = CHANGE_CASE_NONE;
3196
3197   for (list = data; list; list = list->next)
3198     {
3199       idata = list->data;
3200       switch (idata->type)
3201         {
3202         case REPL_TYPE_STRING:
3203           string_append (result, idata->text, &change_case);
3204           break;
3205         case REPL_TYPE_CHARACTER:
3206           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
3207           if (change_case & CHANGE_CASE_SINGLE_MASK)
3208             change_case = CHANGE_CASE_NONE;
3209           break;
3210         case REPL_TYPE_NUMERIC_REFERENCE:
3211           match = g_match_info_fetch (match_info, idata->num);
3212           if (match)
3213             {
3214               string_append (result, match, &change_case);
3215               g_free (match);
3216             }
3217           break;
3218         case REPL_TYPE_SYMBOLIC_REFERENCE:
3219           match = g_match_info_fetch_named (match_info, idata->text);
3220           if (match)
3221             {
3222               string_append (result, match, &change_case);
3223               g_free (match);
3224             }
3225           break;
3226         case REPL_TYPE_CHANGE_CASE:
3227           change_case = idata->change_case;
3228           break;
3229         }
3230     }
3231
3232   return FALSE;
3233 }
3234
3235 /* whether actual match_info is needed for replacement, i.e.
3236  * whether there are references
3237  */
3238 static gboolean
3239 interpolation_list_needs_match (GList *list)
3240 {
3241   while (list != NULL)
3242     {
3243       InterpolationData *data = list->data;
3244
3245       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
3246           data->type == REPL_TYPE_NUMERIC_REFERENCE)
3247         {
3248           return TRUE;
3249         }
3250
3251       list = list->next;
3252     }
3253
3254   return FALSE;
3255 }
3256
3257 /**
3258  * g_regex_replace:
3259  * @regex: a #GRegex structure
3260  * @string: (array length=string_len): the string to perform matches against
3261  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3262  * @start_position: starting index of the string to match, in bytes
3263  * @replacement: text to replace each match with
3264  * @match_options: options for the match
3265  * @error: location to store the error occurring, or %NULL to ignore errors
3266  *
3267  * Replaces all occurrences of the pattern in @regex with the
3268  * replacement text. Backreferences of the form '\number' or
3269  * '\g<number>' in the replacement text are interpolated by the
3270  * number-th captured subexpression of the match, '\g<name>' refers
3271  * to the captured subexpression with the given name. '\0' refers
3272  * to the complete match, but '\0' followed by a number is the octal
3273  * representation of a character. To include a literal '\' in the
3274  * replacement, write '\\\\'.
3275  *
3276  * There are also escapes that changes the case of the following text:
3277  *
3278  * - \l: Convert to lower case the next character
3279  * - \u: Convert to upper case the next character
3280  * - \L: Convert to lower case till \E
3281  * - \U: Convert to upper case till \E
3282  * - \E: End case modification
3283  *
3284  * If you do not need to use backreferences use g_regex_replace_literal().
3285  *
3286  * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was
3287  * passed to g_regex_new(). If you want to use not UTF-8 encoded strings
3288  * you can use g_regex_replace_literal().
3289  *
3290  * Setting @start_position differs from just passing over a shortened
3291  * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that
3292  * begins with any kind of lookbehind assertion, such as "\b".
3293  *
3294  * Returns: a newly allocated string containing the replacements
3295  *
3296  * Since: 2.14
3297  */
3298 gchar *
3299 g_regex_replace (const GRegex      *regex,
3300                  const gchar       *string,
3301                  gssize             string_len,
3302                  gint               start_position,
3303                  const gchar       *replacement,
3304                  GRegexMatchFlags   match_options,
3305                  GError           **error)
3306 {
3307   gchar *result;
3308   GList *list;
3309   GError *tmp_error = NULL;
3310
3311   g_return_val_if_fail (regex != NULL, NULL);
3312   g_return_val_if_fail (string != NULL, NULL);
3313   g_return_val_if_fail (start_position >= 0, NULL);
3314   g_return_val_if_fail (replacement != NULL, NULL);
3315   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
3316   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3317
3318   list = split_replacement (replacement, &tmp_error);
3319   if (tmp_error != NULL)
3320     {
3321       g_propagate_error (error, tmp_error);
3322       return NULL;
3323     }
3324
3325   result = g_regex_replace_eval (regex,
3326                                  string, string_len, start_position,
3327                                  match_options,
3328                                  interpolate_replacement,
3329                                  (gpointer)list,
3330                                  &tmp_error);
3331   if (tmp_error != NULL)
3332     g_propagate_error (error, tmp_error);
3333
3334   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3335
3336   return result;
3337 }
3338
3339 static gboolean
3340 literal_replacement (const GMatchInfo *match_info,
3341                      GString          *result,
3342                      gpointer          data)
3343 {
3344   g_string_append (result, data);
3345   return FALSE;
3346 }
3347
3348 /**
3349  * g_regex_replace_literal:
3350  * @regex: a #GRegex structure
3351  * @string: (array length=string_len): the string to perform matches against
3352  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3353  * @start_position: starting index of the string to match, in bytes
3354  * @replacement: text to replace each match with
3355  * @match_options: options for the match
3356  * @error: location to store the error occurring, or %NULL to ignore errors
3357  *
3358  * Replaces all occurrences of the pattern in @regex with the
3359  * replacement text. @replacement is replaced literally, to
3360  * include backreferences use g_regex_replace().
3361  *
3362  * Setting @start_position differs from just passing over a
3363  * shortened string and setting %G_REGEX_MATCH_NOTBOL in the
3364  * case of a pattern that begins with any kind of lookbehind
3365  * assertion, such as "\b".
3366  *
3367  * Returns: a newly allocated string containing the replacements
3368  *
3369  * Since: 2.14
3370  */
3371 gchar *
3372 g_regex_replace_literal (const GRegex      *regex,
3373                          const gchar       *string,
3374                          gssize             string_len,
3375                          gint               start_position,
3376                          const gchar       *replacement,
3377                          GRegexMatchFlags   match_options,
3378                          GError           **error)
3379 {
3380   g_return_val_if_fail (replacement != NULL, NULL);
3381   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3382
3383   return g_regex_replace_eval (regex,
3384                                string, string_len, start_position,
3385                                match_options,
3386                                literal_replacement,
3387                                (gpointer)replacement,
3388                                error);
3389 }
3390
3391 /**
3392  * g_regex_replace_eval:
3393  * @regex: a #GRegex structure from g_regex_new()
3394  * @string: (array length=string_len): string to perform matches against
3395  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3396  * @start_position: starting index of the string to match, in bytes
3397  * @match_options: options for the match
3398  * @eval: (scope call): a function to call for each match
3399  * @user_data: user data to pass to the function
3400  * @error: location to store the error occurring, or %NULL to ignore errors
3401  *
3402  * Replaces occurrences of the pattern in regex with the output of
3403  * @eval for that occurrence.
3404  *
3405  * Setting @start_position differs from just passing over a shortened
3406  * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
3407  * that begins with any kind of lookbehind assertion, such as "\b".
3408  *
3409  * The following example uses g_regex_replace_eval() to replace multiple
3410  * strings at once:
3411  * |[<!-- language="C" -->
3412  * static gboolean
3413  * eval_cb (const GMatchInfo *info,
3414  *          GString          *res,
3415  *          gpointer          data)
3416  * {
3417  *   gchar *match;
3418  *   gchar *r;
3419  *
3420  *    match = g_match_info_fetch (info, 0);
3421  *    r = g_hash_table_lookup ((GHashTable *)data, match);
3422  *    g_string_append (res, r);
3423  *    g_free (match);
3424  *
3425  *    return FALSE;
3426  * }
3427  *
3428  * ...
3429  *
3430  * GRegex *reg;
3431  * GHashTable *h;
3432  * gchar *res;
3433  *
3434  * h = g_hash_table_new (g_str_hash, g_str_equal);
3435  *
3436  * g_hash_table_insert (h, "1", "ONE");
3437  * g_hash_table_insert (h, "2", "TWO");
3438  * g_hash_table_insert (h, "3", "THREE");
3439  * g_hash_table_insert (h, "4", "FOUR");
3440  *
3441  * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
3442  * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
3443  * g_hash_table_destroy (h);
3444  *
3445  * ...
3446  * ]|
3447  *
3448  * Returns: a newly allocated string containing the replacements
3449  *
3450  * Since: 2.14
3451  */
3452 gchar *
3453 g_regex_replace_eval (const GRegex        *regex,
3454                       const gchar         *string,
3455                       gssize               string_len,
3456                       gint                 start_position,
3457                       GRegexMatchFlags     match_options,
3458                       GRegexEvalCallback   eval,
3459                       gpointer             user_data,
3460                       GError             **error)
3461 {
3462   GMatchInfo *match_info;
3463   GString *result;
3464   gint str_pos = 0;
3465   gboolean done = FALSE;
3466   GError *tmp_error = NULL;
3467
3468   g_return_val_if_fail (regex != NULL, NULL);
3469   g_return_val_if_fail (string != NULL, NULL);
3470   g_return_val_if_fail (start_position >= 0, NULL);
3471   g_return_val_if_fail (eval != NULL, NULL);
3472   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3473
3474   if (string_len < 0)
3475     string_len = strlen (string);
3476
3477   result = g_string_sized_new (string_len);
3478
3479   /* run down the string making matches. */
3480   g_regex_match_full (regex, string, string_len, start_position,
3481                       match_options, &match_info, &tmp_error);
3482   while (!done && g_match_info_matches (match_info))
3483     {
3484       g_string_append_len (result,
3485                            string + str_pos,
3486                            match_info->offsets[0] - str_pos);
3487       done = (*eval) (match_info, result, user_data);
3488       str_pos = match_info->offsets[1];
3489       g_match_info_next (match_info, &tmp_error);
3490     }
3491   g_match_info_free (match_info);
3492   if (tmp_error != NULL)
3493     {
3494       g_propagate_error (error, tmp_error);
3495       g_string_free (result, TRUE);
3496       return NULL;
3497     }
3498
3499   g_string_append_len (result, string + str_pos, string_len - str_pos);
3500   return g_string_free (result, FALSE);
3501 }
3502
3503 /**
3504  * g_regex_check_replacement:
3505  * @replacement: the replacement string
3506  * @has_references: (out) (optional): location to store information about
3507  *   references in @replacement or %NULL
3508  * @error: location to store error
3509  *
3510  * Checks whether @replacement is a valid replacement string
3511  * (see g_regex_replace()), i.e. that all escape sequences in
3512  * it are valid.
3513  *
3514  * If @has_references is not %NULL then @replacement is checked
3515  * for pattern references. For instance, replacement text 'foo\n'
3516  * does not contain references and may be evaluated without information
3517  * about actual match, but '\0\1' (whole match followed by first
3518  * subpattern) requires valid #GMatchInfo object.
3519  *
3520  * Returns: whether @replacement is a valid replacement string
3521  *
3522  * Since: 2.14
3523  */
3524 gboolean
3525 g_regex_check_replacement (const gchar  *replacement,
3526                            gboolean     *has_references,
3527                            GError      **error)
3528 {
3529   GList *list;
3530   GError *tmp = NULL;
3531
3532   list = split_replacement (replacement, &tmp);
3533
3534   if (tmp)
3535   {
3536     g_propagate_error (error, tmp);
3537     return FALSE;
3538   }
3539
3540   if (has_references)
3541     *has_references = interpolation_list_needs_match (list);
3542
3543   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3544
3545   return TRUE;
3546 }
3547
3548 /**
3549  * g_regex_escape_nul:
3550  * @string: the string to escape
3551  * @length: the length of @string
3552  *
3553  * Escapes the nul characters in @string to "\x00".  It can be used
3554  * to compile a regex with embedded nul characters.
3555  *
3556  * For completeness, @length can be -1 for a nul-terminated string.
3557  * In this case the output string will be of course equal to @string.
3558  *
3559  * Returns: a newly-allocated escaped string
3560  *
3561  * Since: 2.30
3562  */
3563 gchar *
3564 g_regex_escape_nul (const gchar *string,
3565                     gint         length)
3566 {
3567   GString *escaped;
3568   const gchar *p, *piece_start, *end;
3569   gint backslashes;
3570
3571   g_return_val_if_fail (string != NULL, NULL);
3572
3573   if (length < 0)
3574     return g_strdup (string);
3575
3576   end = string + length;
3577   p = piece_start = string;
3578   escaped = g_string_sized_new (length + 1);
3579
3580   backslashes = 0;
3581   while (p < end)
3582     {
3583       switch (*p)
3584         {
3585         case '\0':
3586           if (p != piece_start)
3587             {
3588               /* copy the previous piece. */
3589               g_string_append_len (escaped, piece_start, p - piece_start);
3590             }
3591           if ((backslashes & 1) == 0)
3592             g_string_append_c (escaped, '\\');
3593           g_string_append_c (escaped, 'x');
3594           g_string_append_c (escaped, '0');
3595           g_string_append_c (escaped, '0');
3596           piece_start = ++p;
3597           backslashes = 0;
3598           break;
3599         case '\\':
3600           backslashes++;
3601           ++p;
3602           break;
3603         default:
3604           backslashes = 0;
3605           p = g_utf8_next_char (p);
3606           break;
3607         }
3608     }
3609
3610   if (piece_start < end)
3611     g_string_append_len (escaped, piece_start, end - piece_start);
3612
3613   return g_string_free (escaped, FALSE);
3614 }
3615
3616 /**
3617  * g_regex_escape_string:
3618  * @string: the string to escape
3619  * @length: the length of @string, in bytes, or -1 if @string is nul-terminated
3620  *
3621  * Escapes the special characters used for regular expressions
3622  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
3623  * function is useful to dynamically generate regular expressions.
3624  *
3625  * @string can contain nul characters that are replaced with "\0",
3626  * in this case remember to specify the correct length of @string
3627  * in @length.
3628  *
3629  * Returns: a newly-allocated escaped string
3630  *
3631  * Since: 2.14
3632  */
3633 gchar *
3634 g_regex_escape_string (const gchar *string,
3635                        gint         length)
3636 {
3637   GString *escaped;
3638   const char *p, *piece_start, *end;
3639
3640   g_return_val_if_fail (string != NULL, NULL);
3641
3642   if (length < 0)
3643     length = strlen (string);
3644
3645   end = string + length;
3646   p = piece_start = string;
3647   escaped = g_string_sized_new (length + 1);
3648
3649   while (p < end)
3650     {
3651       switch (*p)
3652         {
3653         case '\0':
3654         case '\\':
3655         case '|':
3656         case '(':
3657         case ')':
3658         case '[':
3659         case ']':
3660         case '{':
3661         case '}':
3662         case '^':
3663         case '$':
3664         case '*':
3665         case '+':
3666         case '?':
3667         case '.':
3668           if (p != piece_start)
3669             /* copy the previous piece. */
3670             g_string_append_len (escaped, piece_start, p - piece_start);
3671           g_string_append_c (escaped, '\\');
3672           if (*p == '\0')
3673             g_string_append_c (escaped, '0');
3674           else
3675             g_string_append_c (escaped, *p);
3676           piece_start = ++p;
3677           break;
3678         default:
3679           p = g_utf8_next_char (p);
3680           break;
3681         }
3682   }
3683
3684   if (piece_start < end)
3685     g_string_append_len (escaped, piece_start, end - piece_start);
3686
3687   return g_string_free (escaped, FALSE);
3688 }