#include <string.h>
-#include "glib.h"
-#include "glibintl.h"
-#include "gregex.h"
-
#ifdef USE_SYSTEM_PCRE
#include <pcre.h>
#else
#include "pcre/pcre.h"
#endif
+#include "gtypes.h"
+#include "gregex.h"
+#include "glibintl.h"
+#include "glist.h"
+#include "gmessages.h"
+#include "gstrfuncs.h"
+#include "gatomic.h"
+#include "gthread.h"
+
+/**
+ * SECTION:gregex
+ * @title: Perl-compatible regular expressions
+ * @short_description: matches strings against regular expressions
+ * @see_also: <xref linkend="glib-regex-syntax"/>
+ *
+ * The <function>g_regex_*()</function> functions implement regular
+ * expression pattern matching using syntax and semantics similar to
+ * Perl regular expression.
+ *
+ * Some functions accept a @start_position argument, setting it differs
+ * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
+ * in the case of a pattern that begins with any kind of lookbehind assertion.
+ * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
+ * in the middle of words. ("\B" matches only if the current position in the
+ * subject is not a word boundary.) When applied to the string "Mississipi"
+ * from the fourth byte, namely "issipi", it does not match, because "\B" is
+ * always false at the start of the subject, which is deemed to be a word
+ * boundary. However, if the entire string is passed , but with
+ * @start_position set to 4, it finds the second occurrence of "iss" because
+ * it is able to look behind the starting point to discover that it is
+ * preceded by a letter.
+ *
+ * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
+ * to these functions must be encoded in UTF-8. The lengths and the positions
+ * inside the strings are in bytes and not in characters, so, for instance,
+ * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a
+ * single character. If you set #G_REGEX_RAW the strings can be non-valid
+ * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
+ * bytes and two characters long.
+ *
+ * When matching a pattern, "\n" matches only against a "\n" character in
+ * the string, and "\r" matches only a "\r" character. To match any newline
+ * sequence use "\R". This particular group matches either the two-character
+ * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
+ * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
+ * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
+ * separator, U+2028), or PS (paragraph separator, U+2029).
+ *
+ * The behaviour of the dot, circumflex, and dollar metacharacters are
+ * affected by newline characters, the default is to recognize any newline
+ * character (the same characters recognized by "\R"). This can be changed
+ * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
+ * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
+ * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
+ * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
+ * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
+ * unescaped "#" outside a character class is encountered. This indicates
+ * a comment that lasts until after the next newline.
+ *
+ * Creating and manipulating the same #GRegex structure from different
+ * threads is not a problem as #GRegex does not modify its internal
+ * state between creation and destruction, on the other hand #GMatchInfo
+ * is not threadsafe.
+ *
+ * The regular expressions low-level functionalities are obtained through
+ * the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
+ * written by Philip Hazel.
+ */
+
/* Mask of all the possible values for GRegexCompileFlags. */
#define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \
G_REGEX_MULTILINE | \
struct _GMatchInfo
{
+ volatile gint ref_count; /* the ref count */
GRegex *regex; /* the regex */
GRegexMatchFlags match_opts; /* options used at match time on the regex */
gint matches; /* number of matching sub patterns */
gint pos; /* position in the string where last match left off */
+ gint n_offsets; /* number of offsets */
gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
- gint n_offsets; /* number of offsets */
gint *workspace; /* workspace for pcre_dfa_exec() */
gint n_workspace; /* number of workspace elements */
const gchar *string; /* string passed to the match function */
return _("workspace limit for empty substrings reached");
case PCRE_ERROR_BADNEWLINE:
return _("invalid combination of newline flags");
+ case PCRE_ERROR_BADOFFSET:
+ return _("bad offset");
+ case PCRE_ERROR_SHORTUTF8:
+ return _("short utf8");
default:
break;
}
string_len = strlen (string);
match_info = g_new0 (GMatchInfo, 1);
+ match_info->ref_count = 1;
match_info->regex = g_regex_ref ((GRegex *)regex);
match_info->string = string;
match_info->string_len = string_len;
}
/**
- * g_match_info_free:
+ * g_match_info_ref:
* @match_info: a #GMatchInfo
*
- * Frees all the memory associated with the #GMatchInfo structure.
+ * Increases reference count of @match_info by 1.
*
- * Since: 2.14
+ * Returns: @match_info
+ *
+ * Since: 2.30
+ */
+GMatchInfo *
+g_match_info_ref (GMatchInfo *match_info)
+{
+ g_return_val_if_fail (match_info != NULL, NULL);
+ g_atomic_int_inc (&match_info->ref_count);
+ return match_info;
+}
+
+/**
+ * g_match_info_unref:
+ * @match_info: a #GMatchInfo
+ *
+ * Decreases reference count of @match_info by 1. When reference count drops
+ * to zero, it frees all the memory associated with the match_info structure.
+ *
+ * Since: 2.30
*/
void
-g_match_info_free (GMatchInfo *match_info)
+g_match_info_unref (GMatchInfo *match_info)
{
- if (match_info)
+ if (g_atomic_int_dec_and_test (&match_info->ref_count))
{
g_regex_unref (match_info->regex);
g_free (match_info->offsets);
}
/**
+ * g_match_info_free:
+ * @match_info: (allow-none): a #GMatchInfo, or %NULL
+ *
+ * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
+ * nothing.
+ *
+ * Since: 2.14
+ */
+void
+g_match_info_free (GMatchInfo *match_info)
+{
+ if (match_info == NULL)
+ return;
+
+ g_match_info_unref (match_info);
+}
+
+/**
* g_match_info_next:
* @match_info: a #GMatchInfo structure
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Scans for the next match using the same parameters of the previous
* call to g_regex_match_full() or g_regex_match() that returned
prev_match_start = match_info->offsets[0];
prev_match_end = match_info->offsets[1];
+ if (match_info->pos > match_info->string_len)
+ {
+ /* we have reached the end of the string */
+ match_info->pos = -1;
+ match_info->matches = PCRE_ERROR_NOMATCH;
+ return FALSE;
+ }
+
match_info->matches = pcre_exec (match_info->regex->pcre_re,
match_info->regex->extra,
match_info->string,
* g_match_info_expand_references:
* @match_info: a #GMatchInfo or %NULL
* @string_to_expand: the string to expand
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Returns a new string containing the text in @string_to_expand with
* references and escape sequences expanded. References refer to the last
{
g_return_if_fail (regex != NULL);
- if (g_atomic_int_exchange_and_add (®ex->ref_count, -1) - 1 == 0)
+ if (g_atomic_int_dec_and_test (®ex->ref_count))
{
g_free (regex->pattern);
if (regex->pcre_re != NULL)
gint erroffset;
gint errcode;
gboolean optimize = FALSE;
- static gboolean initialized = FALSE;
+ static gsize initialised;
unsigned long int pcre_compile_options;
g_return_val_if_fail (pattern != NULL, NULL);
g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
- if (!initialized)
+ if (g_once_init_enter (&initialised))
{
gint support;
const gchar *msg;
return NULL;
}
- initialized = TRUE;
+ g_once_init_leave (&initialised, TRUE);
}
/* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
compile_options |= PCRE_NEWLINE_ANY;
}
+ compile_options |= PCRE_UCP;
+
/* compile the pattern */
re = pcre_compile2 (pattern, compile_options, &errcode,
&errmsg, &erroffset, NULL);
* @match_options: match options
* @match_info: (out) (allow-none): pointer to location where to store
* the #GMatchInfo, or %NULL if you do not need it
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Scans for a match in string for the pattern in @regex.
* The @match_options are combined with the match options specified
* @match_options: match options
* @match_info: (out) (allow-none): pointer to location where to store
* the #GMatchInfo, or %NULL if you do not need it
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Using the standard algorithm for regular expression matching only
- * the longest match in the string is retrieved, it is not possibile
+ * the longest match in the string is retrieved, it is not possible
* to obtain all the available matches. For instance matching
* "<a> <b> <c>" against the pattern "<.*>"
* you get "<a> <b> <c>".
regex = g_regex_new (pattern, compile_options, 0, NULL);
if (!regex)
return NULL;
+
result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
g_regex_unref (regex);
return result;
match_ok = g_regex_match_full (regex, string, string_len, start_position,
match_options, &match_info, &tmp_error);
+
while (tmp_error == NULL)
{
if (match_ok)
* @start_position: starting index of the string to match
* @replacement: text to replace each match with
* @match_options: options for the match
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Replaces all occurrences of the pattern in @regex with the
* replacement text. Backreferences of the form '\number' or
* @start_position: starting index of the string to match
* @replacement: text to replace each match with
* @match_options: options for the match
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Replaces all occurrences of the pattern in @regex with the
* replacement text. @replacement is replaced literally, to
/**
* g_regex_replace_eval:
* @regex: a #GRegex structure from g_regex_new()
- * @string (array length=string_len): string to perform matches against
+ * @string: (array length=string_len): string to perform matches against
* @string_len: the length of @string, or -1 if @string is nul-terminated
* @start_position: starting index of the string to match
* @match_options: options for the match
* @eval: a function to call for each match
* @user_data: user data to pass to the function
- * @error: location to store the error occuring, or %NULL to ignore errors
+ * @error: location to store the error occurring, or %NULL to ignore errors
*
* Replaces occurrences of the pattern in regex with the output of
* @eval for that occurrence.
}
/**
+ * g_regex_escape_nul:
+ * @string: the string to escape
+ * @length: the length of @string
+ *
+ * Escapes the nul characters in @string to "\x00". It can be used
+ * to compile a regex with embedded nul characters.
+ *
+ * For completeness, @length can be -1 for a nul-terminated string.
+ * In this case the output string will be of course equal to @string.
+ *
+ * Returns: a newly-allocated escaped string
+ *
+ * Since: 2.30
+ */
+gchar *
+g_regex_escape_nul (const gchar *string,
+ gint length)
+{
+ GString *escaped;
+ const gchar *p, *piece_start, *end;
+ gint backslashes;
+
+ g_return_val_if_fail (string != NULL, NULL);
+
+ if (length < 0)
+ return g_strdup (string);
+
+ end = string + length;
+ p = piece_start = string;
+ escaped = g_string_sized_new (length + 1);
+
+ backslashes = 0;
+ while (p < end)
+ {
+ switch (*p)
+ {
+ case '\0':
+ if (p != piece_start)
+ {
+ /* copy the previous piece. */
+ g_string_append_len (escaped, piece_start, p - piece_start);
+ }
+ if ((backslashes & 1) == 0)
+ g_string_append_c (escaped, '\\');
+ g_string_append_c (escaped, 'x');
+ g_string_append_c (escaped, '0');
+ g_string_append_c (escaped, '0');
+ piece_start = ++p;
+ backslashes = 0;
+ break;
+ case '\\':
+ backslashes++;
+ ++p;
+ break;
+ default:
+ backslashes = 0;
+ p = g_utf8_next_char (p);
+ break;
+ }
+ }
+
+ if (piece_start < end)
+ g_string_append_len (escaped, piece_start, end - piece_start);
+
+ return g_string_free (escaped, FALSE);
+}
+
+/**
* g_regex_escape_string:
* @string: (array length=length): the string to escape
* @length: the length of @string, or -1 if @string is nul-terminated