From: Marco Barisione Date: Thu, 15 Mar 2007 13:01:31 +0000 (+0000) Subject: Add GRegex for regular expression matching. (#50075) X-Git-Tag: GLIB_2_13_0~9 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0196d639753247cb0d8aca289155154ef6daa561;p=platform%2Fupstream%2Fglib.git Add GRegex for regular expression matching. (#50075) 2007-03-15 Marco Barisione Add GRegex for regular expression matching. (#50075) * configure.in: Handle GRegex compilation. * glib/gregex.c: * glib/gregex.h: Code for GRegex. * glib/Makefile.am: * glib/makefile.msc.in: Updated makefiles. * glib/pcre/*: Internal copy of PCRE. * glib/update-pcre/*: Stuff to automatically update the internal PCRE to a newer version. * tests/regex-test.c: * tests/Makefile.am: * tests/makefile.msc.in: Add tests for GRegex. svn path=/trunk/; revision=5408 --- diff --git a/ChangeLog b/ChangeLog index 6212cbe..1967453 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2007-03-15 Marco Barisione + + Add GRegex for regular expression matching. (#50075) + + * configure.in: Handle GRegex compilation. + + * glib/gregex.c: + * glib/gregex.h: Code for GRegex. + + * glib/Makefile.am: + * glib/makefile.msc.in: Updated makefiles. + + * glib/pcre/*: Internal copy of PCRE. + + * glib/update-pcre/*: Stuff to automatically update the internal PCRE + to a newer version. + + * tests/regex-test.c: + * tests/Makefile.am: + * tests/makefile.msc.in: Add tests for GRegex. + 2007-03-15 Chris Wilson * glib/gmain.c (g_main_dispatch): Replace a diff --git a/configure.in b/configure.in index 86e09d7..9f6163c 100644 --- a/configure.in +++ b/configure.in @@ -173,7 +173,7 @@ AM_CONDITIONAL(MS_LIB_AVAILABLE, [test x$ms_librarian = xyes]) if test "$glib_native_win32" != yes; then # libtool option to control which symbols are exported # right now, symbols starting with _ are not exported - LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^[[^_]].*"' + LIBTOOL_EXPORT_OPTIONS='-export-symbols-regex "^g.*"' else # We currently use .def files on Windows LIBTOOL_EXPORT_OPTIONS= @@ -2146,6 +2146,74 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([[ [broken_poll="no (cross compiling)"]) AC_MSG_RESULT($broken_poll) +dnl ********************* +dnl *** GRegex checks *** +dnl ********************* +PCRE_REQUIRED_VERSION=7.0 + +# Check if we should compile GRegex +AC_ARG_ENABLE(regex, AC_HELP_STRING([--disable-regex], + [disable the compilation of GRegex]), +[case "${enableval}" in + yes) enable_regex=true ;; + no) enable_regex=false ;; + *) AC_MSG_ERROR(bad value ${enableval} for --enable-regex) ;; +esac], +[enable_regex=true]) + +AM_CONDITIONAL(ENABLE_REGEX, $enable_regex) + +if test x$enable_regex = xtrue; then + # Check if we should use the internal or the system-supplied pcre + AC_ARG_WITH(pcre, + [AC_HELP_STRING([--with-pcre=@<:@internal/system@:>@], + [specify whether to use the internal or the + system-supplied PCRE library])]) + + AM_CONDITIONAL(USE_SYSTEM_PCRE, [test "x$with_pcre" = xsystem]) + + if test "x$with_pcre" = xsystem; then + PKG_CHECK_MODULES(PCRE, + libpcre >= $PCRE_REQUIRED_VERSION) + AC_CACHE_CHECK([for Unicode support in PCRE],glib_cv_pcre_has_unicode,[ + CFLAGS="$PCRE_CFLAGS" LDFLAGS="$PCRE_LIBS" + AC_TRY_RUN([#include + int main () { + int support; + pcre_config (PCRE_CONFIG_UTF8, &support); + if (!support) + return 1; + pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support); + if (!support) + return 1; + return 0; + }], + glib_cv_pcre_has_unicode=yes, + glib_cv_pcre_has_unicode=no, + glib_cv_pcre_has_unicode=yes)]) + if test "$glib_cv_pcre_has_unicode" = "no"; then + AC_MSG_ERROR([*** The system-supplied PCRE does not support Unicode properties or UTF-8.]) + fi + AC_SUBST(PCRE_CFLAGS) + AC_SUBST(PCRE_LIBS) + AC_DEFINE(USE_SYSTEM_PCRE, [], [using the system-supplied PCRE library]) + else + # If using gcc 4 pass -Wno-pointer-sign when compiling the internal PCRE + if test x"$GCC" = xyes; then + AC_MSG_CHECKING([whether gcc understands -Wno-pointer-sign]) + if test [`$CC --version | sed -e 's/[^0-9]*\([0-9]\).*/\1/' -e q`] -ge 4; then + PCRE_WARN_CFLAGS="$PCRE_WARN_CFLAGS -Wno-pointer-sign" + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + fi + fi + AC_SUBST(PCRE_WARN_CFLAGS) +else + AM_CONDITIONAL(USE_SYSTEM_PCRE, false]) +fi + dnl ********************** dnl *** Win32 API libs *** dnl ********************** @@ -2864,6 +2932,8 @@ Makefile glib/Makefile glib/libcharset/Makefile glib/gnulib/Makefile +glib/pcre/Makefile +glib/update-pcre/Makefile gmodule/Makefile gmodule/gmoduleconf.h gobject/Makefile diff --git a/docs/reference/ChangeLog b/docs/reference/ChangeLog index 77732d8..eb7cffc 100644 --- a/docs/reference/ChangeLog +++ b/docs/reference/ChangeLog @@ -1,3 +1,17 @@ +2007-03-15 Marco Barisione + + Add GRegex for regular expression matching. (#50075) + + * glib/Makefile.am: + * glib/glib-docs.sgml: + * glib/glib-sections.txt: + * glib/tmpl/glib-unused.sgml: + * glib/regex-syntax.sgml: + * glib/tmpl/gregex-unused.sgml: + * glib/tmpl/gregex.sgml: Add GRegex. + + * glib/building.sgml: Document build options for GRegex. + 2007-03-14 Stefan Kost * gobject/tmpl/gparamspec.sgml: diff --git a/docs/reference/glib/Makefile.am b/docs/reference/glib/Makefile.am index 0ae3eca..d6eb74e 100644 --- a/docs/reference/glib/Makefile.am +++ b/docs/reference/glib/Makefile.am @@ -37,7 +37,9 @@ IGNORE_HFILES= \ gmirroringtable.h \ gscripttable.h \ glib-mirroring-tab \ - gnulib + gnulib \ + pcre \ + update-pcre # Extra options to supply to gtkdoc-mkdb MKDB_OPTIONS=--sgml-mode --output-format=xml --ignore-files=trio @@ -55,6 +57,7 @@ content_files = \ changes.sgml \ compiling.sgml \ resources.sgml \ + regex-syntax.sgml \ version.xml \ glib-gettextize.xml diff --git a/docs/reference/glib/building.sgml b/docs/reference/glib/building.sgml index 02d063d..ccd734e 100644 --- a/docs/reference/glib/building.sgml +++ b/docs/reference/glib/building.sgml @@ -146,6 +146,16 @@ How to compile GLib itself e.g. POSIX threads, DCE threads or Solaris threads. + + + GRegex uses the the PCRE library + for regular expression matching. The default is to use the internal + version of PCRE that is patched to use GLib for memory management + and Unicode handling. If you prefer to use the system-supplied PCRE + library you can pass the --with-pcre=system option to configure, + but it is not recommended. + + @@ -177,6 +187,13 @@ How to compile GLib itself --with-threads=[none|posix|dce|win32] + + --disable-regex + --enable-regex + + + --with-pcre=[internal|system] + --disable-included-printf --enable-included-printf @@ -362,6 +379,61 @@ How to compile GLib itself + <systemitem>--disable-regex</systemitem> and + <systemitem>--enable-regex</systemitem> + + + Do not compile GLib with regular expression support. + GLib will be smaller because it will not need the + PCRE library. This is however not recommended, as + programs may need GRegex. + + + + + <systemitem>--with-pcre</systemitem> + + + Specify whether to use the internal or the system-supplied + PCRE library. + + + 'internal' means that GRegex will be compiled to use + the internal PCRE library. + + + + 'system' means that GRegex will be compiled to use + the system-supplied PCRE library. + + + Using the internal PCRE is the preferred solution: + + + + System-supplied PCRE has a separated copy of the big tables + used for Unicode handling. + + + + + Some systems have PCRE libraries compiled without some needed + features, such as UTF-8 and Unicode support. + + + + + PCRE uses some global variables for memory management and + other features. In the rare case of a program using both + GRegex and PCRE (maybe indirectly through a library), + this variables could lead to problems when they are modified. + + + + + + + <systemitem>--disable-included-printf</systemitem> and <systemitem>--enable-included-printf</systemitem> diff --git a/docs/reference/glib/glib-docs.sgml b/docs/reference/glib/glib-docs.sgml index 82b6302..1b12ed5 100644 --- a/docs/reference/glib/glib-docs.sgml +++ b/docs/reference/glib/glib-docs.sgml @@ -61,6 +61,7 @@ + @@ -69,6 +70,7 @@ + @@ -101,6 +103,7 @@ synchronize their operation. &glib-Compiling; &glib-Running; &glib-Changes; + &glib-RegexSyntax; &glib-Resources; @@ -151,6 +154,7 @@ synchronize their operation. &glib-Shell; &glib-Option; &glib-Pattern-Matching; + &glib-Regex; &glib-Markup; &glib-Keyfile; &glib-Bookmarkfile; diff --git a/docs/reference/glib/glib-sections.txt b/docs/reference/glib/glib-sections.txt index 83a1e27..3cb40a3 100644 --- a/docs/reference/glib/glib-sections.txt +++ b/docs/reference/glib/glib-sections.txt @@ -864,6 +864,50 @@ g_pattern_match_simple
+Perl-compatible regular expressions +gregex +GRegexError +G_REGEX_ERROR +GRegexCompileFlags +GRegexMatchFlags +GRegex +GRegexEvalCallback +g_regex_new +g_regex_free +g_regex_optimize +g_regex_copy +g_regex_get_pattern +g_regex_clear +g_regex_match_simple +g_regex_match +g_regex_match_full +g_regex_match_next +g_regex_match_next_full +g_regex_match_all +g_regex_match_all_full +g_regex_get_match_count +g_regex_is_partial_match +g_regex_fetch +g_regex_fetch_pos +g_regex_fetch_named +g_regex_fetch_named_pos +g_regex_fetch_all +g_regex_get_string_number +g_regex_split_simple +g_regex_split +g_regex_split_full +g_regex_split_next +g_regex_split_next_full +g_regex_expand_references +g_regex_replace +g_regex_replace_literal +g_regex_replace_eval +g_regex_escape_string + +g_regex_error_quark +
+ +
Message Logging messages G_LOG_DOMAIN diff --git a/docs/reference/glib/regex-syntax.sgml b/docs/reference/glib/regex-syntax.sgml new file mode 100644 index 0000000..1b9e523 --- /dev/null +++ b/docs/reference/glib/regex-syntax.sgml @@ -0,0 +1,2704 @@ + + +Regular expression syntax + + + + + +Regular expression syntax + +Syntax and semantics of the regular expressions supported by GRegex + + + + +GRegex regular expression details + +A regular expression is a pattern that is matched against a +string from left to right. Most characters stand for themselves in a +pattern, and match the corresponding characters in the string. As a +trivial example, the pattern + + + +The quick brown fox + + + +matches a portion of a string that is identical to itself. When +caseless matching is specified (the G_REGEX_CASELESS flag), letters are +matched independently of case. + + + +The power of regular expressions comes from the ability to include +alternatives and repetitions in the pattern. These are encoded in the +pattern by the use of metacharacters, which do not stand for themselves +but instead are interpreted in some special way. + + + +There are two different sets of metacharacters: those that are recognized +anywhere in the pattern except within square brackets, and those +that are recognized in square brackets. Outside square brackets, the +metacharacters are as follows: + + + +Metacharacters outside square brackets + + + + + Character + Meaning + + + + + \ + general escape character with several uses + + + ^ + assert start of string (or line, in multiline mode) + + + $ + assert end of string (or line, in multiline mode) + + + . + match any character except newline (by default) + + + [ + start character class definition + + + | + start of alternative branch + + + ( + start subpattern + + + ) + end subpattern + + + ? + extends the meaning of (, or 0/1 quantifier, or quantifier minimizer + + + * + 0 or more quantifier + + + + + 1 or more quantifier, also "possessive quantifier" + + + { + start min/max quantifier + + + +
+ + +Part of a pattern that is in square brackets is called a "character +class". In a character class the only metacharacters are: + + + +Metacharacters inside square brackets + + + + + Character + Meaning + + + + + \ + general escape character + + + ^ + negate the class, but only if the first character + + + - + indicates character range + + + [ + POSIX character class (only if followed by POSIX syntax) + + + ] + terminates the character class + + + +
+
+ + +Backslash + +The backslash character has several uses. Firstly, if it is followed by +a non-alphanumeric character, it takes away any special meaning that +character may have. This use of backslash as an escape character +applies both inside and outside character classes. + + + +For example, if you want to match a * character, you write \* in the +pattern. This escaping action applies whether or not the following +character would otherwise be interpreted as a metacharacter, so it is +always safe to precede a non-alphanumeric with backslash to specify +that it stands for itself. In particular, if you want to match a +backslash, you write \\. + + + +If a pattern is compiled with the G_REGEX_EXTENDED +option, whitespace in the pattern (other than in a character class) and +characters between a # outside a character class and the next newline +are ignored. +An escaping backslash can be used to include a whitespace or # character +as part of the pattern. + + + +If you want to remove the special meaning from a sequence of characters, +you can do so by putting them between \Q and \E. +The \Q...\E sequence is recognized both inside and outside character +classes. + + + +Non-printing characters + +A second use of backslash provides a way of encoding non-printing +characters in patterns in a visible manner. There is no restriction on the +appearance of non-printing characters, apart from the binary zero that +terminates a pattern, but when a pattern is being prepared by text +editing, it is usually easier to use one of the following escape +sequences than the binary character it represents: + + + +Non-printing characters + + + + + Escape + Meaning + + + + + \a + alarm, that is, the BEL character (hex 07) + + + \cx + "control-x", where x is any character + + + \e + escape (hex 1B) + + + \f + formfeed (hex 0C) + + + \n + newline (hex 0A) + + + \r + carriage return (hex 0D) + + + \t + tab (hex 09) + + + \ddd + character with octal code ddd, or backreference + + + \xhh + character with hex code hh + + + \x{hhh..} + character with hex code hhh.. + + + +
+ + +The precise effect of \cx is as follows: if x is a lower case letter, +it is converted to upper case. Then bit 6 of the character (hex 40) is +inverted. Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c; +becomes hex 7B. + + + +After \x, from zero to two hexadecimal digits are read (letters can be +in upper or lower case). Any number of hexadecimal digits may appear +between \x{ and }, but the value of the character code +must be less than 2**31 (that is, the maximum hexadecimal value is +7FFFFFFF). If characters other than hexadecimal digits appear between +\x{ and }, or if there is no terminating }, this form of escape is not +recognized. Instead, the initial \x will be interpreted as a basic hexadecimal +escape, with no following digits, giving a character whose +value is zero. + + + +Characters whose value is less than 256 can be defined by either of the +two syntaxes for \x. There is no difference +in the way they are handled. For example, \xdc is exactly the same as +\x{dc}. + + + +After \0 up to two further octal digits are read. If there are fewer +than two digits, just those that are present are used. +Thus the sequence \0\x\07 specifies two binary zeros followed by a BEL +character (code value 7). Make sure you supply two digits after the +initial zero if the pattern character that follows is itself an octal +digit. + + + +The handling of a backslash followed by a digit other than 0 is complicated. +Outside a character class, GRegex reads it and any following digits as a +decimal number. If the number is less than 10, or if there +have been at least that many previous capturing left parentheses in the +expression, the entire sequence is taken as a back reference. A +description of how this works is given later, following the discussion +of parenthesized subpatterns. + + + +Inside a character class, or if the decimal number is greater than 9 +and there have not been that many capturing subpatterns, GRegex re-reads +up to three octal digits following the backslash, and uses them to generate +a data character. Any subsequent digits stand for themselves. For example: + + + +Non-printing characters + + + + + Escape + Meaning + + + + + \040 + is another way of writing a space + + + \40 + is the same, provided there are fewer than 40 previous capturing subpatterns + + + \7 + is always a back reference + + + \11 + might be a back reference, or another way of writing a tab + + + \011 + is always a tab + + + \0113 + is a tab followed by the character "3" + + + \113 + might be a back reference, otherwise the character with octal code 113 + + + \377 + might be a back reference, otherwise the byte consisting entirely of 1 bits + + + \81 + is either a back reference, or a binary zero followed by the two characters "8" and "1" + + + +
+ + +Note that octal values of 100 or greater must not be introduced by a +leading zero, because no more than three octal digits are ever read. + + + +All the sequences that define a single character can be used both inside +and outside character classes. In addition, inside a character class, the +sequence \b is interpreted as the backspace character (hex 08), and the +sequences \R and \X are interpreted as the characters "R" and "X", respectively. +Outside a character class, these sequences have different meanings (see below). + +
+ + +Absolute and relative back references + +The sequence \g followed by a positive or negative number, optionally enclosed +in braces, is an absolute or relative back reference. Back references are +discussed later, following the discussion of parenthesized subpatterns. + + + + +Generic character types + + +Another use of backslash is for specifying generic character types. +The following are always recognized: + + + +Generic characters + + + + + Escape + Meaning + + + + + \d + any decimal digit + + + \D + any character that is not a decimal digit + + + \s + any whitespace character + + + \S + any character that is not a whitespace character + + + \w + any "word" character + + + \W + any "non-word" character + + + +
+ + +Each pair of escape sequences partitions the complete set of characters +into two disjoint sets. Any given character matches one, and only one, +of each pair. + + + +These character type sequences can appear both inside and outside character +classes. They each match one character of the appropriate type. +If the current matching point is at the end of the passed string, all +of them fail, since there is no character to match. + + + +For compatibility with Perl, \s does not match the VT character (code +11). This makes it different from the the POSIX "space" class. The \s +characters are HT (9), LF (10), FF (12), CR (13), and space (32). + + + +A "word" character is an underscore or any character less than 256 that +is a letter or digit. + + +Characters with values greater than 128 never match \d, +\s, or \w, and always match \D, \S, and \W. + +
+ + +Newline sequences +Outside a character class, the escape sequence \R matches any Unicode +newline sequence. +This particular group matches either the two-character sequence CR followed by +LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, +U+000B), FF (formfeed, U+000C), CR (carriage return, U+000D), NEL (next +line, U+0085), LS (line separator, U+2028), or PS (paragraph separator, U+2029). +The two-character sequence is treated as a single unit that +cannot be split. Inside a character class, \R matches the letter "R". + + + +Unicode character properties + +To support generic character types there are three additional escape +sequences, they are: + + + +Generic character types + + + + + Escape + Meaning + + + + + \p{xx} + a character with the xx property + + + \P{xx} + a character without the xx property + + + \X + an extended Unicode sequence + + + +
+ + +The property names represented by xx above are limited to the Unicode +script names, the general category properties, and "Any", which matches +any character (including newline). Other properties such as "InMusicalSymbols" +are not currently supported. Note that \P{Any} does not match any characters, +so always causes a match failure. + + + +Sets of Unicode characters are defined as belonging to certain scripts. A +character from one of these sets can be matched using a script name. For +example, \p{Greek} or \P{Han}. + + + +Those that are not part of an identified script are lumped together as +"Common". The current list of scripts is: + + + +Arabic +Armenian +Balinese +Bengali +Bopomofo +Braille +Buginese +Buhid +Canadian_Aboriginal +Cherokee +Common +Coptic +Cuneiform +Cypriot +Cyrillic +Deseret +Devanagari +Ethiopic +Georgian +Glagolitic +Gothic +Greek +Gujarati +Gurmukhi +Han +Hangul +Hanunoo +Hebrew +Hiragana +Inherited +Kannada +Katakana +Kharoshthi +Khmer +Lao +Latin +Limbu +Linear_B +Malayalam +Mongolian +Myanmar +New_Tai_Lue +Nko +Ogham +Old_Italic +Old_Persian +Oriya +Osmanya +Phags_Pa +Phoenician +Runic +Shavian +Sinhala +Syloti_Nagri +Syriac +Tagalog +Tagbanwa +Tai_Le +Tamil +Telugu +Thaana +Thai +Tibetan +Tifinagh +Ugaritic +Yi + + + +Each character has exactly one general category property, specified by a +two-letter abbreviation. For compatibility with Perl, negation can be specified +by including a circumflex between the opening brace and the property name. For +example, \p{^Lu} is the same as \P{Lu}. + + + +If only one letter is specified with \p or \P, it includes all the general +category properties that start with that letter. In this case, in the absence +of negation, the curly brackets in the escape sequence are optional; these two +examples have the same effect: + + + +\p{L} +\pL + + + +The following general category property codes are supported: + + + +Property codes + + + + + Code + Meaning + + + + + C + Other + + + Cc + Control + + + Cf + Format + + + Cn + Unassigned + + + Co + Private use + + + Cs + Surrogate + + + L + Letter + + + Ll + Lower case letter + + + Lm + Modifier letter + + + Lo + Other letter + + + Lt + Title case letter + + + Lu + Upper case letter + + + M + Mark + + + Mc + Spacing mark + + + Me + Enclosing mark + + + Mn + Non-spacing mark + + + N + Number + + + Nd + Decimal number + + + Nl + Letter number + + + No + Other number + + + P + Punctuation + + + Pc + Connector punctuation + + + Pd + Dash punctuation + + + Pe + Close punctuation + + + Pf + Final punctuation + + + Pi + Initial punctuation + + + Po + Other punctuation + + + Ps + Open punctuation + + + S + Symbol + + + Sc + Currency symbol + + + Sk + Modifier symbol + + + Sm + Mathematical symbol + + + So + Other symbol + + + Z + Separator + + + Zl + Line separator + + + Zp + Paragraph separator + + + Zs + Space separator + + + +
+ + +The special property L& is also supported: it matches a character that has +the Lu, Ll, or Lt property, in other words, a letter that is not classified as +a modifier or "other". + + + +The long synonyms for these properties that Perl supports (such as \ep{Letter}) +are not supported by GRegex, nor is it permitted to prefix any of these +properties with "Is". + + + +No character that is in the Unicode table has the Cn (unassigned) property. +Instead, this property is assumed for any code point that is not in the +Unicode table. + + + +Specifying caseless matching does not affect these escape sequences. +For example, \p{Lu} always matches only upper case letters. + + + +The \X escape matches any number of Unicode characters that form an +extended Unicode sequence. \X is equivalent to + + + +(?>\PM\pM*) + + + +That is, it matches a character without the "mark" property, followed +by zero or more characters with the "mark" property, and treats the +sequence as an atomic group (see below). Characters with the "mark" +property are typically accents that affect the preceding character. + + + +Matching characters by Unicode property is not fast, because GRegex has +to search a structure that contains data for over fifteen thousand +characters. That is why the traditional escape sequences such as \d and +\w do not use Unicode properties. + +
+ + +Simple assertions + +The final use of backslash is for certain simple assertions. An +assertion specifies a condition that has to be met at a particular point in +a match, without consuming any characters from the string. The +use of subpatterns for more complicated assertions is described below. +The backslashed assertions are: + + + +Simple assertions + + + + + Escape + Meaning + + + + + \b + matches at a word boundary + + + \B + matches when not at a word boundary + + + \A + matches at the start of the string + + + \Z + matches at the end of the string or before a newline at the end of the string + + + \z + matches only at the end of the string + + + \G + matches at first matching position in the string + + + +
+ + +These assertions may not appear in character classes (but note that \b +has a different meaning, namely the backspace character, inside a +character class). + + + +A word boundary is a position in the string where the current +character and the previous character do not both match \w or \W (i.e. +one matches \w and the other matches \W), or the start or end of the +string if the first or last character matches \w, respectively. + + + +The \A, \Z, and \z assertions differ from the traditional circumflex +and dollar (described in the next section) in that they only ever match +at the very start and end of the string, whatever options are +set. Thus, they are independent of multiline mode. These three assertions +are not affected by the G_REGEX_MATCH_NOTBOL or G_REGEX_MATCH_NOTEOL options, +which affect only the behaviour of the circumflex and dollar metacharacters. +However, if the start_position argument of a matching function is non-zero, +indicating that matching is to start at a point other than the beginning of +the string, \A can never match. The difference between \Z and \z is +that \Z matches before a newline at the end of the string as well at the +very end, whereas \z matches only at the end. + + + +The \G assertion is true only when the current matching position is at +the start point of the match, as specified by the start_position argument +to the matching functions. It differs from \A when the value of startoffset is +non-zero. + + + +Note, however, that the interpretation of \G, as the start of the +current match, is subtly different from Perl’s, which defines it as the +end of the previous match. In Perl, these can be different when the +previously matched string was empty. + + + +If all the alternatives of a pattern begin with \G, the expression is +anchored to the starting match position, and the "anchored" flag is set +in the compiled regular expression. + +
+
+ + +Circumflex and dollar + +Outside a character class, in the default matching mode, the circumflex +character is an assertion that is true only if the current matching +point is at the start of the string. If the start_position argument to +the matching functions is non-zero, circumflex can never match if the +G_REGEX_MULTILINE option is unset. Inside a character class, circumflex +has an entirely different meaning (see below). + + + +Circumflex need not be the first character of the pattern if a number +of alternatives are involved, but it should be the first thing in each +alternative in which it appears if the pattern is ever to match that +branch. If all possible alternatives start with a circumflex, that is, +if the pattern is constrained to match only at the start of the string, +it is said to be an "anchored" pattern. (There are also other +constructs that can cause a pattern to be anchored.) + + + +A dollar character is an assertion that is true only if the current +matching point is at the end of the string, or immediately +before a newline at the end of the string (by default). Dollar need not +be the last character of the pattern if a number of alternatives are +involved, but it should be the last item in any branch in which it +appears. Dollar has no special meaning in a character class. + + + +The meaning of dollar can be changed so that it matches only at the +very end of the string, by setting the G_REGEX_DOLLAR_ENDONLY option at +compile time. This does not affect the \Z assertion. + + + +The meanings of the circumflex and dollar characters are changed if the +G_REGEX_MULTILINE option is set. When this is the case, +a circumflex matches immediately after internal newlines as well as at the +start of the string. It does not match after a newline that ends the string. +A dollar matches before any newlines in the string, as well as at the very +end, when G_REGEX_MULTILINE is set. When newline is +specified as the two-character sequence CRLF, isolated CR and LF characters +do not indicate newlines. + + + +For example, the pattern /^abc$/ matches the string "def\nabc" (where +\n represents a newline) in multiline mode, but not otherwise. Consequently, +patterns that are anchored in single line mode because all branches start with +^ are not anchored in multiline mode, and a match for circumflex is possible +when the start_position argument of a matching function +is non-zero. The G_REGEX_DOLLAR_ENDONLY option is ignored +if G_REGEX_MULTILINE is set. + + + +Note that the sequences \A, \Z, and \z can be used to match the start and +end of the string in both modes, and if all branches of a pattern start with +\A it is always anchored, whether or not G_REGEX_MULTILINE +is set. + + + + +Full stop (period, dot) + +Outside a character class, a dot in the pattern matches any one character +in the string, including a non-printing character, but not (by +default) newline. In UTF-8 a character might be more than one byte long. + + + +When a line ending is defined as a single character, dot never matches that +character; when the two-character sequence CRLF is used, dot does not match CR +if it is immediately followed by LF, but otherwise it matches all characters +(including isolated CRs and LFs). When any Unicode line endings are being +recognized, dot does not match CR or LF or any of the other line ending +characters. + + + +If the G_REGEX_DOTALL flag is set, dots match newlines +as well. The handling of dot is entirely independent of the handling of circumflex +and dollar, the only relationship being that they both involve newline +characters. Dot has no special meaning in a character class. + + + +The behaviour of dot with regard to newlines can be changed. If the +G_REGEX_DOTALL option is set, a dot matches any one +character, without exception. If newline is defined as the two-character +sequence CRLF, it takes two dots to match it. + + + +The handling of dot is entirely independent of the handling of circumflex and +dollar, the only relationship being that they both involve newlines. Dot has no +special meaning in a character class. + + + + +Matching a single byte + +Outside a character class, the escape sequence \C matches any one byte, +both in and out of UTF-8 mode. Unlike a dot, it always matches any line +ending characters. +The feature is provided in Perl in order to match individual bytes in +UTF-8 mode. Because it breaks up UTF-8 characters into individual +bytes, what remains in the string may be a malformed UTF-8 string. For +this reason, the \C escape sequence is best avoided. + + + +GRegex does not allow \C to appear in lookbehind assertions (described +below), because in UTF-8 mode this would make it impossible to calculate +the length of the lookbehind. + + + + +Square brackets and character classes + +An opening square bracket introduces a character class, terminated by a +closing square bracket. A closing square bracket on its own is not special. If a closing square bracket is required as a member of the class, +it should be the first data character in the class (after an initial +circumflex, if present) or escaped with a backslash. + + + +A character class matches a single character in the string. A matched character +must be in the set of characters defined by the class, unless the first +character in the class definition is a circumflex, in which case the +string character must not be in the set defined by the class. If a +circumflex is actually required as a member of the class, ensure it is +not the first character, or escape it with a backslash. + + + +For example, the character class [aeiou] matches any lower case vowel, +while [^aeiou] matches any character that is not a lower case vowel. +Note that a circumflex is just a convenient notation for specifying the +characters that are in the class by enumerating those that are not. A +class that starts with a circumflex is not an assertion: it still consumes +a character from the string, and therefore it fails if the current pointer +is at the end of the string. + + + +In UTF-8 mode, characters with values greater than 255 can be included +in a class as a literal string of bytes, or by using the \x{ escaping +mechanism. + + + +When caseless matching is set, any letters in a class represent both +their upper case and lower case versions, so for example, a caseless +[aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not +match "A", whereas a caseful version would. + + + +Characters that might indicate line breaks are never treated +in any special way when matching character classes, whatever line-ending +sequence is in use, and whatever setting of the G_REGEX_DOTALL +and G_REGEX_MULTILINE options is used. A class such as [^a] +always matches one of these characters. + + + +The minus (hyphen) character can be used to specify a range of characters in +a character class. For example, [d-m] matches any letter +between d and m, inclusive. If a minus character is required in a +class, it must be escaped with a backslash or appear in a position +where it cannot be interpreted as indicating a range, typically as the +first or last character in the class. + + + +It is not possible to have the literal character "]" as the end character +of a range. A pattern such as [W-]46] is interpreted as a class of +two characters ("W" and "-") followed by a literal string "46]", so it +would match "W46]" or "-46]". However, if the "]" is escaped with a +backslash it is interpreted as the end of range, so [W-\]46] is interpreted +as a class containing a range followed by two other characters. +The octal or hexadecimal representation of "]" can also be used to end +a range. + + + +Ranges operate in the collating sequence of character values. They can +also be used for characters specified numerically, for example +[\000-\037]. In UTF-8 mode, ranges can include characters whose values +are greater than 255, for example [\x{100}-\x{2ff}]. + + + +The character types \d, \D, \p, \P, \s, \S, \w, and \W may also appear +in a character class, and add the characters that they match to the +class. For example, [\dABCDEF] matches any hexadecimal digit. A +circumflex can conveniently be used with the upper case character types to +specify a more restricted set of characters than the matching lower +case type. For example, the class [^\W_] matches any letter or digit, +but not underscore. + + + +The only metacharacters that are recognized in character classes are +backslash, hyphen (only where it can be interpreted as specifying a +range), circumflex (only at the start), opening square bracket (only +when it can be interpreted as introducing a POSIX class name - see the +next section), and the terminating closing square bracket. However, +escaping other non-alphanumeric characters does no harm. + + + + +Posix character classes + +GRegex supports the POSIX notation for character classes. This uses names +enclosed by [: and :] within the enclosing square brackets. For example, + + + +[01[:alpha:]%] + + + +matches "0", "1", any alphabetic character, or "%". The supported class +names are + + + +Posix classes + + + + + Name + Meaning + + + + + alnum + letters and digits + + + alpha + letters + + + ascii + character codes 0 - 127 + + + blank + space or tab only + + + cntrl + control characters + + + digit + decimal digits (same as \d) + + + graph + printing characters, excluding space + + + lower + lower case letters + + + print + printing characters, including space + + + punct + printing characters, excluding letters and digits + + + space + white space (not quite the same as \s) + + + upper + upper case letters + + + word + "word" characters (same as \w) + + + xdigit + hexadecimal digits + + + +
+ + +The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), +and space (32). Notice that this list includes the VT character (code +11). This makes "space" different to \s, which does not include VT (for +Perl compatibility). + + + +The name "word" is a Perl extension, and "blank" is a GNU extension. +Another Perl extension is negation, which is indicated by a ^ character +after the colon. For example, + + + +[12[:^digit:]] + + + +matches "1", "2", or any non-digit. GRegex also recognize the +POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but +these are not supported, and an error is given if they are encountered. + + + +In UTF-8 mode, characters with values greater than 128 do not match any +of the POSIX character classes. + +
+ + +Vertical bar + +Vertical bar characters are used to separate alternative patterns. For +example, the pattern + + + + gilbert|sullivan + + + +matches either "gilbert" or "sullivan". Any number of alternatives may +appear, and an empty alternative is permitted (matching the empty +string). The matching process tries each alternative in turn, from +left to right, and the first one that succeeds is used. If the alternatives are within a subpattern (defined below), "succeeds" means matching the rest of the main pattern as well as the alternative in the subpattern. + + + + +Internal option setting + +The settings of the G_REGEX_CASELESS, G_REGEX_MULTILINE, G_REGEX_MULTILINE, +and G_REGEX_EXTENDED options can be changed from within the pattern by a +sequence of Perl-style option letters enclosed between "(?" and ")". The +option letters are + + + +Option settings + + + + + Option + Flag + + + + + i + G_REGEX_CASELESS + + + m + G_REGEX_MULTILINE + + + s + G_REGEX_DOTALL + + + x + G_REGEX_EXTENDED + + + +
+ + +For example, (?im) sets caseless, multiline matching. It is also +possible to unset these options by preceding the letter with a hyphen, and a +combined setting and unsetting such as (?im-sx), which sets G_REGEX_CASELESS +and G_REGEX_MULTILINE while unsetting G_REGEX_DOTALL and G_REGEX_EXTENDED, +is also permitted. If a letter appears both before and after the +hyphen, the option is unset. + + + +When an option change occurs at top level (that is, not inside subpattern +parentheses), the change applies to the remainder of the pattern +that follows. + + + +An option change within a subpattern (see below for a description of subpatterns) +affects only that part of the current pattern that follows it, so + + + +(a(?i)b)c + + + +matches abc and aBc and no other strings (assuming G_REGEX_CASELESS is not +used). By this means, options can be made to have different settings +in different parts of the pattern. Any changes made in one alternative +do carry on into subsequent branches within the same subpattern. For +example, + + + +(a(?i)b|c) + + + +matches "ab", "aB", "c", and "C", even though when matching "C" the +first branch is abandoned before the option setting. This is because +the effects of option settings happen at compile time. There would be +some very weird behaviour otherwise. + + + +The options G_REGEX_UNGREEDY and +G_REGEX_EXTRA and G_REGEX_DUPNAMES +can be changed in the same way as the Perl-compatible options by using +the characters U, X and J respectively. + +
+ + +Subpatterns + +Subpatterns are delimited by parentheses (round brackets), which can be +nested. Turning part of a pattern into a subpattern does two things: + + + + +It localizes a set of alternatives. For example, the pattern +cat(aract|erpillar|) matches one of the words "cat", "cataract", or +"caterpillar". Without the parentheses, it would match "cataract", +"erpillar" or an empty string. + + +It sets up the subpattern as a capturing subpattern. This means +that, when the whole pattern matches, that portion of the +string that matched the subpattern can be obtained using g_regex_fetch(). +Opening parentheses are counted from left to right (starting from 1, as +subpattern 0 is the whole matched string) to obtain numbers for the +capturing subpatterns. + + + + +For example, if the string "the red king" is matched against the pattern + + + +the ((red|white) (king|queen)) + + + +the captured substrings are "red king", "red", and "king", and are numbered 1, 2, and 3, respectively. + + + +The fact that plain parentheses fulfil two functions is not always +helpful. There are often times when a grouping subpattern is required +without a capturing requirement. If an opening parenthesis is followed +by a question mark and a colon, the subpattern does not do any capturing, +and is not counted when computing the number of any subsequent +capturing subpatterns. For example, if the string "the white queen" is +matched against the pattern + + + +the ((?:red|white) (king|queen)) + + + +the captured substrings are "white queen" and "queen", and are numbered +1 and 2. The maximum number of capturing subpatterns is 65535. + + + +As a convenient shorthand, if any option settings are required at the +start of a non-capturing subpattern, the option letters may appear +between the "?" and the ":". Thus the two patterns + + + +(?i:saturday|sunday) +(?:(?i)saturday|sunday) + + + +match exactly the same set of strings. Because alternative branches are +tried from left to right, and options are not reset until the end of +the subpattern is reached, an option setting in one branch does affect +subsequent branches, so the above patterns match "SUNDAY" as well as +"Saturday". + + + + +Named subpatterns + +Identifying capturing parentheses by number is simple, but it can be +very hard to keep track of the numbers in complicated regular expressions. +Furthermore, if an expression is modified, the numbers may +change. To help with this difficulty, GRegex supports the naming of +subpatterns. A subpattern can be named in one of three ways: (?<name>...) or +(?'name'...) as in Perl, or (?P<name>...) as in Python. +References to capturing parentheses from other +parts of the pattern, such as backreferences, recursion, and conditions, +can be made by name as well as by number. + + + +Names consist of up to 32 alphanumeric characters and underscores. Named +capturing parentheses are still allocated numbers as well as names, exactly as +if the names were not present. +By default, a name must be unique within a pattern, but it is possible to relax +this constraint by setting the G_REGEX_DUPNAMES option at +compile time. This can be useful for patterns where only one instance of the +named parentheses can match. Suppose you want to match the name of a weekday, +either as a 3-letter abbreviation or as the full name, and in both cases you +want to extract the abbreviation. This pattern (ignoring the line breaks) does +the job: + + + +(?<DN>Mon|Fri|Sun)(?:day)?| +(?<DN>Tue)(?:sday)?| +(?<DN>Wed)(?:nesday)?| +(?<DN>Thu)(?:rsday)?| +(?<DN>Sat)(?:urday)? + + + +There are five capturing substrings, but only one is ever set after a match. +The function for extracting the data by name returns the substring +for the first (and in this example, the only) subpattern of that name that +matched. This saves searching to find which numbered subpattern it was. If you +make a reference to a non-unique named subpattern from elsewhere in the +pattern, the one that corresponds to the lowest number is used. + + + + +Repetition + +Repetition is specified by quantifiers, which can follow any of the +following items: + + + +a literal data character +the dot metacharacter +the \C escape sequence +the \X escape sequence (in UTF-8 mode) +the \R escape sequence +an escape such as \d that matches a single character +a character class +a back reference (see next section) +a parenthesized subpattern (unless it is an assertion) + + + +The general repetition quantifier specifies a minimum and maximum number +of permitted matches, by giving the two numbers in curly brackets +(braces), separated by a comma. The numbers must be less than 65536, +and the first must be less than or equal to the second. For example: + + + +z{2,4} + + + +matches "zz", "zzz", or "zzzz". A closing brace on its own is not a +special character. If the second number is omitted, but the comma is +present, there is no upper limit; if the second number and the comma +are both omitted, the quantifier specifies an exact number of required +matches. Thus + + + +[aeiou]{3,} + + + +matches at least 3 successive vowels, but may match many more, while + + + +\d{8} + + + +matches exactly 8 digits. An opening curly bracket that appears in a +position where a quantifier is not allowed, or one that does not match +the syntax of a quantifier, is taken as a literal character. For example, +{,6} is not a quantifier, but a literal string of four characters. + + + +In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to +individual bytes. Thus, for example, \x{100}{2} matches two UTF-8 +characters, each of which is represented by a two-byte sequence. Similarly, +\X{3} matches three Unicode extended sequences, each of which may be +several bytes long (and they may be of different lengths). + + + +The quantifier {0} is permitted, causing the expression to behave as if +the previous item and the quantifier were not present. + + + +For convenience, the three most common quantifiers have single-character +abbreviations: + + + +Abbreviations for quantifiers + + + + + Abbreviation + Meaning + + + + + * + is equivalent to {0,} + + + + + is equivalent to {1,} + + + ? + is equivalent to {0,1} + + + +
+ + +It is possible to construct infinite loops by following a subpattern +that can match no characters with a quantifier that has no upper limit, +for example: + + + +(a?)* + + + +Because there are cases where this can be useful, such patterns are +accepted, but if any repetition of the subpattern does in fact match +no characters, the loop is forcibly broken. + + + +By default, the quantifiers are "greedy", that is, they match as much +as possible (up to the maximum number of permitted times), without +causing the rest of the pattern to fail. The classic example of where +this gives problems is in trying to match comments in C programs. These +appear between /* and */ and within the comment, individual * and / +characters may appear. An attempt to match C comments by applying the +pattern + + + +/\*.*\*/ + + + +to the string + + + +/* first comment */ not comment /* second comment */ + + + +fails, because it matches the entire string owing to the greediness of +the .* item. + + + +However, if a quantifier is followed by a question mark, it ceases to +be greedy, and instead matches the minimum number of times possible, so +the pattern + + + +/\*.*?\*/ + + + +does the right thing with the C comments. The meaning of the various +quantifiers is not otherwise changed, just the preferred number of +matches. Do not confuse this use of question mark with its use as a +quantifier in its own right. Because it has two uses, it can sometimes +appear doubled, as in + + + +\d??\d + + + +which matches one digit by preference, but can match two if that is the +only way the rest of the pattern matches. + + + +If the G_REGEX_UNGREEDY flag is set, the quantifiers are not greedy +by default, but individual ones can be made greedy by following them with +a question mark. In other words, it inverts the default behaviour. + + + +When a parenthesized subpattern is quantified with a minimum repeat +count that is greater than 1 or with a limited maximum, more memory is +required for the compiled pattern, in proportion to the size of the +minimum or maximum. + + + +If a pattern starts with .* or .{0,} and the G_REGEX_DOTALL flag +is set, thus allowing the dot to match newlines, the +pattern is implicitly anchored, because whatever follows will be tried +against every character position in the string, so there is no +point in retrying the overall match at any position after the first. +GRegex normally treats such a pattern as though it were preceded by \A. + + + +In cases where it is known that the string contains no newlines, it +is worth setting G_REGEX_DOTALL in order to obtain this optimization, +or alternatively using ^ to indicate anchoring explicitly. + + + +However, there is one situation where the optimization cannot be used. +When .* is inside capturing parentheses that are the subject of a +backreference elsewhere in the pattern, a match at the start may fail +where a later one succeeds. Consider, for example: + + + +(.*)abc\1 + + + +If the string is "xyz123abc123" the match point is the fourth character. +For this reason, such a pattern is not implicitly anchored. + + + +When a capturing subpattern is repeated, the value captured is the +substring that matched the final iteration. For example, after + + + +(tweedle[dume]{3}\s*)+ + + + +has matched "tweedledum tweedledee" the value of the captured substring +is "tweedledee". However, if there are nested capturing subpatterns, +the corresponding captured values may have been set in previous iterations. +For example, after + + + +/(a|(b))+/ + + + +matches "aba" the value of the second captured substring is "b". + +
+ + +Atomic grouping and possessive quantifiers + +With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") +repetition, failure of what follows normally causes the repeated +item to be re-evaluated to see if a different number +of repeats allows the rest of the pattern to match. Sometimes it +is useful to prevent this, either to change the nature of the +match, or to cause it fail earlier than it otherwise might, when the +author of the pattern knows there is no point in carrying on. + + + +Consider, for example, the pattern \d+foo when applied to the string + + + +123456bar + + + +After matching all 6 digits and then failing to match "foo", the normal +action of the matcher is to try again with only 5 digits matching the +\d+ item, and then with 4, and so on, before ultimately failing. +"Atomic grouping" (a term taken from Jeffrey Friedl’s book) provides +the means for specifying that once a subpattern has matched, it is not +to be re-evaluated in this way. + + + +If we use atomic grouping for the previous example, the matcher +give up immediately on failing to match "foo" the first time. The notation +is a kind of special parenthesis, starting with (?> as in this +example: + + + +(?>\d+)foo + + + +This kind of parenthesis "locks up" the part of the pattern it contains +once it has matched, and a failure further into the pattern is +prevented from backtracking into it. Backtracking past it to previous +items, however, works as normal. + + + +An alternative description is that a subpattern of this type matches +the string of characters that an identical standalone pattern would +match, if anchored at the current point in the string. + + + +Atomic grouping subpatterns are not capturing subpatterns. Simple cases +such as the above example can be thought of as a maximizing repeat that +must swallow everything it can. So, while both \d+ and \d+? are prepared +to adjust the number of digits they match in order to make the +rest of the pattern match, (?>\d+) can only match an entire sequence of +digits. + + + +Atomic groups in general can of course contain arbitrarily complicated +subpatterns, and can be nested. However, when the subpattern for an +atomic group is just a single repeated item, as in the example above, a +simpler notation, called a "possessive quantifier" can be used. This +consists of an additional + character following a quantifier. Using +this notation, the previous example can be rewritten as + + + +\d++foo + + + +Possessive quantifiers are always greedy; the setting of the +G_REGEX_UNGREEDY option is ignored. They are a convenient notation for the +simpler forms of atomic group. However, there is no difference in the +meaning of a possessive quantifier and the equivalent +atomic group, though there may be a performance difference; +possessive quantifiers should be slightly faster. + + + +The possessive quantifier syntax is an extension to the Perl syntax. +It was invented by Jeffrey Friedl in the first edition of his book and +then implemented by Mike McCloskey in Sun's Java package. +It ultimately found its way into Perl at release 5.10. + + + +GRegex has an optimization that automatically "possessifies" certain simple +pattern constructs. For example, the sequence A+B is treated as A++B because +there is no point in backtracking into a sequence of A's when B must follow. + + + +When a pattern contains an unlimited repeat inside a subpattern that +can itself be repeated an unlimited number of times, the use of an +atomic group is the only way to avoid some failing matches taking a +very long time indeed. The pattern + + + +(\D+|<\d+>)*[!?] + + + +matches an unlimited number of substrings that either consist of non- +digits, or digits enclosed in <>, followed by either ! or ?. When it +matches, it runs quickly. However, if it is applied to + + + +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + + + +it takes a long time before reporting failure. This is because the +string can be divided between the internal \D+ repeat and the external +* repeat in a large number of ways, and all have to be tried. (The +example uses [!?] rather than a single character at the end, because +GRegex has an optimization that allows for fast failure +when a single character is used. It remember the last single character +that is required for a match, and fail early if it is not present +in the string.) If the pattern is changed so that it uses an atomic +group, like this: + + + +((?>\D+)|<\d+>)*[!?] + + + +sequences of non-digits cannot be broken, and failure happens quickly. + + + + +Back references + +Outside a character class, a backslash followed by a digit greater than +0 (and possibly further digits) is a back reference to a capturing subpattern +earlier (that is, to its left) in the pattern, provided there have been that +many previous capturing left parentheses. + + + +However, if the decimal number following the backslash is less than 10, +it is always taken as a back reference, and causes an error only if +there are not that many capturing left parentheses in the entire pattern. +In other words, the parentheses that are referenced need not be +to the left of the reference for numbers less than 10. A "forward back +reference" of this type can make sense when a repetition is involved and +the subpattern to the right has participated in an earlier iteration. + + + +It is not possible to have a numerical "forward back reference" to subpattern +whose number is 10 or more using this syntax because a sequence such as \e50 is +interpreted as a character defined in octal. See the subsection entitled +"Non-printing characters" above for further details of the handling of digits +following a backslash. There is no such problem when named parentheses are used. +A back reference to any subpattern is possible using named parentheses (see below). + + + +Another way of avoiding the ambiguity inherent in the use of digits following a +backslash is to use the \g escape sequence (introduced in Perl 5.10.) +This escape must be followed by a positive or a negative number, +optionally enclosed in braces. + + + +A positive number specifies an absolute reference without the ambiguity that is +present in the older syntax. It is also useful when literal digits follow the +reference. A negative number is a relative reference. Consider "(abc(def)ghi)\g{-1}", +the sequence \g{-1} is a reference to the most recently started capturing +subpattern before \g, that is, is it equivalent to \2. Similarly, \g{-2} +would be equivalent to \1. The use of relative references can be helpful in +long patterns, and also in patterns that are created by joining together +fragments that contain references within themselves. + + + +A back reference matches whatever actually matched the capturing subpattern +in the current string, rather than anything matching +the subpattern itself (see "Subpatterns as subroutines" below for a way +of doing that). So the pattern + + + +(sens|respons)e and \1ibility + + + +matches "sense and sensibility" and "response and responsibility", but +not "sense and responsibility". If caseful matching is in force at the +time of the back reference, the case of letters is relevant. For example, + + + +((?i)rah)\s+\1 + + + +matches "rah rah" and "RAH RAH", but not "RAH rah", even though the +original capturing subpattern is matched caselessly. + + + +Back references to named subpatterns use the Perl syntax \k<name> or \k'name' +or the Python syntax (?P=name). We could rewrite the above example in either of +the following ways: + + + +(?<p1>(?i)rah)\s+\k<p1> +(?P<p1>(?i)rah)\s+(?P=p1) + + + +A subpattern that is referenced by name may appear in the pattern before or +after the reference. + + + +There may be more than one back reference to the same subpattern. If a +subpattern has not actually been used in a particular match, any back +references to it always fail. For example, the pattern + + + +(a|(bc))\2 + + + +always fails if it starts to match "a" rather than "bc". Because there +may be many capturing parentheses in a pattern, all digits following +the backslash are taken as part of a potential back reference number. +If the pattern continues with a digit character, some delimiter must be +used to terminate the back reference. If the G_REGEX_EXTENDED flag is +set, this can be whitespace. Otherwise an empty comment (see "Comments" below) can be used. + + + +A back reference that occurs inside the parentheses to which it refers +fails when the subpattern is first used, so, for example, (a\1) never +matches. However, such references can be useful inside repeated subpatterns. +For example, the pattern + + + +(a|b\1)+ + + + +matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration +of the subpattern, the back reference matches the character +string corresponding to the previous iteration. In order for this to +work, the pattern must be such that the first iteration does not need +to match the back reference. This can be done using alternation, as in +the example above, or by a quantifier with a minimum of zero. + + + + +Assertions + +An assertion is a test on the characters following or preceding the +current matching point that does not actually consume any characters. +The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are +described above. + + + +More complicated assertions are coded as subpatterns. There are two +kinds: those that look ahead of the current position in the +string, and those that look behind it. An assertion subpattern is +matched in the normal way, except that it does not cause the current +matching position to be changed. + + + +Assertion subpatterns are not capturing subpatterns, and may not be +repeated, because it makes no sense to assert the same thing several +times. If any kind of assertion contains capturing subpatterns within +it, these are counted for the purposes of numbering the capturing +subpatterns in the whole pattern. However, substring capturing is carried +out only for positive assertions, because it does not make sense for +negative assertions. + + + +Lookahead assertions + +Lookahead assertions start with (?= for positive assertions and (?! for +negative assertions. For example, + + + +\w+(?=;) + + + +matches a word followed by a semicolon, but does not include the semicolon +in the match, and + + + +foo(?!bar) + + + +matches any occurrence of "foo" that is not followed by "bar". Note +that the apparently similar pattern + + + +(?!foo)bar + + + +does not find an occurrence of "bar" that is preceded by something +other than "foo"; it finds any occurrence of "bar" whatsoever, because +the assertion (?!foo) is always true when the next three characters are +"bar". A lookbehind assertion is needed to achieve the other effect. + + + +If you want to force a matching failure at some point in a pattern, the +most convenient way to do it is with (?!) because an empty string +always matches, so an assertion that requires there not to be an empty +string must always fail. + + + + +Lookbehind assertions + +Lookbehind assertions start with (?<= for positive assertions and (?<! +for negative assertions. For example, + + + +(?<!foo)bar + + + +does find an occurrence of "bar" that is not preceded by "foo". The +contents of a lookbehind assertion are restricted such that all the +strings it matches must have a fixed length. However, if there are +several top-level alternatives, they do not all have to have the same +fixed length. Thus + + + +(?<=bullock|donkey) + + + +is permitted, but + + + +(?<!dogs?|cats?) + + + +causes an error at compile time. Branches that match different length +strings are permitted only at the top level of a lookbehind assertion. +An assertion such as + + + +(?<=ab(c|de)) + + + +is not permitted, because its single top-level branch can match two +different lengths, but it is acceptable if rewritten to use two top- +level branches: + + + +(?<=abc|abde) + + + +The implementation of lookbehind assertions is, for each alternative, +to temporarily move the current position back by the fixed length and +then try to match. If there are insufficient characters before the +current position, the assertion fails. + + + +GRegex does not allow the \C escape (which matches a single byte in UTF-8 +mode) to appear in lookbehind assertions, because it makes it impossible +to calculate the length of the lookbehind. The \X and \R escapes, which can +match different numbers of bytes, are also not permitted. + + + +Possessive quantifiers can be used in conjunction with lookbehind assertions to +specify efficient matching at the end of the subject string. Consider a simple +pattern such as + + + +abcd$ + + + +when applied to a long string that does not match. Because matching +proceeds from left to right, GRegex will look for each "a" in the string +and then see if what follows matches the rest of the pattern. If the +pattern is specified as + + + +^.*abcd$ + + + +the initial .* matches the entire string at first, but when this fails +(because there is no following "a"), it backtracks to match all but the +last character, then all but the last two characters, and so on. Once +again the search for "a" covers the entire string, from right to left, +so we are no better off. However, if the pattern is written as + + + +^.*+(?<=abcd) + + + +there can be no backtracking for the .*+ item; it can match only the +entire string. The subsequent lookbehind assertion does a single test +on the last four characters. If it fails, the match fails immediately. +For long strings, this approach makes a significant difference to the +processing time. + + + + +Using multiple assertions + +Several assertions (of any sort) may occur in succession. For example, + + + +(?<=\d{3})(?<!999)foo + + + +matches "foo" preceded by three digits that are not "999". Notice that +each of the assertions is applied independently at the same point in +the string. First there is a check that the previous three +characters are all digits, and then there is a check that the same +three characters are not "999". This pattern does not match "foo" preceded +by six characters, the first of which are digits and the last +three of which are not "999". For example, it doesn’t match "123abcfoo". +A pattern to do that is + + + +(?<=\d{3}...)(?<!999)foo + + + +This time the first assertion looks at the preceding six characters, +checking that the first three are digits, and then the second assertion +checks that the preceding three characters are not "999". + + + +Assertions can be nested in any combination. For example, + + + +(?<=(?<!foo)bar)baz + + + +matches an occurrence of "baz" that is preceded by "bar" which in turn +is not preceded by "foo", while + + + +(?<=\d{3}(?!999)...)foo + + + +is another pattern that matches "foo" preceded by three digits and any +three characters that are not "999". + + + + + +Conditional subpatterns + +It is possible to cause the matching process to obey a subpattern +conditionally or to choose between two alternative subpatterns, depending +on the result of an assertion, or whether a previous capturing subpattern +matched or not. The two possible forms of conditional subpattern are + + + +(?(condition)yes-pattern) +(?(condition)yes-pattern|no-pattern) + + + +If the condition is satisfied, the yes-pattern is used; otherwise the +no-pattern (if present) is used. If there are more than two alternatives +in the subpattern, a compile-time error occurs. + + + +There are four kinds of condition: references to subpatterns, references to +recursion, a pseudo-condition called DEFINE, and assertions. + + + +Checking for a used subpattern by number + +If the text between the parentheses consists of a sequence of digits, the +condition is true if the capturing subpattern of that number has previously +matched. + + + +Consider the following pattern, which contains non-significant white space +to make it more readable (assume the G_REGEX_EXTENDED) +and to divide it into three parts for ease of discussion: + + + +( \( )? [^()]+ (?(1) \) ) + + + +The first part matches an optional opening parenthesis, and if that +character is present, sets it as the first captured substring. The second +part matches one or more characters that are not parentheses. The +third part is a conditional subpattern that tests whether the first set +of parentheses matched or not. If they did, that is, if string started +with an opening parenthesis, the condition is true, and so the yes-pattern +is executed and a closing parenthesis is required. Otherwise, +since no-pattern is not present, the subpattern matches nothing. In +other words, this pattern matches a sequence of non-parentheses, +optionally enclosed in parentheses. + + + + +Checking for a used subpattern by name + +Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used +subpattern by name, the Python syntax (?(name)...) is also recognized. However, +there is a possible ambiguity with this syntax, because subpattern names may +consist entirely of digits. GRegex looks first for a named subpattern; if it +cannot find one and the name consists entirely of digits, GRegex looks for a +subpattern of that number, which must be greater than zero. Using subpattern +names that consist entirely of digits is not recommended. + + + +Rewriting the above example to use a named subpattern gives this: + + + +(?<OPEN> \( )? [^()]+ (?(<OPEN>) \) ) + + + + +Checking for pattern recursion + +If the condition is the string (R), and there is no subpattern with the name R, +the condition is true if a recursive call to the whole pattern or any +subpattern has been made. If digits or a name preceded by ampersand follow the +letter R, for example: + + + +(?(R3)...) +(?(R&name)...) + + + +the condition is true if the most recent recursion is into the subpattern whose +number or name is given. This condition does not check the entire recursion +stack. + + + +At "top level", all these recursion test conditions are false. Recursive +patterns are described below. + + + + +Defining subpatterns for use by reference only + +If the condition is the string (DEFINE), and there is no subpattern with the +name DEFINE, the condition is always false. In this case, there may be only one +alternative in the subpattern. It is always skipped if control reaches this +point in the pattern; the idea of DEFINE is that it can be used to define +"subroutines" that can be referenced from elsewhere. (The use of "subroutines" +is described below.) For example, a pattern to match an IPv4 address could be +written like this (ignore whitespace and line breaks): + + + +(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) ) +\b (?&byte) (\.(?&byte)){3} \b + + + +The first part of the pattern is a DEFINE group inside which a another group +named "byte" is defined. This matches an individual component of an IPv4 +address (a number less than 256). When matching takes place, this part of the +pattern is skipped because DEFINE acts like a false condition. + + + +The rest of the pattern uses references to the named group to match the four +dot-separated components of an IPv4 address, insisting on a word boundary at +each end. + + + + +Assertion conditions + +If the condition is not in any of the above formats, it must be an +assertion. This may be a positive or negative lookahead or lookbehind +assertion. Consider this pattern, again containing non-significant +white space, and with the two alternatives on the second line: + + + +(?(?=[^a-z]*[a-z]) +\d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) + + + +The condition is a positive lookahead assertion that matches an +optional sequence of non-letters followed by a letter. In other words, +it tests for the presence of at least one letter in the string. If a +letter is found, the string is matched against the first alternative; +otherwise it is matched against the second. This pattern matches +strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are +letters and dd are digits. + + + + + +Comments + +The sequence (?# marks the start of a comment that continues up to the +next closing parenthesis. Nested parentheses are not permitted. The +characters that make up a comment play no part in the pattern matching +at all. + + + +If the G_REGEX_EXTENDED option is set, an unescaped # +character outside a character class introduces a comment that continues to +immediately after the next newline in the pattern. + + + + +Recursive patterns + +Consider the problem of matching a string in parentheses, allowing for +unlimited nested parentheses. Without the use of recursion, the best +that can be done is to use a pattern that matches up to some fixed +depth of nesting. It is not possible to handle an arbitrary nesting +depth. + + + +For some time, Perl has provided a facility that allows regular expressions to +recurse (amongst other things). It does this by interpolating Perl code in the +expression at run time, and the code can refer to the expression itself. A Perl +pattern using code interpolation to solve the parentheses problem can be +created like this: + + + +$re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x; + + + +The (?p{...}) item interpolates Perl code at run time, and in this case refers +recursively to the pattern in which it appears. + + + +Obviously, GRegex cannot support the interpolation of Perl code. Instead, it +supports special syntax for recursion of the entire pattern, and also for +individual subpattern recursion. This kind of recursion was introduced into +Perl at release 5.10. + + + +A special item that consists of (? followed by a number greater than zero and a +closing parenthesis is a recursive call of the subpattern of the given number, +provided that it occurs inside that subpattern. (If not, it is a "subroutine" +call, which is described in the next section.) The special item (?R) or (?0) is +a recursive call of the entire regular expression. + + + +In GRegex (like Python, but unlike Perl), a recursive subpattern call is always +treated as an atomic group. That is, once it has matched some of the subject +string, it is never re-entered, even if it contains untried alternatives and +there is a subsequent matching failure. + + + +This pattern solves the nested parentheses problem (assume the +G_REGEX_EXTENDED option is set so that white space is +ignored): + + + +\( ( (?>[^()]+) | (?R) )* \) + + + +First it matches an opening parenthesis. Then it matches any number of +substrings which can either be a sequence of non-parentheses, or a +recursive match of the pattern itself (that is, a correctly parenthesized +substring). Finally there is a closing parenthesis. + + + +If this were part of a larger pattern, you would not want to recurse +the entire pattern, so instead you could use this: + + + +( \( ( (?>[^()]+) | (?1) )* \) ) + + + +We have put the pattern into parentheses, and caused the recursion to +refer to them instead of the whole pattern. In a larger pattern, keeping +track of parenthesis numbers can be tricky. It may be more convenient to +use named parentheses instead. +The Perl syntax for this is (?&name); GRegex also supports the(?P>name) +syntac. We could rewrite the above example as follows: + + + +(?<pn> \( ( (?>[^()]+) | (?&pn) )* \) ) + + + +If there is more than one subpattern with the same name, the earliest one is +used. This particular example pattern contains nested unlimited repeats, and so +the use of atomic grouping for matching strings of non-parentheses is important +when applying the pattern to strings that do not match. +For example, when this pattern is applied to + + + +(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() + + + +it yields "no match" quickly. However, if atomic grouping is not used, +the match runs for a very long time indeed because there are so many +different ways the + and * repeats can carve up the string, and all +have to be tested before failure can be reported. + + + +At the end of a match, the values set for any capturing subpatterns are +those from the outermost level of the recursion at which the subpattern +value is set. + + + +If the pattern above is matched against + + + +(ab(cd)ef) + + + +the value for the capturing parentheses is "ef", which is the last +value taken on at the top level. If additional parentheses are added, +giving + + + +\( ( ( (?>[^()]+) | (?R) )* ) \) + ^ ^ + ^ ^ + + + +the string they capture is "ab(cd)ef", the contents of the top level +parentheses. + + + +Do not confuse the (?R) item with the condition (R), which tests for +recursion. Consider this pattern, which matches text in angle brackets, +allowing for arbitrary nesting. Only digits are allowed in nested +brackets (that is, when recursing), whereas any characters are permitted +at the outer level. + + + +< (?: (?(R) \d++ | [^<>]*+) | (?R)) * > + + + +In this pattern, (?(R) is the start of a conditional subpattern, with +two different alternatives for the recursive and non-recursive cases. +The (?R) item is the actual recursive call. + + + + +Subpatterns as subroutines + +If the syntax for a recursive subpattern reference (either by number or +by name) is used outside the parentheses to which it refers, it operates +like a subroutine in a programming language. The "called" subpattern may +be defined before or after the reference. An earlier example pointed out +that the pattern + + + +(sens|respons)e and \1ibility + + + +matches "sense and sensibility" and "response and responsibility", but +not "sense and responsibility". If instead the pattern + + + +(sens|respons)e and (?1)ibility + + + +is used, it does match "sense and responsibility" as well as the other +two strings. Another example is given in the discussion of DEFINE above. + + + +Like recursive subpatterns, a "subroutine" call is always treated as an atomic +group. That is, once it has matched some of the string, it is never +re-entered, even if it contains untried alternatives and there is a subsequent +matching failure. + + + +When a subpattern is used as a subroutine, processing options such as +case-independence are fixed when the subpattern is defined. They cannot be +changed for different calls. For example, consider this pattern: + + + +(abc)(?i:(?1)) + + + +It matches "abcabc". It does not match "abcABC" because the change of +processing option does not affect the called subpattern. + + + + + + +Copyright + +This document was copied and adapted from the PCRE documentation, +specifically from the man page for pcrepattern. +The original copyright note is: + + + +Copyright (c) 1997-2006 University of Cambridge. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the name of Google + Inc. nor the names of their contributors may be used to endorse or + promote products derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + + +
diff --git a/docs/reference/glib/tmpl/glib-unused.sgml b/docs/reference/glib/tmpl/glib-unused.sgml index b089e66..ab408f5 100644 --- a/docs/reference/glib/tmpl/glib-unused.sgml +++ b/docs/reference/glib/tmpl/glib-unused.sgml @@ -712,6 +712,13 @@ To use this function you must configure glib with the flag @mem: the memory to check. + + + + + +@Returns: + Gets the file attributes. diff --git a/docs/reference/glib/tmpl/gregex.sgml b/docs/reference/glib/tmpl/gregex.sgml new file mode 100644 index 0000000..222a30b --- /dev/null +++ b/docs/reference/glib/tmpl/gregex.sgml @@ -0,0 +1,578 @@ + +Perl-compatible regular expressions + + +matches strings against regular expressions. + + + +The g_regex_*() functions implement regular +expression pattern matching using syntax and semantics similar to +Perl regular expression. + + +Some functions accept a start_position argument, +setting it differs from just passing over a shortened string and setting +#G_REGEX_MATCH_NOTBOL in the case of a pattern that begins with any kind +of lookbehind assertion. +For example, consider the pattern "\Biss\B" which finds occurrences of "iss" +in the middle of words. ("\B" matches only if the current position in the +subject is not a word boundary.) When applied to the string "Mississipi" +from the fourth byte, namely "issipi", it does not match, because "\B" is +always false at the start of the subject, which is deemed to be a word +boundary. However, if the entire string is passed , but with +start_position set to 4, it finds the second +occurrence of "iss" because it is able to look behind the starting point +to discover that it is preceded by a letter. + + +Note that, unless you set the #G_REGEX_RAW flag, all the strings passed +to these functions must be encoded in UTF-8. The lengths and the positions +inside the strings are in bytes and not in characters, so, for instance, +"\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a single +character. If you set #G_REGEX_RAW the strings can be non-valid UTF-8 +strings and a byte is treated as a character, so "\xc3\xa0" is two bytes +and two characters long. + + +When matching a pattern, "\n" matches only against a "\n" character in the +string, and "\r" matches only a "\r" character. To match any newline sequence +use "\R". This particular group matches either the two-character sequence +CR + LF ("\r\n"), or one of the single characters LF (linefeed, U+000A, "\n"), VT +(vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), CR (carriage return, +U+000D, "\r"), NEL (next line, U+0085), LS (line separator, U+2028), or PS +(paragraph separator, U+2029). + + +The behaviour of the dot, circumflex, and dollar metacharacters are affected by +newline characters, the default is to recognize any newline character (the same +characters recognized by "\R"). This can be changed with #G_REGEX_NEWLINE_CR, +#G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF compile options, +and with #G_REGEX_MATCH_NEWLINE_ANY, #G_REGEX_MATCH_NEWLINE_CR, +#G_REGEX_MATCH_NEWLINE_LF and #G_REGEX_MATCH_NEWLINE_CRLF match options. +These settings are also relevant when compiling a pattern if +#G_REGEX_EXTENDED is set, and an unescaped "#" outside a character class is +encountered. This indicates a comment that lasts until after the next +newline. + + +If you have two threads manipulating the same #GRegex, they must use a +lock to synchronize their operation, as these functions are not threadsafe. +Creating and manipulating different #GRegex structures from different +threads is not a problem. + + +The regular expressions low level functionalities are obtained through +the excellent PCRE library +written by Philip Hazel. + + + + + + + + + + + + +Error codes returned by regular expressions functions. + + +@G_REGEX_ERROR_COMPILE: Compilation of the regular expression in g_regex_new() failed. +@G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression in g_regex_optimize() failed. +@G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement string. +@G_REGEX_ERROR_MATCH: The match process failed. +@Since: 2.14 + + + +Error domain for regular expressions. Errors in this domain will be from the #GRegexError enumeration. See #GError for information on error domains. + + +@Since: 2.14 + + + + +Flags specifying compile-time options. + + +@G_REGEX_CASELESS: Letters in the pattern match both upper and lower case +letters. It be changed within a pattern by a "(?i)" option setting. +@G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting +of a single line of characters (even if it actually contains newlines). +The "start of line" metacharacter ("^") matches only at the start of the +string, while the "end of line" metacharacter ("$") matches only at the +end of the string, or before a terminating newline (unless +#G_REGEX_DOLLAR_ENDONLY is set). When #G_REGEX_MULTILINE is set, +the "start of line" and "end of line" constructs match immediately following +or immediately before any newline in the string, respectively, as well +as at the very start and end. This can be changed within a pattern by a +"(?m)" option setting. +@G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all +characters, including newlines. Without it, newlines are excluded. This +option can be changed within a pattern by a ("?s") option setting. +@G_REGEX_EXTENDED: Whitespace data characters in the pattern are +totally ignored except when escaped or inside a character class. +Whitespace does not include the VT character (code 11). In addition, +characters between an unescaped "#" outside a character class and +the next newline character, inclusive, are also ignored. This can be +changed within a pattern by a "(?x)" option setting. +@G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is, +it is constrained to match only at the first matching point in the string +that is being searched. This effect can also be achieved by appropriate +constructs in the pattern itself such as the "^" metacharater. +@G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern +matches only at the end of the string. Without this option, a dollar also +matches immediately before the final character if it is a newline (but +not before any other newlines). This option is ignored if +#G_REGEX_MULTILINE is set. +@G_REGEX_UNGREEDY: Inverts the "greediness" of the +quantifiers so that they are not greedy by default, but become greedy +if followed by "?". It can also be set by a "(?U)" option setting within +the pattern. +@G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this +flag they are considered as a raw sequence of bytes. +@G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing +parentheses in the pattern. Any opening parenthesis that is not followed +by "?" behaves as if it were followed by "?:" but named parentheses can +still be used for capturing (and they acquire numbers in the usual way). +@G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not +be unique. This can be helpful for certain types of pattern when it is known +that only one instance of the named subpattern can ever be matched. +@G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this +option is set, the only recognized newline character is '\r'. +@G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this +option is set, the only recognized newline character is '\n'. +@G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this +option is set, the only recognized newline character sequence is '\r\n'. +@Since: 2.14 + + + +Flags specifying match-time options. + + +@G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is, +it is constrained to match only at the first matching point in the string +that is being searched. This effect can also be achieved by appropriate +constructs in the pattern itself such as the "^" metacharater. +@G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is +not the beginning of a line, so the circumflex metacharacter should not +match before it. Setting this without G_REGEX_MULTILINE (at compile time) +causes circumflex never to match. This option affects only the behaviour of +the circumflex metacharacter, it does not affect "\A". +@G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is +not the end of a line, so the dollar metacharacter should not match it nor +(except in multiline mode) a newline immediately before it. Setting this +without G_REGEX_MULTILINE (at compile time) causes dollar never to match. +This option affects only the behaviour of the dollar metacharacter, it does +not affect "\Z" or "\z". +@G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid +match if this option is set. If there are alternatives in the pattern, they +are tried. If all the alternatives match the empty string, the entire match +fails. For example, if the pattern "a?b?" is applied to a string not beginning +with "a" or "b", it matches the empty string at the start of the string. +With this flag set, this match is not valid, so GRegex searches further +into the string for occurrences of "a" or "b". +@G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more +documentation on partial matching see g_regex_is_partial_match(). +@G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when creating +a new #GRegex, setting the '\r' character as line terminator. +@G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when creating +a new #GRegex, setting the '\n' character as line terminator. +@G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when creating +a new #GRegex, setting the '\r\n' characters as line terminator. +@G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when creating +a new #GRegex, any newline character or character sequence is recognized. +@Since: 2.14 + + + +A GRegex is the "compiled" form of a regular expression pattern. This +structure is opaque and its fields cannot be accessed directly. + + +@Since: 2.14 + + + +Specifies the type of the function passed to g_regex_replace_eval(). +It is called for each occurance of the pattern @regex in @string, and it +should append the replacement to @result. + + + +Do not call on @regex functions that modify its internal state, such as +g_regex_match(); if you need it you can create a temporary copy of +@regex using g_regex_copy(). + + +@Param1: a #GRegex. +@Param2: the string used to perform matches against. +@Param3: a #GString containing the new string. +@Param4: user data passed to g_regex_replace_eval(). +@Returns: %FALSE to continue the replacement process, %TRUE to stop it. +@Since: 2.14 + + + + + + + +@pattern: +@compile_options: +@match_options: +@error: +@Returns: + + + + + + + +@regex: + + + + + + + +@regex: +@error: +@Returns: + + + + + + + +@regex: +@Returns: + + + + + + + +@regex: +@Returns: + + + + + + + +@regex: + + + + + + + +@pattern: +@string: +@compile_options: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@match_options: +@error: +@Returns: + + + + + + + +@regex: +@string: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@match_options: +@error: +@Returns: + + + + + + + +@regex: +@string: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@match_options: +@error: +@Returns: + + + + + + + +@regex: +@Returns: + + + + + + + +@regex: +@Returns: + + + + + + + +@regex: +@match_num: +@string: +@Returns: + + + + + + + +@regex: +@match_num: +@start_pos: +@end_pos: +@Returns: + + + + + + + +@regex: +@name: +@string: +@Returns: + + + + + + + +@regex: +@name: +@start_pos: +@end_pos: +@Returns: + + + + + + + +@regex: +@string: +@Returns: + + + + + + + +@regex: +@name: +@Returns: + + + + + + + +@pattern: +@string: +@compile_options: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@match_options: +@max_tokens: +@error: +@Returns: + + + + + + + +@regex: +@string: +@match_options: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@match_options: +@error: +@Returns: + + + + + + + +@regex: +@string: +@string_to_expand: +@error: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@replacement: +@match_options: +@error: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@replacement: +@match_options: +@error: +@Returns: + + + + + + + +@regex: +@string: +@string_len: +@start_position: +@match_options: +@eval: +@user_data: +@error: +@Returns: + + + + + + + +@string: +@length: +@Returns: + + diff --git a/glib/Makefile.am b/glib/Makefile.am index 4d3b2ad..b1f8cc6 100644 --- a/glib/Makefile.am +++ b/glib/Makefile.am @@ -6,9 +6,21 @@ PRINTF_SUBDIR = gnulib printf_la = gnulib/libgnulib.la endif -SUBDIRS = libcharset $(PRINTF_SUBDIR) +if ENABLE_REGEX +if USE_SYSTEM_PCRE +else +MAYBE_PCRE = pcre +endif +gregex_c = gregex.c +gregex_h = gregex.h +else +gregex_c = +gregex_h = +endif + +SUBDIRS = libcharset $(PRINTF_SUBDIR) $(MAYBE_PCRE) update-pcre -DIST_SUBDIRS = libcharset gnulib +DIST_SUBDIRS = libcharset gnulib pcre update-pcre INCLUDES = -I$(top_srcdir) -DG_LOG_DOMAIN=\"GLib\" \ $(GLIB_DEBUG_FLAGS) -DG_DISABLE_DEPRECATED -DGLIB_COMPILATION @@ -36,6 +48,8 @@ MIRRORING_TAB_SOURCES = \ glib-mirroring-tab/packtab.h \ glib-mirroring-tab/packtab.c +# The compilation of GRegex can be disabled, but the source files must +# be distributed. EXTRA_DIST = \ makefile.msc.in \ glib.rc.in \ @@ -45,6 +59,8 @@ EXTRA_DIST = \ abicheck.sh \ pltcheck.sh \ glib.symbols \ + gregex.c \ + gregex.h \ $(MIRRORING_TAB_SOURCES) # These may be in the builddir too @@ -106,6 +122,7 @@ libglib_2_0_la_SOURCES = \ gqueue.c \ grel.c \ grand.c \ + $(gregex_c) \ gscanner.c \ gscripttable.h \ gsequence.c \ @@ -185,6 +202,7 @@ glibsubinclude_HEADERS = \ gquark.h \ gqueue.h \ grand.h \ + $(gregex_h) \ grel.h \ gscanner.h \ gsequence.h \ @@ -239,7 +257,17 @@ glib_win32_res = glib-win32-res.o glib_win32_res_ldflag = -Wl,$(glib_win32_res) endif -libglib_2_0_la_LIBADD = libcharset/libcharset.la $(printf_la) @GIO@ @GSPAWN@ @PLATFORMDEP@ @ICONV_LIBS@ @G_LIBS_EXTRA@ +if ENABLE_REGEX +if USE_SYSTEM_PCRE +pcre_lib = $(PCRE_LIBS) +else +pcre_lib = pcre/libpcre.la +endif +else +pcre_lib = +endif + +libglib_2_0_la_LIBADD = libcharset/libcharset.la $(printf_la) @GIO@ @GSPAWN@ @PLATFORMDEP@ @ICONV_LIBS@ @G_LIBS_EXTRA@ $(pcre_lib) libglib_2_0_la_DEPENDENCIES = libcharset/libcharset.la $(printf_la) @GIO@ @GSPAWN@ @PLATFORMDEP@ $(glib_win32_res) @GLIB_DEF@ libglib_2_0_la_LDFLAGS = \ diff --git a/glib/glib.h b/glib/glib.h index f3395aa..b45d3b7 100644 --- a/glib/glib.h +++ b/glib/glib.h @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include diff --git a/glib/glib.symbols b/glib/glib.symbols index 820ec56..5fc3163 100644 --- a/glib/glib.symbols +++ b/glib/glib.symbols @@ -1416,6 +1416,43 @@ g_get_codeset #endif #endif +#if IN_HEADER(__G_REGEX_H__) +#if IN_FILE(__G_REGEX_C__) +g_regex_error_quark +g_regex_new +g_regex_free +g_regex_optimize +g_regex_copy +g_regex_get_pattern +g_regex_clear +g_regex_match_simple +g_regex_match +g_regex_match_full +g_regex_match_next +g_regex_match_next_full +g_regex_match_all +g_regex_match_all_full +g_regex_get_match_count +g_regex_is_partial_match +g_regex_fetch +g_regex_fetch_pos +g_regex_fetch_named +g_regex_fetch_named_pos +g_regex_fetch_all +g_regex_get_string_number +g_regex_split_simple +g_regex_split +g_regex_split_full +g_regex_split_next +g_regex_split_next_full +g_regex_expand_references +g_regex_replace +g_regex_replace_literal +g_regex_replace_eval +g_regex_escape_string +#endif +#endif + #if IN_HEADER(__G_WIN32_H__) #if IN_FILE(__G_WIN32_H__) #ifdef G_OS_WIN32 diff --git a/glib/gregex.c b/glib/gregex.c new file mode 100644 index 0000000..be927aa --- /dev/null +++ b/glib/gregex.c @@ -0,0 +1,2448 @@ +/* GRegex -- regular expression API wrapper around PCRE. + * + * Copyright (C) 1999, 2000 Scott Wimer + * Copyright (C) 2004, Matthias Clasen + * Copyright (C) 2005 - 2007, Marco Barisione + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include "gregex.h" + +#include +#include +#include + +#ifdef USE_SYSTEM_PCRE +#include +#else +#include "pcre/pcre.h" +#endif + +#include "galias.h" + +/* Mask of all the possible values for GRegexCompileFlags. */ +#define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \ + G_REGEX_MULTILINE | \ + G_REGEX_DOTALL | \ + G_REGEX_EXTENDED | \ + G_REGEX_ANCHORED | \ + G_REGEX_DOLLAR_ENDONLY | \ + G_REGEX_UNGREEDY | \ + G_REGEX_RAW | \ + G_REGEX_NO_AUTO_CAPTURE | \ + G_REGEX_DUPNAMES | \ + G_REGEX_NEWLINE_CR | \ + G_REGEX_NEWLINE_LF | \ + G_REGEX_NEWLINE_CRLF) + +/* Mask of all the possible values for GRegexMatchFlags. */ +#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ + G_REGEX_MATCH_NOTBOL | \ + G_REGEX_MATCH_NOTEOL | \ + G_REGEX_MATCH_NOTEMPTY | \ + G_REGEX_MATCH_PARTIAL | \ + G_REGEX_MATCH_NEWLINE_CR | \ + G_REGEX_MATCH_NEWLINE_LF | \ + G_REGEX_MATCH_NEWLINE_CRLF | \ + G_REGEX_MATCH_NEWLINE_ANY) + +/* if the string is in UTF-8 use g_utf8_ functions, else use + * use just +/- 1. */ +#define NEXT_CHAR(re, s) (((re)->pattern->compile_opts & PCRE_UTF8) ? \ + g_utf8_next_char (s) : \ + ((s) + 1)) +#define PREV_CHAR(re, s) (((re)->pattern->compile_opts & PCRE_UTF8) ? \ + g_utf8_prev_char (s) : \ + ((s) - 1)) + +#define WORKSPACE_INITIAL 1000 +#define OFFSETS_DFA_MIN_SIZE 21 + +/* atomically returns the pcre_extra struct in the regex. */ +#define REGEX_GET_EXTRA(re) ((pcre_extra *)g_atomic_pointer_get (&(re)->pattern->extra)) + +/* this struct can be shared by more regexes */ +typedef struct +{ + volatile guint ref_count; /* the ref count for the immutable part */ + gchar *pattern; /* the pattern */ + pcre *pcre_re; /* compiled form of the pattern */ + GRegexCompileFlags compile_opts; /* options used at compile time on the pattern */ + GRegexMatchFlags match_opts; /* options used at match time on the regex */ + pcre_extra *extra; /* data stored when g_regex_optimize() is used */ +} GRegexPattern; + +/* this struct is used only by a single regex */ +typedef struct +{ + gint matches; /* number of matching sub patterns */ + gint pos; /* position in the string where last match left off */ + gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ + gint n_offsets; /* number of offsets */ + gint *workspace; /* workspace for pcre_dfa_exec() */ + gint n_workspace; /* number of workspace elements */ + gssize string_len; /* length of the string last used against */ + GSList *delims; /* delimiter sub strings from split next */ + gint last_separator_end; /* position of the last separator for split_next_full() */ + gboolean last_match_is_empty; /* was the last match in split_next_full() 0 bytes long? */ +} GRegexMatch; + +struct _GRegex +{ + GRegexPattern *pattern; /* immutable part, shared */ + GRegexMatch *match; /* mutable part, not shared */ +}; + +/* TRUE if ret is an error code, FALSE otherwise. */ +#define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL) + +static const gchar * +match_error (gint errcode) +{ + switch (errcode) + { + case PCRE_ERROR_NOMATCH: + /* not an error */ + break; + case PCRE_ERROR_NULL: + /* NULL argument, this should not happen in GRegex */ + g_warning ("A NULL argument was passed to PCRE"); + break; + case PCRE_ERROR_BADOPTION: + return "bad options"; + case PCRE_ERROR_BADMAGIC: + return _("corrupted object"); + case PCRE_ERROR_UNKNOWN_OPCODE: + return N_("internal error or corrupted object"); + case PCRE_ERROR_NOMEMORY: + return _("out of memory"); + case PCRE_ERROR_NOSUBSTRING: + /* not used by pcre_exec() */ + break; + case PCRE_ERROR_MATCHLIMIT: + return _("backtracking limit reached"); + case PCRE_ERROR_CALLOUT: + /* callouts are not implemented */ + break; + case PCRE_ERROR_BADUTF8: + case PCRE_ERROR_BADUTF8_OFFSET: + /* we do not check if strings are valid */ + break; + case PCRE_ERROR_PARTIAL: + /* not an error */ + break; + case PCRE_ERROR_BADPARTIAL: + return _("the pattern contains items not supported for partial matching"); + case PCRE_ERROR_INTERNAL: + return _("internal error"); + case PCRE_ERROR_BADCOUNT: + /* negative ovecsize, this should not happen in GRegex */ + g_warning ("A negative ovecsize was passed to PCRE"); + break; + case PCRE_ERROR_DFA_UITEM: + return _("the pattern contains items not supported for partial matching"); + case PCRE_ERROR_DFA_UCOND: + return _("back references as conditions are not supported for partial matching"); + case PCRE_ERROR_DFA_UMLIMIT: + /* the match_field field is not udes in GRegex */ + break; + case PCRE_ERROR_DFA_WSSIZE: + /* handled expanding the workspace */ + break; + case PCRE_ERROR_DFA_RECURSE: + case PCRE_ERROR_RECURSIONLIMIT: + return _("recursion limit reached"); + case PCRE_ERROR_NULLWSLIMIT: + return _("workspace limit for empty substrings reached"); + case PCRE_ERROR_BADNEWLINE: + return _("invalid combination of newline flags"); + default: + break; + } + return _("unknown error"); +} + +GQuark +g_regex_error_quark (void) +{ + static GQuark error_quark = 0; + + if (error_quark == 0) + error_quark = g_quark_from_static_string ("g-regex-error-quark"); + + return error_quark; +} + +static GRegexPattern * +regex_pattern_new (pcre *re, + const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options) +{ + GRegexPattern *rp = g_new0 (GRegexPattern, 1); + rp->ref_count = 1; + rp->pcre_re = re; + rp->pattern = g_strdup (pattern); + rp->compile_opts = compile_options; + rp->match_opts = match_options; + return rp; +} + +static GRegexPattern * +regex_pattern_ref (GRegexPattern *rp) +{ + /* increases the ref count of the immutable part of the GRegex */ + g_atomic_int_inc ((gint*) &rp->ref_count); + return rp; +} + +static void +regex_pattern_unref (GRegexPattern *rp) +{ + /* decreases the ref count of the immutable part of the GRegex + * and deletes it if the ref count went to 0 */ + if (g_atomic_int_exchange_and_add ((gint *) &rp->ref_count, -1) - 1 == 0) + { + g_free (rp->pattern); + if (rp->pcre_re != NULL) + pcre_free (rp->pcre_re); + if (rp->extra != NULL) + pcre_free (rp->extra); + g_free (rp); + } +} + +static void +regex_match_free (GRegexMatch *rm) +{ + if (rm == NULL) + return; + + g_slist_foreach (rm->delims, (GFunc) g_free, NULL); + g_slist_free (rm->delims); + g_free (rm->offsets); + g_free (rm->workspace); + g_free (rm); +} + +static void +regex_lazy_init_match (GRegex *regex, + gint min_offsets) +{ + gint n_offsets; + + if (regex->match != NULL) + return; + + pcre_fullinfo (regex->pattern->pcre_re, + REGEX_GET_EXTRA (regex), + PCRE_INFO_CAPTURECOUNT, &n_offsets); + n_offsets = (MAX (n_offsets, min_offsets) + 1) * 3; + + regex->match = g_new0 (GRegexMatch, 1); + regex->match->string_len = -1; + regex->match->matches = -1000; + regex->match->n_offsets = n_offsets; + regex->match->offsets = g_new0 (gint, n_offsets); +} + +/** + * g_regex_new: + * @pattern: the regular expression. + * @compile_options: compile options for the regular expression. + * @match_options: match options for the regular expression. + * @error: return location for a #GError. + * + * Compiles the regular expression to an internal form, and does the initial + * setup of the #GRegex structure. + * + * Returns: a #GRegex structure. + * + * Since: 2.14 + */ +GRegex * +g_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + pcre *re; + const gchar *errmsg; + gint erroffset; + static gboolean initialized = FALSE; + + g_return_val_if_fail (pattern != NULL, NULL); + g_return_val_if_fail (error == NULL || *error == NULL, NULL); + g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); + + if (!initialized) + { + gint support; + const gchar *msg; + + pcre_config (PCRE_CONFIG_UTF8, &support); + if (!support) + { + msg = N_("PCRE library is compiled without UTF8 support"); + g_critical (msg); + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg)); + return NULL; + } + + pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &support); + if (!support) + { + msg = N_("PCRE library is compiled without UTF8 properties support"); + g_critical (msg); + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, gettext (msg)); + return NULL; + } + + initialized = TRUE; + } + + /* In GRegex the string are, by default, UTF-8 encoded. PCRE + * instead uses UTF-8 only if required with PCRE_UTF8. */ + if (compile_options & G_REGEX_RAW) + { + /* disable utf-8 */ + compile_options &= ~G_REGEX_RAW; + } + else + { + /* enable utf-8 */ + compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK; + match_options |= PCRE_NO_UTF8_CHECK; + } + + /* compile the pattern */ + re = pcre_compile (pattern, compile_options, &errmsg, &erroffset, NULL); + + /* if the compilation failed, set the error member and return + * immediately */ + if (re == NULL) + { + GError *tmp_error = g_error_new (G_REGEX_ERROR, + G_REGEX_ERROR_COMPILE, + _("Error while compiling regular " + "expression %s at char %d: %s"), + pattern, erroffset, errmsg); + g_propagate_error (error, tmp_error); + + return NULL; + } + else + { + GRegex *regex = g_new0 (GRegex, 1); + regex->pattern = regex_pattern_new (re, pattern, + compile_options, match_options); + return regex; + } +} + +/** + * g_regex_free: + * @regex: a #GRegex. + * + * Frees all the memory associated with the regex structure. + * + * Since: 2.14 + */ +void +g_regex_free (GRegex *regex) +{ + if (regex == NULL) + return; + + regex_pattern_unref (regex->pattern); + regex_match_free (regex->match); + g_free (regex); +} + +/** + * g_regex_copy: + * @regex: a #GRegex structure from g_regex_new(). + * + * Copies a #GRegex. The returned #Gregex is in the same state as after + * a call to g_regex_clear(), so it does not contain information on the + * last match. If @regex is %NULL it returns %NULL. + * + * The returned copy shares some of its internal state with the original + * @regex, and the other internal variables are created only when needed, + * so the copy is a lightweight operation. + * + * Returns: a newly allocated copy of @regex, or %NULL if an error + * occurred. + * + * Since: 2.14 + */ +GRegex * +g_regex_copy (const GRegex *regex) +{ + GRegex *copy; + + if (regex == NULL) + return NULL; + + copy = g_new0 (GRegex, 1); + copy->pattern = regex_pattern_ref (regex->pattern); + + return copy; +} + +/** + * g_regex_get_pattern: + * @regex: a #GRegex structure. + * + * Gets the pattern string associated with @regex, i.e. a copy of the string passed + * to g_regex_new(). + * + * Returns: the pattern of @regex. + * + * Since: 2.14 + */ +const gchar * +g_regex_get_pattern (const GRegex *regex) +{ + g_return_val_if_fail (regex != NULL, NULL); + + return regex->pattern->pattern; +} + +/** + * g_regex_clear: + * @regex: a #GRegex structure. + * + * Clears out the members of @regex that are holding information about the + * last set of matches for this pattern. g_regex_clear() needs to be + * called between uses of g_regex_match_next() or g_regex_match_next_full() + * against new target strings. + * + * Since: 2.14 + */ +void +g_regex_clear (GRegex *regex) +{ + g_return_if_fail (regex != NULL); + + if (regex->match == NULL) + return; + + regex->match->matches = -1000; /* an error code not used by PCRE */ + regex->match->string_len = -1; + regex->match->pos = 0; + + /* if the pattern was used with g_regex_split_next(), it may have + * delimiter offsets stored. Free up those guys as well. */ + if (regex->match->delims != NULL) + { + g_slist_foreach (regex->match->delims, (GFunc) g_free, NULL); + g_slist_free (regex->match->delims); + regex->match->delims = NULL; + } +} + +/** + * g_regex_optimize: + * @regex: a #GRegex structure. + * @error: return location for a #GError. + * + * If the pattern will be used many times, then it may be worth the + * effort to optimize it to improve the speed of matches. + * + * Returns: %TRUE if @regex has been optimized or was already optimized, + * %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_optimize (GRegex *regex, + GError **error) +{ + const gchar *errmsg; + pcre_extra *extra; + pcre_extra G_GNUC_MAY_ALIAS **extra_p; + + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (error == NULL || *error == NULL, FALSE); + + if (REGEX_GET_EXTRA (regex) != NULL) + /* already optimized. */ + return TRUE; + + extra = pcre_study (regex->pattern->pcre_re, 0, &errmsg); + + if (errmsg != NULL) + { + GError *tmp_error = g_error_new (G_REGEX_ERROR, + G_REGEX_ERROR_OPTIMIZE, + _("Error while optimizing " + "regular expression %s: %s"), + regex->pattern->pattern, + errmsg); + g_propagate_error (error, tmp_error); + return FALSE; + } + + if (extra == NULL) + return TRUE; + + extra_p = ®ex->pattern->extra; + if (!g_atomic_pointer_compare_and_exchange ((gpointer *)extra_p, NULL, extra)) + /* someone else has optimized the regex while this function was running */ + pcre_free (extra); + + return TRUE; +} + +/** + * g_regex_match_simple: + * @pattern: the regular expression. + * @string: the string to scan for matches. + * @compile_options: compile options for the regular expression. + * @match_options: match options. + * + * Scans for a match in @string for @pattern. + * + * This function is equivalent to g_regex_match() but it does not + * require to compile the pattern with g_regex_new(), avoiding some + * lines of code when you need just to do a match without extracting + * substrings, capture counts, and so on. + * + * If this function is to be called on the same @pattern more than + * once, it's more efficient to compile the pattern once with + * g_regex_new() and then use g_regex_match(). + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match_simple (const gchar *pattern, + const gchar *string, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options) +{ + GRegex *regex; + gboolean result; + + regex = g_regex_new (pattern, compile_options, 0, NULL); + if (!regex) + return FALSE; + result = g_regex_match_full (regex, string, -1, 0, match_options, NULL); + g_regex_free (regex); + return result; +} + +/** + * g_regex_match: + * @regex: a #GRegex structure from g_regex_new(). + * @string: the string to scan for matches. + * @match_options: match options. + * + * Scans for a match in string for the pattern in @regex. The @match_options + * are combined with the match options specified when the @regex structure + * was created, letting you have more flexibility in reusing #GRegex + * structures. + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options) +{ + return g_regex_match_full (regex, string, -1, 0, + match_options, NULL); +} + +/** + * g_regex_match_full: + * @regex: a #GRegex structure from g_regex_new(). + * @string: the string to scan for matches. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @match_options: match options. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Scans for a match in string for the pattern in @regex. The @match_options + * are combined with the match options specified when the @regex structure + * was created, letting you have more flexibility in reusing #GRegex + * structures. + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error) +{ + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (string != NULL, FALSE); + g_return_val_if_fail (start_position >= 0, FALSE); + g_return_val_if_fail (error == NULL || *error == NULL, FALSE); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); + + regex_lazy_init_match (regex, 0); + + if (string_len < 0) + string_len = strlen(string); + + regex->match->string_len = string_len; + + /* create regex->match->offsets if it does not exist */ + regex_lazy_init_match (regex, 0); + + /* perform the match */ + regex->match->matches = pcre_exec (regex->pattern->pcre_re, + REGEX_GET_EXTRA (regex), + string, regex->match->string_len, + start_position, + regex->pattern->match_opts | match_options, + regex->match->offsets, regex->match->n_offsets); + if (IS_PCRE_ERROR (regex->match->matches)) + { + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, + _("Error while matching regular expression %s: %s"), + regex->pattern->pattern, match_error (regex->match->matches)); + return FALSE; + } + + /* set regex->match->pos to -1 so that a call to g_regex_match_next() + * fails without a previous call to g_regex_clear(). */ + regex->match->pos = -1; + + return regex->match->matches >= 0; +} + +/** + * g_regex_match_next: + * @regex: a #GRegex structure. + * @string: the string to scan for matches. + * @match_options: the match options. + * + * Scans for the next match in @string of the pattern in @regex. + * array. The match options are combined with the match options set when + * the @regex was created. + * + * You have to call g_regex_clear() to reuse the same pattern on a new + * string. + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match_next (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options) +{ + return g_regex_match_next_full (regex, string, -1, 0, + match_options, NULL); +} + +/** + * g_regex_match_next_full: + * @regex: a #GRegex structure. + * @string: the string to scan for matches. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @match_options: the match options. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Scans for the next match in @string of the pattern in @regex. Calling + * g_regex_match_next_full() until it returns %FALSE, you can retrieve + * all the non-overlapping matches of the pattern in @string. Empty matches + * are included, so matching the string "ab" with the pattern "b*" will + * find three matches: "" at position 0, "b" from position 1 to 2 and + * "" at position 2. + * + * The match options are combined with the match options set when the + * @regex was created. + * + * You have to call g_regex_clear() to reuse the same pattern on a new + * string. + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match_next_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error) +{ + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (string != NULL, FALSE); + g_return_val_if_fail (start_position >= 0, FALSE); + g_return_val_if_fail (error == NULL || *error == NULL, FALSE); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); + + regex_lazy_init_match (regex, 0); + + if (G_UNLIKELY (regex->match->pos < 0)) + { + const gchar *msg = _("g_regex_match_next_full: called without a " + "previous call to g_regex_clear()"); + g_log (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL, msg); + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, msg); + return FALSE; + } + + /* if this regex hasn't been used on this string before, then we + * need to calculate the length of the string, and set pos to the + * start of it. + * Knowing if this regex has been used on this string is a bit of + * a challenge. For now, we require the user to call g_regex_clear() + * in between usages on a new string. Not perfect, but not such a + * bad solution either. + */ + if (regex->match->string_len == -1) + { + if (string_len < 0) + string_len = strlen(string); + regex->match->string_len = string_len; + + regex->match->pos = start_position; + } + + /* create regex->match->offsets if it does not exist */ + regex_lazy_init_match (regex, 0); + + /* perform the match */ + regex->match->matches = pcre_exec (regex->pattern->pcre_re, + REGEX_GET_EXTRA (regex), + string, regex->match->string_len, + regex->match->pos, + regex->pattern->match_opts | match_options, + regex->match->offsets, regex->match->n_offsets); + if (IS_PCRE_ERROR (regex->match->matches)) + { + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, + _("Error while matching regular expression %s: %s"), + regex->pattern->pattern, match_error (regex->match->matches)); + return FALSE; + } + + /* avoid infinite loops if regex is an empty string or something + * equivalent */ + if (regex->match->pos == regex->match->offsets[1]) + { + if (regex->match->pos > regex->match->string_len) + { + /* we have reached the end of the string */ + regex->match->pos = -1; + return FALSE; + } + regex->match->pos = NEXT_CHAR (regex, &string[regex->match->pos]) - string; + } + else + { + regex->match->pos = regex->match->offsets[1]; + } + + return regex->match->matches >= 0; +} + +/** + * g_regex_match_all: + * @regex: a #GRegex structure from g_regex_new(). + * @string: the string to scan for matches. + * @match_options: match options. + * + * Using the standard algorithm for regular expression matching only the + * longest match in the string is retrieved. This function uses a + * different algorithm so it can retrieve all the possible matches. + * For more documentation see g_regex_match_all_full(). + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match_all (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options) +{ + return g_regex_match_all_full (regex, string, -1, 0, + match_options, NULL); +} + +/** + * g_regex_match_all_full: + * @regex: a #GRegex structure from g_regex_new(). + * @string: the string to scan for matches. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @match_options: match options. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Using the standard algorithm for regular expression matching only the + * longest match in the string is retrieved, it is not possibile to obtain + * all the available matches. For instance matching + * "<a> <b> <c>" against the pattern "<.*>" you get + * "<a> <b> <c>". + * + * This function uses a different algorithm (called DFA, i.e. deterministic + * finite automaton), so it can retrieve all the possible matches, all + * starting at the same point in the string. For instance matching + * "<a> <b> <c>" against the pattern "<.*>" you + * would obtain three matches: "<a> <b> <c>", + * "<a> <b>" and "<a>". + * + * The number of matched strings is retrieved using + * g_regex_get_match_count(). + * To obtain the matched strings and their position you can use, + * respectively, g_regex_fetch() and g_regex_fetch_pos(). Note that the + * strings are returned in reverse order of length; that is, the longest + * matching string is given first. + * + * Note that the DFA algorithm is slower than the standard one and it is not + * able to capture substrings, so backreferences do not work. + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: %TRUE is the string matched, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_match_all_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error) +{ + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (string != NULL, FALSE); + g_return_val_if_fail (start_position >= 0, FALSE); + g_return_val_if_fail (error == NULL || *error == NULL, FALSE); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); + + regex_lazy_init_match (regex, 0); + + if (string_len < 0) + string_len = strlen(string); + + regex->match->string_len = string_len; + + if (regex->match->workspace == NULL) + { + regex->match->n_workspace = WORKSPACE_INITIAL; + regex->match->workspace = g_new (gint, regex->match->n_workspace); + } + + if (regex->match->n_offsets < OFFSETS_DFA_MIN_SIZE) + { + regex->match->n_offsets = OFFSETS_DFA_MIN_SIZE; + regex->match->offsets = g_realloc (regex->match->offsets, + regex->match->n_offsets * sizeof(gint)); + } + + /* perform the match */ + regex->match->matches = pcre_dfa_exec (regex->pattern->pcre_re, + REGEX_GET_EXTRA (regex), + string, regex->match->string_len, + start_position, + regex->pattern->match_opts | match_options, + regex->match->offsets, regex->match->n_offsets, + regex->match->workspace, + regex->match->n_workspace); + if (regex->match->matches == PCRE_ERROR_DFA_WSSIZE) + { + /* regex->match->workspace is too small. */ + regex->match->n_workspace *= 2; + regex->match->workspace = g_realloc (regex->match->workspace, + regex->match->n_workspace * sizeof(gint)); + return g_regex_match_all_full (regex, string, string_len, + start_position, match_options, error); + } + else if (regex->match->matches == 0) + { + /* regex->match->offsets is too small. */ + regex->match->n_offsets *= 2; + regex->match->offsets = g_realloc (regex->match->offsets, + regex->match->n_offsets * sizeof(gint)); + return g_regex_match_all_full (regex, string, string_len, + start_position, match_options, error); + } + else if (IS_PCRE_ERROR (regex->match->matches)) + { + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, + _("Error while matching regular expression %s: %s"), + regex->pattern->pattern, match_error (regex->match->matches)); + return FALSE; + } + + /* set regex->match->pos to -1 so that a call to g_regex_match_next() + * fails without a previous call to g_regex_clear(). */ + regex->match->pos = -1; + + return regex->match->matches >= 0; +} + +/** + * g_regex_get_match_count: + * @regex: a #GRegex structure. + * + * Retrieves the number of matched substrings (including substring 0, that + * is the whole matched text) in the last call to g_regex_match*(), so 1 + * is returned if the pattern has no substrings in it and 0 is returned if + * the match failed. + * + * If the last match was obtained using the DFA algorithm, that is using + * g_regex_match_all() or g_regex_match_all_full(), the retrieved + * count is not that of the number of capturing parentheses but that of + * the number of matched substrings. + * + * Returns: Number of matched substrings, or -1 if an error occurred. + * + * Since: 2.14 + */ +gint +g_regex_get_match_count (const GRegex *regex) +{ + g_return_val_if_fail (regex != NULL, -1); + + if (regex->match == NULL) + return -1; + + if (regex->match->matches == PCRE_ERROR_NOMATCH) + /* no match */ + return 0; + else if (regex->match->matches < PCRE_ERROR_NOMATCH) + /* error */ + return -1; + else + /* match */ + return regex->match->matches; +} + +/** + * g_regex_is_partial_match: + * @regex: a #GRegex structure. + * + * Usually if the string passed to g_regex_match*() matches as far as + * it goes, but is too short to match the entire pattern, %FALSE is + * returned. There are circumstances where it might be helpful to + * distinguish this case from other cases in which there is no match. + * + * Consider, for example, an application where a human is required to + * type in data for a field with specific formatting requirements. An + * example might be a date in the form ddmmmyy, defined by the pattern + * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". + * If the application sees the user’s keystrokes one by one, and can + * check that what has been typed so far is potentially valid, it is + * able to raise an error as soon as a mistake is made. + * + * GRegex supports the concept of partial matching by means of the + * #G_REGEX_MATCH_PARTIAL flag. When this is set the return code for + * g_regex_match() or g_regex_match_full() is, as usual, %TRUE + * for a complete match, %FALSE otherwise. But, when this functions + * returns %FALSE, you can check if the match was partial calling + * g_regex_is_partial_match(). + * + * When using partial matching you cannot use g_regex_fetch*(). + * + * Because of the way certain internal optimizations are implemented the + * partial matching algorithm cannot be used with all patterns. So repeated + * single characters such as "a{2,4}" and repeated single metasequences such + * as "\d+" are not permitted if the maximum number of occurrences is + * greater than one. Optional items such as "\d?" (where the maximum is one) + * are permitted. Quantifiers with any values are permitted after + * parentheses, so the invalid examples above can be coded thus "(a){2,4}" + * and "(\d)+". If #G_REGEX_MATCH_PARTIAL is set for a pattern that does + * not conform to the restrictions, matching functions return an error. + * + * Returns: %TRUE if the match was partial, %FALSE otherwise. + * + * Since: 2.14 + */ +gboolean +g_regex_is_partial_match (const GRegex *regex) +{ + g_return_val_if_fail (regex != NULL, FALSE); + + if (regex->match == NULL) + return FALSE; + + return regex->match->matches == PCRE_ERROR_PARTIAL; +} + +/** + * g_regex_fetch: + * @regex: #GRegex structure used in last match. + * @match_num: number of the sub expression. + * @string: the string on which the last match was made. + * + * Retrieves the text matching the @match_num'th capturing parentheses. + * 0 is the full text of the match, 1 is the first paren set, 2 the second, + * and so on. + * + * If @match_num is a valid sub pattern but it didn't match anything (e.g. + * sub pattern 1, matching "b" against "(a)?b") then an empty string is + * returned. + * + * If the last match was obtained using the DFA algorithm, that is using + * g_regex_match_all() or g_regex_match_all_full(), the retrieved + * string is not that of a set of parentheses but that of a matched + * substring. Substrings are matched in reverse order of length, so 0 is + * the longest match. + * + * Returns: The matched substring, or %NULL if an error occurred. + * You have to free the string yourself. + * + * Since: 2.14 + */ +gchar * +g_regex_fetch (const GRegex *regex, + gint match_num, + const gchar *string) +{ + /* we cannot use pcre_get_substring() because it allocates the + * string using pcre_malloc(). */ + gchar *match = NULL; + gint start, end; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (match_num >= 0, NULL); + + if (regex->match == NULL) + return NULL; + + if (regex->match->string_len < 0) + return NULL; + + /* match_num does not exist or it didn't matched, i.e. matching "b" + * against "(a)?b" then group 0 is empty. */ + if (!g_regex_fetch_pos (regex, match_num, &start, &end)) + match = NULL; + else if (start == -1) + match = g_strdup (""); + else + match = g_strndup (&string[start], end - start); + + return match; +} + +/** + * g_regex_fetch_pos: + * @regex: #GRegex structure used in last match. + * @match_num: number of the sub expression. + * @start_pos: pointer to location where to store the start position. + * @end_pos: pointer to location where to store the end position. + * + * Retrieves the position of the @match_num'th capturing parentheses. + * 0 is the full text of the match, 1 is the first paren set, 2 the second, + * and so on. + * + * If @match_num is a valid sub pattern but it didn't match anything (e.g. + * sub pattern 1, matching "b" against "(a)?b") then @start_pos and @end_pos + * are set to -1 and %TRUE is returned. + * + * If the last match was obtained using the DFA algorithm, that is using + * g_regex_match_all() or g_regex_match_all_full(), the retrieved + * position is not that of a set of parentheses but that of a matched + * substring. Substrings are matched in reverse order of length, so 0 is + * the longest match. + * + * Returns: %TRUE if the position was fetched, %FALSE otherwise. If the + * position cannot be fetched, @start_pos and @end_pos are left + * unchanged. + * + * Since: 2.14 + */ +gboolean +g_regex_fetch_pos (const GRegex *regex, + gint match_num, + gint *start_pos, + gint *end_pos) +{ + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (match_num >= 0, FALSE); + + if (regex->match == NULL) + return FALSE; + + /* make sure the sub expression number they're requesting is less than + * the total number of sub expressions that were matched. */ + if (match_num >= regex->match->matches) + return FALSE; + + if (start_pos != NULL) + { + *start_pos = regex->match->offsets[2 * match_num]; + } + + if (end_pos != NULL) + { + *end_pos = regex->match->offsets[2 * match_num + 1]; + } + + return TRUE; +} + +/** + * g_regex_fetch_named: + * @regex: #GRegex structure used in last match. + * @name: name of the subexpression. + * @string: the string on which the last match was made. + * + * Retrieves the text matching the capturing parentheses named @name. + * + * If @name is a valid sub pattern name but it didn't match anything (e.g. + * sub pattern "X", matching "b" against "(?P<X>a)?b") then an empty + * string is returned. + * + * Returns: The matched substring, or %NULL if an error occurred. + * You have to free the string yourself. + * + * Since: 2.14 + */ +gchar * +g_regex_fetch_named (const GRegex *regex, + const gchar *name, + const gchar *string) +{ + /* we cannot use pcre_get_named_substring() because it allocates the + * string using pcre_malloc(). */ + gint num; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (name != NULL, NULL); + + num = g_regex_get_string_number (regex, name); + if (num == -1) + return NULL; + else + return g_regex_fetch (regex, num, string); +} + +/** + * g_regex_fetch_named_pos: + * @regex: #GRegex structure used in last match. + * @name: name of the subexpression. + * @start_pos: pointer to location where to store the start position. + * @end_pos: pointer to location where to store the end position. + * + * Retrieves the position of the capturing parentheses named @name. + * + * If @name is a valid sub pattern name but it didn't match anything (e.g. + * sub pattern "X", matching "b" against "(?P<X>a)?b") then @start_pos and + * @end_pos are set to -1 and %TRUE is returned. + * + * Returns: %TRUE if the position was fetched, %FALSE otherwise. If the + * position cannot be fetched, @start_pos and @end_pos are left + * unchanged. + * + * Since: 2.14 + */ +gboolean +g_regex_fetch_named_pos (const GRegex *regex, + const gchar *name, + gint *start_pos, + gint *end_pos) +{ + gint num; + + num = g_regex_get_string_number (regex, name); + if (num == -1) + return FALSE; + + return g_regex_fetch_pos (regex, num, start_pos, end_pos); +} + +/** + * g_regex_fetch_all: + * @regex: a #GRegex structure. + * @string: the string on which the last match was made. + * + * Bundles up pointers to each of the matching substrings from a match + * and stores them in an array of gchar pointers. The first element in + * the returned array is the match number 0, i.e. the entire matched + * text. + * + * If a sub pattern didn't match anything (e.g. sub pattern 1, matching + * "b" against "(a)?b") then an empty string is inserted. + * + * If the last match was obtained using the DFA algorithm, that is using + * g_regex_match_all() or g_regex_match_all_full(), the retrieved + * strings are not that matched by sets of parentheses but that of the + * matched substring. Substrings are matched in reverse order of length, + * so the first one is the longest match. + * + * Returns: a %NULL-terminated array of gchar * pointers. It must be freed + * using g_strfreev(). If the memory can't be allocated, returns + * %NULL. + * + * Since: 2.14 + */ +gchar ** +g_regex_fetch_all (const GRegex *regex, + const gchar *string) +{ + /* we cannot use pcre_get_substring_list() because the returned value + * isn't suitable for g_strfreev(). */ + gchar **result; + gint i; + + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (string != NULL, FALSE); + + if (regex->match == NULL) + return NULL; + + if (regex->match->matches < 0) + return NULL; + + result = g_new (gchar *, regex->match->matches + 1); + for (i = 0; i < regex->match->matches; i++) + result[i] = g_regex_fetch (regex, i, string); + result[i] = NULL; + + return result; +} + +/** + * g_regex_get_string_number: + * @regex: #GRegex structure. + * @name: name of the subexpression. + * + * Retrieves the number of the subexpression named @name. + * + * Returns: The number of the subexpression or -1 if @name does not exists. + * + * Since: 2.14 + */ +gint +g_regex_get_string_number (const GRegex *regex, + const gchar *name) +{ + gint num; + + g_return_val_if_fail (regex != NULL, -1); + g_return_val_if_fail (name != NULL, -1); + + num = pcre_get_stringnumber (regex->pattern->pcre_re, name); + if (num == PCRE_ERROR_NOSUBSTRING) + num = -1; + + return num; +} + +/** + * g_regex_split_simple: + * @pattern: the regular expression. + * @string: the string to scan for matches. + * @compile_options: compile options for the regular expression. + * @match_options: match options. + * + * Breaks the string on the pattern, and returns an array of the tokens. + * If the pattern contains capturing parentheses, then the text for each + * of the substrings will also be returned. If the pattern does not match + * anywhere in the string, then the whole string is returned as the first + * token. + * + * This function is equivalent to g_regex_split() but it does not + * require to compile the pattern with g_regex_new(), avoiding some + * lines of code when you need just to do a split without extracting + * substrings, capture counts, and so on. + * + * If this function is to be called on the same @pattern more than + * once, it's more efficient to compile the pattern once with + * g_regex_new() and then use g_regex_split(). + * + * As a special case, the result of splitting the empty string "" is an + * empty vector, not a vector containing a single string. The reason for + * this special case is that being able to represent a empty vector is + * typically more useful than consistent handling of empty elements. If + * you do need to represent empty elements, you'll need to check for the + * empty string before calling this function. + * + * A pattern that can match empty strings splits @string into separate + * characters wherever it matches the empty string between characters. + * For example splitting "ab c" using as a separator "\s*", you will get + * "a", "b" and "c". + * + * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev(). + * + * Since: 2.14 + **/ +gchar ** +g_regex_split_simple (const gchar *pattern, + const gchar *string, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options) +{ + GRegex *regex; + gchar **result; + + regex = g_regex_new (pattern, compile_options, 0, NULL); + if (!regex) + return NULL; + result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); + g_regex_free (regex); + return result; +} + +/** + * g_regex_split: + * @regex: a #GRegex structure. + * @string: the string to split with the pattern. + * @match_options: match time option flags. + * + * Breaks the string on the pattern, and returns an array of the tokens. + * If the pattern contains capturing parentheses, then the text for each + * of the substrings will also be returned. If the pattern does not match + * anywhere in the string, then the whole string is returned as the first + * token. + * + * As a special case, the result of splitting the empty string "" is an + * empty vector, not a vector containing a single string. The reason for + * this special case is that being able to represent a empty vector is + * typically more useful than consistent handling of empty elements. If + * you do need to represent empty elements, you'll need to check for the + * empty string before calling this function. + * + * A pattern that can match empty strings splits @string into separate + * characters wherever it matches the empty string between characters. + * For example splitting "ab c" using as a separator "\s*", you will get + * "a", "b" and "c". + * + * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev(). + * + * Since: 2.14 + **/ +gchar ** +g_regex_split (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options) +{ + return g_regex_split_full (regex, string, -1, 0, + match_options, 0, NULL); +} + +/** + * g_regex_split_full: + * @regex: a #GRegex structure. + * @string: the string to split with the pattern. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @match_options: match time option flags. + * @max_tokens: the maximum number of tokens to split @string into. If this + * is less than 1, the string is split completely. + * @error: return location for a #GError. + * + * Breaks the string on the pattern, and returns an array of the tokens. + * If the pattern contains capturing parentheses, then the text for each + * of the substrings will also be returned. If the pattern does not match + * anywhere in the string, then the whole string is returned as the first + * token. + * + * As a special case, the result of splitting the empty string "" is an + * empty vector, not a vector containing a single string. The reason for + * this special case is that being able to represent a empty vector is + * typically more useful than consistent handling of empty elements. If + * you do need to represent empty elements, you'll need to check for the + * empty string before calling this function. + * + * A pattern that can match empty strings splits @string into separate + * characters wherever it matches the empty string between characters. + * For example splitting "ab c" using as a separator "\s*", you will get + * "a", "b" and "c". + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: a %NULL-terminated gchar ** array. Free it using g_strfreev(). + * + * Since: 2.14 + **/ +gchar ** +g_regex_split_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + gint max_tokens, + GError **error) +{ + gchar **string_list; /* The array of char **s worked on */ + gint pos; + gint tokens; + GList *list, *last; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (start_position >= 0, NULL); + g_return_val_if_fail (error == NULL || *error == NULL, NULL); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); + + regex_lazy_init_match (regex, 0); + + if (max_tokens <= 0) + max_tokens = G_MAXINT; + + if (string_len < 0) + string_len = strlen(string); + + if (string_len - start_position == 0) + return g_new0 (gchar *, 1); + + /* clear out the regex for reuse, just in case */ + g_regex_clear (regex); + + list = NULL; + tokens = 0; + while (TRUE) + { + gchar *token; + + /* -1 to leave room for the last part. */ + if (tokens >= max_tokens - 1) + { + /* we have reached the maximum number of tokens, so we copy + * the remaining part of the string. */ + if (regex->match->last_match_is_empty) + { + /* the last match was empty, so we have moved one char + * after the real position to avoid empty matches at the + * same position. */ + regex->match->pos = PREV_CHAR (regex, &string[regex->match->pos]) - string; + } + /* the if is needed in the case we have terminated the available + * tokens, but we are at the end of the string, so there are no + * characters left to copy. */ + if (string_len > regex->match->pos) + { + token = g_strndup (string + regex->match->pos, + string_len - regex->match->pos); + list = g_list_prepend (list, token); + } + /* end the loop. */ + break; + } + + token = g_regex_split_next_full (regex, string, string_len, start_position, + match_options, &tmp_error); + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + g_list_foreach (list, (GFunc)g_free, NULL); + g_list_free (list); + regex->match->pos = -1; + return NULL; + } + + if (token == NULL) + /* no more tokens. */ + break; + + tokens++; + list = g_list_prepend (list, token); + } + + string_list = g_new (gchar *, g_list_length (list) + 1); + pos = 0; + for (last = g_list_last (list); last; last = g_list_previous (last)) + string_list[pos++] = last->data; + string_list[pos] = 0; + + regex->match->pos = -1; + g_list_free (list); + + return string_list; +} + +/** + * g_regex_split_next: + * @regex: a #GRegex structure from g_regex_new(). + * @string: the string to split on pattern. + * @match_options: match time options for the regex. + * + * g_regex_split_next() breaks the string on pattern, and returns the + * tokens, one per call. If the pattern contains capturing parentheses, + * then the text for each of the substrings will also be returned. + * If the pattern does not match anywhere in the string, then the whole + * string is returned as the first token. + * + * A pattern that can match empty strings splits @string into separate + * characters wherever it matches the empty string between characters. + * For example splitting "ab c" using as a separator "\s*", you will get + * "a", "b" and "c". + * + * You have to call g_regex_clear() to reuse the same pattern on a new + * string. + * + * Returns: a gchar * to the next token of the string. + * + * Since: 2.14 + */ +gchar * +g_regex_split_next (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options) +{ + return g_regex_split_next_full (regex, string, -1, 0, match_options, + NULL); +} + +/** + * g_regex_split_next_full: + * @regex: a #GRegex structure from g_regex_new(). + * @string: the string to split on pattern. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @match_options: match time options for the regex. + * @error: return location for a #GError. + * + * g_regex_split_next_full() breaks the string on pattern, and returns + * the tokens, one per call. If the pattern contains capturing parentheses, + * then the text for each of the substrings will also be returned. + * If the pattern does not match anywhere in the string, then the whole + * string is returned as the first token. + * + * A pattern that can match empty strings splits @string into separate + * characters wherever it matches the empty string between characters. + * For example splitting "ab c" using as a separator "\s*", you will get + * "a", "b" and "c". + * + * You have to call g_regex_clear() to reuse the same pattern on a new + * string. + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: a gchar * to the next token of the string. + * + * Since: 2.14 + */ +gchar * +g_regex_split_next_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error) +{ + gint new_pos; + gchar *token = NULL; + gboolean match_ok; + gint match_count; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (error == NULL || *error == NULL, NULL); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); + + regex_lazy_init_match (regex, 0); + + new_pos = MAX (regex->match->pos, start_position); + if (regex->match->last_match_is_empty) + /* if the last match was empty, g_regex_match_next_full() has moved + * forward to avoid infinite loops, but we still need to copy that + * character. */ + new_pos = PREV_CHAR(regex, &string[new_pos]) - string; + + /* if there are delimiter substrings stored, return those one at a + * time. + */ + if (regex->match->delims != NULL) + { + token = regex->match->delims->data; + regex->match->delims = g_slist_remove (regex->match->delims, token); + return token; + } + + if (regex->match->pos == -1) + /* the last call to g_regex_match_next_full() returned NULL. */ + return NULL; + + if (regex->match->string_len < 0) + { + regex->match->last_match_is_empty = FALSE; + /* initialize last_separator_end to start_position to skip the + * empty token at the beginning of the string. */ + regex->match->last_separator_end = start_position; + } + + /* use g_regex_match_next() to find the next occurance of the pattern + * in the string. We use new_pos to keep track of where the stuff + * up to the current match starts. Copy that token of the string off + * and append it to the buffer using g_strndup. */ + match_ok = g_regex_match_next_full (regex, string, string_len, + start_position, match_options, + &tmp_error); + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + return NULL; + } + + if (match_ok) + { + regex->match->last_match_is_empty = + (regex->match->offsets[0] == regex->match->offsets[1]); + + /* we need to skip empty separators at the same position of the end + * of another separator. e.g. the string is "a b" and the separator + * is "*", so from 1 to 2 we have a match and at position 2 we have + * an empty match. */ + if (regex->match->last_separator_end != regex->match->offsets[1]) + { + token = g_strndup (string + new_pos, regex->match->offsets[0] - new_pos); + + /* if there were substrings, these need to get added to the + * list of delims */ + match_count = g_regex_get_match_count (regex); + if (match_count > 1) + { + gint i; + for (i = 1; i < match_count; i++) + regex->match->delims = g_slist_append (regex->match->delims, + g_regex_fetch (regex, i, string)); + } + + regex->match->last_separator_end = regex->match->offsets[1]; + } + else + { + /* we have skipped an empty separator so we need to find the + * next match. */ + return g_regex_split_next_full (regex, string, string_len, + start_position, match_options, + error); + } + } + else + { + /* if there was no match, copy to end of string. */ + if (!regex->match->last_match_is_empty) + token = g_strndup (string + new_pos, regex->match->string_len - new_pos); + else + token = NULL; + } + + return token; +} + +enum +{ + REPL_TYPE_STRING, + REPL_TYPE_CHARACTER, + REPL_TYPE_SYMBOLIC_REFERENCE, + REPL_TYPE_NUMERIC_REFERENCE, + REPL_TYPE_CHANGE_CASE +}; + +typedef enum +{ + CHANGE_CASE_NONE = 1 << 0, + CHANGE_CASE_UPPER = 1 << 1, + CHANGE_CASE_LOWER = 1 << 2, + CHANGE_CASE_UPPER_SINGLE = 1 << 3, + CHANGE_CASE_LOWER_SINGLE = 1 << 4, + CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, + CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, + CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE +} ChangeCase; + +typedef struct +{ + gchar *text; + gint type; + gint num; + gchar c; + ChangeCase change_case; +} InterpolationData; + +static void +free_interpolation_data (InterpolationData *data) +{ + g_free (data->text); + g_free (data); +} + +static const gchar * +expand_escape (const gchar *replacement, + const gchar *p, + InterpolationData *data, + GError **error) +{ + const gchar *q, *r; + gint x, d, h, i; + const gchar *error_detail; + gint base = 0; + GError *tmp_error = NULL; + + p++; + switch (*p) + { + case 't': + p++; + data->c = '\t'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'n': + p++; + data->c = '\n'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'v': + p++; + data->c = '\v'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'r': + p++; + data->c = '\r'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'f': + p++; + data->c = '\f'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'a': + p++; + data->c = '\a'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'b': + p++; + data->c = '\b'; + data->type = REPL_TYPE_CHARACTER; + break; + case '\\': + p++; + data->c = '\\'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'x': + p++; + x = 0; + if (*p == '{') + { + p++; + do + { + h = g_ascii_xdigit_value (*p); + if (h < 0) + { + error_detail = _("hexadecimal digit or '}' expected"); + goto error; + } + x = x * 16 + h; + p++; + } + while (*p != '}'); + p++; + } + else + { + for (i = 0; i < 2; i++) + { + h = g_ascii_xdigit_value (*p); + if (h < 0) + { + error_detail = _("hexadecimal digit expected"); + goto error; + } + x = x * 16 + h; + p++; + } + } + data->type = REPL_TYPE_STRING; + data->text = g_new0 (gchar, 8); + g_unichar_to_utf8 (x, data->text); + break; + case 'l': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_LOWER_SINGLE; + break; + case 'u': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_UPPER_SINGLE; + break; + case 'L': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_LOWER; + break; + case 'U': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_UPPER; + break; + case 'E': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_NONE; + break; + case 'g': + p++; + if (*p != '<') + { + error_detail = _("missing '<' in symbolic reference"); + goto error; + } + q = p + 1; + do + { + p++; + if (!*p) + { + error_detail = _("unfinished symbolic reference"); + goto error; + } + } + while (*p != '>'); + if (p - q == 0) + { + error_detail = _("zero-length symbolic reference"); + goto error; + } + if (g_ascii_isdigit (*q)) + { + x = 0; + do + { + h = g_ascii_digit_value (*q); + if (h < 0) + { + error_detail = _("digit expected"); + p = q; + goto error; + } + x = x * 10 + h; + q++; + } + while (q != p); + data->num = x; + data->type = REPL_TYPE_NUMERIC_REFERENCE; + } + else + { + r = q; + do + { + if (!g_ascii_isalnum (*r)) + { + error_detail = _("illegal symbolic reference"); + p = r; + goto error; + } + r++; + } + while (r != p); + data->text = g_strndup (q, p - q); + data->type = REPL_TYPE_SYMBOLIC_REFERENCE; + } + p++; + break; + case '0': + /* if \0 is followed by a number is an octal number representing a + * character, else it is a numeric reference. */ + if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) + { + base = 8; + p = g_utf8_next_char (p); + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + x = 0; + d = 0; + for (i = 0; i < 3; i++) + { + h = g_ascii_digit_value (*p); + if (h < 0) + break; + if (h > 7) + { + if (base == 8) + break; + else + base = 10; + } + if (i == 2 && base == 10) + break; + x = x * 8 + h; + d = d * 10 + h; + p++; + } + if (base == 8 || i == 3) + { + data->type = REPL_TYPE_STRING; + data->text = g_new0 (gchar, 8); + g_unichar_to_utf8 (x, data->text); + } + else + { + data->type = REPL_TYPE_NUMERIC_REFERENCE; + data->num = d; + } + break; + case 0: + error_detail = _("stray final '\\'"); + goto error; + break; + default: + error_detail = _("unknown escape sequence"); + goto error; + } + + return p; + + error: + /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ + tmp_error = g_error_new (G_REGEX_ERROR, + G_REGEX_ERROR_REPLACE, + _("Error while parsing replacement " + "text \"%s\" at char %lu: %s"), + replacement, + (gulong)(p - replacement), + error_detail); + g_propagate_error (error, tmp_error); + + return NULL; +} + +static GList * +split_replacement (const gchar *replacement, + GError **error) +{ + GList *list = NULL; + InterpolationData *data; + const gchar *p, *start; + + start = p = replacement; + while (*p) + { + if (*p == '\\') + { + data = g_new0 (InterpolationData, 1); + start = p = expand_escape (replacement, p, data, error); + if (p == NULL) + { + g_list_foreach (list, (GFunc)free_interpolation_data, NULL); + g_list_free (list); + free_interpolation_data (data); + + return NULL; + } + list = g_list_prepend (list, data); + } + else + { + p++; + if (*p == '\\' || *p == '\0') + { + if (p - start > 0) + { + data = g_new0 (InterpolationData, 1); + data->text = g_strndup (start, p - start); + data->type = REPL_TYPE_STRING; + list = g_list_prepend (list, data); + } + } + } + } + + return g_list_reverse (list); +} + +/* Change the case of c based on change_case. */ +#define CHANGE_CASE(c, change_case) \ + (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ + g_unichar_tolower (c) : \ + g_unichar_toupper (c)) + +static void +string_append (GString *string, + const gchar *text, + ChangeCase *change_case) +{ + gunichar c; + + if (text[0] == '\0') + return; + + if (*change_case == CHANGE_CASE_NONE) + { + g_string_append (string, text); + } + else if (*change_case & CHANGE_CASE_SINGLE_MASK) + { + c = g_utf8_get_char (text); + g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); + g_string_append (string, g_utf8_next_char (text)); + *change_case = CHANGE_CASE_NONE; + } + else + { + while (*text != '\0') + { + c = g_utf8_get_char (text); + g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); + text = g_utf8_next_char (text); + } + } +} + +static gboolean +interpolate_replacement (const GRegex *regex, + const gchar *string, + GString *result, + gpointer data) +{ + GList *list; + InterpolationData *idata; + gchar *match; + ChangeCase change_case = CHANGE_CASE_NONE; + + for (list = data; list; list = list->next) + { + idata = list->data; + switch (idata->type) + { + case REPL_TYPE_STRING: + string_append (result, idata->text, &change_case); + break; + case REPL_TYPE_CHARACTER: + g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); + if (change_case & CHANGE_CASE_SINGLE_MASK) + change_case = CHANGE_CASE_NONE; + break; + case REPL_TYPE_NUMERIC_REFERENCE: + match = g_regex_fetch (regex, idata->num, string); + if (match) + { + string_append (result, match, &change_case); + g_free (match); + } + break; + case REPL_TYPE_SYMBOLIC_REFERENCE: + match = g_regex_fetch_named (regex, idata->text, string); + if (match) + { + string_append (result, match, &change_case); + g_free (match); + } + break; + case REPL_TYPE_CHANGE_CASE: + change_case = idata->change_case; + break; + } + } + + return FALSE; +} + +/** + * g_regex_expand_references: + * @regex: #GRegex structure used in last match. + * @string: the string on which the last match was made. + * @string_to_expand: the string to expand. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Returns a new string containing the text in @string_to_expand with + * references expanded. References refer to the last match done with + * @string against @regex and have the same syntax used by g_regex_replace(). + * + * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was + * passed to g_regex_new(). + * + * Returns: the expanded string, or %NULL if an error occurred. + * + * Since: 2.14 + */ +gchar * +g_regex_expand_references (GRegex *regex, + const gchar *string, + const gchar *string_to_expand, + GError **error) +{ + GString *result; + GList *list; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (string_to_expand != NULL, NULL); + g_return_val_if_fail (error == NULL || *error == NULL, NULL); + + list = split_replacement (string_to_expand, &tmp_error); + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + return NULL; + } + + result = g_string_sized_new (strlen (string_to_expand)); + interpolate_replacement (regex, string, result, list); + + g_list_foreach (list, (GFunc)free_interpolation_data, NULL); + g_list_free (list); + + return g_string_free (result, FALSE); +} + +/** + * g_regex_replace: + * @regex: a #GRegex structure. + * @string: the string to perform matches against. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @replacement: text to replace each match with. + * @match_options: options for the match. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Replaces all occurances of the pattern in @regex with the + * replacement text. Backreferences of the form '\number' or '\g<number>' + * in the replacement text are interpolated by the number-th captured + * subexpression of the match, '\g<name>' refers to the captured subexpression + * with the given name. '\0' refers to the complete match, but '\0' followed + * by a number is the octal representation of a character. To include a + * literal '\' in the replacement, write '\\'. + * There are also escapes that changes the case of the following text: + * + * + * \l + * + * Convert to lower case the next character + * + * + * \u + * + * Convert to upper case the next character + * + * + * \L + * + * Convert to lower case till \E + * + * + * \U + * + * Convert to upper case till \E + * + * + * \E + * + * End case modification + * + * + * + * + * If you do not need to use backreferences use g_regex_replace_literal(). + * + * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was + * passed to g_regex_new(). If you want to use not UTF-8 encoded stings + * you can use g_regex_replace_literal(). + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: a newly allocated string containing the replacements. + * + * Since: 2.14 + */ +gchar * +g_regex_replace (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + const gchar *replacement, + GRegexMatchFlags match_options, + GError **error) +{ + gchar *result; + GList *list; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (start_position >= 0, NULL); + g_return_val_if_fail (replacement != NULL, NULL); + g_return_val_if_fail (error == NULL || *error == NULL, NULL); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); + + list = split_replacement (replacement, &tmp_error); + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + return NULL; + } + + result = g_regex_replace_eval (regex, + string, string_len, start_position, + match_options, + interpolate_replacement, + (gpointer)list, + &tmp_error); + if (tmp_error != NULL) + g_propagate_error (error, tmp_error); + + g_list_foreach (list, (GFunc)free_interpolation_data, NULL); + g_list_free (list); + + return result; +} + +static gboolean +literal_replacement (const GRegex *regex, + const gchar *string, + GString *result, + gpointer data) +{ + g_string_append (result, data); + return FALSE; +} + +/** + * g_regex_replace_literal: + * @regex: a #GRegex structure. + * @string: the string to perform matches against. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @replacement: text to replace each match with. + * @match_options: options for the match. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Replaces all occurances of the pattern in @regex with the + * replacement text. @replacement is replaced literally, to + * include backreferences use g_regex_replace(). + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: a newly allocated string containing the replacements. + * + * Since: 2.14 + */ +gchar * +g_regex_replace_literal (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + const gchar *replacement, + GRegexMatchFlags match_options, + GError **error) +{ + g_return_val_if_fail (replacement != NULL, NULL); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); + + return g_regex_replace_eval (regex, + string, string_len, start_position, + match_options, + literal_replacement, + (gpointer)replacement, + error); +} + +/** + * g_regex_replace_eval: + * @regex: a #GRegex structure from g_regex_new(). + * @string: string to perform matches against. + * @string_len: the length of @string, or -1 if @string is nul-terminated. + * @start_position: starting index of the string to match. + * @match_options: Options for the match. + * @eval: a function to call for each match. + * @user_data: user data to pass to the function. + * @error: location to store the error occuring, or NULL to ignore errors. + * + * Replaces occurances of the pattern in regex with the output of @eval + * for that occurance. + * + * Setting @start_position differs from just passing over a shortened string + * and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that begins + * with any kind of lookbehind assertion, such as "\b". + * + * Returns: a newly allocated string containing the replacements. + * + * Since: 2.14 + */ +gchar * +g_regex_replace_eval (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GRegexEvalCallback eval, + gpointer user_data, + GError **error) +{ + GString *result; + gint str_pos = 0; + gboolean done = FALSE; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (start_position >= 0, NULL); + g_return_val_if_fail (eval != NULL, NULL); + g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); + + regex_lazy_init_match (regex, 0); + + if (string_len < 0) + string_len = strlen(string); + + /* clear out the regex for reuse, just in case */ + g_regex_clear (regex); + + result = g_string_sized_new (string_len); + + /* run down the string making matches. */ + while (!done && + g_regex_match_next_full (regex, string, string_len, + start_position, match_options, &tmp_error)) + { + g_string_append_len (result, + string + str_pos, + regex->match->offsets[0] - str_pos); + done = (*eval) (regex, string, result, user_data); + str_pos = regex->match->offsets[1]; + } + + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + g_string_free (result, TRUE); + return NULL; + } + + g_string_append_len (result, string + str_pos, string_len - str_pos); + + return g_string_free (result, FALSE); +} + +/** + * g_regex_escape_string: + * @string: the string to escape. + * @length: the length of @string, or -1 if @string is nul-terminated. + * + * Escapes the special characters used for regular expressions in @string, + * for instance "a.b*c" becomes "a\.b\*c". This function is useful to + * dynamically generate regular expressions. + * + * @string can contain NULL characters that are replaced with "\0", in this + * case remember to specify the correct length of @string in @length. + * + * Returns: a newly allocated escaped string. + * + * Since: 2.14 + */ +gchar * +g_regex_escape_string (const gchar *string, + gint length) +{ + GString *escaped; + const char *p, *piece_start, *end; + + g_return_val_if_fail (string != NULL, NULL); + + if (length < 0) + length = strlen (string); + + end = string + length; + p = piece_start = string; + escaped = g_string_sized_new (length + 1); + + while (p < end) + { + switch (*p) + { + case '\0': + case '\\': + case '|': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '^': + case '$': + case '*': + case '+': + case '?': + case '.': + if (p != piece_start) + /* copy the previous piece. */ + g_string_append_len (escaped, piece_start, p - piece_start); + g_string_append_c (escaped, '\\'); + if (*p == '\0') + g_string_append_c (escaped, '0'); + else + g_string_append_c (escaped, *p); + piece_start = ++p; + break; + default: + p = g_utf8_next_char (p); + } + } + + if (piece_start < end) + g_string_append_len (escaped, piece_start, end - piece_start); + + return g_string_free (escaped, FALSE); +} + +#define __G_REGEX_C__ +#include "galiasdef.c" diff --git a/glib/gregex.h b/glib/gregex.h new file mode 100644 index 0000000..b9f8860 --- /dev/null +++ b/glib/gregex.h @@ -0,0 +1,197 @@ +/* GRegex -- regular expression API wrapper around PCRE. + * + * Copyright (C) 1999, 2000 Scott Wimer + * Copyright (C) 2004, Matthias Clasen + * Copyright (C) 2005 - 2006, Marco Barisione + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __G_REGEX_H__ +#define __G_REGEX_H__ + +#include + +G_BEGIN_DECLS + +typedef enum +{ + G_REGEX_ERROR_COMPILE, + G_REGEX_ERROR_OPTIMIZE, + G_REGEX_ERROR_REPLACE, + G_REGEX_ERROR_MATCH +} GRegexError; + +#define G_REGEX_ERROR g_regex_error_quark () + +GQuark g_regex_error_quark (void); + +/* Remember to update G_REGEX_COMPILE_MASK in gregex.c after + * adding a new flag. */ +typedef enum +{ + G_REGEX_CASELESS = 1 << 0, + G_REGEX_MULTILINE = 1 << 1, + G_REGEX_DOTALL = 1 << 2, + G_REGEX_EXTENDED = 1 << 3, + G_REGEX_ANCHORED = 1 << 4, + G_REGEX_DOLLAR_ENDONLY = 1 << 5, + G_REGEX_UNGREEDY = 1 << 9, + G_REGEX_RAW = 1 << 11, + G_REGEX_NO_AUTO_CAPTURE = 1 << 12, + G_REGEX_DUPNAMES = 1 << 19, + G_REGEX_NEWLINE_CR = 1 << 20, + G_REGEX_NEWLINE_LF = 1 << 21, + G_REGEX_NEWLINE_CRLF = G_REGEX_NEWLINE_CR | G_REGEX_NEWLINE_LF +} GRegexCompileFlags; + +/* Remember to update G_REGEX_MATCH_MASK in gregex.c after + * adding a new flag. */ +typedef enum +{ + G_REGEX_MATCH_ANCHORED = 1 << 4, + G_REGEX_MATCH_NOTBOL = 1 << 7, + G_REGEX_MATCH_NOTEOL = 1 << 8, + G_REGEX_MATCH_NOTEMPTY = 1 << 10, + G_REGEX_MATCH_PARTIAL = 1 << 15, + G_REGEX_MATCH_NEWLINE_CR = 1 << 20, + G_REGEX_MATCH_NEWLINE_LF = 1 << 21, + G_REGEX_MATCH_NEWLINE_CRLF = G_REGEX_MATCH_NEWLINE_CR | G_REGEX_MATCH_NEWLINE_LF, + G_REGEX_MATCH_NEWLINE_ANY = 1 << 22, +} GRegexMatchFlags; + +typedef struct _GRegex GRegex; + +typedef gboolean (*GRegexEvalCallback) (const GRegex*, const gchar*, GString*, gpointer); + + +GRegex *g_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error); +void g_regex_free (GRegex *regex); +gboolean g_regex_optimize (GRegex *regex, + GError **error); +GRegex *g_regex_copy (const GRegex *regex); +const gchar *g_regex_get_pattern (const GRegex *regex); +void g_regex_clear (GRegex *regex); +gboolean g_regex_match_simple (const gchar *pattern, + const gchar *string, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options); +gboolean g_regex_match (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options); +gboolean g_regex_match_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error); +gboolean g_regex_match_next (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options); +gboolean g_regex_match_next_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error); +gboolean g_regex_match_all (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options); +gboolean g_regex_match_all_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error); +gint g_regex_get_match_count (const GRegex *regex); +gboolean g_regex_is_partial_match (const GRegex *regex); +gchar *g_regex_fetch (const GRegex *regex, + gint match_num, + const gchar *string); +gboolean g_regex_fetch_pos (const GRegex *regex, + gint match_num, + gint *start_pos, + gint *end_pos); +gchar *g_regex_fetch_named (const GRegex *regex, + const gchar *name, + const gchar *string); +gboolean g_regex_fetch_named_pos (const GRegex *regex, + const gchar *name, + gint *start_pos, + gint *end_pos); +gchar **g_regex_fetch_all (const GRegex *regex, + const gchar *string); +gint g_regex_get_string_number (const GRegex *regex, + const gchar *name); +gchar **g_regex_split_simple (const gchar *pattern, + const gchar *string, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options); +gchar **g_regex_split (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options); +gchar **g_regex_split_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + gint max_tokens, + GError **error); +gchar *g_regex_split_next (GRegex *regex, + const gchar *string, + GRegexMatchFlags match_options); +gchar *g_regex_split_next_full (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GError **error); +gchar *g_regex_expand_references (GRegex *regex, + const gchar *string, + const gchar *string_to_expand, + GError **error); +gchar *g_regex_replace (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + const gchar *replacement, + GRegexMatchFlags match_options, + GError **error); +gchar *g_regex_replace_literal (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + const gchar *replacement, + GRegexMatchFlags match_options, + GError **error); +gchar *g_regex_replace_eval (GRegex *regex, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_options, + GRegexEvalCallback eval, + gpointer user_data, + GError **error); +gchar *g_regex_escape_string (const gchar *string, + gint length); + + +G_END_DECLS + + +#endif /* __G_REGEX_H__ */ diff --git a/glib/makefile.msc.in b/glib/makefile.msc.in index fc72078..189a430 100644 --- a/glib/makefile.msc.in +++ b/glib/makefile.msc.in @@ -17,6 +17,7 @@ all : \ galias.h \ galiasdef.c \ gnulib\gnulib.lib \ + pcre\pcre.lib \ libglib-2.0-0.dll \ glib-@GLIB_MAJOR_VERSION@.@GLIB_MINOR_VERSION@s.lib \ gspawn-win32-helper.exe \ @@ -27,6 +28,11 @@ gnulib\gnulib.lib : nmake -f makefile.msc cd .. +pcre\pcre.lib : + cd pcre + nmake -f makefile.msc + cd .. + glib_OBJECTS = \ garray.obj \ gasyncqueue.obj \ @@ -61,6 +67,7 @@ glib_OBJECTS = \ gpattern.obj \ gprintf.obj \ grand.obj \ + gregex.obj \ grel.obj \ gscanner.obj \ gsequence.obj \ @@ -112,12 +119,12 @@ glib.res : glib.rc # create a static libary # static library can well have the real version number in the name -glib-@GLIB_MAJOR_VERSION@.@GLIB_MINOR_VERSION@s.lib : $(glib_OBJECTS) gnulib\gnulib.lib - lib /out:glib-@GLIB_MAJOR_VERSION@.@GLIB_MINOR_VERSION@s.lib $(glib_OBJECTS) gnulib\gnulib.lib +glib-@GLIB_MAJOR_VERSION@.@GLIB_MINOR_VERSION@s.lib : $(glib_OBJECTS) gnulib\gnulib.lib pcre\pcre.lib + lib /out:glib-@GLIB_MAJOR_VERSION@.@GLIB_MINOR_VERSION@s.lib $(glib_OBJECTS) gnulib\gnulib.lib pcre\pcre.lib -libglib-2.0-0.dll : $(glib_OBJECTS) gnulib\gnulib.lib glib.def glib.res +libglib-2.0-0.dll : $(glib_OBJECTS) gnulib\gnulib.lib pcre\pcre.lib glib.def glib.res $(CC) $(CFLAGS) -LD -Fe$@ $(glib_OBJECTS) glib.res $(LIBICONV_LIBS) $(INTL_LIBS) \ - gnulib\gnulib.lib $(DIRENT_LIBS) user32.lib advapi32.lib shell32.lib wsock32.lib ole32.lib ws2_32.lib \ + gnulib\gnulib.lib pcre\pcre.lib $(DIRENT_LIBS) user32.lib advapi32.lib shell32.lib wsock32.lib ole32.lib ws2_32.lib \ $(LDFLAGS) /implib:glib-2.0.lib /def:glib.def gspawn-win32-helper.exe : gspawn-win32-helper.c libglib-2.0-@LT_CURRENT_MINUS_AGE@.dll diff --git a/glib/pcre/COPYING b/glib/pcre/COPYING new file mode 100644 index 0000000..58241b2 --- /dev/null +++ b/glib/pcre/COPYING @@ -0,0 +1,68 @@ +PCRE LICENCE +------------ + +PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + +Release 7 of PCRE is distributed under the terms of the "BSD" licence, as +specified below. The documentation for PCRE, supplied in the "doc" +directory, is distributed under the same terms as the software itself. + +The basic library functions are written in C and are freestanding. Also +included in the distribution is a set of C++ wrapper functions. + + +THE BASIC LIBRARY FUNCTIONS +--------------------------- + +Written by: Philip Hazel +Email local part: ph10 +Email domain: cam.ac.uk + +University of Cambridge Computing Service, +Cambridge, England. Phone: +44 1223 334714. + +Copyright (c) 1997-2006 University of Cambridge +All rights reserved. + + +THE C++ WRAPPER FUNCTIONS +------------------------- + +Contributed by: Google Inc. + +Copyright (c) 2006, Google Inc. +All rights reserved. + + +THE "BSD" LICENCE +----------------- + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the name of Google + Inc. nor the names of their contributors may be used to endorse or + promote products derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +End diff --git a/glib/pcre/Makefile.am b/glib/pcre/Makefile.am new file mode 100644 index 0000000..61f8aa2 --- /dev/null +++ b/glib/pcre/Makefile.am @@ -0,0 +1,62 @@ +INCLUDES = \ + -DG_LOG_DOMAIN=\"GLib-GRegex\" \ + -DSUPPORT_UCP \ + -DSUPPORT_UTF8 \ + -DNEWLINE=-1 \ + -DMATCH_LIMIT=10000000 \ + -DMATCH_LIMIT_RECURSION=10000000 \ + -DMAX_NAME_SIZE=32 \ + -DMAX_NAME_COUNT=10000 \ + -DMAX_DUPLENGTH=30000 \ + -DLINK_SIZE=2 \ + -DEBCDIC=0 \ + -DPOSIX_MALLOC_THRESHOLD=10 \ + -I$(top_srcdir) \ + -I$(srcdir) \ + -I$(top_srcdir)/glib \ + @GLIB_DEBUG_FLAGS@ \ + -DG_DISABLE_DEPRECATED \ + $(DEPRECATED_FLAGS)\ + $(WARN_CFLAGS) \ + $(PCRE_WARN_CFLAGS) \ + $(DEP_CFLAGS) + +noinst_LTLIBRARIES = libpcre.la + +libpcre_headers = + +libpcre_la_SOURCES = \ + pcre_chartables.c \ + pcre_compile.c \ + pcre_config.c \ + pcre_dfa_exec.c \ + pcre_exec.c \ + pcre_fullinfo.c \ + pcre_get.c \ + pcre_globals.c \ + pcre_info.c \ + pcre_maketables.c \ + pcre_newline.c \ + pcre_ord2utf8.c \ + pcre_refcount.c \ + pcre_study.c \ + pcre_tables.c \ + pcre_try_flipped.c \ + pcre_ucp_searchfuncs.c \ + pcre_valid_utf8.c \ + pcre_version.c \ + pcre_xclass.c \ + pcre.h \ + pcre_internal.h \ + ucp.h \ + ucpinternal.h \ + $(libpcre_headers) + +libpcre_la_LIBADD = $(DEP_LIBS) + +libpcre_la_LDFLAGS = -no-undefined + +EXTRA_DIST = \ + COPYING \ + makefile.msc + diff --git a/glib/pcre/makefile.msc b/glib/pcre/makefile.msc new file mode 100644 index 0000000..9e4371a --- /dev/null +++ b/glib/pcre/makefile.msc @@ -0,0 +1,49 @@ +!IFDEF DEBUG +CRT=-MDd +!ELSE +CRT=-MD +!ENDIF + +CFLAGS = \ + -I ..\.. \ + -DHAVE_CONFIG_H \ + -DHAVE_LONG_LONG_FORMAT \ + -DSUPPORT_UCP \ + -DSUPPORT_UTF8 \ + -DNEWLINE=10 \ + -DMATCH_LIMIT=10000000 \ + -DMATCH_LIMIT_RECURSION=10000000 \ + -DMAX_NAME_SIZE=32 \ + -DMAX_NAME_COUNT=10000 \ + -DMAX_DUPLENGTH=30000 \ + -DLINK_SIZE=2 \ + -DEBCDIC=0 \ + -DPOSIX_MALLOC_THRESHOLD=10 + +OBJECTS = \ + pcre_chartables.obj \ + pcre_compile.obj \ + pcre_config.obj \ + pcre_dfa_exec.obj \ + pcre_exec.obj \ + pcre_fullinfo.obj \ + pcre_get.obj \ + pcre_globals.obj \ + pcre_info.obj \ + pcre_maketables.obj \ + pcre_newline.obj \ + pcre_ord2utf8.obj \ + pcre_refcount.obj \ + pcre_study.obj \ + pcre_tables.obj \ + pcre_try_flipped.obj \ + pcre_ucp_searchfuncs.obj \ + pcre_valid_utf8.obj \ + pcre_version.obj \ + pcre_xclass.obj \ + +pcre.lib : $(OBJECTS) + lib -out:pcre.lib $(OBJECTS) + +.c.obj: + $(CC) $(CRT) $(CFLAGS) -Ox -GD -c $< diff --git a/glib/pcre/pcre.h b/glib/pcre/pcre.h new file mode 100644 index 0000000..1b29a18 --- /dev/null +++ b/glib/pcre/pcre.h @@ -0,0 +1,283 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* This is the public header file for the PCRE library, to be #included by +applications that call the PCRE functions. + + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +#ifndef _PCRE_H +#define _PCRE_H + +/* The current PCRE version information. */ + +/* NOTES FOR FUTURE MAINTAINERS: Do not use numbers with leading zeros, because +they may be treated as octal constants. The PCRE_PRERELEASE feature is for +identifying release candidates. It might be defined as -RC2, for example. In +real releases, it should be defined empty. Do not change the alignment of these +statments. The code in ./configure greps out the version numbers by using "cut" +to get values from column 29 onwards. These are substituted into pcre-config +and libpcre.pc. The values are not put into configure.ac and substituted here +(which would simplify this issue) because that makes life harder for those who +cannot run ./configure. As it now stands, this file need not be edited in that +circumstance. */ + +#define PCRE_MAJOR 7 +#define PCRE_MINOR 0 +#define PCRE_PRERELEASE +#define PCRE_DATE 18-Dec-2006 + +/* Win32 uses DLL by default; it needs special stuff for exported functions +when building PCRE. */ + +#ifdef _WIN32 +# ifdef PCRE_DEFINITION +# ifdef DLL_EXPORT +# define PCRE_DATA_SCOPE __declspec(dllexport) +# endif +# else +# ifndef PCRE_STATIC +# define PCRE_DATA_SCOPE extern __declspec(dllimport) +# endif +# endif +#endif + +/* Otherwise, we use the standard "extern". */ + +#ifndef PCRE_DATA_SCOPE +# ifdef __cplusplus +# define PCRE_DATA_SCOPE extern "C" +# else +# define PCRE_DATA_SCOPE extern +# endif +#endif + +/* Have to include stdlib.h in order to ensure that size_t is defined; +it is needed here for malloc. */ + +#include + +/* Allow for C++ users */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Options */ + +#define PCRE_CASELESS 0x00000001 +#define PCRE_MULTILINE 0x00000002 +#define PCRE_DOTALL 0x00000004 +#define PCRE_EXTENDED 0x00000008 +#define PCRE_ANCHORED 0x00000010 +#define PCRE_DOLLAR_ENDONLY 0x00000020 +#define PCRE_EXTRA 0x00000040 +#define PCRE_NOTBOL 0x00000080 +#define PCRE_NOTEOL 0x00000100 +#define PCRE_UNGREEDY 0x00000200 +#define PCRE_NOTEMPTY 0x00000400 +#define PCRE_UTF8 0x00000800 +#define PCRE_NO_AUTO_CAPTURE 0x00001000 +#define PCRE_NO_UTF8_CHECK 0x00002000 +#define PCRE_AUTO_CALLOUT 0x00004000 +#define PCRE_PARTIAL 0x00008000 +#define PCRE_DFA_SHORTEST 0x00010000 +#define PCRE_DFA_RESTART 0x00020000 +#define PCRE_FIRSTLINE 0x00040000 +#define PCRE_DUPNAMES 0x00080000 +#define PCRE_NEWLINE_CR 0x00100000 +#define PCRE_NEWLINE_LF 0x00200000 +#define PCRE_NEWLINE_CRLF 0x00300000 +#define PCRE_NEWLINE_ANY 0x00400000 + +/* Exec-time and get/set-time error codes */ + +#define PCRE_ERROR_NOMATCH (-1) +#define PCRE_ERROR_NULL (-2) +#define PCRE_ERROR_BADOPTION (-3) +#define PCRE_ERROR_BADMAGIC (-4) +#define PCRE_ERROR_UNKNOWN_OPCODE (-5) +#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */ +#define PCRE_ERROR_NOMEMORY (-6) +#define PCRE_ERROR_NOSUBSTRING (-7) +#define PCRE_ERROR_MATCHLIMIT (-8) +#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */ +#define PCRE_ERROR_BADUTF8 (-10) +#define PCRE_ERROR_BADUTF8_OFFSET (-11) +#define PCRE_ERROR_PARTIAL (-12) +#define PCRE_ERROR_BADPARTIAL (-13) +#define PCRE_ERROR_INTERNAL (-14) +#define PCRE_ERROR_BADCOUNT (-15) +#define PCRE_ERROR_DFA_UITEM (-16) +#define PCRE_ERROR_DFA_UCOND (-17) +#define PCRE_ERROR_DFA_UMLIMIT (-18) +#define PCRE_ERROR_DFA_WSSIZE (-19) +#define PCRE_ERROR_DFA_RECURSE (-20) +#define PCRE_ERROR_RECURSIONLIMIT (-21) +#define PCRE_ERROR_NULLWSLIMIT (-22) +#define PCRE_ERROR_BADNEWLINE (-23) + +/* Request types for pcre_fullinfo() */ + +#define PCRE_INFO_OPTIONS 0 +#define PCRE_INFO_SIZE 1 +#define PCRE_INFO_CAPTURECOUNT 2 +#define PCRE_INFO_BACKREFMAX 3 +#define PCRE_INFO_FIRSTBYTE 4 +#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */ +#define PCRE_INFO_FIRSTTABLE 5 +#define PCRE_INFO_LASTLITERAL 6 +#define PCRE_INFO_NAMEENTRYSIZE 7 +#define PCRE_INFO_NAMECOUNT 8 +#define PCRE_INFO_NAMETABLE 9 +#define PCRE_INFO_STUDYSIZE 10 +#define PCRE_INFO_DEFAULT_TABLES 11 + +/* Request types for pcre_config(). Do not re-arrange, in order to remain +compatible. */ + +#define PCRE_CONFIG_UTF8 0 +#define PCRE_CONFIG_NEWLINE 1 +#define PCRE_CONFIG_LINK_SIZE 2 +#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3 +#define PCRE_CONFIG_MATCH_LIMIT 4 +#define PCRE_CONFIG_STACKRECURSE 5 +#define PCRE_CONFIG_UNICODE_PROPERTIES 6 +#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 + +/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine +these bits, just add new ones on the end, in order to remain compatible. */ + +#define PCRE_EXTRA_STUDY_DATA 0x0001 +#define PCRE_EXTRA_MATCH_LIMIT 0x0002 +#define PCRE_EXTRA_CALLOUT_DATA 0x0004 +#define PCRE_EXTRA_TABLES 0x0008 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 + +/* Types */ + +struct real_pcre; /* declaration; the definition is private */ +typedef struct real_pcre pcre; + +/* When PCRE is compiled as a C++ library, the subject pointer type can be +replaced with a custom type. For conventional use, the public interface is a +const char *. */ + +#ifndef PCRE_SPTR +#define PCRE_SPTR const char * +#endif + +/* The structure for passing additional data to pcre_exec(). This is defined in +such as way as to be extensible. Always add new fields at the end, in order to +remain compatible. */ + +typedef struct pcre_extra { + unsigned long int flags; /* Bits for which fields are set */ + void *study_data; /* Opaque data from pcre_study() */ + unsigned long int match_limit; /* Maximum number of calls to match() */ + void *callout_data; /* Data passed back in callouts */ + const unsigned char *tables; /* Pointer to character tables */ + unsigned long int match_limit_recursion; /* Max recursive calls to match() */ +} pcre_extra; + +/* The structure for passing out data via the pcre_callout_function. We use a +structure so that new fields can be added on the end in future versions, +without changing the API of the function, thereby allowing old clients to work +without modification. */ + +typedef struct pcre_callout_block { + int version; /* Identifies version of block */ + /* ------------------------ Version 0 ------------------------------- */ + int callout_number; /* Number compiled into pattern */ + int *offset_vector; /* The offset vector */ + PCRE_SPTR subject; /* The subject being matched */ + int subject_length; /* The length of the subject */ + int start_match; /* Offset to start of this match attempt */ + int current_position; /* Where we currently are in the subject */ + int capture_top; /* Max current capture */ + int capture_last; /* Most recently closed capture */ + void *callout_data; /* Data passed in with the call */ + /* ------------------- Added for Version 1 -------------------------- */ + int pattern_position; /* Offset to next item in the pattern */ + int next_item_length; /* Length of next item in the pattern */ + /* ------------------------------------------------------------------ */ +} pcre_callout_block; + +#include "glib.h" +#include "galias.h" + +#define pcre_malloc g_try_malloc +#define pcre_free g_free +#define pcre_stack_malloc g_try_malloc + +PCRE_DATA_SCOPE int (*pcre_callout)(pcre_callout_block *); + +/* Exported PCRE functions */ + +PCRE_DATA_SCOPE pcre *pcre_compile(const char *, int, const char **, int *, + const unsigned char *); +PCRE_DATA_SCOPE pcre *pcre_compile2(const char *, int, int *, const char **, + int *, const unsigned char *); +PCRE_DATA_SCOPE int pcre_config(int, void *); +PCRE_DATA_SCOPE int pcre_copy_named_substring(const pcre *, const char *, + int *, int, const char *, char *, int); +PCRE_DATA_SCOPE int pcre_copy_substring(const char *, int *, int, int, char *, + int); +PCRE_DATA_SCOPE int pcre_dfa_exec(const pcre *, const pcre_extra *, + const char *, int, int, int, int *, int , int *, int); +PCRE_DATA_SCOPE int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, + int, int, int, int *, int); +PCRE_DATA_SCOPE void pcre_free_substring(const char *); +PCRE_DATA_SCOPE void pcre_free_substring_list(const char **); +PCRE_DATA_SCOPE int pcre_fullinfo(const pcre *, const pcre_extra *, int, + void *); +PCRE_DATA_SCOPE int pcre_get_named_substring(const pcre *, const char *, + int *, int, const char *, const char **); +PCRE_DATA_SCOPE int pcre_get_stringnumber(const pcre *, const char *); +PCRE_DATA_SCOPE int pcre_get_stringtable_entries(const pcre *, const char *, + char **, char **); +PCRE_DATA_SCOPE int pcre_get_substring(const char *, int *, int, int, + const char **); +PCRE_DATA_SCOPE int pcre_get_substring_list(const char *, int *, int, + const char ***); +PCRE_DATA_SCOPE int pcre_info(const pcre *, int *, int *); +PCRE_DATA_SCOPE const unsigned char *pcre_maketables(void); +PCRE_DATA_SCOPE int pcre_refcount(pcre *, int); +PCRE_DATA_SCOPE pcre_extra *pcre_study(const pcre *, int, const char **); +PCRE_DATA_SCOPE const char *pcre_version(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* End of pcre.h */ diff --git a/glib/pcre/pcre_chartables.c b/glib/pcre/pcre_chartables.c new file mode 100644 index 0000000..e89c71a --- /dev/null +++ b/glib/pcre/pcre_chartables.c @@ -0,0 +1,195 @@ +/* This file is autogenerated by ../update-pcre/update.sh during + * the update of the local copy of PCRE. + */ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* This file is automatically written by the dftables auxiliary +program. If you edit it by hand, you might like to edit the Makefile to +prevent its ever being regenerated. + +This file contains the default tables for characters with codes less than +128 (ASCII characters). These tables are used when no external tables are +passed to PCRE. + +The following #include is present because without it gcc 4.x may remove +the array definition from the final binary if PCRE is built into a static +library and dead code stripping is activated. This leads to link errors. +Pulling in the header ensures that the array gets flagged as "someone +outside this compilation unit might reference this" and so it will always +be supplied to the linker. */ + +#include "pcre_internal.h" + +const unsigned char _pcre_default_tables[] = { + +/* This table is a lower casing table. */ + + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122, 91, 92, 93, 94, 95, + 96, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122,123,124,125,126,127, + 128,129,130,131,132,133,134,135, + 136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151, + 152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167, + 168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183, + 184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199, + 200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215, + 216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231, + 232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247, + 248,249,250,251,252,253,254,255, + +/* This table is a case flipping table. */ + + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122, 91, 92, 93, 94, 95, + 96, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90,123,124,125,126,127, + 128,129,130,131,132,133,134,135, + 136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151, + 152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167, + 168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183, + 184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199, + 200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215, + 216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231, + 232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247, + 248,249,250,251,252,253,254,255, + +/* This table contains bit maps for various character classes. +Each map is 32 bytes long and the bits run from the least +significant end of each byte. The classes that have their own +maps are: space, xdigit, digit, upper, lower, word, graph +print, punct, and cntrl. Other classes are built from combinations. */ + + 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, + 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, + 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, + 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + +/* This table identifies various classes of character by individual bits: + 0x01 white space character + 0x02 letter + 0x04 decimal digit + 0x08 hexadecimal digit + 0x10 alphanumeric or '_' + 0x80 regular expression metacharacter or binary zero +*/ + + 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ + 0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ + 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */ + 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */ + 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ + 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */ + 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */ + 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */ + 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */ + 0x12,0x12,0x12,0x80,0x80,0x00,0x80,0x10, /* X - _ */ + 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */ + 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */ + 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */ + 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ + +/* End of chartables.c */ diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c new file mode 100644 index 0000000..6eaeac1 --- /dev/null +++ b/glib/pcre/pcre_compile.c @@ -0,0 +1,5385 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_compile(), along with +supporting internal functions that are not used by other modules. */ + + +#define NLBLOCK cd /* Block containing newline information */ +#define PSSTART start_pattern /* Field containing processed string start */ +#define PSEND end_pattern /* Field containing processed string end */ + + +#include "pcre_internal.h" + + +/* When DEBUG is defined, we need the pcre_printint() function, which is also +used by pcretest. DEBUG is not defined when building a production library. */ + +#ifdef DEBUG +#include "pcre_printint.src" +#endif + + +/************************************************* +* Code parameters and static tables * +*************************************************/ + +/* This value specifies the size of stack workspace that is used during the +first pre-compile phase that determines how much memory is required. The regex +is partly compiled into this space, but the compiled parts are discarded as +soon as they can be, so that hopefully there will never be an overrun. The code +does, however, check for an overrun. The largest amount I've seen used is 218, +so this number is very generous. + +The same workspace is used during the second, actual compile phase for +remembering forward references to groups so that they can be filled in at the +end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE +is 4 there is plenty of room. */ + +#define COMPILE_WORK_SIZE (4096) + + +/* Table for handling escaped characters in the range '0'-'z'. Positive returns +are simple data values; negative values are for special things like \d and so +on. Zero means further processing is needed (for things like \x), or the escape +is invalid. */ + +#if !EBCDIC /* This is the "normal" table for ASCII systems */ +static const short int escapes[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ + 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ + '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ + 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ +-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ +-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ + '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ + 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ +-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ + 0, 0, -ESC_z /* x - z */ +}; + +#else /* This is the "abnormal" table for EBCDIC systems */ +static const short int escapes[] = { +/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', +/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, +/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', +/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, +/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', +/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', +/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, +/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, +/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, +/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, +/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, +/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, +/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', +/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, +/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, +/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, +/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, +/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, +/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 +}; +#endif + + +/* Tables of names of POSIX character classes and their lengths. The list is +terminated by a zero length entry. The first three must be alpha, lower, upper, +as this is assumed for handling case independence. */ + +static const char posix_names[] = + "alpha\0" + "lower\0" + "upper\0" + "alnum\0" + "ascii\0" + "blank\0" + "cntrl\0" + "digit\0" + "graph\0" + "print\0" + "punct\0" + "space\0" + "word\0" + "xdigit"; + +static const uschar posix_name_lengths[] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; + +/* Table of class bit maps for each POSIX class. Each class is formed from a +base map, with an optional addition or removal of another map. Then, for some +classes, there is some additional tweaking: for [:blank:] the vertical space +characters are removed, and for [:alpha:] and [:alnum:] the underscore +character is removed. The triples in the table consist of the base map offset, +second map offset or -1 if no second map, and a non-negative value for map +addition or a negative value for map subtraction (if there are two maps). The +absolute value of the third field has these meanings: 0 => no tweaking, 1 => +remove vertical space characters, 2 => remove underscore. */ + +static const int posix_class_maps[] = { + cbit_word, cbit_digit, -2, /* alpha */ + cbit_lower, -1, 0, /* lower */ + cbit_upper, -1, 0, /* upper */ + cbit_word, -1, 2, /* alnum - word without underscore */ + cbit_print, cbit_cntrl, 0, /* ascii */ + cbit_space, -1, 1, /* blank - a GNU extension */ + cbit_cntrl, -1, 0, /* cntrl */ + cbit_digit, -1, 0, /* digit */ + cbit_graph, -1, 0, /* graph */ + cbit_print, -1, 0, /* print */ + cbit_punct, -1, 0, /* punct */ + cbit_space, -1, 0, /* space */ + cbit_word, -1, 0, /* word - a Perl extension */ + cbit_xdigit,-1, 0 /* xdigit */ +}; + + +#define STRING(a) # a +#define XSTRING(s) STRING(s) + +/* The texts of compile-time error messages. These are "char *" because they +are passed to the outside world. Do not ever re-use any error number, because +they are documented. Always add a new error instead. Messages marked DEAD below +are no longer used. */ + +#define DEAD(s) "\0" + +static const char error_texts[] = + "no error\0" + "\\ at end of pattern\0" + "\\c at end of pattern\0" + "unrecognized character follows \\\0" + "numbers out of order in {} quantifier\0" + /* 5 */ + "number too big in {} quantifier\0" + "missing terminating ] for character class\0" + "invalid escape sequence in character class\0" + "range out of order in character class\0" + "nothing to repeat\0" + /* 10 */ + DEAD("operand of unlimited repeat could match the empty string") + "internal error: unexpected repeat\0" + "unrecognized character after (?\0" + "POSIX named classes are supported only within a class\0" + "missing )\0" + /* 15 */ + "reference to non-existent subpattern\0" + "erroffset passed as NULL\0" + "unknown option bit(s) set\0" + "missing ) after comment\0" + DEAD("parentheses nested too deeply") + /* 20 */ + "regular expression too large\0" + "failed to get memory\0" + "unmatched parentheses\0" + "internal error: code overflow\0" + "unrecognized character after (?<\0" + /* 25 */ + "lookbehind assertion is not fixed length\0" + "malformed number or name after (?(\0" + "conditional group contains more than two branches\0" + "assertion expected after (?(\0" + "(?R or (?digits must be followed by )\0" + /* 30 */ + "unknown POSIX class name\0" + "POSIX collating elements are not supported\0" + "this version of PCRE is not compiled with PCRE_UTF8 support\0" + DEAD("spare error") + "character value in \\x{...} sequence is too large\0" + /* 35 */ + "invalid condition (?(0)\0" + "\\C not allowed in lookbehind assertion\0" + "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" + "number after (?C is > 255\0" + "closing ) for (?C expected\0" + /* 40 */ + "recursive call could loop indefinitely\0" + "unrecognized character after (?P\0" + "syntax error in subpattern name (missing terminator)\0" + "two named subpatterns have the same name\0" + "invalid UTF-8 string\0" + /* 45 */ + "support for \\P, \\p, and \\X has not been compiled\0" + "malformed \\P or \\p sequence\0" + "unknown property name after \\P or \\p\0" + "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" + "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" + /* 50 */ + "repeated subpattern is too long\0" + "octal value is greater than \\377 (not in UTF-8 mode)\0" + "internal error: overran compiling workspace\0" + "internal error: previously-checked referenced subpattern not found\0" + "DEFINE group contains more than one branch\0" + /* 55 */ + "repeating a DEFINE group is not allowed\0" + "inconsistent NEWLINE options\0" + "\\g is not followed by an (optionally braced) non-zero number"; + +static const int error_texts_offsets[] = { + 0, + 9, + 29, + 50, + 83, + 121, + 153, + 195, + 238, + 276, + 294, + 295, + 329, + 361, + 415, + 425, + 462, + 487, + 513, + 537, + 538, + 567, + 588, + 610, + 640, + 673, + 714, + 749, + 799, + 828, + 866, + 891, + 934, + 994, + 995, + 1044, + 1068, + 1107, + 1151, + 1177, + 1204, + 1243, + 1276, + 1329, + 1370, + 1391, + 1440, + 1468, + 1505, + 1557, + 1600, + 1632, + 1685, + 1729, + 1796, + 1839, + 1879, + 1908 +}; + + +/* Definition to allow mutual recursion */ + +static BOOL + compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *, + int *, branch_chain *, compile_data *, int *); + + + +/************************************************* +* Handle escapes * +*************************************************/ + +/* This function is called when a \ has been encountered. It either returns a +positive value for a simple escape such as \n, or a negative value which +encodes one of the more complicated things such as \d. A backreference to group +n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When +UTF-8 is enabled, a positive value greater than 255 may be returned. On entry, +ptr is pointing at the \. On exit, it is on the final character of the escape +sequence. + +Arguments: + ptrptr points to the pattern position pointer + errorcodeptr points to the errorcode variable + bracount number of previous extracting brackets + options the options bits + isclass TRUE if inside a character class + +Returns: zero or positive => a data character + negative => a special escape sequence + on error, errorptr is set +*/ + +static int +check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, + int options, BOOL isclass) +{ +BOOL utf8 = (options & PCRE_UTF8) != 0; +const uschar *ptr = *ptrptr + 1; +int c, i; + +GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ +ptr--; /* Set pointer back to the last byte */ + +/* If backslash is at the end of the pattern, it's an error. */ + +if (c == 0) *errorcodeptr = ERR1; + +/* Non-alphamerics are literals. For digits or letters, do an initial lookup in +a table. A non-zero result is something that can be returned immediately. +Otherwise further processing may be required. */ + +#if !EBCDIC /* ASCII coding */ +else if (c < '0' || c > 'z') {} /* Not alphameric */ +else if ((i = escapes[c - '0']) != 0) c = i; + +#else /* EBCDIC coding */ +else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */ +else if ((i = escapes[c - 0x48]) != 0) c = i; +#endif + +/* Escapes that need further processing, or are illegal. */ + +else + { + const uschar *oldptr; + BOOL braced, negated; + + switch (c) + { + /* A number of Perl escapes are not handled by PCRE. We give an explicit + error. */ + + case 'l': + case 'L': + case 'N': + case 'u': + case 'U': + *errorcodeptr = ERR37; + break; + + /* \g must be followed by a number, either plain or braced. If positive, it + is an absolute backreference. If negative, it is a relative backreference. + This is a Perl 5.10 feature. */ + + case 'g': + if (ptr[1] == '{') + { + braced = TRUE; + ptr++; + } + else braced = FALSE; + + if (ptr[1] == '-') + { + negated = TRUE; + ptr++; + } + else negated = FALSE; + + c = 0; + while (g_ascii_isdigit(ptr[1]) != 0) + c = c * 10 + *(++ptr) - '0'; + + if (c == 0 || (braced && *(++ptr) != '}')) + { + *errorcodeptr = ERR57; + return 0; + } + + if (negated) + { + if (c > bracount) + { + *errorcodeptr = ERR15; + return 0; + } + c = bracount - (c - 1); + } + + c = -(ESC_REF + c); + break; + + /* The handling of escape sequences consisting of a string of digits + starting with one that is not zero is not straightforward. By experiment, + the way Perl works seems to be as follows: + + Outside a character class, the digits are read as a decimal number. If the + number is less than 10, or if there are that many previous extracting + left brackets, then it is a back reference. Otherwise, up to three octal + digits are read to form an escaped byte. Thus \123 is likely to be octal + 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal + value is greater than 377, the least significant 8 bits are taken. Inside a + character class, \ followed by a digit is always an octal number. */ + + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + + if (!isclass) + { + oldptr = ptr; + c -= '0'; + while (g_ascii_isdigit(ptr[1]) != 0) + c = c * 10 + *(++ptr) - '0'; + if (c < 10 || c <= bracount) + { + c = -(ESC_REF + c); + break; + } + ptr = oldptr; /* Put the pointer back and fall through */ + } + + /* Handle an octal number following \. If the first digit is 8 or 9, Perl + generates a binary zero byte and treats the digit as a following literal. + Thus we have to pull back the pointer by one. */ + + if ((c = *ptr) >= '8') + { + ptr--; + c = 0; + break; + } + + /* \0 always starts an octal number, but we may drop through to here with a + larger first octal digit. The original code used just to take the least + significant 8 bits of octal numbers (I think this is what early Perls used + to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more + than 3 octal digits. */ + + case '0': + c -= '0'; + while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') + c = c * 8 + *(++ptr) - '0'; + if (!utf8 && c > 255) *errorcodeptr = ERR51; + break; + + /* \x is complicated. \x{ddd} is a character number which can be greater + than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is + treated as a data character. */ + + case 'x': + if (ptr[1] == '{') + { + const uschar *pt = ptr + 2; + int count = 0; + + c = 0; + while (g_ascii_isxdigit(*pt) != 0) + { + register int cc = *pt++; + if (c == 0 && cc == '0') continue; /* Leading zeroes */ + count++; + +#if !EBCDIC /* ASCII coding */ + if (cc >= 'a') cc -= 32; /* Convert to upper case */ + c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); +#else /* EBCDIC coding */ + if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ + c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); +#endif + } + + if (*pt == '}') + { + if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; + ptr = pt; + break; + } + + /* If the sequence of hex digits does not end with '}', then we don't + recognize this construct; fall through to the normal \x handling. */ + } + + /* Read just a single-byte hex-defined char */ + + c = 0; + while (i++ < 2 && g_ascii_isxdigit(ptr[1]) != 0) + { + int cc; /* Some compilers don't like ++ */ + cc = *(++ptr); /* in initializers */ +#if !EBCDIC /* ASCII coding */ + if (cc >= 'a') cc -= 32; /* Convert to upper case */ + c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); +#else /* EBCDIC coding */ + if (cc <= 'z') cc += 64; /* Convert to upper case */ + c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); +#endif + } + break; + + /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. + This coding is ASCII-specific, but then the whole concept of \cx is + ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ + + case 'c': + c = *(++ptr); + if (c == 0) + { + *errorcodeptr = ERR2; + return 0; + } + +#if !EBCDIC /* ASCII coding */ + if (c >= 'a' && c <= 'z') c -= 32; + c ^= 0x40; +#else /* EBCDIC coding */ + if (c >= 'a' && c <= 'z') c += 64; + c ^= 0xC0; +#endif + break; + + /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any + other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, + for Perl compatibility, it is a literal. This code looks a bit odd, but + there used to be some cases other than the default, and there may be again + in future, so I haven't "optimized" it. */ + + default: + if ((options & PCRE_EXTRA) != 0) switch(c) + { + default: + *errorcodeptr = ERR3; + break; + } + break; + } + } + +*ptrptr = ptr; +return c; +} + + + +#ifdef SUPPORT_UCP +/************************************************* +* Handle \P and \p * +*************************************************/ + +/* This function is called after \P or \p has been encountered, provided that +PCRE is compiled with support for Unicode properties. On entry, ptrptr is +pointing at the P or p. On exit, it is pointing at the final character of the +escape sequence. + +Argument: + ptrptr points to the pattern position pointer + negptr points to a boolean that is set TRUE for negation else FALSE + dptr points to an int that is set to the detailed property value + errorcodeptr points to the error code variable + +Returns: type value from ucp_type_table, or -1 for an invalid type +*/ + +static int +get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) +{ +int c, i, bot, top; +const uschar *ptr = *ptrptr; +char name[32]; + +c = *(++ptr); +if (c == 0) goto ERROR_RETURN; + +*negptr = FALSE; + +/* \P or \p can be followed by a name in {}, optionally preceded by ^ for +negation. */ + +if (c == '{') + { + if (ptr[1] == '^') + { + *negptr = TRUE; + ptr++; + } + for (i = 0; i < sizeof(name) - 1; i++) + { + c = *(++ptr); + if (c == 0) goto ERROR_RETURN; + if (c == '}') break; + name[i] = c; + } + if (c !='}') goto ERROR_RETURN; + name[i] = 0; + } + +/* Otherwise there is just one following character */ + +else + { + name[0] = c; + name[1] = 0; + } + +*ptrptr = ptr; + +/* Search for a recognized property name using binary chop */ + +bot = 0; +top = _pcre_utt_size; + +while (bot < top) + { + i = (bot + top) >> 1; + c = strcmp(name, &_pcre_ucp_names[_pcre_utt[i].offset]); + if (c == 0) + { + *dptr = _pcre_utt[i].value; + return _pcre_utt[i].type; + } + if (c > 0) bot = i + 1; else top = i; + } + +*errorcodeptr = ERR47; +*ptrptr = ptr; +return -1; + +ERROR_RETURN: +*errorcodeptr = ERR46; +*ptrptr = ptr; +return -1; +} +#endif + + + + +/************************************************* +* Check for counted repeat * +*************************************************/ + +/* This function is called when a '{' is encountered in a place where it might +start a quantifier. It looks ahead to see if it really is a quantifier or not. +It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} +where the ddds are digits. + +Arguments: + p pointer to the first char after '{' + +Returns: TRUE or FALSE +*/ + +static BOOL +is_counted_repeat(const uschar *p) +{ +if (g_ascii_isdigit(*p++) == 0) return FALSE; +while (g_ascii_isdigit(*p) != 0) p++; +if (*p == '}') return TRUE; + +if (*p++ != ',') return FALSE; +if (*p == '}') return TRUE; + +if (g_ascii_isdigit(*p++) == 0) return FALSE; +while (g_ascii_isdigit(*p) != 0) p++; + +return (*p == '}'); +} + + + +/************************************************* +* Read repeat counts * +*************************************************/ + +/* Read an item of the form {n,m} and return the values. This is called only +after is_counted_repeat() has confirmed that a repeat-count quantifier exists, +so the syntax is guaranteed to be correct, but we need to check the values. + +Arguments: + p pointer to first char after '{' + minp pointer to int for min + maxp pointer to int for max + returned as -1 if no max + errorcodeptr points to error code variable + +Returns: pointer to '}' on success; + current ptr on error, with errorcodeptr set non-zero +*/ + +static const uschar * +read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) +{ +int min = 0; +int max = -1; + +/* Read the minimum value and do a paranoid check: a negative value indicates +an integer overflow. */ + +while (g_ascii_isdigit(*p) != 0) min = min * 10 + *p++ - '0'; +if (min < 0 || min > 65535) + { + *errorcodeptr = ERR5; + return p; + } + +/* Read the maximum value if there is one, and again do a paranoid on its size. +Also, max must not be less than min. */ + +if (*p == '}') max = min; else + { + if (*(++p) != '}') + { + max = 0; + while(g_ascii_isdigit(*p) != 0) max = max * 10 + *p++ - '0'; + if (max < 0 || max > 65535) + { + *errorcodeptr = ERR5; + return p; + } + if (max < min) + { + *errorcodeptr = ERR4; + return p; + } + } + } + +/* Fill in the required variables, and pass back the pointer to the terminating +'}'. */ + +*minp = min; +*maxp = max; +return p; +} + + + +/************************************************* +* Find forward referenced subpattern * +*************************************************/ + +/* This function scans along a pattern's text looking for capturing +subpatterns, and counting them. If it finds a named pattern that matches the +name it is given, it returns its number. Alternatively, if the name is NULL, it +returns when it reaches a given numbered subpattern. This is used for forward +references to subpatterns. We know that if (?P< is encountered, the name will +be terminated by '>' because that is checked in the first pass. + +Arguments: + ptr current position in the pattern + count current count of capturing parens so far encountered + name name to seek, or NULL if seeking a numbered subpattern + lorn name length, or subpattern number if name is NULL + xmode TRUE if we are in /x mode + +Returns: the number of the named subpattern, or -1 if not found +*/ + +static int +find_parens(const uschar *ptr, int count, const uschar *name, int lorn, + BOOL xmode) +{ +const uschar *thisname; + +for (; *ptr != 0; ptr++) + { + int term; + + /* Skip over backslashed characters and also entire \Q...\E */ + + if (*ptr == '\\') + { + if (*(++ptr) == 0) return -1; + if (*ptr == 'Q') for (;;) + { + while (*(++ptr) != 0 && *ptr != '\\'); + if (*ptr == 0) return -1; + if (*(++ptr) == 'E') break; + } + continue; + } + + /* Skip over character classes */ + + if (*ptr == '[') + { + while (*(++ptr) != ']') + { + if (*ptr == '\\') + { + if (*(++ptr) == 0) return -1; + if (*ptr == 'Q') for (;;) + { + while (*(++ptr) != 0 && *ptr != '\\'); + if (*ptr == 0) return -1; + if (*(++ptr) == 'E') break; + } + continue; + } + } + continue; + } + + /* Skip comments in /x mode */ + + if (xmode && *ptr == '#') + { + while (*(++ptr) != 0 && *ptr != '\n'); + if (*ptr == 0) return -1; + continue; + } + + /* An opening parens must now be a real metacharacter */ + + if (*ptr != '(') continue; + if (ptr[1] != '?') + { + count++; + if (name == NULL && count == lorn) return count; + continue; + } + + ptr += 2; + if (*ptr == 'P') ptr++; /* Allow optional P */ + + /* We have to disambiguate (? */ + + if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && + *ptr != '\'') + continue; + + count++; + + if (name == NULL && count == lorn) return count; + term = *ptr++; + if (term == '<') term = '>'; + thisname = ptr; + while (*ptr != term) ptr++; + if (name != NULL && lorn == ptr - thisname && + strncmp((const char *)name, (const char *)thisname, lorn) == 0) + return count; + } + +return -1; +} + + + +/************************************************* +* Find first significant op code * +*************************************************/ + +/* This is called by several functions that scan a compiled expression looking +for a fixed first character, or an anchoring op code etc. It skips over things +that do not influence this. For some calls, a change of option is important. +For some calls, it makes sense to skip negative forward and all backward +assertions, and also the \b assertion; for others it does not. + +Arguments: + code pointer to the start of the group + options pointer to external options + optbit the option bit whose changing is significant, or + zero if none are + skipassert TRUE if certain assertions are to be skipped + +Returns: pointer to the first significant opcode +*/ + +static const uschar* +first_significant_code(const uschar *code, int *options, int optbit, + BOOL skipassert) +{ +for (;;) + { + switch ((int)*code) + { + case OP_OPT: + if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) + *options = (int)code[1]; + code += 2; + break; + + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + if (!skipassert) return code; + do code += GET(code, 1); while (*code == OP_ALT); + code += _pcre_OP_lengths[*code]; + break; + + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + if (!skipassert) return code; + /* Fall through */ + + case OP_CALLOUT: + case OP_CREF: + case OP_RREF: + case OP_DEF: + code += _pcre_OP_lengths[*code]; + break; + + default: + return code; + } + } +/* Control never reaches here */ +} + + + + +/************************************************* +* Find the fixed length of a pattern * +*************************************************/ + +/* Scan a pattern and compute the fixed length of subject that will match it, +if the length is fixed. This is needed for dealing with backward assertions. +In UTF8 mode, the result is in characters rather than bytes. + +Arguments: + code points to the start of the pattern (the bracket) + options the compiling options + +Returns: the fixed length, or -1 if there is no fixed length, + or -2 if \C was encountered +*/ + +static int +find_fixedlength(uschar *code, int options) +{ +int length = -1; + +register int branchlength = 0; +register uschar *cc = code + 1 + LINK_SIZE; + +/* Scan along the opcodes for this branch. If we get to the end of the +branch, check the length against that of the other branches. */ + +for (;;) + { + int d; + register int op = *cc; + + switch (op) + { + case OP_CBRA: + case OP_BRA: + case OP_ONCE: + case OP_COND: + d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* Reached end of a branch; if it's a ket it is the end of a nested + call. If it's ALT it is an alternation in a nested call. If it is + END it's the end of the outer call. All can be handled by the same code. */ + + case OP_ALT: + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_END: + if (length < 0) length = branchlength; + else if (length != branchlength) return -1; + if (*cc != OP_ALT) return length; + cc += 1 + LINK_SIZE; + branchlength = 0; + break; + + /* Skip over assertive subpatterns */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do cc += GET(cc, 1); while (*cc == OP_ALT); + /* Fall through */ + + /* Skip over things that don't match chars */ + + case OP_REVERSE: + case OP_CREF: + case OP_RREF: + case OP_DEF: + case OP_OPT: + case OP_CALLOUT: + case OP_SOD: + case OP_SOM: + case OP_EOD: + case OP_EODN: + case OP_CIRC: + case OP_DOLL: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + cc += _pcre_OP_lengths[*cc]; + break; + + /* Handle literal characters */ + + case OP_CHAR: + case OP_CHARNC: + case OP_NOT: + branchlength++; + cc += 2; +#ifdef SUPPORT_UTF8 + if ((options & PCRE_UTF8) != 0) + { + while ((*cc & 0xc0) == 0x80) cc++; + } +#endif + break; + + /* Handle exact repetitions. The count is already in characters, but we + need to skip over a multibyte character in UTF8 mode. */ + + case OP_EXACT: + branchlength += GET2(cc,1); + cc += 4; +#ifdef SUPPORT_UTF8 + if ((options & PCRE_UTF8) != 0) + { + while((*cc & 0x80) == 0x80) cc++; + } +#endif + break; + + case OP_TYPEEXACT: + branchlength += GET2(cc,1); + cc += 4; + break; + + /* Handle single-char matchers */ + + case OP_PROP: + case OP_NOTPROP: + cc += 2; + /* Fall through */ + + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + branchlength++; + cc++; + break; + + /* The single-byte matcher isn't allowed */ + + case OP_ANYBYTE: + return -2; + + /* Check a class for variable quantification */ + +#ifdef SUPPORT_UTF8 + case OP_XCLASS: + cc += GET(cc, 1) - 33; + /* Fall through */ +#endif + + case OP_CLASS: + case OP_NCLASS: + cc += 33; + + switch (*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + return -1; + + case OP_CRRANGE: + case OP_CRMINRANGE: + if (GET2(cc,1) != GET2(cc,3)) return -1; + branchlength += GET2(cc,1); + cc += 5; + break; + + default: + branchlength++; + } + break; + + /* Anything else is variable length */ + + default: + return -1; + } + } +/* Control never gets here */ +} + + + + +/************************************************* +* Scan compiled regex for numbered bracket * +*************************************************/ + +/* This little function scans through a compiled pattern until it finds a +capturing bracket with the given number. + +Arguments: + code points to start of expression + utf8 TRUE in UTF-8 mode + number the required bracket number + +Returns: pointer to the opcode for the bracket, or NULL if not found +*/ + +static const uschar * +find_bracket(const uschar *code, BOOL utf8, int number) +{ +for (;;) + { + register int c = *code; + if (c == OP_END) return NULL; + + /* XCLASS is used for classes that cannot be represented just by a bit + map. This includes negated single high-valued characters. The length in + the table is zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + + /* Handle capturing bracket */ + + else if (c == OP_CBRA) + { + int n = GET2(code, 1+LINK_SIZE); + if (n == number) return (uschar *)code; + code += _pcre_OP_lengths[c]; + } + + /* In UTF-8 mode, opcodes that are followed by a character may be followed by + a multi-byte character. The length in the table is a minimum, so we have to + arrange to skip the extra bytes. */ + + else + { + code += _pcre_OP_lengths[c]; + if (utf8) switch(c) + { + case OP_CHAR: + case OP_CHARNC: + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; + break; + } + } + } +} + + + +/************************************************* +* Scan compiled regex for recursion reference * +*************************************************/ + +/* This little function scans through a compiled pattern until it finds an +instance of OP_RECURSE. + +Arguments: + code points to start of expression + utf8 TRUE in UTF-8 mode + +Returns: pointer to the opcode for OP_RECURSE, or NULL if not found +*/ + +static const uschar * +find_recurse(const uschar *code, BOOL utf8) +{ +for (;;) + { + register int c = *code; + if (c == OP_END) return NULL; + if (c == OP_RECURSE) return code; + + /* XCLASS is used for classes that cannot be represented just by a bit + map. This includes negated single high-valued characters. The length in + the table is zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + + /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes + that are followed by a character may be followed by a multi-byte character. + The length in the table is a minimum, so we have to arrange to skip the extra + bytes. */ + + else + { + code += _pcre_OP_lengths[c]; + if (utf8) switch(c) + { + case OP_CHAR: + case OP_CHARNC: + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; + break; + } + } + } +} + + + +/************************************************* +* Scan compiled branch for non-emptiness * +*************************************************/ + +/* This function scans through a branch of a compiled pattern to see whether it +can match the empty string or not. It is called from could_be_empty() +below and from compile_branch() when checking for an unlimited repeat of a +group that can match nothing. Note that first_significant_code() skips over +assertions. If we hit an unclosed bracket, we return "empty" - this means we've +struck an inner bracket whose current branch will already have been scanned. + +Arguments: + code points to start of search + endcode points to where to stop + utf8 TRUE if in UTF8 mode + +Returns: TRUE if what is matched could be empty +*/ + +static BOOL +could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) +{ +register int c; +for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); + code < endcode; + code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) + { + const uschar *ccode; + + c = *code; + + if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE) + { + BOOL empty_branch; + if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ + + /* Scan a closed bracket */ + + empty_branch = FALSE; + do + { + if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) + empty_branch = TRUE; + code += GET(code, 1); + } + while (*code == OP_ALT); + if (!empty_branch) return FALSE; /* All branches are non-empty */ + + /* Move past the KET and fudge things so that the increment in the "for" + above has no effect. */ + + c = OP_END; + code += 1 + LINK_SIZE - _pcre_OP_lengths[c]; + continue; + } + + /* Handle the other opcodes */ + + switch (c) + { + /* Check for quantifiers after a class */ + +#ifdef SUPPORT_UTF8 + case OP_XCLASS: + ccode = code + GET(code, 1); + goto CHECK_CLASS_REPEAT; +#endif + + case OP_CLASS: + case OP_NCLASS: + ccode = code + 33; + +#ifdef SUPPORT_UTF8 + CHECK_CLASS_REPEAT: +#endif + + switch (*ccode) + { + case OP_CRSTAR: /* These could be empty; continue */ + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + break; + + default: /* Non-repeat => class must match */ + case OP_CRPLUS: /* These repeats aren't empty */ + case OP_CRMINPLUS: + return FALSE; + + case OP_CRRANGE: + case OP_CRMINRANGE: + if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ + break; + } + break; + + /* Opcodes that must match a character */ + + case OP_PROP: + case OP_NOTPROP: + case OP_EXTUNI: + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + case OP_ANYBYTE: + case OP_CHAR: + case OP_CHARNC: + case OP_NOT: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_EXACT: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSPLUS: + case OP_NOTEXACT: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + case OP_TYPEEXACT: + return FALSE; + + /* End of branch */ + + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_ALT: + return TRUE; + + /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, + MINUPTO, and POSUPTO may be followed by a multibyte character */ + +#ifdef SUPPORT_UTF8 + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + if (utf8) while ((code[2] & 0xc0) == 0x80) code++; + break; +#endif + } + } + +return TRUE; +} + + + +/************************************************* +* Scan compiled regex for non-emptiness * +*************************************************/ + +/* This function is called to check for left recursive calls. We want to check +the current branch of the current pattern to see if it could match the empty +string. If it could, we must look outwards for branches at other levels, +stopping when we pass beyond the bracket which is the subject of the recursion. + +Arguments: + code points to start of the recursion + endcode points to where to stop (current RECURSE item) + bcptr points to the chain of current (unclosed) branch starts + utf8 TRUE if in UTF-8 mode + +Returns: TRUE if what is matched could be empty +*/ + +static BOOL +could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, + BOOL utf8) +{ +while (bcptr != NULL && bcptr->current >= code) + { + if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; + bcptr = bcptr->outer; + } +return TRUE; +} + + + +/************************************************* +* Check for POSIX class syntax * +*************************************************/ + +/* This function is called when the sequence "[:" or "[." or "[=" is +encountered in a character class. It checks whether this is followed by an +optional ^ and then a sequence of letters, terminated by a matching ":]" or +".]" or "=]". + +Argument: + ptr pointer to the initial [ + endptr where to return the end pointer + cd pointer to compile data + +Returns: TRUE or FALSE +*/ + +static BOOL +check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) +{ +int terminator; /* Don't combine these lines; the Solaris cc */ +terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ +if (*(++ptr) == '^') ptr++; +while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; +if (*ptr == terminator && ptr[1] == ']') + { + *endptr = ptr; + return TRUE; + } +return FALSE; +} + + + + +/************************************************* +* Check POSIX class name * +*************************************************/ + +/* This function is called to check the name given in a POSIX-style class entry +such as [:alnum:]. + +Arguments: + ptr points to the first letter + len the length of the name + +Returns: a value representing the name, or -1 if unknown +*/ + +static int +check_posix_name(const uschar *ptr, int len) +{ + int offset = 0; + int yield = 0; + while (posix_name_lengths[yield] != 0) + { + if (len == posix_name_lengths[yield] && + strcmp((const char *)ptr, posix_names + offset) == 0) return yield; + offset += posix_name_lengths[yield] + 1; + yield++; + } + return -1; +} + + +/************************************************* +* Adjust OP_RECURSE items in repeated group * +*************************************************/ + +/* OP_RECURSE items contain an offset from the start of the regex to the group +that is referenced. This means that groups can be replicated for fixed +repetition simply by copying (because the recursion is allowed to refer to +earlier groups that are outside the current group). However, when a group is +optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before +it, after it has been compiled. This means that any OP_RECURSE items within it +that refer to the group itself or any contained groups have to have their +offsets adjusted. That one of the jobs of this function. Before it is called, +the partially compiled regex must be temporarily terminated with OP_END. + +This function has been extended with the possibility of forward references for +recursions and subroutine calls. It must also check the list of such references +for the group we are dealing with. If it finds that one of the recursions in +the current group is on this list, it adjusts the offset in the list, not the +value in the reference (which is a group number). + +Arguments: + group points to the start of the group + adjust the amount by which the group is to be moved + utf8 TRUE in UTF-8 mode + cd contains pointers to tables etc. + save_hwm the hwm forward reference pointer at the start of the group + +Returns: nothing +*/ + +static void +adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, + uschar *save_hwm) +{ +uschar *ptr = group; +while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) + { + int offset; + uschar *hc; + + /* See if this recursion is on the forward reference list. If so, adjust the + reference. */ + + for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) + { + offset = GET(hc, 0); + if (cd->start_code + offset == ptr + 1) + { + PUT(hc, 0, offset + adjust); + break; + } + } + + /* Otherwise, adjust the recursion offset if it's after the start of this + group. */ + + if (hc >= cd->hwm) + { + offset = GET(ptr, 1); + if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); + } + + ptr += 1 + LINK_SIZE; + } +} + + + +/************************************************* +* Insert an automatic callout point * +*************************************************/ + +/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert +callout points before each pattern item. + +Arguments: + code current code pointer + ptr current pattern pointer + cd pointers to tables etc + +Returns: new code pointer +*/ + +static uschar * +auto_callout(uschar *code, const uschar *ptr, compile_data *cd) +{ +*code++ = OP_CALLOUT; +*code++ = 255; +PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ +PUT(code, LINK_SIZE, 0); /* Default length */ +return code + 2*LINK_SIZE; +} + + + +/************************************************* +* Complete a callout item * +*************************************************/ + +/* A callout item contains the length of the next item in the pattern, which +we can't fill in till after we have reached the relevant point. This is used +for both automatic and manual callouts. + +Arguments: + previous_callout points to previous callout item + ptr current pattern pointer + cd pointers to tables etc + +Returns: nothing +*/ + +static void +complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) +{ +int length = ptr - cd->start_pattern - GET(previous_callout, 2); +PUT(previous_callout, 2 + LINK_SIZE, length); +} + + + +#ifdef SUPPORT_UCP +/************************************************* +* Get othercase range * +*************************************************/ + +/* This function is passed the start and end of a class range, in UTF-8 mode +with UCP support. It searches up the characters, looking for internal ranges of +characters in the "other" case. Each call returns the next one, updating the +start address. + +Arguments: + cptr points to starting character value; updated + d end value + ocptr where to put start of othercase range + odptr where to put end of othercase range + +Yield: TRUE when range returned; FALSE when no more +*/ + +static BOOL +get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, + unsigned int *odptr) +{ +unsigned int c, othercase, next; + +for (c = *cptr; c <= d; c++) + { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; } + +if (c > d) return FALSE; + +*ocptr = othercase; +next = othercase + 1; + +for (++c; c <= d; c++) + { + if (_pcre_ucp_othercase(c) != next) break; + next++; + } + +*odptr = next - 1; +*cptr = c; + +return TRUE; +} +#endif /* SUPPORT_UCP */ + + + +/************************************************* +* Check if auto-possessifying is possible * +*************************************************/ + +/* This function is called for unlimited repeats of certain items, to see +whether the next thing could possibly match the repeated item. If not, it makes +sense to automatically possessify the repeated item. + +Arguments: + op_code the repeated op code + this data for this item, depends on the opcode + utf8 TRUE in UTF-8 mode + utf8_char used for utf8 character bytes, NULL if not relevant + ptr next character in pattern + options options bits + cd contains pointers to tables etc. + +Returns: TRUE if possessifying is wanted +*/ + +static BOOL +check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, + const uschar *ptr, int options, compile_data *cd) +{ +int next; + +/* Skip whitespace and comments in extended mode */ + +if ((options & PCRE_EXTENDED) != 0) + { + for (;;) + { + while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; + if (*ptr == '#') + { + while (*(++ptr) != 0) + if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } + } + else break; + } + } + +/* If the next item is one that we can handle, get its value. A non-negative +value is a character, a negative value is an escape value. */ + +if (*ptr == '\\') + { + int temperrorcode = 0; + next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); + if (temperrorcode != 0) return FALSE; + ptr++; /* Point after the escape sequence */ + } + +else if ((cd->ctypes[*ptr] & ctype_meta) == 0) + { +#ifdef SUPPORT_UTF8 + if (utf8) { GETCHARINC(next, ptr); } else +#endif + next = *ptr++; + } + +else return FALSE; + +/* Skip whitespace and comments in extended mode */ + +if ((options & PCRE_EXTENDED) != 0) + { + for (;;) + { + while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; + if (*ptr == '#') + { + while (*(++ptr) != 0) + if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } + } + else break; + } + } + +/* If the next thing is itself optional, we have to give up. */ + +if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) + return FALSE; + +/* Now compare the next item with the previous opcode. If the previous is a +positive single character match, "item" either contains the character or, if +"item" is greater than 127 in utf8 mode, the character's bytes are in +utf8_char. */ + + +/* Handle cases when the next item is a character. */ + +if (next >= 0) switch(op_code) + { + case OP_CHAR: +#ifdef SUPPORT_UTF8 + if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +#endif + return item != next; + + /* For CHARNC (caseless character) we must check the other case. If we have + Unicode property support, we can use it to test the other case of + high-valued characters. */ + + case OP_CHARNC: +#ifdef SUPPORT_UTF8 + if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +#endif + if (item == next) return FALSE; +#ifdef SUPPORT_UTF8 + if (utf8) + { + unsigned int othercase; + if (next < 128) othercase = cd->fcc[next]; else +#ifdef SUPPORT_UCP + othercase = _pcre_ucp_othercase((unsigned int)next); +#else + othercase = NOTACHAR; +#endif + return (unsigned int)item != othercase; + } + else +#endif /* SUPPORT_UTF8 */ + return (item != cd->fcc[next]); /* Non-UTF-8 mode */ + + /* For OP_NOT, "item" must be a single-byte character. */ + + case OP_NOT: + if (next < 0) return FALSE; /* Not a character */ + if (item == next) return TRUE; + if ((options & PCRE_CASELESS) == 0) return FALSE; +#ifdef SUPPORT_UTF8 + if (utf8) + { + unsigned int othercase; + if (next < 128) othercase = cd->fcc[next]; else +#ifdef SUPPORT_UCP + othercase = _pcre_ucp_othercase(next); +#else + othercase = NOTACHAR; +#endif + return (unsigned int)item == othercase; + } + else +#endif /* SUPPORT_UTF8 */ + return (item == cd->fcc[next]); /* Non-UTF-8 mode */ + + case OP_DIGIT: + return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; + + case OP_NOT_DIGIT: + return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; + + case OP_WHITESPACE: + return next > 127 || (cd->ctypes[next] & ctype_space) == 0; + + case OP_NOT_WHITESPACE: + return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; + + case OP_WORDCHAR: + return next > 127 || (cd->ctypes[next] & ctype_word) == 0; + + case OP_NOT_WORDCHAR: + return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; + + default: + return FALSE; + } + + +/* Handle the case when the next item is \d, \s, etc. */ + +switch(op_code) + { + case OP_CHAR: + case OP_CHARNC: +#ifdef SUPPORT_UTF8 + if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +#endif + switch(-next) + { + case ESC_d: + return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; + + case ESC_D: + return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; + + case ESC_s: + return item > 127 || (cd->ctypes[item] & ctype_space) == 0; + + case ESC_S: + return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; + + case ESC_w: + return item > 127 || (cd->ctypes[item] & ctype_word) == 0; + + case ESC_W: + return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; + + default: + return FALSE; + } + + case OP_DIGIT: + return next == -ESC_D || next == -ESC_s || next == -ESC_W; + + case OP_NOT_DIGIT: + return next == -ESC_d; + + case OP_WHITESPACE: + return next == -ESC_S || next == -ESC_d || next == -ESC_w; + + case OP_NOT_WHITESPACE: + return next == -ESC_s; + + case OP_WORDCHAR: + return next == -ESC_W || next == -ESC_s; + + case OP_NOT_WORDCHAR: + return next == -ESC_w || next == -ESC_d; + + default: + return FALSE; + } + +/* Control does not reach here */ +} + + + +/************************************************* +* Compile one branch * +*************************************************/ + +/* Scan the pattern, compiling it into the a vector. If the options are +changed during the branch, the pointer is used to change the external options +bits. This function is used during the pre-compile phase when we are trying +to find out the amount of memory needed, as well as during the real compile +phase. The value of lengthptr distinguishes the two phases. + +Arguments: + optionsptr pointer to the option bits + codeptr points to the pointer to the current code point + ptrptr points to the current pattern pointer + errorcodeptr points to error code variable + firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) + reqbyteptr set to the last literal character required, else < 0 + bcptr points to current branch chain + cd contains pointers to tables etc. + lengthptr NULL during the real compile phase + points to length accumulator during pre-compile phase + +Returns: TRUE on success + FALSE, with *errorcodeptr set non-zero on error +*/ + +static BOOL +compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, + int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, + compile_data *cd, int *lengthptr) +{ +int repeat_type, op_type; +int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ +int bravalue = 0; +int greedy_default, greedy_non_default; +int firstbyte, reqbyte; +int zeroreqbyte, zerofirstbyte; +int req_caseopt, reqvary, tempreqvary; +int options = *optionsptr; +int after_manual_callout = 0; +int length_prevgroup = 0; +register int c; +register uschar *code = *codeptr; +uschar *last_code = code; +uschar *orig_code = code; +uschar *tempcode; +BOOL inescq = FALSE; +BOOL groupsetfirstbyte = FALSE; +const uschar *ptr = *ptrptr; +const uschar *tempptr; +uschar *previous = NULL; +uschar *previous_callout = NULL; +uschar *save_hwm = NULL; +uschar classbits[32]; + +#ifdef SUPPORT_UTF8 +BOOL class_utf8; +BOOL utf8 = (options & PCRE_UTF8) != 0; +uschar *class_utf8data; +uschar utf8_char[6]; +#else +BOOL utf8 = FALSE; +uschar *utf8_char = NULL; +#endif + +#ifdef DEBUG +if (lengthptr != NULL) DPRINTF((">> start branch\n")); +#endif + +/* Set up the default and non-default settings for greediness */ + +greedy_default = ((options & PCRE_UNGREEDY) != 0); +greedy_non_default = greedy_default ^ 1; + +/* Initialize no first byte, no required byte. REQ_UNSET means "no char +matching encountered yet". It gets changed to REQ_NONE if we hit something that +matches a non-fixed char first char; reqbyte just remains unset if we never +find one. + +When we hit a repeat whose minimum is zero, we may have to adjust these values +to take the zero repeat into account. This is implemented by setting them to +zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual +item types that can be repeated set these backoff variables appropriately. */ + +firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; + +/* The variable req_caseopt contains either the REQ_CASELESS value or zero, +according to the current setting of the caseless flag. REQ_CASELESS is a bit +value > 255. It is added into the firstbyte or reqbyte variables to record the +case status of the value. This is used only for ASCII characters. */ + +req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; + +/* Switch on next character until the end of the branch */ + +for (;; ptr++) + { + BOOL negate_class; + BOOL possessive_quantifier; + BOOL is_quantifier; + BOOL is_recurse; + int class_charcount; + int class_lastchar; + int newoptions; + int recno; + int skipbytes; + int subreqbyte; + int subfirstbyte; + int terminator; + int mclength; + uschar mcbuffer[8]; + + /* Get next byte in the pattern */ + + c = *ptr; + + /* If we are in the pre-compile phase, accumulate the length used for the + previous cycle of this loop. */ + + if (lengthptr != NULL) + { +#ifdef DEBUG + if (code > cd->hwm) cd->hwm = code; /* High water info */ +#endif + if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ + { + *errorcodeptr = ERR52; + goto FAILED; + } + + /* There is at least one situation where code goes backwards: this is the + case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, + the class is simply eliminated. However, it is created first, so we have to + allow memory for it. Therefore, don't ever reduce the length at this point. + */ + + if (code < last_code) code = last_code; + *lengthptr += code - last_code; + DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); + + /* If "previous" is set and it is not at the start of the work space, move + it back to there, in order to avoid filling up the work space. Otherwise, + if "previous" is NULL, reset the current code pointer to the start. */ + + if (previous != NULL) + { + if (previous > orig_code) + { + memmove(orig_code, previous, code - previous); + code -= previous - orig_code; + previous = orig_code; + } + } + else code = orig_code; + + /* Remember where this code item starts so we can pick up the length + next time round. */ + + last_code = code; + } + + /* In the real compile phase, just check the workspace used by the forward + reference list. */ + + else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) + { + *errorcodeptr = ERR52; + goto FAILED; + } + + /* If in \Q...\E, check for the end; if not, we have a literal */ + + if (inescq && c != 0) + { + if (c == '\\' && ptr[1] == 'E') + { + inescq = FALSE; + ptr++; + continue; + } + else + { + if (previous_callout != NULL) + { + if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ + complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + if ((options & PCRE_AUTO_CALLOUT) != 0) + { + previous_callout = code; + code = auto_callout(code, ptr, cd); + } + goto NORMAL_CHAR; + } + } + + /* Fill in length of a previous callout, except when the next thing is + a quantifier. */ + + is_quantifier = c == '*' || c == '+' || c == '?' || + (c == '{' && is_counted_repeat(ptr+1)); + + if (!is_quantifier && previous_callout != NULL && + after_manual_callout-- <= 0) + { + if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ + complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + + /* In extended mode, skip white space and comments */ + + if ((options & PCRE_EXTENDED) != 0) + { + if ((cd->ctypes[c] & ctype_space) != 0) continue; + if (c == '#') + { + while (*(++ptr) != 0) + { + if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } + } + if (*ptr != 0) continue; + + /* Else fall through to handle end of string */ + c = 0; + } + } + + /* No auto callout for quantifiers. */ + + if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) + { + previous_callout = code; + code = auto_callout(code, ptr, cd); + } + + switch(c) + { + /* ===================================================================*/ + case 0: /* The branch terminates at string end */ + case '|': /* or | or ) */ + case ')': + *firstbyteptr = firstbyte; + *reqbyteptr = reqbyte; + *codeptr = code; + *ptrptr = ptr; + if (lengthptr != NULL) + { + *lengthptr += code - last_code; /* To include callout length */ + DPRINTF((">> end branch\n")); + } + return TRUE; + + + /* ===================================================================*/ + /* Handle single-character metacharacters. In multiline mode, ^ disables + the setting of any following char as a first character. */ + + case '^': + if ((options & PCRE_MULTILINE) != 0) + { + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + } + previous = NULL; + *code++ = OP_CIRC; + break; + + case '$': + previous = NULL; + *code++ = OP_DOLL; + break; + + /* There can never be a first char if '.' is first, whatever happens about + repeats. The value of reqbyte doesn't change either. */ + + case '.': + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + previous = code; + *code++ = OP_ANY; + break; + + + /* ===================================================================*/ + /* Character classes. If the included characters are all < 256, we build a + 32-byte bitmap of the permitted characters, except in the special case + where there is only one such character. For negated classes, we build the + map as usual, then invert it at the end. However, we use a different opcode + so that data characters > 255 can be handled correctly. + + If the class contains characters outside the 0-255 range, a different + opcode is compiled. It may optionally have a bit map for characters < 256, + but those above are are explicitly listed afterwards. A flag byte tells + whether the bitmap is present, and whether this is a negated class or not. + */ + + case '[': + previous = code; + + /* PCRE supports POSIX class stuff inside a class. Perl gives an error if + they are encountered at the top level, so we'll do that too. */ + + if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && + check_posix_syntax(ptr, &tempptr, cd)) + { + *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; + goto FAILED; + } + + /* If the first character is '^', set the negation flag and skip it. */ + + if ((c = *(++ptr)) == '^') + { + negate_class = TRUE; + c = *(++ptr); + } + else + { + negate_class = FALSE; + } + + /* Keep a count of chars with values < 256 so that we can optimize the case + of just a single character (as long as it's < 256). However, For higher + valued UTF-8 characters, we don't yet do any optimization. */ + + class_charcount = 0; + class_lastchar = -1; + + /* Initialize the 32-char bit map to all zeros. We build the map in a + temporary bit of memory, in case the class contains only 1 character (less + than 256), because in that case the compiled code doesn't use the bit map. + */ + + memset(classbits, 0, 32 * sizeof(uschar)); + +#ifdef SUPPORT_UTF8 + class_utf8 = FALSE; /* No chars >= 256 */ + class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ +#endif + + /* Process characters until ] is reached. By writing this as a "do" it + means that an initial ] is taken as a data character. At the start of the + loop, c contains the first byte of the character. */ + + if (c != 0) do + { + const uschar *oldptr; + +#ifdef SUPPORT_UTF8 + if (utf8 && c > 127) + { /* Braces are required because the */ + GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ + } +#endif + + /* Inside \Q...\E everything is literal except \E */ + + if (inescq) + { + if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ + { + inescq = FALSE; /* Reset literal state */ + ptr++; /* Skip the 'E' */ + continue; /* Carry on with next */ + } + goto CHECK_RANGE; /* Could be range if \E follows */ + } + + /* Handle POSIX class names. Perl allows a negation extension of the + form [:^name:]. A square bracket that doesn't match the syntax is + treated as a literal. We also recognize the POSIX constructions + [.ch.] and [=ch=] ("collating elements") and fault them, as Perl + 5.6 and 5.8 do. */ + + if (c == '[' && + (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && + check_posix_syntax(ptr, &tempptr, cd)) + { + BOOL local_negate = FALSE; + int posix_class, taboffset, tabopt; + register const uschar *cbits = cd->cbits; + uschar pbits[32]; + + if (ptr[1] != ':') + { + *errorcodeptr = ERR31; + goto FAILED; + } + + ptr += 2; + if (*ptr == '^') + { + local_negate = TRUE; + ptr++; + } + + posix_class = check_posix_name(ptr, tempptr - ptr); + if (posix_class < 0) + { + *errorcodeptr = ERR30; + goto FAILED; + } + + /* If matching is caseless, upper and lower are converted to + alpha. This relies on the fact that the class table starts with + alpha, lower, upper as the first 3 entries. */ + + if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) + posix_class = 0; + + /* We build the bit map for the POSIX class in a chunk of local store + because we may be adding and subtracting from it, and we don't want to + subtract bits that may be in the main map already. At the end we or the + result into the bit map that is being built. */ + + posix_class *= 3; + + /* Copy in the first table (always present) */ + + memcpy(pbits, cbits + posix_class_maps[posix_class], + 32 * sizeof(uschar)); + + /* If there is a second table, add or remove it as required. */ + + taboffset = posix_class_maps[posix_class + 1]; + tabopt = posix_class_maps[posix_class + 2]; + + if (taboffset >= 0) + { + if (tabopt >= 0) + for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; + else + for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; + } + + /* Not see if we need to remove any special characters. An option + value of 1 removes vertical space and 2 removes underscore. */ + + if (tabopt < 0) tabopt = -tabopt; + if (tabopt == 1) pbits[1] &= ~0x3c; + else if (tabopt == 2) pbits[11] &= 0x7f; + + /* Add the POSIX table or its complement into the main table that is + being built and we are done. */ + + if (local_negate) + for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; + else + for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; + + ptr = tempptr + 1; + class_charcount = 10; /* Set > 1; assumes more than 1 per class */ + continue; /* End of POSIX syntax handling */ + } + + /* Backslash may introduce a single character, or it may introduce one + of the specials, which just set a flag. The sequence \b is a special + case. Inside a class (and only there) it is treated as backspace. + Elsewhere it marks a word boundary. Other escapes have preset maps ready + to or into the one we are building. We assume they have more than one + character in them, so set class_charcount bigger than one. */ + + if (c == '\\') + { + c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); + if (*errorcodeptr != 0) goto FAILED; + + if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ + else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ + else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ + else if (-c == ESC_Q) /* Handle start of quoted string */ + { + if (ptr[1] == '\\' && ptr[2] == 'E') + { + ptr += 2; /* avoid empty string */ + } + else inescq = TRUE; + continue; + } + + if (c < 0) + { + register const uschar *cbits = cd->cbits; + class_charcount += 2; /* Greater than 1 is what matters */ + + /* Save time by not doing this in the pre-compile phase. */ + + if (lengthptr == NULL) switch (-c) + { + case ESC_d: + for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; + continue; + + case ESC_D: + for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; + continue; + + case ESC_w: + for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; + continue; + + case ESC_W: + for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; + continue; + + case ESC_s: + for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; + classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ + continue; + + case ESC_S: + for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; + classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ + continue; + + case ESC_E: /* Perl ignores an orphan \E */ + continue; + + default: /* Not recognized; fall through */ + break; /* Need "default" setting to stop compiler warning. */ + } + + /* In the pre-compile phase, just do the recognition. */ + + else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || + c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; + + /* We need to deal with \P and \p in both phases. */ + +#ifdef SUPPORT_UCP + if (-c == ESC_p || -c == ESC_P) + { + BOOL negated; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) goto FAILED; + class_utf8 = TRUE; + *class_utf8data++ = ((-c == ESC_p) != negated)? + XCL_PROP : XCL_NOTPROP; + *class_utf8data++ = ptype; + *class_utf8data++ = pdata; + class_charcount -= 2; /* Not a < 256 character */ + continue; + } +#endif + /* Unrecognized escapes are faulted if PCRE is running in its + strict mode. By default, for compatibility with Perl, they are + treated as literals. */ + + if ((options & PCRE_EXTRA) != 0) + { + *errorcodeptr = ERR7; + goto FAILED; + } + + class_charcount -= 2; /* Undo the default count from above */ + c = *ptr; /* Get the final character and fall through */ + } + + /* Fall through if we have a single character (c >= 0). This may be + greater than 256 in UTF-8 mode. */ + + } /* End of backslash handling */ + + /* A single character may be followed by '-' to form a range. However, + Perl does not permit ']' to be the end of the range. A '-' character + at the end is treated as a literal. Perl ignores orphaned \E sequences + entirely. The code for handling \Q and \E is messy. */ + + CHECK_RANGE: + while (ptr[1] == '\\' && ptr[2] == 'E') + { + inescq = FALSE; + ptr += 2; + } + + oldptr = ptr; + + if (!inescq && ptr[1] == '-') + { + int d; + ptr += 2; + while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; + + /* If we hit \Q (not followed by \E) at this point, go into escaped + mode. */ + + while (*ptr == '\\' && ptr[1] == 'Q') + { + ptr += 2; + if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } + inescq = TRUE; + break; + } + + if (*ptr == 0 || (!inescq && *ptr == ']')) + { + ptr = oldptr; + goto LONE_SINGLE_CHARACTER; + } + +#ifdef SUPPORT_UTF8 + if (utf8) + { /* Braces are required because the */ + GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ + } + else +#endif + d = *ptr; /* Not UTF-8 mode */ + + /* The second part of a range can be a single-character escape, but + not any of the other escapes. Perl 5.6 treats a hyphen as a literal + in such circumstances. */ + + if (!inescq && d == '\\') + { + d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); + if (*errorcodeptr != 0) goto FAILED; + + /* \b is backslash; \X is literal X; \R is literal R; any other + special means the '-' was literal */ + + if (d < 0) + { + if (d == -ESC_b) d = '\b'; + else if (d == -ESC_X) d = 'X'; + else if (d == -ESC_R) d = 'R'; else + { + ptr = oldptr; + goto LONE_SINGLE_CHARACTER; /* A few lines below */ + } + } + } + + /* Check that the two values are in the correct order. Optimize + one-character ranges */ + + if (d < c) + { + *errorcodeptr = ERR8; + goto FAILED; + } + + if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ + + /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless + matching, we have to use an XCLASS with extra data items. Caseless + matching for characters > 127 is available only if UCP support is + available. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) + { + class_utf8 = TRUE; + + /* With UCP support, we can find the other case equivalents of + the relevant characters. There may be several ranges. Optimize how + they fit with the basic range. */ + +#ifdef SUPPORT_UCP + if ((options & PCRE_CASELESS) != 0) + { + unsigned int occ, ocd; + unsigned int cc = c; + unsigned int origd = d; + while (get_othercase_range(&cc, origd, &occ, &ocd)) + { + if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */ + + if (occ < c && ocd >= c - 1) /* Extend the basic range */ + { /* if there is overlap, */ + c = occ; /* noting that if occ < c */ + continue; /* we can't have ocd > d */ + } /* because a subrange is */ + if (ocd > d && occ <= d + 1) /* always shorter than */ + { /* the basic range. */ + d = ocd; + continue; + } + + if (occ == ocd) + { + *class_utf8data++ = XCL_SINGLE; + } + else + { + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(occ, class_utf8data); + } + class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); + } + } +#endif /* SUPPORT_UCP */ + + /* Now record the original range, possibly modified for UCP caseless + overlapping ranges. */ + + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(c, class_utf8data); + class_utf8data += _pcre_ord2utf8(d, class_utf8data); + + /* With UCP support, we are done. Without UCP support, there is no + caseless matching for UTF-8 characters > 127; we can use the bit map + for the smaller ones. */ + +#ifdef SUPPORT_UCP + continue; /* With next character in the class */ +#else + if ((options & PCRE_CASELESS) == 0 || c > 127) continue; + + /* Adjust upper limit and fall through to set up the map */ + + d = 127; + +#endif /* SUPPORT_UCP */ + } +#endif /* SUPPORT_UTF8 */ + + /* We use the bit map for all cases when not in UTF-8 mode; else + ranges that lie entirely within 0-127 when there is UCP support; else + for partial ranges without UCP support. */ + + class_charcount += d - c + 1; + class_lastchar = d; + + /* We can save a bit of time by skipping this in the pre-compile. */ + + if (lengthptr == NULL) for (; c <= d; c++) + { + classbits[c/8] |= (1 << (c&7)); + if ((options & PCRE_CASELESS) != 0) + { + int uc = cd->fcc[c]; /* flip case */ + classbits[uc/8] |= (1 << (uc&7)); + } + } + + continue; /* Go get the next char in the class */ + } + + /* Handle a lone single character - we can get here for a normal + non-escape char, or after \ that introduces a single character or for an + apparent range that isn't. */ + + LONE_SINGLE_CHARACTER: + + /* Handle a character that cannot go in the bit map */ + +#ifdef SUPPORT_UTF8 + if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) + { + class_utf8 = TRUE; + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(c, class_utf8data); + +#ifdef SUPPORT_UCP + if ((options & PCRE_CASELESS) != 0) + { + unsigned int othercase; + if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) + { + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); + } + } +#endif /* SUPPORT_UCP */ + + } + else +#endif /* SUPPORT_UTF8 */ + + /* Handle a single-byte character */ + { + classbits[c/8] |= (1 << (c&7)); + if ((options & PCRE_CASELESS) != 0) + { + c = cd->fcc[c]; /* flip case */ + classbits[c/8] |= (1 << (c&7)); + } + class_charcount++; + class_lastchar = c; + } + } + + /* Loop until ']' reached. This "while" is the end of the "do" above. */ + + while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); + + if (c == 0) /* Missing terminating ']' */ + { + *errorcodeptr = ERR6; + goto FAILED; + } + + /* If class_charcount is 1, we saw precisely one character whose value is + less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we + can optimize the negative case only if there were no characters >= 128 + because OP_NOT and the related opcodes like OP_NOTSTAR operate on + single-bytes only. This is an historical hangover. Maybe one day we can + tidy these opcodes to handle multi-byte characters. + + The optimization throws away the bit map. We turn the item into a + 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note + that OP_NOT does not support multibyte characters. In the positive case, it + can cause firstbyte to be set. Otherwise, there can be no first char if + this item is first, whatever repeat count may follow. In the case of + reqbyte, save the previous value for reinstating. */ + +#ifdef SUPPORT_UTF8 + if (class_charcount == 1 && + (!utf8 || + (!class_utf8 && (!negate_class || class_lastchar < 128)))) + +#else + if (class_charcount == 1) +#endif + { + zeroreqbyte = reqbyte; + + /* The OP_NOT opcode works on one-byte characters only. */ + + if (negate_class) + { + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + *code++ = OP_NOT; + *code++ = class_lastchar; + break; + } + + /* For a single, positive character, get the value into mcbuffer, and + then we can handle this with the normal one-character code. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && class_lastchar > 127) + mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); + else +#endif + { + mcbuffer[0] = class_lastchar; + mclength = 1; + } + goto ONE_CHAR; + } /* End of 1-char optimization */ + + /* The general case - not the one-char optimization. If this is the first + thing in the branch, there can be no first char setting, whatever the + repeat count. Any reqbyte setting must remain unchanged after any kind of + repeat. */ + + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + + /* If there are characters with values > 255, we have to compile an + extended class, with its own opcode. If there are no characters < 256, + we can omit the bitmap in the actual compiled code. */ + +#ifdef SUPPORT_UTF8 + if (class_utf8) + { + *class_utf8data++ = XCL_END; /* Marks the end of extra data */ + *code++ = OP_XCLASS; + code += LINK_SIZE; + *code = negate_class? XCL_NOT : 0; + + /* If the map is required, move up the extra data to make room for it; + otherwise just move the code pointer to the end of the extra data. */ + + if (class_charcount > 0) + { + *code++ |= XCL_MAP; + memmove(code + 32, code, class_utf8data - code); + memcpy(code, classbits, 32); + code = class_utf8data + 32; + } + else code = class_utf8data; + + /* Now fill in the complete length of the item */ + + PUT(previous, 1, code - previous); + break; /* End of class handling */ + } +#endif + + /* If there are no characters > 255, negate the 32-byte map if necessary, + and copy it into the code vector. If this is the first thing in the branch, + there can be no first char setting, whatever the repeat count. Any reqbyte + setting must remain unchanged after any kind of repeat. */ + + if (negate_class) + { + *code++ = OP_NCLASS; + if (lengthptr == NULL) /* Save time in the pre-compile phase */ + for (c = 0; c < 32; c++) code[c] = ~classbits[c]; + } + else + { + *code++ = OP_CLASS; + memcpy(code, classbits, 32); + } + code += 32; + break; + + + /* ===================================================================*/ + /* Various kinds of repeat; '{' is not necessarily a quantifier, but this + has been tested above. */ + + case '{': + if (!is_quantifier) goto NORMAL_CHAR; + ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); + if (*errorcodeptr != 0) goto FAILED; + goto REPEAT; + + case '*': + repeat_min = 0; + repeat_max = -1; + goto REPEAT; + + case '+': + repeat_min = 1; + repeat_max = -1; + goto REPEAT; + + case '?': + repeat_min = 0; + repeat_max = 1; + + REPEAT: + if (previous == NULL) + { + *errorcodeptr = ERR9; + goto FAILED; + } + + if (repeat_min == 0) + { + firstbyte = zerofirstbyte; /* Adjust for zero repeat */ + reqbyte = zeroreqbyte; /* Ditto */ + } + + /* Remember whether this is a variable length repeat */ + + reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; + + op_type = 0; /* Default single-char op codes */ + possessive_quantifier = FALSE; /* Default not possessive quantifier */ + + /* Save start of previous item, in case we have to move it up to make space + for an inserted OP_ONCE for the additional '+' extension. */ + + tempcode = previous; + + /* If the next character is '+', we have a possessive quantifier. This + implies greediness, whatever the setting of the PCRE_UNGREEDY option. + If the next character is '?' this is a minimizing repeat, by default, + but if PCRE_UNGREEDY is set, it works the other way round. We change the + repeat type to the non-default. */ + + if (ptr[1] == '+') + { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + ptr++; + } + else if (ptr[1] == '?') + { + repeat_type = greedy_non_default; + ptr++; + } + else repeat_type = greedy_default; + + /* If previous was a character match, abolish the item and generate a + repeat item instead. If a char item has a minumum of more than one, ensure + that it is set in reqbyte - it might not be if a sequence such as x{3} is + the first thing in a branch because the x will have gone into firstbyte + instead. */ + + if (*previous == OP_CHAR || *previous == OP_CHARNC) + { + /* Deal with UTF-8 characters that take up more than one byte. It's + easier to write this out separately than try to macrify it. Use c to + hold the length of the character in bytes, plus 0x80 to flag that it's a + length rather than a small character. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && (code[-1] & 0x80) != 0) + { + uschar *lastchar = code - 1; + while((*lastchar & 0xc0) == 0x80) lastchar--; + c = code - lastchar; /* Length of UTF-8 character */ + memcpy(utf8_char, lastchar, c); /* Save the char */ + c |= 0x80; /* Flag c as a length */ + } + else +#endif + + /* Handle the case of a single byte - either with no UTF8 support, or + with UTF-8 disabled, or for a UTF-8 character < 128. */ + + { + c = code[-1]; + if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; + } + + /* If the repetition is unlimited, it pays to see if the next thing on + the line is something that cannot possibly match this character. If so, + automatically possessifying this item gains some performance in the case + where the match fails. */ + + if (!possessive_quantifier && + repeat_max < 0 && + check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, + options, cd)) + { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + } + + goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ + } + + /* If previous was a single negated character ([^a] or similar), we use + one of the special opcodes, replacing it. The code is shared with single- + character repeats by setting opt_type to add a suitable offset into + repeat_type. We can also test for auto-possessification. OP_NOT is + currently used only for single-byte chars. */ + + else if (*previous == OP_NOT) + { + op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ + c = previous[1]; + if (!possessive_quantifier && + repeat_max < 0 && + check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) + { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + } + goto OUTPUT_SINGLE_REPEAT; + } + + /* If previous was a character type match (\d or similar), abolish it and + create a suitable repeat item. The code is shared with single-character + repeats by setting op_type to add a suitable offset into repeat_type. Note + the the Unicode property types will be present only when SUPPORT_UCP is + defined, but we don't wrap the little bits of code here because it just + makes it horribly messy. */ + + else if (*previous < OP_EODN) + { + uschar *oldcode; + int prop_type, prop_value; + op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ + c = *previous; + + if (!possessive_quantifier && + repeat_max < 0 && + check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) + { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + } + + OUTPUT_SINGLE_REPEAT: + if (*previous == OP_PROP || *previous == OP_NOTPROP) + { + prop_type = previous[1]; + prop_value = previous[2]; + } + else prop_type = prop_value = -1; + + oldcode = code; + code = previous; /* Usually overwrite previous item */ + + /* If the maximum is zero then the minimum must also be zero; Perl allows + this case, so we do too - by simply omitting the item altogether. */ + + if (repeat_max == 0) goto END_REPEAT; + + /* All real repeats make it impossible to handle partial matching (maybe + one day we will be able to remove this restriction). */ + + if (repeat_max != 1) cd->nopartial = TRUE; + + /* Combine the op_type with the repeat_type */ + + repeat_type += op_type; + + /* A minimum of zero is handled either as the special case * or ?, or as + an UPTO, with the maximum given. */ + + if (repeat_min == 0) + { + if (repeat_max == -1) *code++ = OP_STAR + repeat_type; + else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; + else + { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + + /* A repeat minimum of 1 is optimized into some special cases. If the + maximum is unlimited, we use OP_PLUS. Otherwise, the original item is + left in place and, if the maximum is greater than 1, we use OP_UPTO with + one less than the maximum. */ + + else if (repeat_min == 1) + { + if (repeat_max == -1) + *code++ = OP_PLUS + repeat_type; + else + { + code = oldcode; /* leave previous item in place */ + if (repeat_max == 1) goto END_REPEAT; + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max - 1); + } + } + + /* The case {n,n} is just an EXACT, while the general case {n,m} is + handled as an EXACT followed by an UPTO. */ + + else + { + *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ + PUT2INC(code, 0, repeat_min); + + /* If the maximum is unlimited, insert an OP_STAR. Before doing so, + we have to insert the character for the previous code. For a repeated + Unicode property match, there are two extra bytes that define the + required property. In UTF-8 mode, long characters have their length in + c, with the 0x80 bit as a flag. */ + + if (repeat_max < 0) + { +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 128) + { + memcpy(code, utf8_char, c & 7); + code += c & 7; + } + else +#endif + { + *code++ = c; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } + } + *code++ = OP_STAR + repeat_type; + } + + /* Else insert an UPTO if the max is greater than the min, again + preceded by the character, for the previously inserted code. If the + UPTO is just for 1 instance, we can use QUERY instead. */ + + else if (repeat_max != repeat_min) + { +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 128) + { + memcpy(code, utf8_char, c & 7); + code += c & 7; + } + else +#endif + *code++ = c; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } + repeat_max -= repeat_min; + + if (repeat_max == 1) + { + *code++ = OP_QUERY + repeat_type; + } + else + { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + } + + /* The character or character type itself comes last in all cases. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 128) + { + memcpy(code, utf8_char, c & 7); + code += c & 7; + } + else +#endif + *code++ = c; + + /* For a repeated Unicode property match, there are two extra bytes that + define the required property. */ + +#ifdef SUPPORT_UCP + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } +#endif + } + + /* If previous was a character class or a back reference, we put the repeat + stuff after it, but just skip the item if the repeat was {0,0}. */ + + else if (*previous == OP_CLASS || + *previous == OP_NCLASS || +#ifdef SUPPORT_UTF8 + *previous == OP_XCLASS || +#endif + *previous == OP_REF) + { + if (repeat_max == 0) + { + code = previous; + goto END_REPEAT; + } + + /* All real repeats make it impossible to handle partial matching (maybe + one day we will be able to remove this restriction). */ + + if (repeat_max != 1) cd->nopartial = TRUE; + + if (repeat_min == 0 && repeat_max == -1) + *code++ = OP_CRSTAR + repeat_type; + else if (repeat_min == 1 && repeat_max == -1) + *code++ = OP_CRPLUS + repeat_type; + else if (repeat_min == 0 && repeat_max == 1) + *code++ = OP_CRQUERY + repeat_type; + else + { + *code++ = OP_CRRANGE + repeat_type; + PUT2INC(code, 0, repeat_min); + if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ + PUT2INC(code, 0, repeat_max); + } + } + + /* If previous was a bracket group, we may have to replicate it in certain + cases. */ + + else if (*previous == OP_BRA || *previous == OP_CBRA || + *previous == OP_ONCE || *previous == OP_COND) + { + register int i; + int ketoffset = 0; + int len = code - previous; + uschar *bralink = NULL; + + /* Repeating a DEFINE group is pointless */ + + if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) + { + *errorcodeptr = ERR55; + goto FAILED; + } + + /* This is a paranoid check to stop integer overflow later on */ + + if (len > MAX_DUPLENGTH) + { + *errorcodeptr = ERR50; + goto FAILED; + } + + /* If the maximum repeat count is unlimited, find the end of the bracket + by scanning through from the start, and compute the offset back to it + from the current code pointer. There may be an OP_OPT setting following + the final KET, so we can't find the end just by going back from the code + pointer. */ + + if (repeat_max == -1) + { + register uschar *ket = previous; + do ket += GET(ket, 1); while (*ket != OP_KET); + ketoffset = code - ket; + } + + /* The case of a zero minimum is special because of the need to stick + OP_BRAZERO in front of it, and because the group appears once in the + data, whereas in other cases it appears the minimum number of times. For + this reason, it is simplest to treat this case separately, as otherwise + the code gets far too messy. There are several special subcases when the + minimum is zero. */ + + if (repeat_min == 0) + { + /* If the maximum is also zero, we just omit the group from the output + altogether. */ + + if (repeat_max == 0) + { + code = previous; + goto END_REPEAT; + } + + /* If the maximum is 1 or unlimited, we just have to stick in the + BRAZERO and do no more at this point. However, we do need to adjust + any OP_RECURSE calls inside the group that refer to the group itself or + any internal or forward referenced group, because the offset is from + the start of the whole regex. Temporarily terminate the pattern while + doing this. */ + + if (repeat_max <= 1) + { + *code = OP_END; + adjust_recurse(previous, 1, utf8, cd, save_hwm); + memmove(previous+1, previous, len); + code++; + *previous++ = OP_BRAZERO + repeat_type; + } + + /* If the maximum is greater than 1 and limited, we have to replicate + in a nested fashion, sticking OP_BRAZERO before each set of brackets. + The first one has to be handled carefully because it's the original + copy, which has to be moved up. The remainder can be handled by code + that is common with the non-zero minimum case below. We have to + adjust the value or repeat_max, since one less copy is required. Once + again, we may have to adjust any OP_RECURSE calls inside the group. */ + + else + { + int offset; + *code = OP_END; + adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); + memmove(previous + 2 + LINK_SIZE, previous, len); + code += 2 + LINK_SIZE; + *previous++ = OP_BRAZERO + repeat_type; + *previous++ = OP_BRA; + + /* We chain together the bracket offset fields that have to be + filled in later when the ends of the brackets are reached. */ + + offset = (bralink == NULL)? 0 : previous - bralink; + bralink = previous; + PUTINC(previous, 0, offset); + } + + repeat_max--; + } + + /* If the minimum is greater than zero, replicate the group as many + times as necessary, and adjust the maximum to the number of subsequent + copies that we need. If we set a first char from the group, and didn't + set a required char, copy the latter from the former. If there are any + forward reference subroutine calls in the group, there will be entries on + the workspace list; replicate these with an appropriate increment. */ + + else + { + if (repeat_min > 1) + { + /* In the pre-compile phase, we don't actually do the replication. We + just adjust the length as if we had. */ + + if (lengthptr != NULL) + *lengthptr += (repeat_min - 1)*length_prevgroup; + + /* This is compiling for real */ + + else + { + if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; + for (i = 1; i < repeat_min; i++) + { + uschar *hc; + uschar *this_hwm = cd->hwm; + memcpy(code, previous, len); + for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + { + PUT(cd->hwm, 0, GET(hc, 0) + len); + cd->hwm += LINK_SIZE; + } + save_hwm = this_hwm; + code += len; + } + } + } + + if (repeat_max > 0) repeat_max -= repeat_min; + } + + /* This code is common to both the zero and non-zero minimum cases. If + the maximum is limited, it replicates the group in a nested fashion, + remembering the bracket starts on a stack. In the case of a zero minimum, + the first one was set up above. In all cases the repeat_max now specifies + the number of additional copies needed. Again, we must remember to + replicate entries on the forward reference list. */ + + if (repeat_max >= 0) + { + /* In the pre-compile phase, we don't actually do the replication. We + just adjust the length as if we had. For each repetition we must add 1 + to the length for BRAZERO and for all but the last repetition we must + add 2 + 2*LINKSIZE to allow for the nesting that occurs. */ + + if (lengthptr != NULL && repeat_max > 0) + *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - + 2 - 2*LINK_SIZE; /* Last one doesn't nest */ + + /* This is compiling for real */ + + else for (i = repeat_max - 1; i >= 0; i--) + { + uschar *hc; + uschar *this_hwm = cd->hwm; + + *code++ = OP_BRAZERO + repeat_type; + + /* All but the final copy start a new nesting, maintaining the + chain of brackets outstanding. */ + + if (i != 0) + { + int offset; + *code++ = OP_BRA; + offset = (bralink == NULL)? 0 : code - bralink; + bralink = code; + PUTINC(code, 0, offset); + } + + memcpy(code, previous, len); + for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + { + PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); + cd->hwm += LINK_SIZE; + } + save_hwm = this_hwm; + code += len; + } + + /* Now chain through the pending brackets, and fill in their length + fields (which are holding the chain links pro tem). */ + + while (bralink != NULL) + { + int oldlinkoffset; + int offset = code - bralink + 1; + uschar *bra = code - offset; + oldlinkoffset = GET(bra, 1); + bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; + *code++ = OP_KET; + PUTINC(code, 0, offset); + PUT(bra, 1, offset); + } + } + + /* If the maximum is unlimited, set a repeater in the final copy. We + can't just offset backwards from the current code point, because we + don't know if there's been an options resetting after the ket. The + correct offset was computed above. + + Then, when we are doing the actual compile phase, check to see whether + this group is a non-atomic one that could match an empty string. If so, + convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so + that runtime checking can be done. [This check is also applied to + atomic groups at runtime, but in a different way.] */ + + else + { + uschar *ketcode = code - ketoffset; + uschar *bracode = ketcode - GET(ketcode, 1); + *ketcode = OP_KETRMAX + repeat_type; + if (lengthptr == NULL && *bracode != OP_ONCE) + { + uschar *scode = bracode; + do + { + if (could_be_empty_branch(scode, ketcode, utf8)) + { + *bracode += OP_SBRA - OP_BRA; + break; + } + scode += GET(scode, 1); + } + while (*scode == OP_ALT); + } + } + } + + /* Else there's some kind of shambles */ + + else + { + *errorcodeptr = ERR11; + goto FAILED; + } + + /* If the character following a repeat is '+', or if certain optimization + tests above succeeded, possessive_quantifier is TRUE. For some of the + simpler opcodes, there is an special alternative opcode for this. For + anything else, we wrap the entire repeated item inside OP_ONCE brackets. + The '+' notation is just syntactic sugar, taken from Sun's Java package, + but the special opcodes can optimize it a bit. The repeated item starts at + tempcode, not at previous, which might be the first part of a string whose + (former) last char we repeated. + + Possessifying an 'exact' quantifier has no effect, so we can ignore it. But + an 'upto' may follow. We skip over an 'exact' item, and then test the + length of what remains before proceeding. */ + + if (possessive_quantifier) + { + int len; + if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || + *tempcode == OP_NOTEXACT) + tempcode += _pcre_OP_lengths[*tempcode]; + len = code - tempcode; + if (len > 0) switch (*tempcode) + { + case OP_STAR: *tempcode = OP_POSSTAR; break; + case OP_PLUS: *tempcode = OP_POSPLUS; break; + case OP_QUERY: *tempcode = OP_POSQUERY; break; + case OP_UPTO: *tempcode = OP_POSUPTO; break; + + case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; + case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; + case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; + case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; + + case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; + case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; + case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; + case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; + + default: + memmove(tempcode + 1+LINK_SIZE, tempcode, len); + code += 1 + LINK_SIZE; + len += 1 + LINK_SIZE; + tempcode[0] = OP_ONCE; + *code++ = OP_KET; + PUTINC(code, 0, len); + PUT(tempcode, 1, len); + break; + } + } + + /* In all case we no longer have a previous item. We also set the + "follows varying string" flag for subsequently encountered reqbytes if + it isn't already set and we have just passed a varying length item. */ + + END_REPEAT: + previous = NULL; + cd->req_varyopt |= reqvary; + break; + + + /* ===================================================================*/ + /* Start of nested parenthesized sub-expression, or comment or lookahead or + lookbehind or option setting or condition or all the other extended + parenthesis forms. First deal with the specials; all are introduced by ?, + and the appearance of any of them means that this is not a capturing + group. */ + + case '(': + newoptions = options; + skipbytes = 0; + bravalue = OP_CBRA; + save_hwm = cd->hwm; + + if (*(++ptr) == '?') + { + int i, set, unset, namelen; + int *optset; + const uschar *name; + uschar *slot; + + switch (*(++ptr)) + { + case '#': /* Comment; skip to ket */ + ptr++; + while (*ptr != 0 && *ptr != ')') ptr++; + if (*ptr == 0) + { + *errorcodeptr = ERR18; + goto FAILED; + } + continue; + + + /* ------------------------------------------------------------ */ + case ':': /* Non-capturing bracket */ + bravalue = OP_BRA; + ptr++; + break; + + + /* ------------------------------------------------------------ */ + case '(': + bravalue = OP_COND; /* Conditional group */ + + /* A condition can be an assertion, a number (referring to a numbered + group), a name (referring to a named group), or 'R', referring to + recursion. R and R&name are also permitted for recursion tests. + + There are several syntaxes for testing a named group: (?(name)) is used + by Python; Perl 5.10 onwards uses (?() or (?('name')). + + There are two unfortunate ambiguities, caused by history. (a) 'R' can + be the recursive thing or the name 'R' (and similarly for 'R' followed + by digits), and (b) a number could be a name that consists of digits. + In both cases, we look for a name first; if not found, we try the other + cases. */ + + /* For conditions that are assertions, check the syntax, and then exit + the switch. This will take control down to where bracketed groups, + including assertions, are processed. */ + + if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) + break; + + /* Most other conditions use OP_CREF (a couple change to OP_RREF + below), and all need to skip 3 bytes at the start of the group. */ + + code[1+LINK_SIZE] = OP_CREF; + skipbytes = 3; + + /* Check for a test for recursion in a named group. */ + + if (ptr[1] == 'R' && ptr[2] == '&') + { + terminator = -1; + ptr += 2; + code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ + } + + /* Check for a test for a named group's having been set, using the Perl + syntax (?() or (?('name') */ + + else if (ptr[1] == '<') + { + terminator = '>'; + ptr++; + } + else if (ptr[1] == '\'') + { + terminator = '\''; + ptr++; + } + else terminator = 0; + + /* We now expect to read a name; any thing else is an error */ + + if ((cd->ctypes[ptr[1]] & ctype_word) == 0) + { + ptr += 1; /* To get the right offset */ + *errorcodeptr = ERR28; + goto FAILED; + } + + /* Read the name, but also get it as a number if it's all digits */ + + recno = 0; + name = ++ptr; + while ((cd->ctypes[*ptr] & ctype_word) != 0) + { + if (recno >= 0) + recno = (g_ascii_isdigit(*ptr) != 0)? + recno * 10 + *ptr - '0' : -1; + ptr++; + } + namelen = ptr - name; + + if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') + { + ptr--; /* Error offset */ + *errorcodeptr = ERR26; + goto FAILED; + } + + /* Do no further checking in the pre-compile phase. */ + + if (lengthptr != NULL) break; + + /* In the real compile we do the work of looking for the actual + reference. */ + + slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) + { + if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; + slot += cd->name_entry_size; + } + + /* Found a previous named subpattern */ + + if (i < cd->names_found) + { + recno = GET2(slot, 0); + PUT2(code, 2+LINK_SIZE, recno); + } + + /* Search the pattern for a forward reference */ + + else if ((i = find_parens(ptr, cd->bracount, name, namelen, + (options & PCRE_EXTENDED) != 0)) > 0) + { + PUT2(code, 2+LINK_SIZE, i); + } + + /* If terminator == 0 it means that the name followed directly after + the opening parenthesis [e.g. (?(abc)...] and in this case there are + some further alternatives to try. For the cases where terminator != 0 + [things like (?(... or (?('name')... or (?(R&name)... ] we have + now checked all the possibilities, so give an error. */ + + else if (terminator != 0) + { + *errorcodeptr = ERR15; + goto FAILED; + } + + /* Check for (?(R) for recursion. Allow digits after R to specify a + specific group number. */ + + else if (*name == 'R') + { + recno = 0; + for (i = 1; i < namelen; i++) + { + if (g_ascii_isdigit(name[i]) == 0) + { + *errorcodeptr = ERR15; + goto FAILED; + } + recno = recno * 10 + name[i] - '0'; + } + if (recno == 0) recno = RREF_ANY; + code[1+LINK_SIZE] = OP_RREF; /* Change test type */ + PUT2(code, 2+LINK_SIZE, recno); + } + + /* Similarly, check for the (?(DEFINE) "condition", which is always + false. */ + + else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) + { + code[1+LINK_SIZE] = OP_DEF; + skipbytes = 1; + } + + /* Check for the "name" actually being a subpattern number. */ + + else if (recno > 0) + { + PUT2(code, 2+LINK_SIZE, recno); + } + + /* Either an unidentified subpattern, or a reference to (?(0) */ + + else + { + *errorcodeptr = (recno == 0)? ERR35: ERR15; + goto FAILED; + } + break; + + + /* ------------------------------------------------------------ */ + case '=': /* Positive lookahead */ + bravalue = OP_ASSERT; + ptr++; + break; + + + /* ------------------------------------------------------------ */ + case '!': /* Negative lookahead */ + bravalue = OP_ASSERT_NOT; + ptr++; + break; + + + /* ------------------------------------------------------------ */ + case '<': /* Lookbehind or named define */ + switch (ptr[1]) + { + case '=': /* Positive lookbehind */ + bravalue = OP_ASSERTBACK; + ptr += 2; + break; + + case '!': /* Negative lookbehind */ + bravalue = OP_ASSERTBACK_NOT; + ptr += 2; + break; + + default: /* Could be name define, else bad */ + if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; + ptr++; /* Correct offset for error */ + *errorcodeptr = ERR24; + goto FAILED; + } + break; + + + /* ------------------------------------------------------------ */ + case '>': /* One-time brackets */ + bravalue = OP_ONCE; + ptr++; + break; + + + /* ------------------------------------------------------------ */ + case 'C': /* Callout - may be followed by digits; */ + previous_callout = code; /* Save for later completion */ + after_manual_callout = 1; /* Skip one item before completing */ + *code++ = OP_CALLOUT; + { + int n = 0; + while (g_ascii_isdigit(*(++ptr)) != 0) + n = n * 10 + *ptr - '0'; + if (*ptr != ')') + { + *errorcodeptr = ERR39; + goto FAILED; + } + if (n > 255) + { + *errorcodeptr = ERR38; + goto FAILED; + } + *code++ = n; + PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ + code += 2 * LINK_SIZE; + } + previous = NULL; + continue; + + + /* ------------------------------------------------------------ */ + case 'P': /* Python-style named subpattern handling */ + if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ + { + is_recurse = *ptr == '>'; + terminator = ')'; + goto NAMED_REF_OR_RECURSE; + } + else if (*ptr != '<') /* Test for Python-style definition */ + { + *errorcodeptr = ERR41; + goto FAILED; + } + /* Fall through to handle (?P< as (?< is handled */ + + + /* ------------------------------------------------------------ */ + DEFINE_NAME: /* Come here from (?< handling */ + case '\'': + { + terminator = (*ptr == '<')? '>' : '\''; + name = ++ptr; + + while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; + namelen = ptr - name; + + /* In the pre-compile phase, just do a syntax check. */ + + if (lengthptr != NULL) + { + if (*ptr != terminator) + { + *errorcodeptr = ERR42; + goto FAILED; + } + if (cd->names_found >= MAX_NAME_COUNT) + { + *errorcodeptr = ERR49; + goto FAILED; + } + if (namelen + 3 > cd->name_entry_size) + { + cd->name_entry_size = namelen + 3; + if (namelen > MAX_NAME_SIZE) + { + *errorcodeptr = ERR48; + goto FAILED; + } + } + } + + /* In the real compile, create the entry in the table */ + + else + { + slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) + { + int crc = memcmp(name, slot+2, namelen); + if (crc == 0) + { + if (slot[2+namelen] == 0) + { + if ((options & PCRE_DUPNAMES) == 0) + { + *errorcodeptr = ERR43; + goto FAILED; + } + } + else crc = -1; /* Current name is substring */ + } + if (crc < 0) + { + memmove(slot + cd->name_entry_size, slot, + (cd->names_found - i) * cd->name_entry_size); + break; + } + slot += cd->name_entry_size; + } + + PUT2(slot, 0, cd->bracount + 1); + memcpy(slot + 2, name, namelen); + slot[2+namelen] = 0; + } + } + + /* In both cases, count the number of names we've encountered. */ + + ptr++; /* Move past > or ' */ + cd->names_found++; + goto NUMBERED_GROUP; + + + /* ------------------------------------------------------------ */ + case '&': /* Perl recursion/subroutine syntax */ + terminator = ')'; + is_recurse = TRUE; + /* Fall through */ + + /* We come here from the Python syntax above that handles both + references (?P=name) and recursion (?P>name), as well as falling + through from the Perl recursion syntax (?&name). */ + + NAMED_REF_OR_RECURSE: + name = ++ptr; + while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; + namelen = ptr - name; + + /* In the pre-compile phase, do a syntax check and set a dummy + reference number. */ + + if (lengthptr != NULL) + { + if (*ptr != terminator) + { + *errorcodeptr = ERR42; + goto FAILED; + } + if (namelen > MAX_NAME_SIZE) + { + *errorcodeptr = ERR48; + goto FAILED; + } + recno = 0; + } + + /* In the real compile, seek the name in the table */ + + else + { + slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) + { + if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; + slot += cd->name_entry_size; + } + + if (i < cd->names_found) /* Back reference */ + { + recno = GET2(slot, 0); + } + else if ((recno = /* Forward back reference */ + find_parens(ptr, cd->bracount, name, namelen, + (options & PCRE_EXTENDED) != 0)) <= 0) + { + *errorcodeptr = ERR15; + goto FAILED; + } + } + + /* In both phases, we can now go to the code than handles numerical + recursion or backreferences. */ + + if (is_recurse) goto HANDLE_RECURSION; + else goto HANDLE_REFERENCE; + + + /* ------------------------------------------------------------ */ + case 'R': /* Recursion */ + ptr++; /* Same as (?0) */ + /* Fall through */ + + + /* ------------------------------------------------------------ */ + case '0': case '1': case '2': case '3': case '4': /* Recursion or */ + case '5': case '6': case '7': case '8': case '9': /* subroutine */ + { + const uschar *called; + recno = 0; + while(g_ascii_isdigit(*ptr) != 0) + recno = recno * 10 + *ptr++ - '0'; + if (*ptr != ')') + { + *errorcodeptr = ERR29; + goto FAILED; + } + + /* Come here from code above that handles a named recursion */ + + HANDLE_RECURSION: + + previous = code; + called = cd->start_code; + + /* When we are actually compiling, find the bracket that is being + referenced. Temporarily end the regex in case it doesn't exist before + this point. If we end up with a forward reference, first check that + the bracket does occur later so we can give the error (and position) + now. Then remember this forward reference in the workspace so it can + be filled in at the end. */ + + if (lengthptr == NULL) + { + *code = OP_END; + if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); + + /* Forward reference */ + + if (called == NULL) + { + if (find_parens(ptr, cd->bracount, NULL, recno, + (options & PCRE_EXTENDED) != 0) < 0) + { + *errorcodeptr = ERR15; + goto FAILED; + } + called = cd->start_code + recno; + PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); + } + + /* If not a forward reference, and the subpattern is still open, + this is a recursive call. We check to see if this is a left + recursion that could loop for ever, and diagnose that case. */ + + else if (GET(called, 1) == 0 && + could_be_empty(called, code, bcptr, utf8)) + { + *errorcodeptr = ERR40; + goto FAILED; + } + } + + /* Insert the recursion/subroutine item, automatically wrapped inside + "once" brackets. Set up a "previous group" length so that a + subsequent quantifier will work. */ + + *code = OP_ONCE; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; + + *code = OP_RECURSE; + PUT(code, 1, called - cd->start_code); + code += 1 + LINK_SIZE; + + *code = OP_KET; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; + + length_prevgroup = 3 + 3*LINK_SIZE; + } + + /* Can't determine a first byte now */ + + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + continue; + + + /* ------------------------------------------------------------ */ + default: /* Other characters: check option setting */ + set = unset = 0; + optset = &set; + + while (*ptr != ')' && *ptr != ':') + { + switch (*ptr++) + { + case '-': optset = &unset; break; + + case 'J': /* Record that it changed in the external options */ + *optset |= PCRE_DUPNAMES; + cd->external_options |= PCRE_JCHANGED; + break; + + case 'i': *optset |= PCRE_CASELESS; break; + case 'm': *optset |= PCRE_MULTILINE; break; + case 's': *optset |= PCRE_DOTALL; break; + case 'x': *optset |= PCRE_EXTENDED; break; + case 'U': *optset |= PCRE_UNGREEDY; break; + case 'X': *optset |= PCRE_EXTRA; break; + + default: *errorcodeptr = ERR12; + ptr--; /* Correct the offset */ + goto FAILED; + } + } + + /* Set up the changed option bits, but don't change anything yet. */ + + newoptions = (options | set) & (~unset); + + /* If the options ended with ')' this is not the start of a nested + group with option changes, so the options change at this level. If this + item is right at the start of the pattern, the options can be + abstracted and made external in the pre-compile phase, and ignored in + the compile phase. This can be helpful when matching -- for instance in + caseless checking of required bytes. + + If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are + definitely *not* at the start of the pattern because something has been + compiled. In the pre-compile phase, however, the code pointer can have + that value after the start, because it gets reset as code is discarded + during the pre-compile. However, this can happen only at top level - if + we are within parentheses, the starting BRA will still be present. At + any parenthesis level, the length value can be used to test if anything + has been compiled at that level. Thus, a test for both these conditions + is necessary to ensure we correctly detect the start of the pattern in + both phases. + + If we are not at the pattern start, compile code to change the ims + options if this setting actually changes any of them. We also pass the + new setting back so that it can be put at the start of any following + branches, and when this group ends (if we are in a group), a resetting + item can be compiled. */ + + if (*ptr == ')') + { + if (code == cd->start_code + 1 + LINK_SIZE && + (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) + { + cd->external_options = newoptions; + options = newoptions; + } + else + { + if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) + { + *code++ = OP_OPT; + *code++ = newoptions & PCRE_IMS; + } + + /* Change options at this level, and pass them back for use + in subsequent branches. Reset the greedy defaults and the case + value for firstbyte and reqbyte. */ + + *optionsptr = options = newoptions; + greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); + greedy_non_default = greedy_default ^ 1; + req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; + } + + previous = NULL; /* This item can't be repeated */ + continue; /* It is complete */ + } + + /* If the options ended with ':' we are heading into a nested group + with possible change of options. Such groups are non-capturing and are + not assertions of any kind. All we need to do is skip over the ':'; + the newoptions value is handled below. */ + + bravalue = OP_BRA; + ptr++; + } /* End of switch for character following (? */ + } /* End of (? handling */ + + /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, + all unadorned brackets become non-capturing and behave like (?:...) + brackets. */ + + else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) + { + bravalue = OP_BRA; + } + + /* Else we have a capturing group. */ + + else + { + NUMBERED_GROUP: + cd->bracount += 1; + PUT2(code, 1+LINK_SIZE, cd->bracount); + skipbytes = 2; + } + + /* Process nested bracketed regex. Assertions may not be repeated, but + other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a + non-register variable in order to be able to pass its address because some + compilers complain otherwise. Pass in a new setting for the ims options if + they have changed. */ + + previous = (bravalue >= OP_ONCE)? code : NULL; + *code = bravalue; + tempcode = code; + tempreqvary = cd->req_varyopt; /* Save value before bracket */ + length_prevgroup = 0; /* Initialize for pre-compile phase */ + + if (!compile_regex( + newoptions, /* The complete new option state */ + options & PCRE_IMS, /* The previous ims option state */ + &tempcode, /* Where to put code (updated) */ + &ptr, /* Input pointer (updated) */ + errorcodeptr, /* Where to put an error message */ + (bravalue == OP_ASSERTBACK || + bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ + skipbytes, /* Skip over bracket number */ + &subfirstbyte, /* For possible first char */ + &subreqbyte, /* For possible last char */ + bcptr, /* Current branch chain */ + cd, /* Tables block */ + (lengthptr == NULL)? NULL : /* Actual compile phase */ + &length_prevgroup /* Pre-compile phase */ + )) + goto FAILED; + + /* At the end of compiling, code is still pointing to the start of the + group, while tempcode has been updated to point past the end of the group + and any option resetting that may follow it. The pattern pointer (ptr) + is on the bracket. */ + + /* If this is a conditional bracket, check that there are no more than + two branches in the group, or just one if it's a DEFINE group. */ + + if (bravalue == OP_COND) + { + uschar *tc = code; + int condcount = 0; + + do { + condcount++; + tc += GET(tc,1); + } + while (*tc != OP_KET); + + /* A DEFINE group is never obeyed inline (the "condition" is always + false). It must have only one branch. */ + + if (code[LINK_SIZE+1] == OP_DEF) + { + if (condcount > 1) + { + *errorcodeptr = ERR54; + goto FAILED; + } + bravalue = OP_DEF; /* Just a flag to suppress char handling below */ + } + + /* A "normal" conditional group. If there is just one branch, we must not + make use of its firstbyte or reqbyte, because this is equivalent to an + empty second branch. */ + + else + { + if (condcount > 2) + { + *errorcodeptr = ERR27; + goto FAILED; + } + if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; + } + } + + /* Error if hit end of pattern */ + + if (*ptr != ')') + { + *errorcodeptr = ERR14; + goto FAILED; + } + + /* In the pre-compile phase, update the length by the length of the nested + group, less the brackets at either end. Then reduce the compiled code to + just the brackets so that it doesn't use much memory if it is duplicated by + a quantifier. */ + + if (lengthptr != NULL) + { + *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; + code++; + PUTINC(code, 0, 1 + LINK_SIZE); + *code++ = OP_KET; + PUTINC(code, 0, 1 + LINK_SIZE); + } + + /* Otherwise update the main code pointer to the end of the group. */ + + else code = tempcode; + + /* For a DEFINE group, required and first character settings are not + relevant. */ + + if (bravalue == OP_DEF) break; + + /* Handle updating of the required and first characters for other types of + group. Update for normal brackets of all kinds, and conditions with two + branches (see code above). If the bracket is followed by a quantifier with + zero repeat, we have to back off. Hence the definition of zeroreqbyte and + zerofirstbyte outside the main loop so that they can be accessed for the + back off. */ + + zeroreqbyte = reqbyte; + zerofirstbyte = firstbyte; + groupsetfirstbyte = FALSE; + + if (bravalue >= OP_ONCE) + { + /* If we have not yet set a firstbyte in this branch, take it from the + subpattern, remembering that it was set here so that a repeat of more + than one can replicate it as reqbyte if necessary. If the subpattern has + no firstbyte, set "none" for the whole branch. In both cases, a zero + repeat forces firstbyte to "none". */ + + if (firstbyte == REQ_UNSET) + { + if (subfirstbyte >= 0) + { + firstbyte = subfirstbyte; + groupsetfirstbyte = TRUE; + } + else firstbyte = REQ_NONE; + zerofirstbyte = REQ_NONE; + } + + /* If firstbyte was previously set, convert the subpattern's firstbyte + into reqbyte if there wasn't one, using the vary flag that was in + existence beforehand. */ + + else if (subfirstbyte >= 0 && subreqbyte < 0) + subreqbyte = subfirstbyte | tempreqvary; + + /* If the subpattern set a required byte (or set a first byte that isn't + really the first byte - see above), set it. */ + + if (subreqbyte >= 0) reqbyte = subreqbyte; + } + + /* For a forward assertion, we take the reqbyte, if set. This can be + helpful if the pattern that follows the assertion doesn't set a different + char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte + for an assertion, however because it leads to incorrect effect for patterns + such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead + of a firstbyte. This is overcome by a scan at the end if there's no + firstbyte, looking for an asserted first char. */ + + else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; + break; /* End of processing '(' */ + + + /* ===================================================================*/ + /* Handle metasequences introduced by \. For ones like \d, the ESC_ values + are arranged to be the negation of the corresponding OP_values. For the + back references, the values are ESC_REF plus the reference number. Only + back references and those types that consume a character may be repeated. + We can test for values between ESC_b and ESC_Z for the latter; this may + have to change if any new ones are ever created. */ + + case '\\': + tempptr = ptr; + c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); + if (*errorcodeptr != 0) goto FAILED; + + if (c < 0) + { + if (-c == ESC_Q) /* Handle start of quoted string */ + { + if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ + else inescq = TRUE; + continue; + } + + if (-c == ESC_E) continue; /* Perl ignores an orphan \E */ + + /* For metasequences that actually match a character, we disable the + setting of a first character if it hasn't already been set. */ + + if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) + firstbyte = REQ_NONE; + + /* Set values to reset to if this is followed by a zero repeat. */ + + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + + /* \k or \k'name' is a back reference by name (Perl syntax) */ + + if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'')) + { + is_recurse = FALSE; + terminator = (*(++ptr) == '<')? '>' : '\''; + goto NAMED_REF_OR_RECURSE; + } + + /* Back references are handled specially; must disable firstbyte if + not set to cope with cases like (?=(\w+))\1: which would otherwise set + ':' later. */ + + if (-c >= ESC_REF) + { + recno = -c - ESC_REF; + + HANDLE_REFERENCE: /* Come here from named backref handling */ + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + previous = code; + *code++ = OP_REF; + PUT2INC(code, 0, recno); + cd->backref_map |= (recno < 32)? (1 << recno) : 1; + if (recno > cd->top_backref) cd->top_backref = recno; + } + + /* So are Unicode property matches, if supported. */ + +#ifdef SUPPORT_UCP + else if (-c == ESC_P || -c == ESC_p) + { + BOOL negated; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) goto FAILED; + previous = code; + *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; + } +#else + + /* If Unicode properties are not supported, \X, \P, and \p are not + allowed. */ + + else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) + { + *errorcodeptr = ERR45; + goto FAILED; + } +#endif + + /* For the rest (including \X when Unicode properties are supported), we + can obtain the OP value by negating the escape value. */ + + else + { + previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; + *code++ = -c; + } + continue; + } + + /* We have a data character whose value is in c. In UTF-8 mode it may have + a value > 127. We set its representation in the length/buffer, and then + handle it as a data character. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && c > 127) + mclength = _pcre_ord2utf8(c, mcbuffer); + else +#endif + + { + mcbuffer[0] = c; + mclength = 1; + } + goto ONE_CHAR; + + + /* ===================================================================*/ + /* Handle a literal character. It is guaranteed not to be whitespace or # + when the extended flag is set. If we are in UTF-8 mode, it may be a + multi-byte literal character. */ + + default: + NORMAL_CHAR: + mclength = 1; + mcbuffer[0] = c; + +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 0xc0) + { + while ((ptr[1] & 0xc0) == 0x80) + mcbuffer[mclength++] = *(++ptr); + } +#endif + + /* At this point we have the character's bytes in mcbuffer, and the length + in mclength. When not in UTF-8 mode, the length is always 1. */ + + ONE_CHAR: + previous = code; + *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; + for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; + + /* Set the first and required bytes appropriately. If no previous first + byte, set it from this character, but revert to none on a zero repeat. + Otherwise, leave the firstbyte value alone, and don't change it on a zero + repeat. */ + + if (firstbyte == REQ_UNSET) + { + zerofirstbyte = REQ_NONE; + zeroreqbyte = reqbyte; + + /* If the character is more than one byte long, we can set firstbyte + only if it is not to be matched caselessly. */ + + if (mclength == 1 || req_caseopt == 0) + { + firstbyte = mcbuffer[0] | req_caseopt; + if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt; + } + else firstbyte = reqbyte = REQ_NONE; + } + + /* firstbyte was previously set; we can set reqbyte only the length is + 1 or the matching is caseful. */ + + else + { + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + if (mclength == 1 || req_caseopt == 0) + reqbyte = code[-1] | req_caseopt | cd->req_varyopt; + } + + break; /* End of literal character handling */ + } + } /* end of big loop */ + + +/* Control never reaches here by falling through, only by a goto for all the +error states. Pass back the position in the pattern so that it can be displayed +to the user for diagnosing the error. */ + +FAILED: +*ptrptr = ptr; +return FALSE; +} + + + + +/************************************************* +* Compile sequence of alternatives * +*************************************************/ + +/* On entry, ptr is pointing past the bracket character, but on return it +points to the closing bracket, or vertical bar, or end of string. The code +variable is pointing at the byte into which the BRA operator has been stored. +If the ims options are changed at the start (for a (?ims: group) or during any +branch, we need to insert an OP_OPT item at the start of every following branch +to ensure they get set correctly at run time, and also pass the new options +into every subsequent branch compile. + +This function is used during the pre-compile phase when we are trying to find +out the amount of memory needed, as well as during the real compile phase. The +value of lengthptr distinguishes the two phases. + +Argument: + options option bits, including any changes for this subpattern + oldims previous settings of ims option bits + codeptr -> the address of the current code pointer + ptrptr -> the address of the current pattern pointer + errorcodeptr -> pointer to error code variable + lookbehind TRUE if this is a lookbehind assertion + skipbytes skip this many bytes at start (for brackets and OP_COND) + firstbyteptr place to put the first required character, or a negative number + reqbyteptr place to put the last required character, or a negative number + bcptr pointer to the chain of currently open branches + cd points to the data block with tables pointers etc. + lengthptr NULL during the real compile phase + points to length accumulator during pre-compile phase + +Returns: TRUE on success +*/ + +static BOOL +compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, + int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr, + int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr) +{ +const uschar *ptr = *ptrptr; +uschar *code = *codeptr; +uschar *last_branch = code; +uschar *start_bracket = code; +uschar *reverse_count = NULL; +int firstbyte, reqbyte; +int branchfirstbyte, branchreqbyte; +int length; +branch_chain bc; + +bc.outer = bcptr; +bc.current = code; + +firstbyte = reqbyte = REQ_UNSET; + +/* Accumulate the length for use in the pre-compile phase. Start with the +length of the BRA and KET and any extra bytes that are required at the +beginning. We accumulate in a local variable to save frequent testing of +lenthptr for NULL. We cannot do this by looking at the value of code at the +start and end of each alternative, because compiled items are discarded during +the pre-compile phase so that the work space is not exceeded. */ + +length = 2 + 2*LINK_SIZE + skipbytes; + +/* WARNING: If the above line is changed for any reason, you must also change +the code that abstracts option settings at the start of the pattern and makes +them global. It tests the value of length for (2 + 2*LINK_SIZE) in the +pre-compile phase to find out whether anything has yet been compiled or not. */ + +/* Offset is set zero to mark that this bracket is still open */ + +PUT(code, 1, 0); +code += 1 + LINK_SIZE + skipbytes; + +/* Loop for each alternative branch */ + +for (;;) + { + /* Handle a change of ims options at the start of the branch */ + + if ((options & PCRE_IMS) != oldims) + { + *code++ = OP_OPT; + *code++ = options & PCRE_IMS; + length += 2; + } + + /* Set up dummy OP_REVERSE if lookbehind assertion */ + + if (lookbehind) + { + *code++ = OP_REVERSE; + reverse_count = code; + PUTINC(code, 0, 0); + length += 1 + LINK_SIZE; + } + + /* Now compile the branch; in the pre-compile phase its length gets added + into the length. */ + + if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, + &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) + { + *ptrptr = ptr; + return FALSE; + } + + /* In the real compile phase, there is some post-processing to be done. */ + + if (lengthptr == NULL) + { + /* If this is the first branch, the firstbyte and reqbyte values for the + branch become the values for the regex. */ + + if (*last_branch != OP_ALT) + { + firstbyte = branchfirstbyte; + reqbyte = branchreqbyte; + } + + /* If this is not the first branch, the first char and reqbyte have to + match the values from all the previous branches, except that if the + previous value for reqbyte didn't have REQ_VARY set, it can still match, + and we set REQ_VARY for the regex. */ + + else + { + /* If we previously had a firstbyte, but it doesn't match the new branch, + we have to abandon the firstbyte for the regex, but if there was + previously no reqbyte, it takes on the value of the old firstbyte. */ + + if (firstbyte >= 0 && firstbyte != branchfirstbyte) + { + if (reqbyte < 0) reqbyte = firstbyte; + firstbyte = REQ_NONE; + } + + /* If we (now or from before) have no firstbyte, a firstbyte from the + branch becomes a reqbyte if there isn't a branch reqbyte. */ + + if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) + branchreqbyte = branchfirstbyte; + + /* Now ensure that the reqbytes match */ + + if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) + reqbyte = REQ_NONE; + else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ + } + + /* If lookbehind, check that this branch matches a fixed-length string, and + put the length into the OP_REVERSE item. Temporarily mark the end of the + branch with OP_END. */ + + if (lookbehind) + { + int fixed_length; + *code = OP_END; + fixed_length = find_fixedlength(last_branch, options); + DPRINTF(("fixed length = %d\n", fixed_length)); + if (fixed_length < 0) + { + *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; + *ptrptr = ptr; + return FALSE; + } + PUT(reverse_count, 0, fixed_length); + } + } + + /* Reached end of expression, either ')' or end of pattern. Go back through + the alternative branches and reverse the chain of offsets, with the field in + the BRA item now becoming an offset to the first alternative. If there are + no alternatives, it points to the end of the group. The length in the + terminating ket is always the length of the whole bracketed item. If any of + the ims options were changed inside the group, compile a resetting op-code + following, except at the very end of the pattern. Return leaving the pointer + at the terminating char. */ + + if (*ptr != '|') + { + int branch_length = code - last_branch; + do + { + int prev_length = GET(last_branch, 1); + PUT(last_branch, 1, branch_length); + branch_length = prev_length; + last_branch -= branch_length; + } + while (branch_length > 0); + + /* Fill in the ket */ + + *code = OP_KET; + PUT(code, 1, code - start_bracket); + code += 1 + LINK_SIZE; + + /* Resetting option if needed */ + + if ((options & PCRE_IMS) != oldims && *ptr == ')') + { + *code++ = OP_OPT; + *code++ = oldims; + length += 2; + } + + /* Set values to pass back */ + + *codeptr = code; + *ptrptr = ptr; + *firstbyteptr = firstbyte; + *reqbyteptr = reqbyte; + if (lengthptr != NULL) *lengthptr += length; + return TRUE; + } + + /* Another branch follows; insert an "or" node. Its length field points back + to the previous branch while the bracket remains open. At the end the chain + is reversed. It's done like this so that the start of the bracket has a + zero offset until it is closed, making it possible to detect recursion. */ + + *code = OP_ALT; + PUT(code, 1, code - last_branch); + bc.current = last_branch = code; + code += 1 + LINK_SIZE; + ptr++; + length += 1 + LINK_SIZE; + } +/* Control never reaches here */ +} + + + + +/************************************************* +* Check for anchored expression * +*************************************************/ + +/* Try to find out if this is an anchored regular expression. Consider each +alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket +all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then +it's anchored. However, if this is a multiline pattern, then only OP_SOD +counts, since OP_CIRC can match in the middle. + +We can also consider a regex to be anchored if OP_SOM starts all its branches. +This is the code for \G, which means "match at start of match position, taking +into account the match offset". + +A branch is also implicitly anchored if it starts with .* and DOTALL is set, +because that will try the rest of the pattern at all possible matching points, +so there is no point trying again.... er .... + +.... except when the .* appears inside capturing parentheses, and there is a +subsequent back reference to those parentheses. We haven't enough information +to catch that case precisely. + +At first, the best we could do was to detect when .* was in capturing brackets +and the highest back reference was greater than or equal to that level. +However, by keeping a bitmap of the first 31 back references, we can catch some +of the more common cases more precisely. + +Arguments: + code points to start of expression (the bracket) + options points to the options setting + bracket_map a bitmap of which brackets we are inside while testing; this + handles up to substring 31; after that we just have to take + the less precise approach + backref_map the back reference bitmap + +Returns: TRUE or FALSE +*/ + +static BOOL +is_anchored(register const uschar *code, int *options, unsigned int bracket_map, + unsigned int backref_map) +{ +do { + const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], + options, PCRE_MULTILINE, FALSE); + register int op = *scode; + + /* Non-capturing brackets */ + + if (op == OP_BRA) + { + if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; + } + + /* Capturing brackets */ + + else if (op == OP_CBRA) + { + int n = GET2(scode, 1+LINK_SIZE); + int new_map = bracket_map | ((n < 32)? (1 << n) : 1); + if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; + } + + /* Other brackets */ + + else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) + { + if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; + } + + /* .* is not anchored unless DOTALL is set and it isn't in brackets that + are or may be referenced. */ + + else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || + op == OP_TYPEPOSSTAR) && + (*options & PCRE_DOTALL) != 0) + { + if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; + } + + /* Check for explicit anchoring */ + + else if (op != OP_SOD && op != OP_SOM && + ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) + return FALSE; + code += GET(code, 1); + } +while (*code == OP_ALT); /* Loop for each alternative */ +return TRUE; +} + + + +/************************************************* +* Check for starting with ^ or .* * +*************************************************/ + +/* This is called to find out if every branch starts with ^ or .* so that +"first char" processing can be done to speed things up in multiline +matching and for non-DOTALL patterns that start with .* (which must start at +the beginning or after \n). As in the case of is_anchored() (see above), we +have to take account of back references to capturing brackets that contain .* +because in that case we can't make the assumption. + +Arguments: + code points to start of expression (the bracket) + bracket_map a bitmap of which brackets we are inside while testing; this + handles up to substring 31; after that we just have to take + the less precise approach + backref_map the back reference bitmap + +Returns: TRUE or FALSE +*/ + +static BOOL +is_startline(const uschar *code, unsigned int bracket_map, + unsigned int backref_map) +{ +do { + const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], + NULL, 0, FALSE); + register int op = *scode; + + /* Non-capturing brackets */ + + if (op == OP_BRA) + { + if (!is_startline(scode, bracket_map, backref_map)) return FALSE; + } + + /* Capturing brackets */ + + else if (op == OP_CBRA) + { + int n = GET2(scode, 1+LINK_SIZE); + int new_map = bracket_map | ((n < 32)? (1 << n) : 1); + if (!is_startline(scode, new_map, backref_map)) return FALSE; + } + + /* Other brackets */ + + else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) + { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } + + /* .* means "start at start or after \n" if it isn't in brackets that + may be referenced. */ + + else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) + { + if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; + } + + /* Check for explicit circumflex */ + + else if (op != OP_CIRC) return FALSE; + + /* Move on to the next alternative */ + + code += GET(code, 1); + } +while (*code == OP_ALT); /* Loop for each alternative */ +return TRUE; +} + + + +/************************************************* +* Check for asserted fixed first char * +*************************************************/ + +/* During compilation, the "first char" settings from forward assertions are +discarded, because they can cause conflicts with actual literals that follow. +However, if we end up without a first char setting for an unanchored pattern, +it is worth scanning the regex to see if there is an initial asserted first +char. If all branches start with the same asserted char, or with a bracket all +of whose alternatives start with the same asserted char (recurse ad lib), then +we return that char, otherwise -1. + +Arguments: + code points to start of expression (the bracket) + options pointer to the options (used to check casing changes) + inassert TRUE if in an assertion + +Returns: -1 or the fixed first char +*/ + +static int +find_firstassertedchar(const uschar *code, int *options, BOOL inassert) +{ +register int c = -1; +do { + int d; + const uschar *scode = + first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); + register int op = *scode; + + switch(op) + { + default: + return -1; + + case OP_BRA: + case OP_CBRA: + case OP_ASSERT: + case OP_ONCE: + case OP_COND: + if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) + return -1; + if (c < 0) c = d; else if (c != d) return -1; + break; + + case OP_EXACT: /* Fall through */ + scode += 2; + + case OP_CHAR: + case OP_CHARNC: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + if (!inassert) return -1; + if (c < 0) + { + c = scode[1]; + if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; + } + else if (c != scode[1]) return -1; + break; + } + + code += GET(code, 1); + } +while (*code == OP_ALT); +return c; +} + + + +/************************************************* +* Compile a Regular Expression * +*************************************************/ + +/* This function takes a string and returns a pointer to a block of store +holding a compiled version of the expression. The original API for this +function had no error code return variable; it is retained for backwards +compatibility. The new function is given a new name. + +Arguments: + pattern the regular expression + options various option bits + errorcodeptr pointer to error code variable (pcre_compile2() only) + can be NULL if you don't want a code value + errorptr pointer to pointer to error text + erroroffset ptr offset in pattern where error was detected + tables pointer to character tables or NULL + +Returns: pointer to compiled data block, or NULL on error, + with errorptr and erroroffset set +*/ + +PCRE_DATA_SCOPE pcre * +pcre_compile(const char *pattern, int options, const char **errorptr, + int *erroroffset, const unsigned char *tables) +{ +return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +} + + +PCRE_DATA_SCOPE pcre * +pcre_compile2(const char *pattern, int options, int *errorcodeptr, + const char **errorptr, int *erroroffset, const unsigned char *tables) +{ +real_pcre *re; +int length = 1; /* For final END opcode */ +int firstbyte, reqbyte, newline; +int errorcode = 0; +#ifdef SUPPORT_UTF8 +BOOL utf8; +#endif +size_t size; +uschar *code; +const uschar *codestart; +const uschar *ptr; +compile_data compile_block; +compile_data *cd = &compile_block; + +/* This space is used for "compiling" into during the first phase, when we are +computing the amount of memory that is needed. Compiled items are thrown away +as soon as possible, so that a fairly large buffer should be sufficient for +this purpose. The same space is used in the second phase for remembering where +to fill in forward references to subpatterns. */ + +uschar cworkspace[COMPILE_WORK_SIZE]; + + +/* Set this early so that early errors get offset 0. */ + +ptr = (const uschar *)pattern; + +/* We can't pass back an error message if errorptr is NULL; I guess the best we +can do is just return NULL, but we can set a code value if there is a code +pointer. */ + +if (errorptr == NULL) + { + if (errorcodeptr != NULL) *errorcodeptr = 99; + return NULL; + } + +*errorptr = NULL; +if (errorcodeptr != NULL) *errorcodeptr = ERR0; + +/* However, we can give a message for this error */ + +if (erroroffset == NULL) + { + errorcode = ERR16; + goto PCRE_EARLY_ERROR_RETURN; + } + +*erroroffset = 0; + +/* Can't support UTF8 unless PCRE has been compiled to include the code. */ + +#ifdef SUPPORT_UTF8 +utf8 = (options & PCRE_UTF8) != 0; +if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && + (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) + { + errorcode = ERR44; + goto PCRE_UTF8_ERROR_RETURN; + } +#else +if ((options & PCRE_UTF8) != 0) + { + errorcode = ERR32; + goto PCRE_EARLY_ERROR_RETURN; + } +#endif + +if ((options & ~PUBLIC_OPTIONS) != 0) + { + errorcode = ERR17; + goto PCRE_EARLY_ERROR_RETURN; + } + +/* Set up pointers to the individual character tables */ + +if (tables == NULL) tables = _pcre_default_tables; +cd->lcc = tables + lcc_offset; +cd->fcc = tables + fcc_offset; +cd->cbits = tables + cbits_offset; +cd->ctypes = tables + ctypes_offset; + +/* Handle different types of newline. The three bits give seven cases. The +current code allows for fixed one- or two-byte sequences, plus "any". */ + +switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY)) + { + case 0: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + case PCRE_NEWLINE_ANY: newline = -1; break; + default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; + } + +if (newline < 0) + { + cd->nltype = NLTYPE_ANY; + } +else + { + cd->nltype = NLTYPE_FIXED; + if (newline > 255) + { + cd->nllen = 2; + cd->nl[0] = (newline >> 8) & 255; + cd->nl[1] = newline & 255; + } + else + { + cd->nllen = 1; + cd->nl[0] = newline; + } + } + +/* Maximum back reference and backref bitmap. The bitmap records up to 31 back +references to help in deciding whether (.*) can be treated as anchored or not. +*/ + +cd->top_backref = 0; +cd->backref_map = 0; + +/* Reflect pattern for debugging output */ + +DPRINTF(("------------------------------------------------------------------\n")); +DPRINTF(("%s\n", pattern)); + +/* Pretend to compile the pattern while actually just accumulating the length +of memory required. This behaviour is triggered by passing a non-NULL final +argument to compile_regex(). We pass a block of workspace (cworkspace) for it +to compile parts of the pattern into; the compiled code is discarded when it is +no longer needed, so hopefully this workspace will never overflow, though there +is a test for its doing so. */ + +cd->bracount = 0; +cd->names_found = 0; +cd->name_entry_size = 0; +cd->name_table = NULL; +cd->start_workspace = cworkspace; +cd->start_code = cworkspace; +cd->hwm = cworkspace; +cd->start_pattern = (const uschar *)pattern; +cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); +cd->req_varyopt = 0; +cd->nopartial = FALSE; +cd->external_options = options; + +/* Now do the pre-compile. On error, errorcode will be set non-zero, so we +don't need to look at the result of the function here. The initial options have +been put into the cd block so that they can be changed if an option setting is +found within the regex right at the beginning. Bringing initial option settings +outside can help speed up starting point checks. */ + +code = cworkspace; +*code = OP_BRA; +(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, + &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); +if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; + +DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, + cd->hwm - cworkspace)); + +if (length > MAX_PATTERN_SIZE) + { + errorcode = ERR20; + goto PCRE_EARLY_ERROR_RETURN; + } + +/* Compute the size of data block needed and get it, either from malloc or +externally provided function. Integer overflow should no longer be possible +because nowadays we limit the maximum value of cd->names_found and +cd->name_entry_size. */ + +size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3); +re = (real_pcre *)(pcre_malloc)(size); + +if (re == NULL) + { + errorcode = ERR21; + goto PCRE_EARLY_ERROR_RETURN; + } + +/* Put in the magic number, and save the sizes, initial options, and character +table pointer. NULL is used for the default character tables. The nullpad field +is at the end; it's there to help in the case when a regex compiled on a system +with 4-byte pointers is run on another with 8-byte pointers. */ + +re->magic_number = MAGIC_NUMBER; +re->size = size; +re->options = cd->external_options; +re->dummy1 = 0; +re->first_byte = 0; +re->req_byte = 0; +re->name_table_offset = sizeof(real_pcre); +re->name_entry_size = cd->name_entry_size; +re->name_count = cd->names_found; +re->ref_count = 0; +re->tables = (tables == _pcre_default_tables)? NULL : tables; +re->nullpad = NULL; + +/* The starting points of the name/number translation table and of the code are +passed around in the compile data block. The start/end pattern and initial +options are already set from the pre-compile phase, as is the name_entry_size +field. Reset the bracket count and the names_found field. Also reset the hwm +field; this time it's used for remembering forward references to subpatterns. +*/ + +cd->bracount = 0; +cd->names_found = 0; +cd->name_table = (uschar *)re + re->name_table_offset; +codestart = cd->name_table + re->name_entry_size * re->name_count; +cd->start_code = codestart; +cd->hwm = cworkspace; +cd->req_varyopt = 0; +cd->nopartial = FALSE; + +/* Set up a starting, non-extracting bracket, then compile the expression. On +error, errorcode will be set non-zero, so we don't need to look at the result +of the function here. */ + +ptr = (const uschar *)pattern; +code = (uschar *)codestart; +*code = OP_BRA; +(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, + &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); +re->top_bracket = cd->bracount; +re->top_backref = cd->top_backref; + +if (cd->nopartial) re->options |= PCRE_NOPARTIAL; + +/* If not reached end of pattern on success, there's an excess bracket. */ + +if (errorcode == 0 && *ptr != 0) errorcode = ERR22; + +/* Fill in the terminating state and check for disastrous overflow, but +if debugging, leave the test till after things are printed out. */ + +*code++ = OP_END; + +#ifndef DEBUG +if (code - codestart > length) errorcode = ERR23; +#endif + +/* Fill in any forward references that are required. */ + +while (errorcode == 0 && cd->hwm > cworkspace) + { + int offset, recno; + const uschar *groupptr; + cd->hwm -= LINK_SIZE; + offset = GET(cd->hwm, 0); + recno = GET(codestart, offset); + groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); + if (groupptr == NULL) errorcode = ERR53; + else PUT(((uschar *)codestart), offset, groupptr - codestart); + } + +/* Give an error if there's back reference to a non-existent capturing +subpattern. */ + +if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; + +/* Failed to compile, or error while post-processing */ + +if (errorcode != 0) + { + (pcre_free)(re); + PCRE_EARLY_ERROR_RETURN: + *erroroffset = ptr - (const uschar *)pattern; +#ifdef SUPPORT_UTF8 + PCRE_UTF8_ERROR_RETURN: +#endif + *errorptr = error_texts + error_texts_offsets[errorcode]; + if (errorcodeptr != NULL) *errorcodeptr = errorcode; + return NULL; + } + +/* If the anchored option was not passed, set the flag if we can determine that +the pattern is anchored by virtue of ^ characters or \A or anything else (such +as starting with .* when DOTALL is set). + +Otherwise, if we know what the first byte has to be, save it, because that +speeds up unanchored matches no end. If not, see if we can set the +PCRE_STARTLINE flag. This is helpful for multiline matches when all branches +start with ^. and also when all branches start with .* for non-DOTALL matches. +*/ + +if ((re->options & PCRE_ANCHORED) == 0) + { + int temp_options = re->options; /* May get changed during these scans */ + if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) + re->options |= PCRE_ANCHORED; + else + { + if (firstbyte < 0) + firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE); + if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ + { + int ch = firstbyte & 255; + re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && + cd->fcc[ch] == ch)? ch : firstbyte; + re->options |= PCRE_FIRSTSET; + } + else if (is_startline(codestart, 0, cd->backref_map)) + re->options |= PCRE_STARTLINE; + } + } + +/* For an anchored pattern, we use the "required byte" only if it follows a +variable length item in the regex. Remove the caseless flag for non-caseable +bytes. */ + +if (reqbyte >= 0 && + ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) + { + int ch = reqbyte & 255; + re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && + cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; + re->options |= PCRE_REQCHSET; + } + +/* Print out the compiled data if debugging is enabled. This is never the +case when building a production library. */ + +#ifdef DEBUG + +printf("Length = %d top_bracket = %d top_backref = %d\n", + length, re->top_bracket, re->top_backref); + +if (re->options != 0) + { + printf("%s%s%s%s%s%s%s%s%s\n", + ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", + ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", + ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", + ((re->options & PCRE_EXTENDED) != 0)? "extended " : "", + ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", + ((re->options & PCRE_DOTALL) != 0)? "dotall " : "", + ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "", + ((re->options & PCRE_EXTRA) != 0)? "extra " : "", + ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : ""); + } + +if ((re->options & PCRE_FIRSTSET) != 0) + { + int ch = re->first_byte & 255; + const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? + "" : " (caseless)"; + if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); + else printf("First char = \\x%02x%s\n", ch, caseless); + } + +if ((re->options & PCRE_REQCHSET) != 0) + { + int ch = re->req_byte & 255; + const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? + "" : " (caseless)"; + if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); + else printf("Req char = \\x%02x%s\n", ch, caseless); + } + +pcre_printint(re, stdout); + +/* This check is done here in the debugging case so that the code that +was compiled can be seen. */ + +if (code - codestart > length) + { + (pcre_free)(re); + *errorptr = error_texts + error_texts_offsets[ERR23]; + *erroroffset = ptr - (uschar *)pattern; + if (errorcodeptr != NULL) *errorcodeptr = ERR23; + return NULL; + } +#endif /* DEBUG */ + +return (pcre *)re; +} + +/* End of pcre_compile.c */ diff --git a/glib/pcre/pcre_config.c b/glib/pcre/pcre_config.c new file mode 100644 index 0000000..29e6c1a --- /dev/null +++ b/glib/pcre/pcre_config.c @@ -0,0 +1,116 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_config(). */ + + +#include "pcre_internal.h" + + +/************************************************* +* Return info about what features are configured * +*************************************************/ + +/* This function has an extensible interface so that additional items can be +added compatibly. + +Arguments: + what what information is required + where where to put the information + +Returns: 0 if data returned, negative on error +*/ + +PCRE_DATA_SCOPE int +pcre_config(int what, void *where) +{ +switch (what) + { + case PCRE_CONFIG_UTF8: +#ifdef SUPPORT_UTF8 + *((int *)where) = 1; +#else + *((int *)where) = 0; +#endif + break; + + case PCRE_CONFIG_UNICODE_PROPERTIES: +#ifdef SUPPORT_UCP + *((int *)where) = 1; +#else + *((int *)where) = 0; +#endif + break; + + case PCRE_CONFIG_NEWLINE: + *((int *)where) = NEWLINE; + break; + + case PCRE_CONFIG_LINK_SIZE: + *((int *)where) = LINK_SIZE; + break; + + case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD: + *((int *)where) = POSIX_MALLOC_THRESHOLD; + break; + + case PCRE_CONFIG_MATCH_LIMIT: + *((unsigned int *)where) = MATCH_LIMIT; + break; + + case PCRE_CONFIG_MATCH_LIMIT_RECURSION: + *((unsigned int *)where) = MATCH_LIMIT_RECURSION; + break; + + case PCRE_CONFIG_STACKRECURSE: +#ifdef NO_RECURSE + *((int *)where) = 0; +#else + *((int *)where) = 1; +#endif + break; + + default: return PCRE_ERROR_BADOPTION; + } + +return 0; +} + +/* End of pcre_config.c */ diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c new file mode 100644 index 0000000..5f1301f --- /dev/null +++ b/glib/pcre/pcre_dfa_exec.c @@ -0,0 +1,2433 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_dfa_exec(), which is an +alternative matching function that uses a sort of DFA algorithm (not a true +FSM). This is NOT Perl- compatible, but it has advantages in certain +applications. */ + + +#define NLBLOCK md /* Block containing newline information */ +#define PSSTART start_subject /* Field containing processed string start */ +#define PSEND end_subject /* Field containing processed string end */ + +#include "pcre_internal.h" + + +/* For use to indent debugging output */ + +#define SP " " + + + +/************************************************* +* Code parameters and static tables * +*************************************************/ + +/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes +into others, under special conditions. A gap of 20 between the blocks should be +enough. */ + +#define OP_PROP_EXTRA 100 +#define OP_EXTUNI_EXTRA 120 +#define OP_ANYNL_EXTRA 140 + + +/* This table identifies those opcodes that are followed immediately by a +character that is to be tested in some way. This makes is possible to +centralize the loading of these characters. In the case of Type * etc, the +"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a +small value. */ + +static uschar coptable[] = { + 0, /* End */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */ + 0, 0, /* Any, Anybyte */ + 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */ + 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ + 1, /* Char */ + 1, /* Charnc */ + 1, /* not */ + /* Positive single-char repeats */ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ + 3, 3, 3, /* upto, minupto, exact */ + 1, 1, 1, 3, /* *+, ++, ?+, upto+ */ + /* Negative single-char repeats - only for chars < 256 */ + 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ + 3, 3, 3, /* NOT upto, minupto, exact */ + 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */ + /* Positive type repeats */ + 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ + 3, 3, 3, /* Type upto, minupto, exact */ + 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */ + /* Character class & ref repeats */ + 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ + 0, 0, /* CRRANGE, CRMINRANGE */ + 0, /* CLASS */ + 0, /* NCLASS */ + 0, /* XCLASS - variable length */ + 0, /* REF */ + 0, /* RECURSE */ + 0, /* CALLOUT */ + 0, /* Alt */ + 0, /* Ket */ + 0, /* KetRmax */ + 0, /* KetRmin */ + 0, /* Assert */ + 0, /* Assert not */ + 0, /* Assert behind */ + 0, /* Assert behind not */ + 0, /* Reverse */ + 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ + 0, 0, 0, /* SBRA, SCBRA, SCOND */ + 0, /* CREF */ + 0, /* RREF */ + 0, /* DEF */ + 0, 0 /* BRAZERO, BRAMINZERO */ +}; + +/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, +and \w */ + +static uschar toptable1[] = { + 0, 0, 0, 0, 0, + ctype_digit, ctype_digit, + ctype_space, ctype_space, + ctype_word, ctype_word, + 0 /* OP_ANY */ +}; + +static uschar toptable2[] = { + 0, 0, 0, 0, 0, + ctype_digit, 0, + ctype_space, 0, + ctype_word, 0, + 1 /* OP_ANY */ +}; + + +/* Structure for holding data about a particular state, which is in effect the +current data for an active path through the match tree. It must consist +entirely of ints because the working vector we are passed, and which we put +these structures in, is a vector of ints. */ + +typedef struct stateblock { + int offset; /* Offset to opcode */ + int count; /* Count for repeats */ + int ims; /* ims flag bits */ + int data; /* Some use extra data */ +} stateblock; + +#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int)) + + +#ifdef DEBUG +/************************************************* +* Print character string * +*************************************************/ + +/* Character string printing function for debugging. + +Arguments: + p points to string + length number of bytes + f where to print + +Returns: nothing +*/ + +static void +pchars(unsigned char *p, int length, FILE *f) +{ +int c; +while (length-- > 0) + { + if (isprint(c = *(p++))) + fprintf(f, "%c", c); + else + fprintf(f, "\\x%02x", c); + } +} +#endif + + + +/************************************************* +* Execute a Regular Expression - DFA engine * +*************************************************/ + +/* This internal function applies a compiled pattern to a subject string, +starting at a given point, using a DFA engine. This function is called from the +external one, possibly multiple times if the pattern is not anchored. The +function calls itself recursively for some kinds of subpattern. + +Arguments: + md the match_data block with fixed information + this_start_code the opening bracket of this subexpression's code + current_subject where we currently are in the subject string + start_offset start offset in the subject string + offsets vector to contain the matching string offsets + offsetcount size of same + workspace vector of workspace + wscount size of same + ims the current ims flags + rlevel function call recursion level + recursing regex recursive call level + +Returns: > 0 => + = 0 => + -1 => failed to match + < -1 => some kind of unexpected problem + +The following macros are used for adding states to the two state vectors (one +for the current character, one for the following character). */ + +#define ADD_ACTIVE(x,y) \ + if (active_count++ < wscount) \ + { \ + next_active_state->offset = (x); \ + next_active_state->count = (y); \ + next_active_state->ims = ims; \ + next_active_state++; \ + DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ + } \ + else return PCRE_ERROR_DFA_WSSIZE + +#define ADD_ACTIVE_DATA(x,y,z) \ + if (active_count++ < wscount) \ + { \ + next_active_state->offset = (x); \ + next_active_state->count = (y); \ + next_active_state->ims = ims; \ + next_active_state->data = (z); \ + next_active_state++; \ + DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ + } \ + else return PCRE_ERROR_DFA_WSSIZE + +#define ADD_NEW(x,y) \ + if (new_count++ < wscount) \ + { \ + next_new_state->offset = (x); \ + next_new_state->count = (y); \ + next_new_state->ims = ims; \ + next_new_state++; \ + DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ + } \ + else return PCRE_ERROR_DFA_WSSIZE + +#define ADD_NEW_DATA(x,y,z) \ + if (new_count++ < wscount) \ + { \ + next_new_state->offset = (x); \ + next_new_state->count = (y); \ + next_new_state->ims = ims; \ + next_new_state->data = (z); \ + next_new_state++; \ + DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ + } \ + else return PCRE_ERROR_DFA_WSSIZE + +/* And now, here is the code */ + +static int +internal_dfa_exec( + dfa_match_data *md, + const uschar *this_start_code, + const uschar *current_subject, + int start_offset, + int *offsets, + int offsetcount, + int *workspace, + int wscount, + int ims, + int rlevel, + int recursing) +{ +stateblock *active_states, *new_states, *temp_states; +stateblock *next_active_state, *next_new_state; + +const uschar *ctypes, *lcc, *fcc; +const uschar *ptr; +const uschar *end_code, *first_op; + +int active_count, new_count, match_count; + +/* Some fields in the md block are frequently referenced, so we load them into +independent variables in the hope that this will perform better. */ + +const uschar *start_subject = md->start_subject; +const uschar *end_subject = md->end_subject; +const uschar *start_code = md->start_code; + +#ifdef SUPPORT_UTF8 +BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; +#else +BOOL utf8 = FALSE; +#endif + +rlevel++; +offsetcount &= (-2); + +wscount -= 2; +wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / + (2 * INTS_PER_STATEBLOCK); + +DPRINTF(("\n%.*s---------------------\n" + "%.*sCall to internal_dfa_exec f=%d r=%d\n", + rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing)); + +ctypes = md->tables + ctypes_offset; +lcc = md->tables + lcc_offset; +fcc = md->tables + fcc_offset; + +match_count = PCRE_ERROR_NOMATCH; /* A negative number */ + +active_states = (stateblock *)(workspace + 2); +next_new_state = new_states = active_states + wscount; +new_count = 0; + +first_op = this_start_code + 1 + LINK_SIZE + + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); + +/* The first thing in any (sub) pattern is a bracket of some sort. Push all +the alternative states onto the list, and find out where the end is. This +makes is possible to use this function recursively, when we want to stop at a +matching internal ket rather than at the end. + +If the first opcode in the first alternative is OP_REVERSE, we are dealing with +a backward assertion. In that case, we have to find out the maximum amount to +move back, and set up each alternative appropriately. */ + +if (*first_op == OP_REVERSE) + { + int max_back = 0; + int gone_back; + + end_code = this_start_code; + do + { + int back = GET(end_code, 2+LINK_SIZE); + if (back > max_back) max_back = back; + end_code += GET(end_code, 1); + } + while (*end_code == OP_ALT); + + /* If we can't go back the amount required for the longest lookbehind + pattern, go back as far as we can; some alternatives may still be viable. */ + +#ifdef SUPPORT_UTF8 + /* In character mode we have to step back character by character */ + + if (utf8) + { + for (gone_back = 0; gone_back < max_back; gone_back++) + { + if (current_subject <= start_subject) break; + current_subject--; + while (current_subject > start_subject && + (*current_subject & 0xc0) == 0x80) + current_subject--; + } + } + else +#endif + + /* In byte-mode we can do this quickly. */ + + { + gone_back = (current_subject - max_back < start_subject)? + current_subject - start_subject : max_back; + current_subject -= gone_back; + } + + /* Now we can process the individual branches. */ + + end_code = this_start_code; + do + { + int back = GET(end_code, 2+LINK_SIZE); + if (back <= gone_back) + { + int bstate = end_code - start_code + 2 + 2*LINK_SIZE; + ADD_NEW_DATA(-bstate, 0, gone_back - back); + } + end_code += GET(end_code, 1); + } + while (*end_code == OP_ALT); + } + +/* This is the code for a "normal" subpattern (not a backward assertion). The +start of a whole pattern is always one of these. If we are at the top level, +we may be asked to restart matching from the same point that we reached for a +previous partial match. We still have to scan through the top-level branches to +find the end state. */ + +else + { + end_code = this_start_code; + + /* Restarting */ + + if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) + { + do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); + new_count = workspace[1]; + if (!workspace[0]) + memcpy(new_states, active_states, new_count * sizeof(stateblock)); + } + + /* Not restarting */ + + else + { + int length = 1 + LINK_SIZE + + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); + do + { + ADD_NEW(end_code - start_code + length, 0); + end_code += GET(end_code, 1); + length = 1 + LINK_SIZE; + } + while (*end_code == OP_ALT); + } + } + +workspace[0] = 0; /* Bit indicating which vector is current */ + +DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code)); + +/* Loop for scanning the subject */ + +ptr = current_subject; +for (;;) + { + int i, j; + int clen, dlen; + unsigned int c, d; + + /* Make the new state list into the active state list and empty the + new state list. */ + + temp_states = active_states; + active_states = new_states; + new_states = temp_states; + active_count = new_count; + new_count = 0; + + workspace[0] ^= 1; /* Remember for the restarting feature */ + workspace[1] = active_count; + +#ifdef DEBUG + printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); + pchars((uschar *)ptr, strlen((char *)ptr), stdout); + printf("\"\n"); + + printf("%.*sActive states: ", rlevel*2-2, SP); + for (i = 0; i < active_count; i++) + printf("%d/%d ", active_states[i].offset, active_states[i].count); + printf("\n"); +#endif + + /* Set the pointers for adding new states */ + + next_active_state = active_states + active_count; + next_new_state = new_states; + + /* Load the current character from the subject outside the loop, as many + different states may want to look at it, and we assume that at least one + will. */ + + if (ptr < end_subject) + { + clen = 1; /* Number of bytes in the character */ +#ifdef SUPPORT_UTF8 + if (utf8) { GETCHARLEN(c, ptr, clen); } else +#endif /* SUPPORT_UTF8 */ + c = *ptr; + } + else + { + clen = 0; /* This indicates the end of the subject */ + c = NOTACHAR; /* This value should never actually be used */ + } + + /* Scan up the active states and act on each one. The result of an action + may be to add more states to the currently active list (e.g. on hitting a + parenthesis) or it may be to put states on the new list, for considering + when we move the character pointer on. */ + + for (i = 0; i < active_count; i++) + { + stateblock *current_state = active_states + i; + const uschar *code; + int state_offset = current_state->offset; + int count, codevalue; + int chartype, script; + +#ifdef DEBUG + printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); + if (clen == 0) printf("EOL\n"); + else if (c > 32 && c < 127) printf("'%c'\n", c); + else printf("0x%02x\n", c); +#endif + + /* This variable is referred to implicity in the ADD_xxx macros. */ + + ims = current_state->ims; + + /* A negative offset is a special case meaning "hold off going to this + (negated) state until the number of characters in the data field have + been skipped". */ + + if (state_offset < 0) + { + if (current_state->data > 0) + { + DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); + ADD_NEW_DATA(state_offset, current_state->count, + current_state->data - 1); + continue; + } + else + { + current_state->offset = state_offset = -state_offset; + } + } + + /* Check for a duplicate state with the same count, and skip if found. */ + + for (j = 0; j < i; j++) + { + if (active_states[j].offset == state_offset && + active_states[j].count == current_state->count) + { + DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); + goto NEXT_ACTIVE_STATE; + } + } + + /* The state offset is the offset to the opcode */ + + code = start_code + state_offset; + codevalue = *code; + + /* If this opcode is followed by an inline character, load it. It is + tempting to test for the presence of a subject character here, but that + is wrong, because sometimes zero repetitions of the subject are + permitted. + + We also use this mechanism for opcodes such as OP_TYPEPLUS that take an + argument that is not a data character - but is always one byte long. + Unfortunately, we have to take special action to deal with \P, \p, and + \X in this case. To keep the other cases fast, convert these ones to new + opcodes. */ + + if (coptable[codevalue] > 0) + { + dlen = 1; +#ifdef SUPPORT_UTF8 + if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else +#endif /* SUPPORT_UTF8 */ + d = code[coptable[codevalue]]; + if (codevalue >= OP_TYPESTAR) + { + switch(d) + { + case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; + case OP_NOTPROP: + case OP_PROP: codevalue += OP_PROP_EXTRA; break; + case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; + case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; + default: break; + } + } + } + else + { + dlen = 0; /* Not strictly necessary, but compilers moan */ + d = NOTACHAR; /* if these variables are not set. */ + } + + + /* Now process the individual opcodes */ + + switch (codevalue) + { + +/* ========================================================================== */ + /* Reached a closing bracket. If not at the end of the pattern, carry + on with the next opcode. Otherwise, unless we have an empty string and + PCRE_NOTEMPTY is set, save the match data, shifting up all previous + matches so we always have the longest first. */ + + case OP_KET: + case OP_KETRMIN: + case OP_KETRMAX: + if (code != end_code) + { + ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); + if (codevalue != OP_KET) + { + ADD_ACTIVE(state_offset - GET(code, 1), 0); + } + } + else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) + { + if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; + else if (match_count > 0 && ++match_count * 2 >= offsetcount) + match_count = 0; + count = ((match_count == 0)? offsetcount : match_count * 2) - 2; + if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); + if (offsetcount >= 2) + { + offsets[0] = current_subject - start_subject; + offsets[1] = ptr - start_subject; + DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, + offsets[1] - offsets[0], current_subject)); + } + if ((md->moptions & PCRE_DFA_SHORTEST) != 0) + { + DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" + "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, + match_count, rlevel*2-2, SP)); + return match_count; + } + } + break; + +/* ========================================================================== */ + /* These opcodes add to the current list of states without looking + at the current character. */ + + /*-----------------------------------------------------------------*/ + case OP_ALT: + do { code += GET(code, 1); } while (*code == OP_ALT); + ADD_ACTIVE(code - start_code, 0); + break; + + /*-----------------------------------------------------------------*/ + case OP_BRA: + case OP_SBRA: + do + { + ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + code += GET(code, 1); + } + while (*code == OP_ALT); + break; + + /*-----------------------------------------------------------------*/ + case OP_CBRA: + case OP_SCBRA: + ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); + code += GET(code, 1); + while (*code == OP_ALT) + { + ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + code += GET(code, 1); + } + break; + + /*-----------------------------------------------------------------*/ + case OP_BRAZERO: + case OP_BRAMINZERO: + ADD_ACTIVE(state_offset + 1, 0); + code += 1 + GET(code, 2); + while (*code == OP_ALT) code += GET(code, 1); + ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + break; + + /*-----------------------------------------------------------------*/ + case OP_CIRC: + if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || + ((ims & PCRE_MULTILINE) != 0 && + ptr != end_subject && + WAS_NEWLINE(ptr))) + { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_EOD: + if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_OPT: + ims = code[1]; + ADD_ACTIVE(state_offset + 2, 0); + break; + + /*-----------------------------------------------------------------*/ + case OP_SOD: + if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_SOM: + if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } + break; + + +/* ========================================================================== */ + /* These opcodes inspect the next subject character, and sometimes + the previous one as well, but do not have an argument. The variable + clen contains the length of the current character and is zero if we are + at the end of the subject. */ + + /*-----------------------------------------------------------------*/ + case OP_ANY: + if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr))) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_EODN: + if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) + { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_DOLL: + if ((md->moptions & PCRE_NOTEOL) == 0) + { + if (clen == 0 || + (IS_NEWLINE(ptr) && + ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) + )) + { ADD_ACTIVE(state_offset + 1, 0); } + } + else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr)) + { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + + case OP_DIGIT: + case OP_WHITESPACE: + case OP_WORDCHAR: + if (clen > 0 && c < 256 && + ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_NOT_DIGIT: + case OP_NOT_WHITESPACE: + case OP_NOT_WORDCHAR: + if (clen > 0 && (c >= 256 || + ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + { + int left_word, right_word; + + if (ptr > start_subject) + { + const uschar *temp = ptr - 1; +#ifdef SUPPORT_UTF8 + if (utf8) BACKCHAR(temp); +#endif + GETCHARTEST(d, temp); + left_word = d < 256 && (ctypes[d] & ctype_word) != 0; + } + else left_word = 0; + + if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; + else right_word = 0; + + if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) + { ADD_ACTIVE(state_offset + 1, 0); } + } + break; + + +#ifdef SUPPORT_UCP + + /*-----------------------------------------------------------------*/ + /* Check the next character by Unicode property. We will get here only + if the support is in the binary; otherwise a compile-time error occurs. + */ + + case OP_PROP: + case OP_NOTPROP: + if (clen > 0) + { + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[1]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[2]; + break; + + case PT_PC: + OK = chartype == code[2]; + break; + + case PT_SC: + OK = script == code[2]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } + } + break; +#endif + + + +/* ========================================================================== */ + /* These opcodes likewise inspect the subject character, but have an + argument that is not a data character. It is one of these opcodes: + OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR, + OP_NOT_WORDCHAR. The value is loaded into d. */ + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + !IS_NEWLINE(ptr) + ) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (count > 0 && codevalue == OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + !IS_NEWLINE(ptr) + ) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (codevalue == OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset + 2, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + !IS_NEWLINE(ptr) + ) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (codevalue == OP_TYPEPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPEEXACT: + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + !IS_NEWLINE(ptr) + ) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (++count >= GET2(code, 1)) + { ADD_NEW(state_offset + 4, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + ADD_ACTIVE(state_offset + 4, 0); + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + !IS_NEWLINE(ptr) + ) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (codevalue == OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= GET2(code, 1)) + { ADD_NEW(state_offset + 4, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + +/* ========================================================================== */ + /* These are virtual opcodes that are used when something like + OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its + argument. It keeps the code above fast for the other cases. The argument + is in the d variable. */ + + case OP_PROP_EXTRA + OP_TYPEPLUS: + case OP_PROP_EXTRA + OP_TYPEMINPLUS: + case OP_PROP_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } + if (clen > 0) + { + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[2]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[3]; + break; + + case PT_PC: + OK = chartype == code[3]; + break; + + case PT_SC: + OK = script == code[3]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) + { + if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXTUNI_EXTRA + OP_TYPEPLUS: + case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: + case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) + { + const uschar *nptr = ptr + clen; + int ncount = 0; + if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + while (nptr < end_subject) + { + int nd; + int ndlen = 1; + GETCHARLEN(nd, nptr, ndlen); + if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; + ncount++; + nptr += ndlen; + } + count++; + ADD_NEW_DATA(-state_offset, count, ncount); + } + break; + + /*-----------------------------------------------------------------*/ + case OP_ANYNL_EXTRA + OP_TYPEPLUS: + case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: + case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + int ncount = 0; + switch (c) + { + case 0x000d: + if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + /* Fall through */ + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW_DATA(-state_offset, count, ncount); + break; + default: + break; + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_PROP_EXTRA + OP_TYPEQUERY: + case OP_PROP_EXTRA + OP_TYPEMINQUERY: + case OP_PROP_EXTRA + OP_TYPEPOSQUERY: + count = 4; + goto QS1; + + case OP_PROP_EXTRA + OP_TYPESTAR: + case OP_PROP_EXTRA + OP_TYPEMINSTAR: + case OP_PROP_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS1: + + ADD_ACTIVE(state_offset + 4, 0); + if (clen > 0) + { + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[2]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[3]; + break; + + case PT_PC: + OK = chartype == code[3]; + break; + + case PT_SC: + OK = script == code[3]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) + { + if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset + count, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXTUNI_EXTRA + OP_TYPEQUERY: + case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: + case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: + count = 2; + goto QS2; + + case OP_EXTUNI_EXTRA + OP_TYPESTAR: + case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: + case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS2: + + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) + { + const uschar *nptr = ptr + clen; + int ncount = 0; + if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + while (nptr < end_subject) + { + int nd; + int ndlen = 1; + GETCHARLEN(nd, nptr, ndlen); + if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; + ncount++; + nptr += ndlen; + } + ADD_NEW_DATA(-(state_offset + count), 0, ncount); + } + break; + + /*-----------------------------------------------------------------*/ + case OP_ANYNL_EXTRA + OP_TYPEQUERY: + case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: + case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: + count = 2; + goto QS3; + + case OP_ANYNL_EXTRA + OP_TYPESTAR: + case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: + case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS3: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + int ncount = 0; + switch (c) + { + case 0x000d: + if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + /* Fall through */ + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW_DATA(-(state_offset + count), 0, ncount); + break; + default: + break; + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_PROP_EXTRA + OP_TYPEEXACT: + case OP_PROP_EXTRA + OP_TYPEUPTO: + case OP_PROP_EXTRA + OP_TYPEMINUPTO: + case OP_PROP_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 6, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[4]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[5]; + break; + + case PT_PC: + OK = chartype == code[5]; + break; + + case PT_SC: + OK = script == code[5]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) + { + if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= GET2(code, 1)) + { ADD_NEW(state_offset + 6, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXTUNI_EXTRA + OP_TYPEEXACT: + case OP_EXTUNI_EXTRA + OP_TYPEUPTO: + case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: + case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 4, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) + { + const uschar *nptr = ptr + clen; + int ncount = 0; + if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + while (nptr < end_subject) + { + int nd; + int ndlen = 1; + GETCHARLEN(nd, nptr, ndlen); + if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; + ncount++; + nptr += ndlen; + } + if (++count >= GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } + else + { ADD_NEW_DATA(-state_offset, count, ncount); } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_ANYNL_EXTRA + OP_TYPEEXACT: + case OP_ANYNL_EXTRA + OP_TYPEUPTO: + case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: + case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 4, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + int ncount = 0; + switch (c) + { + case 0x000d: + if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + /* Fall through */ + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } + else + { ADD_NEW_DATA(-state_offset, count, ncount); } + break; + default: + break; + } + } + break; + +/* ========================================================================== */ + /* These opcodes are followed by a character that is usually compared + to the current subject character; it is loaded into d. We still get + here even if there is no subject character, because in some cases zero + repetitions are permitted. */ + + /*-----------------------------------------------------------------*/ + case OP_CHAR: + if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_CHARNC: + if (clen == 0) break; + +#ifdef SUPPORT_UTF8 + if (utf8) + { + if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else + { + unsigned int othercase; + if (c < 128) othercase = fcc[c]; else + + /* If we have Unicode property support, we can use it to test the + other case of the character. */ + +#ifdef SUPPORT_UCP + othercase = _pcre_ucp_othercase(c); +#else + othercase = NOTACHAR; +#endif + + if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } + } + } + else +#endif /* SUPPORT_UTF8 */ + + /* Non-UTF-8 mode */ + { + if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } + } + break; + + +#ifdef SUPPORT_UCP + /*-----------------------------------------------------------------*/ + /* This is a tricky one because it can match more than one character. + Find out how many characters to skip, and then set up a negative state + to wait for them to pass before continuing. */ + + case OP_EXTUNI: + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) + { + const uschar *nptr = ptr + clen; + int ncount = 0; + while (nptr < end_subject) + { + int nclen = 1; + GETCHARLEN(c, nptr, nclen); + if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break; + ncount++; + nptr += nclen; + } + ADD_NEW_DATA(-(state_offset + 1), 0, ncount); + } + break; +#endif + + /*-----------------------------------------------------------------*/ + /* This is a tricky like EXTUNI because it too can match more than one + character (when CR is followed by LF). In this case, set up a negative + state to wait for one character to pass before continuing. */ + + case OP_ANYNL: + if (clen > 0) switch(c) + { + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + ADD_NEW(state_offset + 1, 0); + break; + case 0x000d: + if (ptr + 1 < end_subject && ptr[1] == 0x0a) + { + ADD_NEW_DATA(-(state_offset + 1), 0, 1); + } + else + { + ADD_NEW(state_offset + 1, 0); + } + break; + } + break; + + /*-----------------------------------------------------------------*/ + /* Match a negated single character. This is only used for one-byte + characters, that is, we know that d < 256. The character we are + checking (c) can be multibyte. */ + + case OP_NOT: + if (clen > 0) + { + unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d; + if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } + if (clen > 0) + { + unsigned int otherd = NOTACHAR; + if ((ims & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UTF8 + if (utf8 && d >= 128) + { +#ifdef SUPPORT_UCP + otherd = _pcre_ucp_othercase(d); +#endif /* SUPPORT_UCP */ + } + else +#endif /* SUPPORT_UTF8 */ + otherd = fcc[d]; + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (count > 0 && + (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + case OP_NOTQUERY: + case OP_NOTMINQUERY: + case OP_NOTPOSQUERY: + ADD_ACTIVE(state_offset + dlen + 1, 0); + if (clen > 0) + { + unsigned int otherd = NOTACHAR; + if ((ims & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UTF8 + if (utf8 && d >= 128) + { +#ifdef SUPPORT_UCP + otherd = _pcre_ucp_othercase(d); +#endif /* SUPPORT_UCP */ + } + else +#endif /* SUPPORT_UTF8 */ + otherd = fcc[d]; + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset + dlen + 1, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPOSSTAR: + ADD_ACTIVE(state_offset + dlen + 1, 0); + if (clen > 0) + { + unsigned int otherd = NOTACHAR; + if ((ims & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UTF8 + if (utf8 && d >= 128) + { +#ifdef SUPPORT_UCP + otherd = _pcre_ucp_othercase(d); +#endif /* SUPPORT_UCP */ + } + else +#endif /* SUPPORT_UTF8 */ + otherd = fcc[d]; + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXACT: + case OP_NOTEXACT: + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + unsigned int otherd = NOTACHAR; + if ((ims & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UTF8 + if (utf8 && d >= 128) + { +#ifdef SUPPORT_UCP + otherd = _pcre_ucp_othercase(d); +#endif /* SUPPORT_UCP */ + } + else +#endif /* SUPPORT_UTF8 */ + otherd = fcc[d]; + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (++count >= GET2(code, 1)) + { ADD_NEW(state_offset + dlen + 3, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + case OP_NOTUPTO: + case OP_NOTMINUPTO: + case OP_NOTPOSUPTO: + ADD_ACTIVE(state_offset + dlen + 3, 0); + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + unsigned int otherd = NOTACHAR; + if ((ims & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UTF8 + if (utf8 && d >= 128) + { +#ifdef SUPPORT_UCP + otherd = _pcre_ucp_othercase(d); +#endif /* SUPPORT_UCP */ + } + else +#endif /* SUPPORT_UTF8 */ + otherd = fcc[d]; + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= GET2(code, 1)) + { ADD_NEW(state_offset + dlen + 3, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + +/* ========================================================================== */ + /* These are the class-handling opcodes */ + + case OP_CLASS: + case OP_NCLASS: + case OP_XCLASS: + { + BOOL isinclass = FALSE; + int next_state_offset; + const uschar *ecode; + + /* For a simple class, there is always just a 32-byte table, and we + can set isinclass from it. */ + + if (codevalue != OP_XCLASS) + { + ecode = code + 33; + if (clen > 0) + { + isinclass = (c > 255)? (codevalue == OP_NCLASS) : + ((code[1 + c/8] & (1 << (c&7))) != 0); + } + } + + /* An extended class may have a table or a list of single characters, + ranges, or both, and it may be positive or negative. There's a + function that sorts all this out. */ + + else + { + ecode = code + GET(code, 1); + if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); + } + + /* At this point, isinclass is set for all kinds of class, and ecode + points to the byte after the end of the class. If there is a + quantifier, this is where it will be. */ + + next_state_offset = ecode - start_code; + + switch (*ecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + ADD_ACTIVE(next_state_offset + 1, 0); + if (isinclass) { ADD_NEW(state_offset, 0); } + break; + + case OP_CRPLUS: + case OP_CRMINPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } + if (isinclass) { count++; ADD_NEW(state_offset, count); } + break; + + case OP_CRQUERY: + case OP_CRMINQUERY: + ADD_ACTIVE(next_state_offset + 1, 0); + if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + count = current_state->count; /* Already matched */ + if (count >= GET2(ecode, 1)) + { ADD_ACTIVE(next_state_offset + 5, 0); } + if (isinclass) + { + int max = GET2(ecode, 3); + if (++count >= max && max != 0) /* Max 0 => no limit */ + { ADD_NEW(next_state_offset + 5, 0); } + else + { ADD_NEW(state_offset, count); } + } + break; + + default: + if (isinclass) { ADD_NEW(next_state_offset, 0); } + break; + } + } + break; + +/* ========================================================================== */ + /* These are the opcodes for fancy brackets of various kinds. We have + to use recursion in order to handle them. */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + { + int rc; + int local_offsets[2]; + int local_workspace[1000]; + const uschar *endasscode = code + GET(code, 1); + + while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); + + rc = internal_dfa_exec( + md, /* static match data */ + code, /* this subexpression's code */ + ptr, /* where we currently are */ + ptr - start_subject, /* start offset */ + local_offsets, /* offset vector */ + sizeof(local_offsets)/sizeof(int), /* size of same */ + local_workspace, /* workspace vector */ + sizeof(local_workspace)/sizeof(int), /* size of same */ + ims, /* the current ims flags */ + rlevel, /* function recursion level */ + recursing); /* pass on regex recursion */ + + if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) + { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_COND: + case OP_SCOND: + { + int local_offsets[1000]; + int local_workspace[1000]; + int condcode = code[LINK_SIZE+1]; + + /* Back reference conditions are not supported */ + + if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; + + /* The DEFINE condition is always false */ + + if (condcode == OP_DEF) + { + ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); + } + + /* The only supported version of OP_RREF is for the value RREF_ANY, + which means "test if in any recursion". We can't test for specifically + recursed groups. */ + + else if (condcode == OP_RREF) + { + int value = GET2(code, LINK_SIZE+2); + if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; + if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } + else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } + } + + /* Otherwise, the condition is an assertion */ + + else + { + int rc; + const uschar *asscode = code + LINK_SIZE + 1; + const uschar *endasscode = asscode + GET(asscode, 1); + + while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); + + rc = internal_dfa_exec( + md, /* fixed match data */ + asscode, /* this subexpression's code */ + ptr, /* where we currently are */ + ptr - start_subject, /* start offset */ + local_offsets, /* offset vector */ + sizeof(local_offsets)/sizeof(int), /* size of same */ + local_workspace, /* workspace vector */ + sizeof(local_workspace)/sizeof(int), /* size of same */ + ims, /* the current ims flags */ + rlevel, /* function recursion level */ + recursing); /* pass on regex recursion */ + + if ((rc >= 0) == + (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) + { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } + else + { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_RECURSE: + { + int local_offsets[1000]; + int local_workspace[1000]; + int rc; + + DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP, + recursing + 1)); + + rc = internal_dfa_exec( + md, /* fixed match data */ + start_code + GET(code, 1), /* this subexpression's code */ + ptr, /* where we currently are */ + ptr - start_subject, /* start offset */ + local_offsets, /* offset vector */ + sizeof(local_offsets)/sizeof(int), /* size of same */ + local_workspace, /* workspace vector */ + sizeof(local_workspace)/sizeof(int), /* size of same */ + ims, /* the current ims flags */ + rlevel, /* function recursion level */ + recursing + 1); /* regex recurse level */ + + DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP, + recursing + 1, rc)); + + /* Ran out of internal offsets */ + + if (rc == 0) return PCRE_ERROR_DFA_RECURSE; + + /* For each successful matched substring, set up the next state with a + count of characters to skip before trying it. Note that the count is in + characters, not bytes. */ + + if (rc > 0) + { + for (rc = rc*2 - 2; rc >= 0; rc -= 2) + { + const uschar *p = start_subject + local_offsets[rc]; + const uschar *pp = start_subject + local_offsets[rc+1]; + int charcount = local_offsets[rc+1] - local_offsets[rc]; + while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; + if (charcount > 0) + { + ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); + } + else + { + ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); + } + } + } + else if (rc != PCRE_ERROR_NOMATCH) return rc; + } + break; + + /*-----------------------------------------------------------------*/ + case OP_ONCE: + { + int local_offsets[2]; + int local_workspace[1000]; + + int rc = internal_dfa_exec( + md, /* fixed match data */ + code, /* this subexpression's code */ + ptr, /* where we currently are */ + ptr - start_subject, /* start offset */ + local_offsets, /* offset vector */ + sizeof(local_offsets)/sizeof(int), /* size of same */ + local_workspace, /* workspace vector */ + sizeof(local_workspace)/sizeof(int), /* size of same */ + ims, /* the current ims flags */ + rlevel, /* function recursion level */ + recursing); /* pass on regex recursion */ + + if (rc >= 0) + { + const uschar *end_subpattern = code; + int charcount = local_offsets[1] - local_offsets[0]; + int next_state_offset, repeat_state_offset; + + do { end_subpattern += GET(end_subpattern, 1); } + while (*end_subpattern == OP_ALT); + next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; + + /* If the end of this subpattern is KETRMAX or KETRMIN, we must + arrange for the repeat state also to be added to the relevant list. + Calculate the offset, or set -1 for no repeat. */ + + repeat_state_offset = (*end_subpattern == OP_KETRMAX || + *end_subpattern == OP_KETRMIN)? + end_subpattern - start_code - GET(end_subpattern, 1) : -1; + + /* If we have matched an empty string, add the next state at the + current character pointer. This is important so that the duplicate + checking kicks in, which is what breaks infinite loops that match an + empty string. */ + + if (charcount == 0) + { + ADD_ACTIVE(next_state_offset, 0); + } + + /* Optimization: if there are no more active states, and there + are no new states yet set up, then skip over the subject string + right here, to save looping. Otherwise, set up the new state to swing + into action when the end of the substring is reached. */ + + else if (i + 1 >= active_count && new_count == 0) + { + ptr += charcount; + clen = 0; + ADD_NEW(next_state_offset, 0); + + /* If we are adding a repeat state at the new character position, + we must fudge things so that it is the only current state. + Otherwise, it might be a duplicate of one we processed before, and + that would cause it to be skipped. */ + + if (repeat_state_offset >= 0) + { + next_active_state = active_states; + active_count = 0; + i = -1; + ADD_ACTIVE(repeat_state_offset, 0); + } + } + else + { + const uschar *p = start_subject + local_offsets[0]; + const uschar *pp = start_subject + local_offsets[1]; + while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; + ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); + if (repeat_state_offset >= 0) + { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } + } + + } + else if (rc != PCRE_ERROR_NOMATCH) return rc; + } + break; + + +/* ========================================================================== */ + /* Handle callouts */ + + case OP_CALLOUT: + if (pcre_callout != NULL) + { + int rrc; + pcre_callout_block cb; + cb.version = 1; /* Version 1 of the callout block */ + cb.callout_number = code[1]; + cb.offset_vector = offsets; + cb.subject = (PCRE_SPTR)start_subject; + cb.subject_length = end_subject - start_subject; + cb.start_match = current_subject - start_subject; + cb.current_position = ptr - start_subject; + cb.pattern_position = GET(code, 2); + cb.next_item_length = GET(code, 2 + LINK_SIZE); + cb.capture_top = 1; + cb.capture_last = -1; + cb.callout_data = md->callout_data; + if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ + if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); } + } + break; + + +/* ========================================================================== */ + default: /* Unsupported opcode */ + return PCRE_ERROR_DFA_UITEM; + } + + NEXT_ACTIVE_STATE: continue; + + } /* End of loop scanning active states */ + + /* We have finished the processing at the current subject character. If no + new states have been set for the next character, we have found all the + matches that we are going to find. If we are at the top level and partial + matching has been requested, check for appropriate conditions. */ + + if (new_count <= 0) + { + if (match_count < 0 && /* No matches found */ + rlevel == 1 && /* Top level match function */ + (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */ + ptr >= end_subject && /* Reached end of subject */ + ptr > current_subject) /* Matched non-empty string */ + { + if (offsetcount >= 2) + { + offsets[0] = current_subject - start_subject; + offsets[1] = end_subject - start_subject; + } + match_count = PCRE_ERROR_PARTIAL; + } + + DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" + "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, + rlevel*2-2, SP)); + break; /* In effect, "return", but see the comment below */ + } + + /* One or more states are active for the next character. */ + + ptr += clen; /* Advance to next subject character */ + } /* Loop to move along the subject string */ + +/* Control gets here from "break" a few lines above. We do it this way because +if we use "return" above, we have compiler trouble. Some compilers warn if +there's nothing here because they think the function doesn't return a value. On +the other hand, if we put a dummy statement here, some more clever compilers +complain that it can't be reached. Sigh. */ + +return match_count; +} + + + + +/************************************************* +* Execute a Regular Expression - DFA engine * +*************************************************/ + +/* This external function applies a compiled re to a subject string using a DFA +engine. This function calls the internal function multiple times if the pattern +is not anchored. + +Arguments: + argument_re points to the compiled expression + extra_data points to extra data or is NULL (not currently used) + subject points to the subject string + length length of subject string (may contain binary zeros) + start_offset where to start in the subject string + options option bits + offsets vector of match offsets + offsetcount size of same + workspace workspace vector + wscount size of same + +Returns: > 0 => number of match offset pairs placed in offsets + = 0 => offsets overflowed; longest matches are present + -1 => failed to match + < -1 => some kind of unexpected problem +*/ + +PCRE_DATA_SCOPE int +pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, + const char *subject, int length, int start_offset, int options, int *offsets, + int offsetcount, int *workspace, int wscount) +{ +real_pcre *re = (real_pcre *)argument_re; +dfa_match_data match_block; +dfa_match_data *md = &match_block; +BOOL utf8, anchored, startline, firstline; +const uschar *current_subject, *end_subject, *lcc; + +pcre_study_data internal_study; +const pcre_study_data *study = NULL; +real_pcre internal_re; + +const uschar *req_byte_ptr; +const uschar *start_bits = NULL; +BOOL first_byte_caseless = FALSE; +BOOL req_byte_caseless = FALSE; +int first_byte = -1; +int req_byte = -1; +int req_byte2 = -1; +int newline; + +/* Plausibility checks */ + +if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; +if (re == NULL || subject == NULL || workspace == NULL || + (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; +if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; +if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; + +/* We need to find the pointer to any study data before we test for byte +flipping, so we scan the extra_data block first. This may set two fields in the +match block, so we must initialize them beforehand. However, the other fields +in the match block must not be set until after the byte flipping. */ + +md->tables = re->tables; +md->callout_data = NULL; + +if (extra_data != NULL) + { + unsigned int flags = extra_data->flags; + if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) + study = (const pcre_study_data *)extra_data->study_data; + if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; + if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) + return PCRE_ERROR_DFA_UMLIMIT; + if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) + md->callout_data = extra_data->callout_data; + if ((flags & PCRE_EXTRA_TABLES) != 0) + md->tables = extra_data->tables; + } + +/* Check that the first field in the block is the magic number. If it is not, +test for a regex that was compiled on a host of opposite endianness. If this is +the case, flipped values are put in internal_re and internal_study if there was +study data too. */ + +if (re->magic_number != MAGIC_NUMBER) + { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) return PCRE_ERROR_BADMAGIC; + if (study != NULL) study = &internal_study; + } + +/* Set some local values */ + +current_subject = (const unsigned char *)subject + start_offset; +end_subject = (const unsigned char *)subject + length; +req_byte_ptr = current_subject - 1; + +#ifdef SUPPORT_UTF8 +utf8 = (re->options & PCRE_UTF8) != 0; +#else +utf8 = FALSE; +#endif + +anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || + (re->options & PCRE_ANCHORED) != 0; + +/* The remaining fixed data for passing around. */ + +md->start_code = (const uschar *)argument_re + + re->name_table_offset + re->name_count * re->name_entry_size; +md->start_subject = (const unsigned char *)subject; +md->end_subject = end_subject; +md->moptions = options; +md->poptions = re->options; + +/* Handle different types of newline. The two bits give four cases. If nothing +is set at run time, whatever was used at compile time applies. */ + +switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & + PCRE_NEWLINE_BITS) + { + case 0: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + case PCRE_NEWLINE_ANY: newline = -1; break; + default: return PCRE_ERROR_BADNEWLINE; + } + +if (newline < 0) + { + md->nltype = NLTYPE_ANY; + } +else + { + md->nltype = NLTYPE_FIXED; + if (newline > 255) + { + md->nllen = 2; + md->nl[0] = (newline >> 8) & 255; + md->nl[1] = newline & 255; + } + else + { + md->nllen = 1; + md->nl[0] = newline; + } + } + +/* Check a UTF-8 string if required. Unfortunately there's no way of passing +back the character offset. */ + +#ifdef SUPPORT_UTF8 +if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) + { + if (_pcre_valid_utf8((uschar *)subject, length) >= 0) + return PCRE_ERROR_BADUTF8; + if (start_offset > 0 && start_offset < length) + { + int tb = ((uschar *)subject)[start_offset]; + if (tb > 127) + { + tb &= 0xc0; + if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; + } + } + } +#endif + +/* If the exec call supplied NULL for tables, use the inbuilt ones. This +is a feature that makes it possible to save compiled regex and re-use them +in other programs later. */ + +if (md->tables == NULL) md->tables = _pcre_default_tables; + +/* The lower casing table and the "must be at the start of a line" flag are +used in a loop when finding where to start. */ + +lcc = md->tables + lcc_offset; +startline = (re->options & PCRE_STARTLINE) != 0; +firstline = (re->options & PCRE_FIRSTLINE) != 0; + +/* Set up the first character to match, if available. The first_byte value is +never set for an anchored regular expression, but the anchoring may be forced +at run time, so we have to test for anchoring. The first char may be unset for +an unanchored pattern, of course. If there's no first char and the pattern was +studied, there may be a bitmap of possible first characters. */ + +if (!anchored) + { + if ((re->options & PCRE_FIRSTSET) != 0) + { + first_byte = re->first_byte & 255; + if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) + first_byte = lcc[first_byte]; + } + else + { + if (startline && study != NULL && + (study->options & PCRE_STUDY_MAPPED) != 0) + start_bits = study->start_bits; + } + } + +/* For anchored or unanchored matches, there may be a "last known required +character" set. */ + +if ((re->options & PCRE_REQCHSET) != 0) + { + req_byte = re->req_byte & 255; + req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; + req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ + } + +/* Call the main matching function, looping for a non-anchored regex after a +failed match. Unless restarting, optimize by moving to the first match +character if possible, when not anchored. Then unless wanting a partial match, +check for a required later character. */ + +for (;;) + { + int rc; + + if ((options & PCRE_DFA_RESTART) == 0) + { + const uschar *save_end_subject = end_subject; + + /* Advance to a unique first char if possible. If firstline is TRUE, the + start of the match is constrained to the first line of a multiline string. + Implement this by temporarily adjusting end_subject so that we stop + scanning at a newline. If the match fails at the newline, later code breaks + this loop. */ + + if (firstline) + { + const uschar *t = current_subject; + while (t < md->end_subject && !IS_NEWLINE(t)) t++; + end_subject = t; + } + + if (first_byte >= 0) + { + if (first_byte_caseless) + while (current_subject < end_subject && + lcc[*current_subject] != first_byte) + current_subject++; + else + while (current_subject < end_subject && *current_subject != first_byte) + current_subject++; + } + + /* Or to just after a linebreak for a multiline match if possible */ + + else if (startline) + { + if (current_subject > md->start_subject + start_offset) + { + while (current_subject <= end_subject && !WAS_NEWLINE(current_subject)) + current_subject++; + } + } + + /* Or to a non-unique first char after study */ + + else if (start_bits != NULL) + { + while (current_subject < end_subject) + { + register unsigned int c = *current_subject; + if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; + else break; + } + } + + /* Restore fudged end_subject */ + + end_subject = save_end_subject; + } + + /* If req_byte is set, we know that that character must appear in the subject + for the match to succeed. If the first character is set, req_byte must be + later in the subject; otherwise the test starts at the match point. This + optimization can save a huge amount of work in patterns with nested unlimited + repeats that aren't going to match. Writing separate code for cased/caseless + versions makes it go faster, as does using an autoincrement and backing off + on a match. + + HOWEVER: when the subject string is very, very long, searching to its end can + take a long time, and give bad performance on quite ordinary patterns. This + showed up when somebody was matching /^C/ on a 32-megabyte string... so we + don't do this when the string is sufficiently long. + + ALSO: this processing is disabled when partial matching is requested. + */ + + if (req_byte >= 0 && + end_subject - current_subject < REQ_BYTE_MAX && + (options & PCRE_PARTIAL) == 0) + { + register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > req_byte_ptr) + { + if (req_byte_caseless) + { + while (p < end_subject) + { + register int pp = *p++; + if (pp == req_byte || pp == req_byte2) { p--; break; } + } + } + else + { + while (p < end_subject) + { + if (*p++ == req_byte) { p--; break; } + } + } + + /* If we can't find the required character, break the matching loop, + which will cause a return or PCRE_ERROR_NOMATCH. */ + + if (p >= end_subject) break; + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this character yet. */ + + req_byte_ptr = p; + } + } + + /* OK, now we can do the business */ + + rc = internal_dfa_exec( + md, /* fixed match data */ + md->start_code, /* this subexpression's code */ + current_subject, /* where we currently are */ + start_offset, /* start offset in subject */ + offsets, /* offset vector */ + offsetcount, /* size of same */ + workspace, /* workspace vector */ + wscount, /* size of same */ + re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ + 0, /* function recurse level */ + 0); /* regex recurse level */ + + /* Anything other than "no match" means we are done, always; otherwise, carry + on only if not anchored. */ + + if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; + + /* Advance to the next subject character unless we are at the end of a line + and firstline is set. */ + + if (firstline && IS_NEWLINE(current_subject)) break; + current_subject++; + if (utf8) + { + while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) + current_subject++; + } + if (current_subject > end_subject) break; + + /* If we have just passed a CR and the newline option is CRLF or ANY, and we + are now at a LF, advance the match position by one more character. */ + + if (current_subject[-1] == '\r' && + (md->nltype == NLTYPE_ANY || md->nllen == 2) && + current_subject < end_subject && + *current_subject == '\n') + current_subject++; + + } /* "Bumpalong" loop */ + +return PCRE_ERROR_NOMATCH; +} + +/* End of pcre_dfa_exec.c */ diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c new file mode 100644 index 0000000..890e0f7 --- /dev/null +++ b/glib/pcre/pcre_exec.c @@ -0,0 +1,4199 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains pcre_exec(), the externally visible function that does +pattern matching using an NFA algorithm, trying to mimic Perl as closely as +possible. There are also some static supporting functions. */ + +#define NLBLOCK md /* Block containing newline information */ +#define PSSTART start_subject /* Field containing processed string start */ +#define PSEND end_subject /* Field containing processed string end */ + +#include "pcre_internal.h" + +/* The chain of eptrblocks for tail recursions uses memory in stack workspace, +obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */ + +#define EPTR_WORK_SIZE (1000) + +/* Flag bits for the match() function */ + +#define match_condassert 0x01 /* Called to check a condition assertion */ +#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ +#define match_tail_recursed 0x04 /* Tail recursive call */ + +/* Non-error returns from the match() function. Error returns are externally +defined PCRE_ERROR_xxx codes, which are all negative. */ + +#define MATCH_MATCH 1 +#define MATCH_NOMATCH 0 + +/* Maximum number of ints of offset to save on the stack for recursive calls. +If the offset vector is bigger, malloc is used. This should be a multiple of 3, +because the offset vector is always a multiple of 3 long. */ + +#define REC_STACK_SAVE_MAX 30 + +/* Min and max values for the common repeats; for the maxima, 0 => infinity */ + +static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; + + + +#ifdef DEBUG +/************************************************* +* Debugging function to print chars * +*************************************************/ + +/* Print a sequence of chars in printable format, stopping at the end of the +subject if the requested. + +Arguments: + p points to characters + length number to print + is_subject TRUE if printing from within md->start_subject + md pointer to matching data block, if is_subject is TRUE + +Returns: nothing +*/ + +static void +pchars(const uschar *p, int length, BOOL is_subject, match_data *md) +{ +unsigned int c; +if (is_subject && length > md->end_subject - p) length = md->end_subject - p; +while (length-- > 0) + if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); +} +#endif + + + +/************************************************* +* Match a back-reference * +*************************************************/ + +/* If a back reference hasn't been set, the length that is passed is greater +than the number of characters left in the string, so the match fails. + +Arguments: + offset index into the offset vector + eptr points into the subject + length length to be matched + md points to match data block + ims the ims flags + +Returns: TRUE if matched +*/ + +static BOOL +match_ref(int offset, register USPTR eptr, int length, match_data *md, + unsigned long int ims) +{ +USPTR p = md->start_subject + md->offset_vector[offset]; + +#ifdef DEBUG +if (eptr >= md->end_subject) + printf("matching subject "); +else + { + printf("matching subject "); + pchars(eptr, length, TRUE, md); + } +printf(" against backref "); +pchars(p, length, FALSE, md); +printf("\n"); +#endif + +/* Always fail if not enough characters left */ + +if (length > md->end_subject - eptr) return FALSE; + +/* Separate the caselesss case for speed */ + +if ((ims & PCRE_CASELESS) != 0) + { + while (length-- > 0) + if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; + } +else + { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } + +return TRUE; +} + + + +/*************************************************************************** +**************************************************************************** + RECURSION IN THE match() FUNCTION + +The match() function is highly recursive, though not every recursive call +increases the recursive depth. Nevertheless, some regular expressions can cause +it to recurse to a great depth. I was writing for Unix, so I just let it call +itself recursively. This uses the stack for saving everything that has to be +saved for a recursive call. On Unix, the stack can be large, and this works +fine. + +It turns out that on some non-Unix-like systems there are problems with +programs that use a lot of stack. (This despite the fact that every last chip +has oodles of memory these days, and techniques for extending the stack have +been known for decades.) So.... + +There is a fudge, triggered by defining NO_RECURSE, which avoids recursive +calls by keeping local variables that need to be preserved in blocks of memory +obtained from malloc() instead instead of on the stack. Macros are used to +achieve this so that the actual code doesn't look very different to what it +always used to. +**************************************************************************** +***************************************************************************/ + + +/* These versions of the macros use the stack, as normal. There are debugging +versions and production versions. */ + +#ifndef NO_RECURSE +#define REGISTER register +#ifdef DEBUG +#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \ + { \ + printf("match() called in line %d\n", __LINE__); \ + rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \ + printf("to line %d\n", __LINE__); \ + } +#define RRETURN(ra) \ + { \ + printf("match() returned %d from line %d ", ra, __LINE__); \ + return ra; \ + } +#else +#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \ + rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1) +#define RRETURN(ra) return ra +#endif + +#else + + +/* These versions of the macros manage a private stack on the heap. Note +that the rd argument of RMATCH isn't actually used. It's the md argument of +match(), which never changes. */ + +#define REGISTER + +#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\ + {\ + heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ + if (setjmp(frame->Xwhere) == 0)\ + {\ + newframe->Xeptr = ra;\ + newframe->Xecode = rb;\ + newframe->Xoffset_top = rc;\ + newframe->Xims = re;\ + newframe->Xeptrb = rf;\ + newframe->Xflags = rg;\ + newframe->Xrdepth = frame->Xrdepth + 1;\ + newframe->Xprevframe = frame;\ + frame = newframe;\ + DPRINTF(("restarting from line %d\n", __LINE__));\ + goto HEAP_RECURSE;\ + }\ + else\ + {\ + DPRINTF(("longjumped back to line %d\n", __LINE__));\ + frame = md->thisframe;\ + rx = frame->Xresult;\ + }\ + } + +#define RRETURN(ra)\ + {\ + heapframe *newframe = frame;\ + frame = newframe->Xprevframe;\ + (pcre_stack_free)(newframe);\ + if (frame != NULL)\ + {\ + frame->Xresult = ra;\ + md->thisframe = frame;\ + longjmp(frame->Xwhere, 1);\ + }\ + return ra;\ + } + + +/* Structure for remembering the local variables in a private frame */ + +typedef struct heapframe { + struct heapframe *Xprevframe; + + /* Function arguments that may change */ + + const uschar *Xeptr; + const uschar *Xecode; + int Xoffset_top; + long int Xims; + eptrblock *Xeptrb; + int Xflags; + unsigned int Xrdepth; + + /* Function local variables */ + + const uschar *Xcallpat; + const uschar *Xcharptr; + const uschar *Xdata; + const uschar *Xnext; + const uschar *Xpp; + const uschar *Xprev; + const uschar *Xsaved_eptr; + + recursion_info Xnew_recursive; + + BOOL Xcur_is_word; + BOOL Xcondition; + BOOL Xprev_is_word; + + unsigned long int Xoriginal_ims; + +#ifdef SUPPORT_UCP + int Xprop_type; + int Xprop_value; + int Xprop_fail_result; + int Xprop_category; + int Xprop_chartype; + int Xprop_script; +#endif + + int Xctype; + unsigned int Xfc; + int Xfi; + int Xlength; + int Xmax; + int Xmin; + int Xnumber; + int Xoffset; + int Xop; + int Xsave_capture_last; + int Xsave_offset1, Xsave_offset2, Xsave_offset3; + int Xstacksave[REC_STACK_SAVE_MAX]; + + eptrblock Xnewptrb; + + /* Place to pass back result, and where to jump back to */ + + int Xresult; + jmp_buf Xwhere; + +} heapframe; + +#endif + + +/*************************************************************************** +***************************************************************************/ + + + +/************************************************* +* Match from current position * +*************************************************/ + +/* This function is called recursively in many circumstances. Whenever it +returns a negative (error) response, the outer incarnation must also return the +same response. + +Performance note: It might be tempting to extract commonly used fields from the +md structure (e.g. utf8, end_subject) into individual variables to improve +performance. Tests using gcc on a SPARC disproved this; in the first case, it +made performance worse. + +Arguments: + eptr pointer to current character in subject + ecode pointer to current position in compiled code + offset_top current top pointer + md pointer to "static" info for the match + ims current /i, /m, and /s options + eptrb pointer to chain of blocks containing eptr at start of + brackets - for testing for empty matches + flags can contain + match_condassert - this is an assertion condition + match_cbegroup - this is the start of an unlimited repeat + group that can match an empty string + match_tail_recursed - this is a tail_recursed group + rdepth the recursion depth + +Returns: MATCH_MATCH if matched ) these values are >= 0 + MATCH_NOMATCH if failed to match ) + a negative PCRE_ERROR_xxx value if aborted by an error condition + (e.g. stopped by repeated call or recursion limit) +*/ + +static int +match(REGISTER USPTR eptr, REGISTER const uschar *ecode, + int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, + int flags, unsigned int rdepth) +{ +/* These variables do not need to be preserved over recursion in this function, +so they can be ordinary variables in all cases. Mark some of them with +"register" because they are used a lot in loops. */ + +register int rrc; /* Returns from recursive calls */ +register int i; /* Used for loops not involving calls to RMATCH() */ +register unsigned int c; /* Character values not kept over RMATCH() calls */ +register BOOL utf8; /* Local copy of UTF-8 flag for speed */ + +BOOL minimize, possessive; /* Quantifier options */ + +/* When recursion is not being used, all "local" variables that have to be +preserved over calls to RMATCH() are part of a "frame" which is obtained from +heap storage. Set up the top-level frame here; others are obtained from the +heap whenever RMATCH() does a "recursion". See the macro definitions above. */ + +#ifdef NO_RECURSE +heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); +frame->Xprevframe = NULL; /* Marks the top level */ + +/* Copy in the original argument variables */ + +frame->Xeptr = eptr; +frame->Xecode = ecode; +frame->Xoffset_top = offset_top; +frame->Xims = ims; +frame->Xeptrb = eptrb; +frame->Xflags = flags; +frame->Xrdepth = rdepth; + +/* This is where control jumps back to to effect "recursion" */ + +HEAP_RECURSE: + +/* Macros make the argument variables come from the current frame */ + +#define eptr frame->Xeptr +#define ecode frame->Xecode +#define offset_top frame->Xoffset_top +#define ims frame->Xims +#define eptrb frame->Xeptrb +#define flags frame->Xflags +#define rdepth frame->Xrdepth + +/* Ditto for the local variables */ + +#ifdef SUPPORT_UTF8 +#define charptr frame->Xcharptr +#endif +#define callpat frame->Xcallpat +#define data frame->Xdata +#define next frame->Xnext +#define pp frame->Xpp +#define prev frame->Xprev +#define saved_eptr frame->Xsaved_eptr + +#define new_recursive frame->Xnew_recursive + +#define cur_is_word frame->Xcur_is_word +#define condition frame->Xcondition +#define prev_is_word frame->Xprev_is_word + +#define original_ims frame->Xoriginal_ims + +#ifdef SUPPORT_UCP +#define prop_type frame->Xprop_type +#define prop_value frame->Xprop_value +#define prop_fail_result frame->Xprop_fail_result +#define prop_category frame->Xprop_category +#define prop_chartype frame->Xprop_chartype +#define prop_script frame->Xprop_script +#endif + +#define ctype frame->Xctype +#define fc frame->Xfc +#define fi frame->Xfi +#define length frame->Xlength +#define max frame->Xmax +#define min frame->Xmin +#define number frame->Xnumber +#define offset frame->Xoffset +#define op frame->Xop +#define save_capture_last frame->Xsave_capture_last +#define save_offset1 frame->Xsave_offset1 +#define save_offset2 frame->Xsave_offset2 +#define save_offset3 frame->Xsave_offset3 +#define stacksave frame->Xstacksave + +#define newptrb frame->Xnewptrb + +/* When recursion is being used, local variables are allocated on the stack and +get preserved during recursion in the normal way. In this environment, fi and +i, and fc and c, can be the same variables. */ + +#else /* NO_RECURSE not defined */ +#define fi i +#define fc c + + +#ifdef SUPPORT_UTF8 /* Many of these variables are used only */ +const uschar *charptr; /* in small blocks of the code. My normal */ +#endif /* style of coding would have declared */ +const uschar *callpat; /* them within each of those blocks. */ +const uschar *data; /* However, in order to accommodate the */ +const uschar *next; /* version of this code that uses an */ +USPTR pp; /* external "stack" implemented on the */ +const uschar *prev; /* heap, it is easier to declare them all */ +USPTR saved_eptr; /* here, so the declarations can be cut */ + /* out in a block. The only declarations */ +recursion_info new_recursive; /* within blocks below are for variables */ + /* that do not have to be preserved over */ +BOOL cur_is_word; /* a recursive call to RMATCH(). */ +BOOL condition; +BOOL prev_is_word; + +unsigned long int original_ims; + +#ifdef SUPPORT_UCP +int prop_type; +int prop_value; +int prop_fail_result; +int prop_category; +int prop_chartype; +int prop_script; +#endif + +int ctype; +int length; +int max; +int min; +int number; +int offset; +int op; +int save_capture_last; +int save_offset1, save_offset2, save_offset3; +int stacksave[REC_STACK_SAVE_MAX]; + +eptrblock newptrb; +#endif /* NO_RECURSE */ + +/* These statements are here to stop the compiler complaining about unitialized +variables. */ + +#ifdef SUPPORT_UCP +prop_value = 0; +prop_fail_result = 0; +#endif + + +/* This label is used for tail recursion, which is used in a few cases even +when NO_RECURSE is not defined, in order to reduce the amount of stack that is +used. Thanks to Ian Taylor for noticing this possibility and sending the +original patch. */ + +TAIL_RECURSE: + +/* OK, now we can get on with the real code of the function. Recursive calls +are specified by the macro RMATCH and RRETURN is used to return. When +NO_RECURSE is *not* defined, these just turn into a recursive call to match() +and a "return", respectively (possibly with some debugging if DEBUG is +defined). However, RMATCH isn't like a function call because it's quite a +complicated macro. It has to be used in one particular way. This shouldn't, +however, impact performance when true recursion is being used. */ + +/* First check that we haven't called match() too many times, or that we +haven't exceeded the recursive call limit. */ + +if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); +if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); + +original_ims = ims; /* Save for resetting on ')' */ + +#ifdef SUPPORT_UTF8 +utf8 = md->utf8; /* Local copy of the flag */ +#else +utf8 = FALSE; +#endif + +/* At the start of a group with an unlimited repeat that may match an empty +string, the match_cbegroup flag is set. When this is the case, add the current +subject pointer to the chain of such remembered pointers, to be checked when we +hit the closing ket, in order to break infinite loops that match no characters. +When match() is called in other circumstances, don't add to the chain. If this +is a tail recursion, use a block from the workspace, as the one on the stack is +already used. */ + +if ((flags & match_cbegroup) != 0) + { + eptrblock *p; + if ((flags & match_tail_recursed) != 0) + { + if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT); + p = md->eptrchain + md->eptrn++; + } + else p = &newptrb; + p->epb_saved_eptr = eptr; + p->epb_prev = eptrb; + eptrb = p; + } + +/* Now start processing the opcodes. */ + +for (;;) + { + minimize = possessive = FALSE; + op = *ecode; + + /* For partial matching, remember if we ever hit the end of the subject after + matching at least one subject character. */ + + if (md->partial && + eptr >= md->end_subject && + eptr > md->start_match) + md->hitend = TRUE; + + switch(op) + { + /* Handle a capturing bracket. If there is space in the offset vector, save + the current subject position in the working slot at the top of the vector. + We mustn't change the current values of the data slot, because they may be + set from a previous iteration of this group, and be referred to by a + reference inside the group. + + If the bracket fails to match, we need to restore this value and also the + values of the final offsets, in case they were set by a previous iteration + of the same bracket. + + If there isn't enough space in the offset vector, treat this as if it were + a non-capturing bracket. Don't worry about setting the flag for the error + case here; that is handled in the code for KET. */ + + case OP_CBRA: + case OP_SCBRA: + number = GET2(ecode, 1+LINK_SIZE); + offset = number << 1; + +#ifdef DEBUG + printf("start bracket %d\n", number); + printf("subject="); + pchars(eptr, 16, TRUE, md); + printf("\n"); +#endif + + if (offset < md->offset_max) + { + save_offset1 = md->offset_vector[offset]; + save_offset2 = md->offset_vector[offset+1]; + save_offset3 = md->offset_vector[md->offset_end - number]; + save_capture_last = md->capture_last; + + DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); + md->offset_vector[md->offset_end - number] = eptr - md->start_subject; + + flags = (op == OP_SCBRA)? match_cbegroup : 0; + do + { + RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->capture_last = save_capture_last; + ecode += GET(ecode, 1); + } + while (*ecode == OP_ALT); + + DPRINTF(("bracket %d failed\n", number)); + + md->offset_vector[offset] = save_offset1; + md->offset_vector[offset+1] = save_offset2; + md->offset_vector[md->offset_end - number] = save_offset3; + + RRETURN(MATCH_NOMATCH); + } + + /* Insufficient room for saving captured contents. Treat as a non-capturing + bracket. */ + + DPRINTF(("insufficient capture room: treat as non-capturing\n")); + + /* Non-capturing bracket. Loop for all the alternatives. When we get to the + final alternative within the brackets, we would return the result of a + recursive call to match() whatever happened. We can reduce stack usage by + turning this into a tail recursion. */ + + case OP_BRA: + case OP_SBRA: + DPRINTF(("start non-capturing bracket\n")); + flags = (op >= OP_SBRA)? match_cbegroup : 0; + for (;;) + { + if (ecode[GET(ecode, 1)] != OP_ALT) + { + ecode += _pcre_OP_lengths[*ecode]; + flags |= match_tail_recursed; + DPRINTF(("bracket 0 tail recursion\n")); + goto TAIL_RECURSE; + } + + /* For non-final alternatives, continue the loop for a NOMATCH result; + otherwise return. */ + + RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, + eptrb, flags); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode, 1); + } + /* Control never reaches here. */ + + /* Conditional group: compilation checked that there are no more than + two branches. If the condition is false, skipping the first branch takes us + past the end if there is only one branch, but that's OK because that is + exactly what going to the ket would do. As there is only one branch to be + obeyed, we can use tail recursion to avoid using another stack frame. */ + + case OP_COND: + case OP_SCOND: + if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ + { + offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + condition = md->recursive != NULL && + (offset == RREF_ANY || offset == md->recursive->group_num); + ecode += condition? 3 : GET(ecode, 1); + } + + else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ + { + offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ + condition = offset < offset_top && md->offset_vector[offset] >= 0; + ecode += condition? 3 : GET(ecode, 1); + } + + else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ + { + condition = FALSE; + ecode += GET(ecode, 1); + } + + /* The condition is an assertion. Call match() to evaluate it - setting + the final argument match_condassert causes it to stop at the end of an + assertion. */ + + else + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, + match_condassert); + if (rrc == MATCH_MATCH) + { + condition = TRUE; + ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); + while (*ecode == OP_ALT) ecode += GET(ecode, 1); + } + else if (rrc != MATCH_NOMATCH) + { + RRETURN(rrc); /* Need braces because of following else */ + } + else + { + condition = FALSE; + ecode += GET(ecode, 1); + } + } + + /* We are now at the branch that is to be obeyed. As there is only one, + we can use tail recursion to avoid using another stack frame. If the second + alternative doesn't exist, we can just plough on. */ + + if (condition || *ecode == OP_ALT) + { + ecode += 1 + LINK_SIZE; + flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0); + goto TAIL_RECURSE; + } + else + { + ecode += 1 + LINK_SIZE; + } + break; + + + /* End of the pattern. If we are in a top-level recursion, we should + restore the offsets appropriately and continue from after the call. */ + + case OP_END: + if (md->recursive != NULL && md->recursive->group_num == 0) + { + recursion_info *rec = md->recursive; + DPRINTF(("End of pattern in a (?0) recursion\n")); + md->recursive = rec->prevrec; + memmove(md->offset_vector, rec->offset_save, + rec->saved_max * sizeof(int)); + md->start_match = rec->save_start; + ims = original_ims; + ecode = rec->after_call; + break; + } + + /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty + string - backtracking will then try other alternatives, if any. */ + + if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH); + md->end_match_ptr = eptr; /* Record where we ended */ + md->end_offset_top = offset_top; /* and how many extracts were taken */ + RRETURN(MATCH_MATCH); + + /* Change option settings */ + + case OP_OPT: + ims = ecode[1]; + ecode += 2; + DPRINTF(("ims set to %02lx\n", ims)); + break; + + /* Assertion brackets. Check the alternative branches in turn - the + matching won't pass the KET for an assertion. If any one branch matches, + the assertion is true. Lookbehind assertions have an OP_REVERSE item at the + start of each branch to move the current point backwards, so the code at + this level is identical to the lookahead case. */ + + case OP_ASSERT: + case OP_ASSERTBACK: + do + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); + if (rrc == MATCH_MATCH) break; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode, 1); + } + while (*ecode == OP_ALT); + if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); + + /* If checking an assertion for a condition, return MATCH_MATCH. */ + + if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); + + /* Continue from after the assertion, updating the offsets high water + mark, since extracts may have been taken during the assertion. */ + + do ecode += GET(ecode,1); while (*ecode == OP_ALT); + ecode += 1 + LINK_SIZE; + offset_top = md->end_offset_top; + continue; + + /* Negative assertion: all branches must fail to match */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK_NOT: + do + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); + if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode,1); + } + while (*ecode == OP_ALT); + + if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); + + ecode += 1 + LINK_SIZE; + continue; + + /* Move the subject pointer back. This occurs only at the start of + each branch of a lookbehind assertion. If we are too close to the start to + move back, this match function fails. When working with UTF-8 we move + back a number of characters, not bytes. */ + + case OP_REVERSE: +#ifdef SUPPORT_UTF8 + if (utf8) + { + i = GET(ecode, 1); + while (i-- > 0) + { + eptr--; + if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + BACKCHAR(eptr) + } + } + else +#endif + + /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ + + { + eptr -= GET(ecode, 1); + if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + } + + /* Skip to next op code */ + + ecode += 1 + LINK_SIZE; + break; + + /* The callout item calls an external function, if one is provided, passing + details of the match so far. This is mainly for debugging, though the + function is able to force a failure. */ + + case OP_CALLOUT: + if (pcre_callout != NULL) + { + pcre_callout_block cb; + cb.version = 1; /* Version 1 of the callout block */ + cb.callout_number = ecode[1]; + cb.offset_vector = md->offset_vector; + cb.subject = (PCRE_SPTR)md->start_subject; + cb.subject_length = md->end_subject - md->start_subject; + cb.start_match = md->start_match - md->start_subject; + cb.current_position = eptr - md->start_subject; + cb.pattern_position = GET(ecode, 2); + cb.next_item_length = GET(ecode, 2 + LINK_SIZE); + cb.capture_top = offset_top/2; + cb.capture_last = md->capture_last; + cb.callout_data = md->callout_data; + if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if (rrc < 0) RRETURN(rrc); + } + ecode += 2 + 2*LINK_SIZE; + break; + + /* Recursion either matches the current regex, or some subexpression. The + offset data is the offset to the starting bracket from the start of the + whole pattern. (This is so that it works from duplicated subpatterns.) + + If there are any capturing brackets started but not finished, we have to + save their starting points and reinstate them after the recursion. However, + we don't know how many such there are (offset_top records the completed + total) so we just have to save all the potential data. There may be up to + 65535 such values, which is too large to put on the stack, but using malloc + for small numbers seems expensive. As a compromise, the stack is used when + there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc + is used. A problem is what to do if the malloc fails ... there is no way of + returning to the top level with an error. Save the top REC_STACK_SAVE_MAX + values on the stack, and accept that the rest may be wrong. + + There are also other values that have to be saved. We use a chained + sequence of blocks that actually live on the stack. Thanks to Robin Houston + for the original version of this logic. */ + + case OP_RECURSE: + { + callpat = md->start_code + GET(ecode, 1); + new_recursive.group_num = (callpat == md->start_code)? 0 : + GET2(callpat, 1 + LINK_SIZE); + + /* Add to "recursing stack" */ + + new_recursive.prevrec = md->recursive; + md->recursive = &new_recursive; + + /* Find where to continue from afterwards */ + + ecode += 1 + LINK_SIZE; + new_recursive.after_call = ecode; + + /* Now save the offset data. */ + + new_recursive.saved_max = md->offset_end; + if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) + new_recursive.offset_save = stacksave; + else + { + new_recursive.offset_save = + (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); + if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); + } + + memcpy(new_recursive.offset_save, md->offset_vector, + new_recursive.saved_max * sizeof(int)); + new_recursive.save_start = md->start_match; + md->start_match = eptr; + + /* OK, now we can do the recursion. For each top-level alternative we + restore the offset and recursion data. */ + + DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); + flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; + do + { + RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, + md, ims, eptrb, flags); + if (rrc == MATCH_MATCH) + { + DPRINTF(("Recursion matched\n")); + md->recursive = new_recursive.prevrec; + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); + RRETURN(MATCH_MATCH); + } + else if (rrc != MATCH_NOMATCH) + { + DPRINTF(("Recursion gave error %d\n", rrc)); + RRETURN(rrc); + } + + md->recursive = &new_recursive; + memcpy(md->offset_vector, new_recursive.offset_save, + new_recursive.saved_max * sizeof(int)); + callpat += GET(callpat, 1); + } + while (*callpat == OP_ALT); + + DPRINTF(("Recursion didn't match\n")); + md->recursive = new_recursive.prevrec; + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); + RRETURN(MATCH_NOMATCH); + } + /* Control never reaches here */ + + /* "Once" brackets are like assertion brackets except that after a match, + the point in the subject string is not moved back. Thus there can never be + a move back into the brackets. Friedl calls these "atomic" subpatterns. + Check the alternative branches in turn - the matching won't pass the KET + for this kind of subpattern. If any one branch matches, we carry on as at + the end of a normal bracket, leaving the subject pointer. */ + + case OP_ONCE: + prev = ecode; + saved_eptr = eptr; + + do + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + eptrb, 0); + if (rrc == MATCH_MATCH) break; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode,1); + } + while (*ecode == OP_ALT); + + /* If hit the end of the group (which could be repeated), fail */ + + if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); + + /* Continue as from after the assertion, updating the offsets high water + mark, since extracts may have been taken. */ + + do ecode += GET(ecode, 1); while (*ecode == OP_ALT); + + offset_top = md->end_offset_top; + eptr = md->end_match_ptr; + + /* For a non-repeating ket, just continue at this level. This also + happens for a repeating ket if no characters were matched in the group. + This is the forcible breaking of infinite loops as implemented in Perl + 5.005. If there is an options reset, it will get obeyed in the normal + course of events. */ + + if (*ecode == OP_KET || eptr == saved_eptr) + { + ecode += 1+LINK_SIZE; + break; + } + + /* The repeating kets try the rest of the pattern or restart from the + preceding bracket, in the appropriate order. The second "call" of match() + uses tail recursion, to avoid using another stack frame. We need to reset + any options that changed within the bracket before re-running it, so + check the next opcode. */ + + if (ecode[1+LINK_SIZE] == OP_OPT) + { + ims = (ims & ~PCRE_IMS) | ecode[4]; + DPRINTF(("ims set to %02lx at group repeat\n", ims)); + } + + if (*ecode == OP_KETRMIN) + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode = prev; + flags = match_tail_recursed; + goto TAIL_RECURSE; + } + else /* OP_KETRMAX */ + { + RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += 1 + LINK_SIZE; + flags = match_tail_recursed; + goto TAIL_RECURSE; + } + /* Control never gets here */ + + /* An alternation is the end of a branch; scan along to find the end of the + bracketed group and go to there. */ + + case OP_ALT: + do ecode += GET(ecode,1); while (*ecode == OP_ALT); + break; + + /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating + that it may occur zero times. It may repeat infinitely, or not at all - + i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper + repeat limits are compiled as a number of copies, with the optional ones + preceded by BRAZERO or BRAMINZERO. */ + + case OP_BRAZERO: + { + next = ecode+1; + RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + do next += GET(next,1); while (*next == OP_ALT); + ecode = next + 1 + LINK_SIZE; + } + break; + + case OP_BRAMINZERO: + { + next = ecode+1; + do next += GET(next, 1); while (*next == OP_ALT); + RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode++; + } + break; + + /* End of a group, repeated or non-repeating. */ + + case OP_KET: + case OP_KETRMIN: + case OP_KETRMAX: + prev = ecode - GET(ecode, 1); + + /* If this was a group that remembered the subject start, in order to break + infinite repeats of empty string matches, retrieve the subject start from + the chain. Otherwise, set it NULL. */ + + if (*prev >= OP_SBRA) + { + saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ + eptrb = eptrb->epb_prev; /* Backup to previous group */ + } + else saved_eptr = NULL; + + /* If we are at the end of an assertion group, stop matching and return + MATCH_MATCH, but record the current high water mark for use by positive + assertions. Do this also for the "once" (atomic) groups. */ + + if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || + *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || + *prev == OP_ONCE) + { + md->end_match_ptr = eptr; /* For ONCE */ + md->end_offset_top = offset_top; + RRETURN(MATCH_MATCH); + } + + /* For capturing groups we have to check the group number back at the start + and if necessary complete handling an extraction by setting the offsets and + bumping the high water mark. Note that whole-pattern recursion is coded as + a recurse into group 0, so it won't be picked up here. Instead, we catch it + when the OP_END is reached. Other recursion is handled here. */ + + if (*prev == OP_CBRA || *prev == OP_SCBRA) + { + number = GET2(prev, 1+LINK_SIZE); + offset = number << 1; + +#ifdef DEBUG + printf("end bracket %d", number); + printf("\n"); +#endif + + md->capture_last = number; + if (offset >= md->offset_max) md->offset_overflow = TRUE; else + { + md->offset_vector[offset] = + md->offset_vector[md->offset_end - number]; + md->offset_vector[offset+1] = eptr - md->start_subject; + if (offset_top <= offset) offset_top = offset + 2; + } + + /* Handle a recursively called group. Restore the offsets + appropriately and continue from after the call. */ + + if (md->recursive != NULL && md->recursive->group_num == number) + { + recursion_info *rec = md->recursive; + DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); + md->recursive = rec->prevrec; + md->start_match = rec->save_start; + memcpy(md->offset_vector, rec->offset_save, + rec->saved_max * sizeof(int)); + ecode = rec->after_call; + ims = original_ims; + break; + } + } + + /* For both capturing and non-capturing groups, reset the value of the ims + flags, in case they got changed during the group. */ + + ims = original_ims; + DPRINTF(("ims reset to %02lx\n", ims)); + + /* For a non-repeating ket, just continue at this level. This also + happens for a repeating ket if no characters were matched in the group. + This is the forcible breaking of infinite loops as implemented in Perl + 5.005. If there is an options reset, it will get obeyed in the normal + course of events. */ + + if (*ecode == OP_KET || eptr == saved_eptr) + { + ecode += 1 + LINK_SIZE; + break; + } + + /* The repeating kets try the rest of the pattern or restart from the + preceding bracket, in the appropriate order. In the second case, we can use + tail recursion to avoid using another stack frame. */ + + flags = (*prev >= OP_SBRA)? match_cbegroup : 0; + + if (*ecode == OP_KETRMIN) + { + RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode = prev; + flags |= match_tail_recursed; + goto TAIL_RECURSE; + } + else /* OP_KETRMAX */ + { + RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += 1 + LINK_SIZE; + flags = match_tail_recursed; + goto TAIL_RECURSE; + } + /* Control never gets here */ + + /* Start of subject unless notbol, or after internal newline if multiline */ + + case OP_CIRC: + if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); + if ((ims & PCRE_MULTILINE) != 0) + { + if (eptr != md->start_subject && + (eptr == md->end_subject || !WAS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + } + /* ... else fall through */ + + /* Start of subject assertion */ + + case OP_SOD: + if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* Start of match assertion */ + + case OP_SOM: + if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* Assert before internal newline if multiline, or before a terminating + newline unless endonly is set, else end of subject unless noteol is set. */ + + case OP_DOLL: + if ((ims & PCRE_MULTILINE) != 0) + { + if (eptr < md->end_subject) + { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } + else + { if (md->noteol) RRETURN(MATCH_NOMATCH); } + ecode++; + break; + } + else + { + if (md->noteol) RRETURN(MATCH_NOMATCH); + if (!md->endonly) + { + if (eptr != md->end_subject && + (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + } + } + /* ... else fall through for endonly */ + + /* End of subject assertion (\z) */ + + case OP_EOD: + if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* End of subject or ending \n assertion (\Z) */ + + case OP_EODN: + if (eptr != md->end_subject && + (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* Word boundary assertions */ + + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + { + + /* Find out if the previous and current characters are "word" characters. + It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to + be "non-word" characters. */ + +#ifdef SUPPORT_UTF8 + if (utf8) + { + if (eptr == md->start_subject) prev_is_word = FALSE; else + { + const uschar *lastptr = eptr - 1; + while((*lastptr & 0xc0) == 0x80) lastptr--; + GETCHAR(c, lastptr); + prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; + } + if (eptr >= md->end_subject) cur_is_word = FALSE; else + { + GETCHAR(c, eptr); + cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; + } + } + else +#endif + + /* More streamlined when not in UTF-8 mode */ + + { + prev_is_word = (eptr != md->start_subject) && + ((md->ctypes[eptr[-1]] & ctype_word) != 0); + cur_is_word = (eptr < md->end_subject) && + ((md->ctypes[*eptr] & ctype_word) != 0); + } + + /* Now see if the situation is what we want */ + + if ((*ecode++ == OP_WORD_BOUNDARY)? + cur_is_word == prev_is_word : cur_is_word != prev_is_word) + RRETURN(MATCH_NOMATCH); + } + break; + + /* Match a single character type; inline for speed */ + + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) + { + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + } + if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (utf8) + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + ecode++; + break; + + /* Match a single byte, even in UTF-8 mode. This opcode really does match + any byte, even newline, independent of the setting of PCRE_DOTALL. */ + + case OP_ANYBYTE: + if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_NOT_DIGIT: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c < 256 && +#endif + (md->ctypes[c] & ctype_digit) != 0 + ) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_DIGIT: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c >= 256 || +#endif + (md->ctypes[c] & ctype_digit) == 0 + ) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_NOT_WHITESPACE: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c < 256 && +#endif + (md->ctypes[c] & ctype_space) != 0 + ) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_WHITESPACE: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c >= 256 || +#endif + (md->ctypes[c] & ctype_space) == 0 + ) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_NOT_WORDCHAR: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c < 256 && +#endif + (md->ctypes[c] & ctype_word) != 0 + ) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_WORDCHAR: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c >= 256 || +#endif + (md->ctypes[c] & ctype_word) == 0 + ) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_ANYNL: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + break; + } + ecode++; + break; + +#ifdef SUPPORT_UCP + /* Check the next character by Unicode property. We will get here only + if the support is in the binary; otherwise a compile-time error occurs. */ + + case OP_PROP: + case OP_NOTPROP: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + { + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); + + switch(ecode[1]) + { + case PT_ANY: + if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; + + case PT_LAMP: + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_GC: + if ((ecode[2] != category) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_PC: + if ((ecode[2] != chartype) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SC: + if ((ecode[2] != script) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + + ecode += 3; + } + break; + + /* Match an extended Unicode sequence. We will get here only if the support + is in the binary; otherwise a compile-time error occurs. */ + + case OP_EXTUNI: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + { + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); + if (category == ucp_M) RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf8) c = *eptr; else + { + GETCHARLEN(c, eptr, len); + } + category = _pcre_ucp_findprop(c, &chartype, &script); + if (category != ucp_M) break; + eptr += len; + } + } + ecode++; + break; +#endif + + + /* Match a back reference, possibly repeatedly. Look past the end of the + item to see if there is repeat information following. The code is similar + to that for character classes, but repeated for efficiency. Then obey + similar code to character type repeats - written out again for speed. + However, if the referenced string is the empty string, always treat + it as matched, any number of times (otherwise there could be infinite + loops). */ + + case OP_REF: + { + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ + ecode += 3; /* Advance past item */ + + /* If the reference is unset, set the length to be longer than the amount + of subject left; this ensures that every attempt at a match fails. We + can't just fail here, because of the possibility of quantifiers with zero + minima. */ + + length = (offset >= offset_top || md->offset_vector[offset] < 0)? + md->end_subject - eptr + 1 : + md->offset_vector[offset+1] - md->offset_vector[offset]; + + /* Set up for repetition, or handle the non-repeated case */ + + switch (*ecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) max = INT_MAX; + ecode += 5; + break; + + default: /* No repeat follows */ + if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); + eptr += length; + continue; /* With the main loop */ + } + + /* If the length of the reference is zero, just continue with the + main loop. */ + + if (length == 0) continue; + + /* First, ensure the minimum number of matches are present. We get back + the length of the reference string explicitly rather than passing the + address of eptr, so that eptr can be a register variable. */ + + for (i = 1; i <= min; i++) + { + if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); + eptr += length; + } + + /* If min = max, continue at the same level without recursion. + They are not both allowed to be zero. */ + + if (min == max) continue; + + /* If minimizing, keep trying and advancing the pointer */ + + if (minimize) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || !match_ref(offset, eptr, length, md, ims)) + RRETURN(MATCH_NOMATCH); + eptr += length; + } + /* Control never gets here */ + } + + /* If maximizing, find the longest string and work backwards */ + + else + { + pp = eptr; + for (i = min; i < max; i++) + { + if (!match_ref(offset, eptr, length, md, ims)) break; + eptr += length; + } + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr -= length; + } + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + + + /* Match a bit-mapped character class, possibly repeatedly. This op code is + used when all the characters in the class have values in the range 0-255, + and either the matching is caseful, or the characters are in the range + 0-127 when UTF-8 processing is enabled. The only difference between + OP_CLASS and OP_NCLASS occurs when a data character outside the range is + encountered. + + First, look past the end of the item to see if there is repeat information + following. Then obey similar code to character type repeats - written out + again for speed. */ + + case OP_NCLASS: + case OP_CLASS: + { + data = ecode + 1; /* Save for matching */ + ecode += 33; /* Advance past the item */ + + switch (*ecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) max = INT_MAX; + ecode += 5; + break; + + default: /* No repeat follows */ + min = max = 1; + break; + } + + /* First, ensure the minimum number of matches are present. */ + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (c > 255) + { + if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else + { + if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + c = *eptr++; + if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + + /* If max == min we can continue with the main loop without the + need to recurse. */ + + if (min == max) continue; + + /* If minimizing, keep testing the rest of the expression and advancing + the pointer while it matches the class. */ + + if (minimize) + { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (c > 255) + { + if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else + { + if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + c = *eptr++; + if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* If maximizing, find the longest possible run, then work backwards. */ + + else + { + pp = eptr; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c > 255) + { + if (op == OP_CLASS) break; + } + else + { + if ((data[c/8] & (1 << (c&7))) == 0) break; + } + eptr += len; + } + for (;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject) break; + c = *eptr; + if ((data[c/8] & (1 << (c&7))) == 0) break; + eptr++; + } + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + + /* Match an extended character class. This opcode is encountered only + in UTF-8 mode, because that's the only time it is compiled. */ + +#ifdef SUPPORT_UTF8 + case OP_XCLASS: + { + data = ecode + 1 + LINK_SIZE; /* Save for matching */ + ecode += GET(ecode, 1); /* Advance past the item */ + + switch (*ecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) max = INT_MAX; + ecode += 5; + break; + + default: /* No repeat follows */ + min = max = 1; + break; + } + + /* First, ensure the minimum number of matches are present. */ + + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + } + + /* If max == min we can continue with the main loop without the + need to recurse. */ + + if (min == max) continue; + + /* If minimizing, keep testing the rest of the expression and advancing + the pointer while it matches the class. */ + + if (minimize) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* If maximizing, find the longest possible run, then work backwards. */ + + else + { + pp = eptr; + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (!_pcre_xclass(c, data)) break; + eptr += len; + } + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + BACKCHAR(eptr) + } + RRETURN(MATCH_NOMATCH); + } + + /* Control never gets here */ + } +#endif /* End of XCLASS */ + + /* Match a single character, casefully */ + + case OP_CHAR: +#ifdef SUPPORT_UTF8 + if (utf8) + { + length = 1; + ecode++; + GETCHARLEN(fc, ecode, length); + if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); + } + else +#endif + + /* Non-UTF-8 mode */ + { + if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); + if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); + ecode += 2; + } + break; + + /* Match a single character, caselessly */ + + case OP_CHARNC: +#ifdef SUPPORT_UTF8 + if (utf8) + { + length = 1; + ecode++; + GETCHARLEN(fc, ecode, length); + + if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + + /* If the pattern character's value is < 128, we have only one byte, and + can use the fast lookup table. */ + + if (fc < 128) + { + if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + } + + /* Otherwise we must pick up the subject character */ + + else + { + unsigned int dc; + GETCHARINC(dc, eptr); + ecode += length; + + /* If we have Unicode property support, we can use it to test the other + case of the character, if there is one. */ + + if (fc != dc) + { +#ifdef SUPPORT_UCP + if (dc != _pcre_ucp_othercase(fc)) +#endif + RRETURN(MATCH_NOMATCH); + } + } + } + else +#endif /* SUPPORT_UTF8 */ + + /* Non-UTF-8 mode */ + { + if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); + if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + ecode += 2; + } + break; + + /* Match a single character repeatedly. */ + + case OP_EXACT: + min = max = GET2(ecode, 1); + ecode += 3; + goto REPEATCHAR; + + case OP_POSUPTO: + possessive = TRUE; + /* Fall through */ + + case OP_UPTO: + case OP_MINUPTO: + min = 0; + max = GET2(ecode, 1); + minimize = *ecode == OP_MINUPTO; + ecode += 3; + goto REPEATCHAR; + + case OP_POSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATCHAR; + + case OP_POSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATCHAR; + + case OP_POSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATCHAR; + + case OP_STAR: + case OP_MINSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_QUERY: + case OP_MINQUERY: + c = *ecode++ - OP_STAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + + /* Common code for all repeated single-character matches. We can give + up quickly if there are fewer than the minimum number of characters left in + the subject. */ + + REPEATCHAR: +#ifdef SUPPORT_UTF8 + if (utf8) + { + length = 1; + charptr = ecode; + GETCHARLEN(fc, ecode, length); + if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + ecode += length; + + /* Handle multibyte character matching specially here. There is + support for caseless matching if UCP support is present. */ + + if (length > 1) + { + int oclength = 0; + uschar occhars[8]; + +#ifdef SUPPORT_UCP + unsigned int othercase; + if ((ims & PCRE_CASELESS) != 0 && + (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) + oclength = _pcre_ord2utf8(othercase, occhars); +#endif /* SUPPORT_UCP */ + + for (i = 1; i <= min; i++) + { + if (memcmp(eptr, charptr, length) == 0) eptr += length; + /* Need braces because of following else */ + else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } + else + { + if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); + eptr += oclength; + } + } + + if (min == max) continue; + + if (minimize) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (memcmp(eptr, charptr, length) == 0) eptr += length; + /* Need braces because of following else */ + else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } + else + { + if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); + eptr += oclength; + } + } + /* Control never gets here */ + } + + else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) + { + if (eptr > md->end_subject - length) break; + if (memcmp(eptr, charptr, length) == 0) eptr += length; + else if (oclength == 0) break; + else + { + if (memcmp(eptr, occhars, oclength) != 0) break; + eptr += oclength; + } + } + + if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr -= length; + } + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* If the length of a UTF-8 character is 1, we fall through here, and + obey the code as for non-UTF-8 characters below, though in this case the + value of fc will always be < 128. */ + } + else +#endif /* SUPPORT_UTF8 */ + + /* When not in UTF-8 mode, load a single-byte character. */ + { + if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + fc = *ecode++; + } + + /* The value of fc at this point is always less than 256, though we may or + may not be in UTF-8 mode. The code is duplicated for the caseless and + caseful cases, for speed, since matching characters is likely to be quite + common. First, ensure the minimum number of matches are present. If min = + max, continue at the same level without recursing. Otherwise, if + minimizing, keep trying the rest of the expression and advancing one + matching character if failing, up to the maximum. Alternatively, if + maximizing, find the maximum number of characters and work backwards. */ + + DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, + max, eptr)); + + if ((ims & PCRE_CASELESS) != 0) + { + fc = md->lcc[fc]; + for (i = 1; i <= min; i++) + if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (min == max) continue; + if (minimize) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + fc != md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; + eptr++; + } + if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + eptr--; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + } + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* Caseful comparisons (includes all multi-byte characters) */ + + else + { + for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (min == max) continue; + if (minimize) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || fc != *eptr++) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || fc != *eptr) break; + eptr++; + } + if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + eptr--; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + } + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + /* Match a negated single one-byte character. The character we are + checking can be multibyte. */ + + case OP_NOT: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + ecode++; + GETCHARINCTEST(c, eptr); + if ((ims & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UTF8 + if (c < 256) +#endif + c = md->lcc[c]; + if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); + } + else + { + if (*ecode++ == c) RRETURN(MATCH_NOMATCH); + } + break; + + /* Match a negated single one-byte character repeatedly. This is almost a + repeat of the code for a repeated single character, but I haven't found a + nice way of commoning these up that doesn't require a test of the + positive/negative option for each character match. Maybe that wouldn't add + very much to the time taken, but character matching *is* what this is all + about... */ + + case OP_NOTEXACT: + min = max = GET2(ecode, 1); + ecode += 3; + goto REPEATNOTCHAR; + + case OP_NOTUPTO: + case OP_NOTMINUPTO: + min = 0; + max = GET2(ecode, 1); + minimize = *ecode == OP_NOTMINUPTO; + ecode += 3; + goto REPEATNOTCHAR; + + case OP_NOTPOSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSUPTO: + possessive = TRUE; + min = 0; + max = GET2(ecode, 1); + ecode += 3; + goto REPEATNOTCHAR; + + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTQUERY: + case OP_NOTMINQUERY: + c = *ecode++ - OP_NOTSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + + /* Common code for all repeated single-byte matches. We can give up quickly + if there are fewer than the minimum number of bytes left in the + subject. */ + + REPEATNOTCHAR: + if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + fc = *ecode++; + + /* The code is duplicated for the caseless and caseful cases, for speed, + since matching characters is likely to be quite common. First, ensure the + minimum number of matches are present. If min = max, continue at the same + level without recursing. Otherwise, if minimizing, keep trying the rest of + the expression and advancing one matching character if failing, up to the + maximum. Alternatively, if maximizing, find the maximum number of + characters and work backwards. */ + + DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, + max, eptr)); + + if ((ims & PCRE_CASELESS) != 0) + { + fc = md->lcc[fc]; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + register unsigned int d; + for (i = 1; i <= min; i++) + { + GETCHARINC(d, eptr); + if (d < 256) d = md->lcc[d]; + if (fc == d) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + + /* Not UTF-8 mode */ + { + for (i = 1; i <= min; i++) + if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + } + + if (min == max) continue; + + if (minimize) + { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + register unsigned int d; + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + GETCHARINC(d, eptr); + if (d < 256) d = md->lcc[d]; + if (fi >= max || eptr >= md->end_subject || fc == d) + RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* Maximize case */ + + else + { + pp = eptr; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + register unsigned int d; + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(d, eptr, len); + if (d < 256) d = md->lcc[d]; + if (fc == d) break; + eptr += len; + } + if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; + eptr++; + } + if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* Caseful comparisons */ + + else + { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + register unsigned int d; + for (i = 1; i <= min; i++) + { + GETCHARINC(d, eptr); + if (fc == d) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (i = 1; i <= min; i++) + if (fc == *eptr++) RRETURN(MATCH_NOMATCH); + } + + if (min == max) continue; + + if (minimize) + { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + register unsigned int d; + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + GETCHARINC(d, eptr); + if (fi >= max || eptr >= md->end_subject || fc == d) + RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || fc == *eptr++) + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* Maximize case */ + + else + { + pp = eptr; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + register unsigned int d; + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(d, eptr, len); + if (fc == d) break; + eptr += len; + } + if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || fc == *eptr) break; + eptr++; + } + if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + /* Match a single character type repeatedly; several different opcodes + share code. This is very similar to the code for single characters, but we + repeat it in the interests of efficiency. */ + + case OP_TYPEEXACT: + min = max = GET2(ecode, 1); + minimize = TRUE; + ecode += 3; + goto REPEATTYPE; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + min = 0; + max = GET2(ecode, 1); + minimize = *ecode == OP_TYPEMINUPTO; + ecode += 3; + goto REPEATTYPE; + + case OP_TYPEPOSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSUPTO: + possessive = TRUE; + min = 0; + max = GET2(ecode, 1); + ecode += 3; + goto REPEATTYPE; + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + c = *ecode++ - OP_TYPESTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + + /* Common code for all repeated single character type matches. Note that + in UTF-8 mode, '.' matches a character of any length, but for the other + character types, the valid characters are all one-byte long. */ + + REPEATTYPE: + ctype = *ecode++; /* Code for the character type */ + +#ifdef SUPPORT_UCP + if (ctype == OP_PROP || ctype == OP_NOTPROP) + { + prop_fail_result = ctype == OP_NOTPROP; + prop_type = *ecode++; + prop_value = *ecode++; + } + else prop_type = -1; +#endif + + /* First, ensure the minimum number of matches are present. Use inline + code for maximizing the speed, and do the type test once at the start + (i.e. keep it out of the loop). Also we can test that there are at least + the minimum number of bytes before we start. This isn't as effective in + UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that + is tidier. Also separate the UCP code, which can be the same for both UTF-8 + and single-bytes. */ + + if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + if (min > 0) + { +#ifdef SUPPORT_UCP + if (prop_type >= 0) + { + switch(prop_type) + { + case PT_ANY: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + } + break; + + case PT_LAMP: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_GC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_PC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_SC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (ctype == OP_EXTUNI) + { + for (i = 1; i <= min; i++) + { + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf8) c = *eptr; else + { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) break; + eptr += len; + } + } + } + + else +#endif /* SUPPORT_UCP */ + +/* Handle all other cases when the coding is UTF-8 */ + +#ifdef SUPPORT_UTF8 + if (utf8) switch(ctype) + { + case OP_ANY: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || + ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + break; + + case OP_ANYBYTE: + eptr += min; + break; + + case OP_ANYNL: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + break; + } + } + break; + + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_DIGIT: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || + *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + /* No need to skip more bytes - we know it's a 1-byte character */ + } + break; + + case OP_NOT_WHITESPACE: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || + (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0)) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + break; + + case OP_WHITESPACE: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || + *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + /* No need to skip more bytes - we know it's a 1-byte character */ + } + break; + + case OP_NOT_WORDCHAR: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || + (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0)) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + break; + + case OP_WORDCHAR: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || + *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + /* No need to skip more bytes - we know it's a 1-byte character */ + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } /* End switch(ctype) */ + + else +#endif /* SUPPORT_UTF8 */ + + /* Code for the non-UTF-8 case for minimum matching of operators other + than OP_PROP and OP_NOTPROP. We can assume that there are the minimum + number of bytes present, as this was tested above. */ + + switch(ctype) + { + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) + { + for (i = 1; i <= min; i++) + { + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + eptr++; + } + } + else eptr += min; + break; + + case OP_ANYBYTE: + eptr += min; + break; + + /* Because of the CRLF case, we can't assume the minimum number of + bytes are present in this case. */ + + case OP_ANYNL: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + switch(*eptr++) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + break; + } + } + break; + + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + + /* If min = max, continue at the same level without recursing */ + + if (min == max) continue; + + /* If minimizing, we have to test the rest of the pattern before each + subsequent match. Again, separate the UTF-8 case for speed, and also + separate the UCP cases. */ + + if (minimize) + { +#ifdef SUPPORT_UCP + if (prop_type >= 0) + { + switch(prop_type) + { + case PT_ANY: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_LAMP: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_GC: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_PC: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_SC: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (ctype == OP_EXTUNI) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf8) c = *eptr; else + { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) break; + eptr += len; + } + } + } + + else +#endif /* SUPPORT_UCP */ + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && + IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + + GETCHARINC(c, eptr); + switch(ctype) + { + case OP_ANY: /* This is the DOTALL case */ + break; + + case OP_ANYBYTE: + break; + + case OP_ANYNL: + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + break; + } + break; + + case OP_NOT_DIGIT: + if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + if (c < 256 && (md->ctypes[c] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + if (c < 256 && (md->ctypes[c] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + } + else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + + c = *eptr++; + switch(ctype) + { + case OP_ANY: /* This is the DOTALL case */ + break; + + case OP_ANYBYTE: + break; + + case OP_ANYNL: + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + break; + } + break; + + case OP_NOT_DIGIT: + if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + } + /* Control never gets here */ + } + + /* If maximizing, it is worth using inline code for speed, doing the type + test once at the start (i.e. keep it out of the loop). Again, keep the + UTF-8 and UCP stuff separate. */ + + else + { + pp = eptr; /* Remember where we started */ + +#ifdef SUPPORT_UCP + if (prop_type >= 0) + { + switch(prop_type) + { + case PT_ANY: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (prop_fail_result) break; + eptr+= len; + } + break; + + case PT_LAMP: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_GC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_PC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_SC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == prop_fail_result) + break; + eptr+= len; + } + break; + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (ctype == OP_EXTUNI) + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject) break; + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category == ucp_M) break; + while (eptr < md->end_subject) + { + int len = 1; + if (!utf8) c = *eptr; else + { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) break; + eptr += len; + } + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + for (;;) /* Move back over one extended */ + { + int len = 1; + BACKCHAR(eptr); + if (!utf8) c = *eptr; else + { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) break; + eptr--; + } + } + } + + else +#endif /* SUPPORT_UCP */ + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + + if (utf8) + { + switch(ctype) + { + case OP_ANY: + + /* Special code is required for UTF8, but when the maximum is + unlimited we don't need it, so we repeat the non-UTF8 code. This is + probably worth it, because .* is quite a common idiom. */ + + if (max < INT_MAX) + { + if ((ims & PCRE_DOTALL) == 0) + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + } + else + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + } + } + + /* Handle unlimited UTF-8 repeat */ + + else + { + if ((ims & PCRE_DOTALL) == 0) + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + } + break; + } + else + { + c = max - min; + if (c > (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; + eptr += c; + } + } + break; + + /* The byte case is the same as non-UTF8 */ + + case OP_ANYBYTE: + c = max - min; + if (c > (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; + eptr += c; + break; + + case OP_ANYNL: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c == 0x000d) + { + if (++eptr >= md->end_subject) break; + if (*eptr == 0x000a) eptr++; + } + else + { + if (c != 0x000a && c != 0x000b && c != 0x000c && + c != 0x0085 && c != 0x2028 && c != 0x2029) + break; + eptr += len; + } + } + break; + + case OP_NOT_DIGIT: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; + eptr+= len; + } + break; + + case OP_DIGIT: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; + eptr+= len; + } + break; + + case OP_NOT_WHITESPACE: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; + eptr+= len; + } + break; + + case OP_WHITESPACE: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; + eptr+= len; + } + break; + + case OP_NOT_WORDCHAR: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; + eptr+= len; + } + break; + + case OP_WORDCHAR: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; + eptr+= len; + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr-- == pp) break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } + else +#endif + + /* Not UTF-8 mode */ + { + switch(ctype) + { + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + } + break; + } + /* For DOTALL case, fall through and treat as \C */ + + case OP_ANYBYTE: + c = max - min; + if (c > (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; + eptr += c; + break; + + case OP_ANYNL: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject) break; + c = *eptr; + if (c == 0x000d) + { + if (++eptr >= md->end_subject) break; + if (*eptr == 0x000a) eptr++; + } + else + { + if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085) + break; + eptr++; + } + } + break; + + case OP_NOT_DIGIT: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) + break; + eptr++; + } + break; + + case OP_DIGIT: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) + break; + eptr++; + } + break; + + case OP_NOT_WHITESPACE: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) + break; + eptr++; + } + break; + + case OP_WHITESPACE: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) + break; + eptr++; + } + break; + + case OP_NOT_WORDCHAR: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) + break; + eptr++; + } + break; + + case OP_WORDCHAR: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) + break; + eptr++; + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + eptr--; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + } + } + + /* Get here if we can't make it match with any permitted repetitions */ + + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* There's been some horrible disaster. Arrival here can only mean there is + something seriously wrong in the code above or the OP_xxx definitions. */ + + default: + DPRINTF(("Unknown opcode %d\n", *ecode)); + RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); + } + + /* Do not stick any code in here without much thought; it is assumed + that "continue" in the code above comes out to here to repeat the main + loop. */ + + } /* End of main loop */ +/* Control never reaches here */ +} + + +/*************************************************************************** +**************************************************************************** + RECURSION IN THE match() FUNCTION + +Undefine all the macros that were defined above to handle this. */ + +#ifdef NO_RECURSE +#undef eptr +#undef ecode +#undef offset_top +#undef ims +#undef eptrb +#undef flags + +#undef callpat +#undef charptr +#undef data +#undef next +#undef pp +#undef prev +#undef saved_eptr + +#undef new_recursive + +#undef cur_is_word +#undef condition +#undef prev_is_word + +#undef original_ims + +#undef ctype +#undef length +#undef max +#undef min +#undef number +#undef offset +#undef op +#undef save_capture_last +#undef save_offset1 +#undef save_offset2 +#undef save_offset3 +#undef stacksave + +#undef newptrb + +#endif + +/* These two are defined as macros in both cases */ + +#undef fc +#undef fi + +/*************************************************************************** +***************************************************************************/ + + + +/************************************************* +* Execute a Regular Expression * +*************************************************/ + +/* This function applies a compiled re to a subject string and picks out +portions of the string if it matches. Two elements in the vector are set for +each substring: the offsets to the start and end of the substring. + +Arguments: + argument_re points to the compiled expression + extra_data points to extra data or is NULL + subject points to the subject string + length length of subject string (may contain binary zeros) + start_offset where to start in the subject string + options option bits + offsets points to a vector of ints to be filled in with offsets + offsetcount the number of elements in the vector + +Returns: > 0 => success; value is the number of elements filled in + = 0 => success, but offsets is not big enough + -1 => failed to match + < -1 => some kind of unexpected problem +*/ + +PCRE_DATA_SCOPE int +pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, + PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, + int offsetcount) +{ +int rc, resetcount, ocount; +int first_byte = -1; +int req_byte = -1; +int req_byte2 = -1; +int newline; +unsigned long int ims; +BOOL using_temporary_offsets = FALSE; +BOOL anchored; +BOOL startline; +BOOL firstline; +BOOL first_byte_caseless = FALSE; +BOOL req_byte_caseless = FALSE; +BOOL utf8; +match_data match_block; +match_data *md = &match_block; +const uschar *tables; +const uschar *start_bits = NULL; +USPTR start_match = (USPTR)subject + start_offset; +USPTR end_subject; +USPTR req_byte_ptr = start_match - 1; +eptrblock eptrchain[EPTR_WORK_SIZE]; + +pcre_study_data internal_study; +const pcre_study_data *study; + +real_pcre internal_re; +const real_pcre *external_re = (const real_pcre *)argument_re; +const real_pcre *re = external_re; + +/* Plausibility checks */ + +if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; +if (re == NULL || subject == NULL || + (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; +if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; + +/* Fish out the optional data from the extra_data structure, first setting +the default values. */ + +study = NULL; +md->match_limit = MATCH_LIMIT; +md->match_limit_recursion = MATCH_LIMIT_RECURSION; +md->callout_data = NULL; + +/* The table pointer is always in native byte order. */ + +tables = external_re->tables; + +if (extra_data != NULL) + { + register unsigned int flags = extra_data->flags; + if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) + study = (const pcre_study_data *)extra_data->study_data; + if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) + md->match_limit = extra_data->match_limit; + if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) + md->match_limit_recursion = extra_data->match_limit_recursion; + if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) + md->callout_data = extra_data->callout_data; + if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; + } + +/* If the exec call supplied NULL for tables, use the inbuilt ones. This +is a feature that makes it possible to save compiled regex and re-use them +in other programs later. */ + +if (tables == NULL) tables = _pcre_default_tables; + +/* Check that the first field in the block is the magic number. If it is not, +test for a regex that was compiled on a host of opposite endianness. If this is +the case, flipped values are put in internal_re and internal_study if there was +study data too. */ + +if (re->magic_number != MAGIC_NUMBER) + { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) return PCRE_ERROR_BADMAGIC; + if (study != NULL) study = &internal_study; + } + +/* Set up other data */ + +anchored = ((re->options | options) & PCRE_ANCHORED) != 0; +startline = (re->options & PCRE_STARTLINE) != 0; +firstline = (re->options & PCRE_FIRSTLINE) != 0; + +/* The code starts after the real_pcre block and the capture name table. */ + +md->start_code = (const uschar *)external_re + re->name_table_offset + + re->name_count * re->name_entry_size; + +md->start_subject = (USPTR)subject; +md->start_offset = start_offset; +md->end_subject = md->start_subject + length; +end_subject = md->end_subject; + +md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; +utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; + +md->notbol = (options & PCRE_NOTBOL) != 0; +md->noteol = (options & PCRE_NOTEOL) != 0; +md->notempty = (options & PCRE_NOTEMPTY) != 0; +md->partial = (options & PCRE_PARTIAL) != 0; +md->hitend = FALSE; + +md->recursive = NULL; /* No recursion at top level */ +md->eptrchain = eptrchain; /* Make workspace generally available */ + +md->lcc = tables + lcc_offset; +md->ctypes = tables + ctypes_offset; + +/* Handle different types of newline. The two bits give four cases. If nothing +is set at run time, whatever was used at compile time applies. */ + +switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & + PCRE_NEWLINE_BITS) + { + case 0: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + case PCRE_NEWLINE_ANY: newline = -1; break; + default: return PCRE_ERROR_BADNEWLINE; + } + +if (newline < 0) + { + md->nltype = NLTYPE_ANY; + } +else + { + md->nltype = NLTYPE_FIXED; + if (newline > 255) + { + md->nllen = 2; + md->nl[0] = (newline >> 8) & 255; + md->nl[1] = newline & 255; + } + else + { + md->nllen = 1; + md->nl[0] = newline; + } + } + +/* Partial matching is supported only for a restricted set of regexes at the +moment. */ + +if (md->partial && (re->options & PCRE_NOPARTIAL) != 0) + return PCRE_ERROR_BADPARTIAL; + +/* Check a UTF-8 string if required. Unfortunately there's no way of passing +back the character offset. */ + +#ifdef SUPPORT_UTF8 +if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) + { + if (_pcre_valid_utf8((uschar *)subject, length) >= 0) + return PCRE_ERROR_BADUTF8; + if (start_offset > 0 && start_offset < length) + { + int tb = ((uschar *)subject)[start_offset]; + if (tb > 127) + { + tb &= 0xc0; + if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; + } + } + } +#endif + +/* The ims options can vary during the matching as a result of the presence +of (?ims) items in the pattern. They are kept in a local variable so that +restoring at the exit of a group is easy. */ + +ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); + +/* If the expression has got more back references than the offsets supplied can +hold, we get a temporary chunk of working store to use during the matching. +Otherwise, we can use the vector supplied, rounding down its size to a multiple +of 3. */ + +ocount = offsetcount - (offsetcount % 3); + +if (re->top_backref > 0 && re->top_backref >= ocount/3) + { + ocount = re->top_backref * 3 + 3; + md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); + if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; + using_temporary_offsets = TRUE; + DPRINTF(("Got memory to hold back references\n")); + } +else md->offset_vector = offsets; + +md->offset_end = ocount; +md->offset_max = (2*ocount)/3; +md->offset_overflow = FALSE; +md->capture_last = -1; + +/* Compute the minimum number of offsets that we need to reset each time. Doing +this makes a huge difference to execution time when there aren't many brackets +in the pattern. */ + +resetcount = 2 + re->top_bracket * 2; +if (resetcount > offsetcount) resetcount = ocount; + +/* Reset the working variable associated with each extraction. These should +never be used unless previously set, but they get saved and restored, and so we +initialize them to avoid reading uninitialized locations. */ + +if (md->offset_vector != NULL) + { + register int *iptr = md->offset_vector + ocount; + register int *iend = iptr - resetcount/2 + 1; + while (--iptr >= iend) *iptr = -1; + } + +/* Set up the first character to match, if available. The first_byte value is +never set for an anchored regular expression, but the anchoring may be forced +at run time, so we have to test for anchoring. The first char may be unset for +an unanchored pattern, of course. If there's no first char and the pattern was +studied, there may be a bitmap of possible first characters. */ + +if (!anchored) + { + if ((re->options & PCRE_FIRSTSET) != 0) + { + first_byte = re->first_byte & 255; + if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) + first_byte = md->lcc[first_byte]; + } + else + if (!startline && study != NULL && + (study->options & PCRE_STUDY_MAPPED) != 0) + start_bits = study->start_bits; + } + +/* For anchored or unanchored matches, there may be a "last known required +character" set. */ + +if ((re->options & PCRE_REQCHSET) != 0) + { + req_byte = re->req_byte & 255; + req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; + req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ + } + + +/* ==========================================================================*/ + +/* Loop for handling unanchored repeated matching attempts; for anchored regexs +the loop runs just once. */ + +for(;;) + { + USPTR save_end_subject = end_subject; + + /* Reset the maximum number of extractions we might see. */ + + if (md->offset_vector != NULL) + { + register int *iptr = md->offset_vector; + register int *iend = iptr + resetcount; + while (iptr < iend) *iptr++ = -1; + } + + /* Advance to a unique first char if possible. If firstline is TRUE, the + start of the match is constrained to the first line of a multiline string. + That is, the match must be before or at the first newline. Implement this by + temporarily adjusting end_subject so that we stop scanning at a newline. If + the match fails at the newline, later code breaks this loop. */ + + if (firstline) + { + USPTR t = start_match; + while (t < md->end_subject && !IS_NEWLINE(t)) t++; + end_subject = t; + } + + /* Now test for a unique first byte */ + + if (first_byte >= 0) + { + if (first_byte_caseless) + while (start_match < end_subject && + md->lcc[*start_match] != first_byte) + start_match++; + else + while (start_match < end_subject && *start_match != first_byte) + start_match++; + } + + /* Or to just after a linebreak for a multiline match if possible */ + + else if (startline) + { + if (start_match > md->start_subject + start_offset) + { + while (start_match <= end_subject && !WAS_NEWLINE(start_match)) + start_match++; + } + } + + /* Or to a non-unique first char after study */ + + else if (start_bits != NULL) + { + while (start_match < end_subject) + { + register unsigned int c = *start_match; + if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; + } + } + + /* Restore fudged end_subject */ + + end_subject = save_end_subject; + +#ifdef DEBUG /* Sigh. Some compilers never learn. */ + printf(">>>> Match against: "); + pchars(start_match, end_subject - start_match, TRUE, md); + printf("\n"); +#endif + + /* If req_byte is set, we know that that character must appear in the subject + for the match to succeed. If the first character is set, req_byte must be + later in the subject; otherwise the test starts at the match point. This + optimization can save a huge amount of backtracking in patterns with nested + unlimited repeats that aren't going to match. Writing separate code for + cased/caseless versions makes it go faster, as does using an autoincrement + and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end can + take a long time, and give bad performance on quite ordinary patterns. This + showed up when somebody was matching something like /^\d+C/ on a 32-megabyte + string... so we don't do this when the string is sufficiently long. + + ALSO: this processing is disabled when partial matching is requested. + */ + + if (req_byte >= 0 && + end_subject - start_match < REQ_BYTE_MAX && + !md->partial) + { + register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > req_byte_ptr) + { + if (req_byte_caseless) + { + while (p < end_subject) + { + register int pp = *p++; + if (pp == req_byte || pp == req_byte2) { p--; break; } + } + } + else + { + while (p < end_subject) + { + if (*p++ == req_byte) { p--; break; } + } + } + + /* If we can't find the required character, break the matching loop, + forcing a match failure. */ + + if (p >= end_subject) + { + rc = MATCH_NOMATCH; + break; + } + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this character yet. */ + + req_byte_ptr = p; + } + } + + /* OK, we can now run the match. */ + + md->start_match = start_match; + md->match_call_count = 0; + md->eptrn = 0; /* Next free eptrchain slot */ + rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0); + + /* Any return other than MATCH_NOMATCH breaks the loop. */ + + if (rc != MATCH_NOMATCH) break; + + /* If PCRE_FIRSTLINE is set, the match must happen before or at the first + newline in the subject (though it may continue over the newline). Therefore, + if we have just failed to match, starting at a newline, do not continue. */ + + if (firstline && IS_NEWLINE(start_match)) break; + + /* Advance the match position by one character. */ + + start_match++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; +#endif + + /* Break the loop if the pattern is anchored or if we have passed the end of + the subject. */ + + if (anchored || start_match > end_subject) break; + + /* If we have just passed a CR and the newline option is CRLF or ANY, and we + are now at a LF, advance the match position by one more character. */ + + if (start_match[-1] == '\r' && + (md->nltype == NLTYPE_ANY || md->nllen == 2) && + start_match < end_subject && + *start_match == '\n') + start_match++; + + } /* End of for(;;) "bumpalong" loop */ + +/* ==========================================================================*/ + +/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping +conditions is true: + +(1) The pattern is anchored; + +(2) We are past the end of the subject; + +(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because + this option requests that a match occur at or before the first newline in + the subject. + +When we have a match and the offset vector is big enough to deal with any +backreferences, captured substring offsets will already be set up. In the case +where we had to get some local store to hold offsets for backreference +processing, copy those that we can. In this case there need not be overflow if +certain parts of the pattern were not used, even though there are more +capturing parentheses than vector slots. */ + +if (rc == MATCH_MATCH) + { + if (using_temporary_offsets) + { + if (offsetcount >= 4) + { + memcpy(offsets + 2, md->offset_vector + 2, + (offsetcount - 2) * sizeof(int)); + DPRINTF(("Copied offsets from temporary memory\n")); + } + if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; + DPRINTF(("Freeing temporary memory\n")); + (pcre_free)(md->offset_vector); + } + + /* Set the return code to the number of captured strings, or 0 if there are + too many to fit into the vector. */ + + rc = md->offset_overflow? 0 : md->end_offset_top/2; + + /* If there is space, set up the whole thing as substring 0. */ + + if (offsetcount < 2) rc = 0; else + { + offsets[0] = start_match - md->start_subject; + offsets[1] = md->end_match_ptr - md->start_subject; + } + + DPRINTF((">>>> returning %d\n", rc)); + return rc; + } + +/* Control gets here if there has been an error, or if the overall match +attempt has failed at all permitted starting positions. */ + +if (using_temporary_offsets) + { + DPRINTF(("Freeing temporary memory\n")); + (pcre_free)(md->offset_vector); + } + +if (rc != MATCH_NOMATCH) + { + DPRINTF((">>>> error: returning %d\n", rc)); + return rc; + } +else if (md->partial && md->hitend) + { + DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); + return PCRE_ERROR_PARTIAL; + } +else + { + DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); + return PCRE_ERROR_NOMATCH; + } +} + +/* End of pcre_exec.c */ diff --git a/glib/pcre/pcre_fullinfo.c b/glib/pcre/pcre_fullinfo.c new file mode 100644 index 0000000..4a8edc6 --- /dev/null +++ b/glib/pcre/pcre_fullinfo.c @@ -0,0 +1,149 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/*PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_fullinfo(), which returns +information about a compiled pattern. */ + + +#include "pcre_internal.h" + + +/************************************************* +* Return info about compiled pattern * +*************************************************/ + +/* This is a newer "info" function which has an extensible interface so +that additional items can be added compatibly. + +Arguments: + argument_re points to compiled code + extra_data points extra data, or NULL + what what information is required + where where to put the information + +Returns: 0 if data returned, negative on error +*/ + +PCRE_DATA_SCOPE int +pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, + void *where) +{ +real_pcre internal_re; +pcre_study_data internal_study; +const real_pcre *re = (const real_pcre *)argument_re; +const pcre_study_data *study = NULL; + +if (re == NULL || where == NULL) return PCRE_ERROR_NULL; + +if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) + study = (const pcre_study_data *)extra_data->study_data; + +if (re->magic_number != MAGIC_NUMBER) + { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) return PCRE_ERROR_BADMAGIC; + if (study != NULL) study = &internal_study; + } + +switch (what) + { + case PCRE_INFO_OPTIONS: + *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; + break; + + case PCRE_INFO_SIZE: + *((size_t *)where) = re->size; + break; + + case PCRE_INFO_STUDYSIZE: + *((size_t *)where) = (study == NULL)? 0 : study->size; + break; + + case PCRE_INFO_CAPTURECOUNT: + *((int *)where) = re->top_bracket; + break; + + case PCRE_INFO_BACKREFMAX: + *((int *)where) = re->top_backref; + break; + + case PCRE_INFO_FIRSTBYTE: + *((int *)where) = + ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte : + ((re->options & PCRE_STARTLINE) != 0)? -1 : -2; + break; + + /* Make sure we pass back the pointer to the bit vector in the external + block, not the internal copy (with flipped integer fields). */ + + case PCRE_INFO_FIRSTTABLE: + *((const uschar **)where) = + (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? + ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; + break; + + case PCRE_INFO_LASTLITERAL: + *((int *)where) = + ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1; + break; + + case PCRE_INFO_NAMEENTRYSIZE: + *((int *)where) = re->name_entry_size; + break; + + case PCRE_INFO_NAMECOUNT: + *((int *)where) = re->name_count; + break; + + case PCRE_INFO_NAMETABLE: + *((const uschar **)where) = (const uschar *)re + re->name_table_offset; + break; + + case PCRE_INFO_DEFAULT_TABLES: + *((const uschar **)where) = (const uschar *)(_pcre_default_tables); + break; + + default: return PCRE_ERROR_BADOPTION; + } + +return 0; +} + +/* End of pcre_fullinfo.c */ diff --git a/glib/pcre/pcre_get.c b/glib/pcre/pcre_get.c new file mode 100644 index 0000000..856e955 --- /dev/null +++ b/glib/pcre/pcre_get.c @@ -0,0 +1,461 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains some convenience functions for extracting substrings +from the subject string after a regex match has succeeded. The original idea +for these functions came from Scott Wimer. */ + + +#include "pcre_internal.h" + + +/************************************************* +* Find number for named string * +*************************************************/ + +/* This function is used by the get_first_set() function below, as well +as being generally available. It assumes that names are unique. + +Arguments: + code the compiled regex + stringname the name whose number is required + +Returns: the number of the named parentheses, or a negative number + (PCRE_ERROR_NOSUBSTRING) if not found +*/ + +int +pcre_get_stringnumber(const pcre *code, const char *stringname) +{ +int rc; +int entrysize; +int top, bot; +uschar *nametable; + +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) + return rc; +if (top <= 0) return PCRE_ERROR_NOSUBSTRING; + +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) + return rc; +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) + return rc; + +bot = 0; +while (top > bot) + { + int mid = (top + bot) / 2; + uschar *entry = nametable + entrysize*mid; + int c = strcmp(stringname, (char *)(entry + 2)); + if (c == 0) return (entry[0] << 8) + entry[1]; + if (c > 0) bot = mid + 1; else top = mid; + } + +return PCRE_ERROR_NOSUBSTRING; +} + + + +/************************************************* +* Find (multiple) entries for named string * +*************************************************/ + +/* This is used by the get_first_set() function below, as well as being +generally available. It is used when duplicated names are permitted. + +Arguments: + code the compiled regex + stringname the name whose entries required + firstptr where to put the pointer to the first entry + lastptr where to put the pointer to the last entry + +Returns: the length of each entry, or a negative number + (PCRE_ERROR_NOSUBSTRING) if not found +*/ + +int +pcre_get_stringtable_entries(const pcre *code, const char *stringname, + char **firstptr, char **lastptr) +{ +int rc; +int entrysize; +int top, bot; +uschar *nametable, *lastentry; + +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) + return rc; +if (top <= 0) return PCRE_ERROR_NOSUBSTRING; + +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) + return rc; +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) + return rc; + +lastentry = nametable + entrysize * (top - 1); +bot = 0; +while (top > bot) + { + int mid = (top + bot) / 2; + uschar *entry = nametable + entrysize*mid; + int c = strcmp(stringname, (char *)(entry + 2)); + if (c == 0) + { + uschar *first = entry; + uschar *last = entry; + while (first > nametable) + { + if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break; + first -= entrysize; + } + while (last < lastentry) + { + if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break; + last += entrysize; + } + *firstptr = (char *)first; + *lastptr = (char *)last; + return entrysize; + } + if (c > 0) bot = mid + 1; else top = mid; + } + +return PCRE_ERROR_NOSUBSTRING; +} + + + +/************************************************* +* Find first set of multiple named strings * +*************************************************/ + +/* This function allows for duplicate names in the table of named substrings. +It returns the number of the first one that was set in a pattern match. + +Arguments: + code the compiled regex + stringname the name of the capturing substring + ovector the vector of matched substrings + +Returns: the number of the first that is set, + or the number of the last one if none are set, + or a negative number on error +*/ + +static int +get_first_set(const pcre *code, const char *stringname, int *ovector) +{ +const real_pcre *re = (const real_pcre *)code; +int entrysize; +char *first, *last; +uschar *entry; +if ((re->options & (PCRE_DUPNAMES | PCRE_JCHANGED)) == 0) + return pcre_get_stringnumber(code, stringname); +entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last); +if (entrysize <= 0) return entrysize; +for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize) + { + int n = (entry[0] << 8) + entry[1]; + if (ovector[n*2] >= 0) return n; + } +return (first[0] << 8) + first[1]; +} + + + + +/************************************************* +* Copy captured string to given buffer * +*************************************************/ + +/* This function copies a single captured substring into a given buffer. +Note that we use memcpy() rather than strncpy() in case there are binary zeros +in the string. + +Arguments: + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringnumber the number of the required substring + buffer where to put the substring + size the size of the buffer + +Returns: if successful: + the length of the copied string, not including the zero + that is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) buffer too small + PCRE_ERROR_NOSUBSTRING (-7) no such captured substring +*/ + +int +pcre_copy_substring(const char *subject, int *ovector, int stringcount, + int stringnumber, char *buffer, int size) +{ +int yield; +if (stringnumber < 0 || stringnumber >= stringcount) + return PCRE_ERROR_NOSUBSTRING; +stringnumber *= 2; +yield = ovector[stringnumber+1] - ovector[stringnumber]; +if (size < yield + 1) return PCRE_ERROR_NOMEMORY; +memcpy(buffer, subject + ovector[stringnumber], yield); +buffer[yield] = 0; +return yield; +} + + + +/************************************************* +* Copy named captured string to given buffer * +*************************************************/ + +/* This function copies a single captured substring into a given buffer, +identifying it by name. If the regex permits duplicate names, the first +substring that is set is chosen. + +Arguments: + code the compiled regex + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringname the name of the required substring + buffer where to put the substring + size the size of the buffer + +Returns: if successful: + the length of the copied string, not including the zero + that is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) buffer too small + PCRE_ERROR_NOSUBSTRING (-7) no such captured substring +*/ + +int +pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector, + int stringcount, const char *stringname, char *buffer, int size) +{ +int n = get_first_set(code, stringname, ovector); +if (n <= 0) return n; +return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size); +} + + + +/************************************************* +* Copy all captured strings to new store * +*************************************************/ + +/* This function gets one chunk of store and builds a list of pointers and all +of the captured substrings in it. A NULL pointer is put on the end of the list. + +Arguments: + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + listptr set to point to the list of pointers + +Returns: if successful: 0 + if not successful: + PCRE_ERROR_NOMEMORY (-6) failed to get store +*/ + +int +pcre_get_substring_list(const char *subject, int *ovector, int stringcount, + const char ***listptr) +{ +int i; +int size = sizeof(char *); +int double_count = stringcount * 2; +char **stringlist; +char *p; + +for (i = 0; i < double_count; i += 2) + size += sizeof(char *) + ovector[i+1] - ovector[i] + 1; + +stringlist = (char **)(pcre_malloc)(size); +if (stringlist == NULL) return PCRE_ERROR_NOMEMORY; + +*listptr = (const char **)stringlist; +p = (char *)(stringlist + stringcount + 1); + +for (i = 0; i < double_count; i += 2) + { + int len = ovector[i+1] - ovector[i]; + memcpy(p, subject + ovector[i], len); + *stringlist++ = p; + p += len; + *p++ = 0; + } + +*stringlist = NULL; +return 0; +} + + + +/************************************************* +* Free store obtained by get_substring_list * +*************************************************/ + +/* This function exists for the benefit of people calling PCRE from non-C +programs that can call its functions, but not free() or (pcre_free)() directly. + +Argument: the result of a previous pcre_get_substring_list() +Returns: nothing +*/ + +void +pcre_free_substring_list(const char **pointer) +{ +(pcre_free)((void *)pointer); +} + + + +/************************************************* +* Copy captured string to new store * +*************************************************/ + +/* This function copies a single captured substring into a piece of new +store + +Arguments: + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringnumber the number of the required substring + stringptr where to put a pointer to the substring + +Returns: if successful: + the length of the string, not including the zero that + is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) failed to get store + PCRE_ERROR_NOSUBSTRING (-7) substring not present +*/ + +int +pcre_get_substring(const char *subject, int *ovector, int stringcount, + int stringnumber, const char **stringptr) +{ +int yield; +char *substring; +if (stringnumber < 0 || stringnumber >= stringcount) + return PCRE_ERROR_NOSUBSTRING; +stringnumber *= 2; +yield = ovector[stringnumber+1] - ovector[stringnumber]; +substring = (char *)(pcre_malloc)(yield + 1); +if (substring == NULL) return PCRE_ERROR_NOMEMORY; +memcpy(substring, subject + ovector[stringnumber], yield); +substring[yield] = 0; +*stringptr = substring; +return yield; +} + + + +/************************************************* +* Copy named captured string to new store * +*************************************************/ + +/* This function copies a single captured substring, identified by name, into +new store. If the regex permits duplicate names, the first substring that is +set is chosen. + +Arguments: + code the compiled regex + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringname the name of the required substring + stringptr where to put the pointer + +Returns: if successful: + the length of the copied string, not including the zero + that is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) couldn't get memory + PCRE_ERROR_NOSUBSTRING (-7) no such captured substring +*/ + +int +pcre_get_named_substring(const pcre *code, const char *subject, int *ovector, + int stringcount, const char *stringname, const char **stringptr) +{ +int n = get_first_set(code, stringname, ovector); +if (n <= 0) return n; +return pcre_get_substring(subject, ovector, stringcount, n, stringptr); +} + + + + +/************************************************* +* Free store obtained by get_substring * +*************************************************/ + +/* This function exists for the benefit of people calling PCRE from non-C +programs that can call its functions, but not free() or (pcre_free)() directly. + +Argument: the result of a previous pcre_get_substring() +Returns: nothing +*/ + +void +pcre_free_substring(const char *pointer) +{ +(pcre_free)((void *)pointer); +} + +/* End of pcre_get.c */ diff --git a/glib/pcre/pcre_globals.c b/glib/pcre/pcre_globals.c new file mode 100644 index 0000000..8c0b3e6 --- /dev/null +++ b/glib/pcre/pcre_globals.c @@ -0,0 +1,59 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains global variables that are exported by the PCRE library. +PCRE is thread-clean and doesn't use any global variables in the normal sense. +However, it calls memory allocation and freeing functions via the four +indirections below, and it can optionally do callouts, using the fifth +indirection. These values can be changed by the caller, but are shared between +all threads. However, when compiling for Virtual Pascal, things are done +differently, and global variables are not used (see pcre.in). */ + + +#include "pcre_internal.h" + + +#ifdef __cplusplus +extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL; +#else +int (*pcre_callout)(pcre_callout_block *) = NULL; +#endif + +/* End of pcre_globals.c */ diff --git a/glib/pcre/pcre_info.c b/glib/pcre/pcre_info.c new file mode 100644 index 0000000..b318b93 --- /dev/null +++ b/glib/pcre/pcre_info.c @@ -0,0 +1,89 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_info(), which gives some +information about a compiled pattern. However, use of this function is now +deprecated, as it has been superseded by pcre_fullinfo(). */ + + +#include "pcre_internal.h" + + +/************************************************* +* (Obsolete) Return info about compiled pattern * +*************************************************/ + +/* This is the original "info" function. It picks potentially useful data out +of the private structure, but its interface was too rigid. It remains for +backwards compatibility. The public options are passed back in an int - though +the re->options field has been expanded to a long int, all the public options +at the low end of it, and so even on 16-bit systems this will still be OK. +Therefore, I haven't changed the API for pcre_info(). + +Arguments: + argument_re points to compiled code + optptr where to pass back the options + first_byte where to pass back the first character, + or -1 if multiline and all branches start ^, + or -2 otherwise + +Returns: number of capturing subpatterns + or negative values on error +*/ + +PCRE_DATA_SCOPE int +pcre_info(const pcre *argument_re, int *optptr, int *first_byte) +{ +real_pcre internal_re; +const real_pcre *re = (const real_pcre *)argument_re; +if (re == NULL) return PCRE_ERROR_NULL; +if (re->magic_number != MAGIC_NUMBER) + { + re = _pcre_try_flipped(re, &internal_re, NULL, NULL); + if (re == NULL) return PCRE_ERROR_BADMAGIC; + } +if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); +if (first_byte != NULL) + *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte : + ((re->options & PCRE_STARTLINE) != 0)? -1 : -2; +return re->top_bracket; +} + +/* End of pcre_info.c */ diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h new file mode 100644 index 0000000..2b50902 --- /dev/null +++ b/glib/pcre/pcre_internal.h @@ -0,0 +1,1041 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This header contains definitions that are shared between the different +modules, but which are not relevant to the exported API. This includes some +functions whose names all begin with "_pcre_". */ + +#ifndef PCRE_INTERNAL_H +#define PCRE_INTERNAL_H + +/* Define DEBUG to get debugging output on stdout. */ + +#if 0 +#define DEBUG +#endif + +/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef +inline, and there are *still* stupid compilers about that don't like indented +pre-processor statements, or at least there were when I first wrote this. After +all, it had only been about 10 years then... + +It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so +be absolutely sure we get our version. */ + +#undef DPRINTF +#ifdef DEBUG +#define DPRINTF(p) printf p +#else +#define DPRINTF(p) /* Nothing */ +#endif + + +/* Get the definitions provided by running "configure" */ + +#include "config.h" + +/* Standard C headers plus the external interface definition. The only time +setjmp and stdarg are used is when NO_RECURSE is set. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef PCRE_SPY +#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ +#endif + +/* We need to have types that specify unsigned 16-bit and 32-bit integers. We +cannot determine these outside the compilation (e.g. by running a program as +part of "configure") because PCRE is often cross-compiled for use on other +systems. Instead we make use of the maximum sizes that are available at +preprocessor time in standard C environments. */ + +#if USHRT_MAX == 65535 + typedef unsigned short pcre_uint16; +#elif UINT_MAX == 65535 + typedef unsigned int pcre_uint16; +#else + #error Cannot determine a type for 16-bit unsigned integers +#endif + +#if UINT_MAX == 4294967295 + typedef unsigned int pcre_uint32; +#elif ULONG_MAX == 4294967295 + typedef unsigned long int pcre_uint32; +#else + #error Cannot determine a type for 32-bit unsigned integers +#endif + +/* All character handling must be done as unsigned characters. Otherwise there +are problems with top-bit-set characters and functions such as isspace(). +However, we leave the interface to the outside world as char *, because that +should make things easier for callers. We define a short type for unsigned char +to save lots of typing. I tried "uchar", but it causes problems on Digital +Unix, where it is defined in sys/types, so use "uschar" instead. */ + +typedef unsigned char uschar; + +/* This is an unsigned int value that no character can ever have. UTF-8 +characters only go up to 0x7fffffff (though Unicode doesn't go beyond +0x0010ffff). */ + +#define NOTACHAR 0xffffffff + +/* PCRE is able to support several different kinds of newline (CR, LF, CRLF, +and "all" at present). The following macros are used to package up testing for +newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to +indicate in which datablock the parameters exist, and what the start/end of +string field names are. */ + +#define NLTYPE_FIXED 0 /* Newline is a fixed length string */ +#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ + +/* This macro checks for a newline at the given position */ + +#define IS_NEWLINE(p) \ + ((NLBLOCK->nltype != NLTYPE_FIXED)? \ + ((p) < NLBLOCK->PSEND && \ + _pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \ + ) \ + : \ + ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ + (p)[0] == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ + ) \ + ) + +/* This macro checks for a newline immediately preceding the given position */ + +#define WAS_NEWLINE(p) \ + ((NLBLOCK->nltype != NLTYPE_FIXED)? \ + ((p) > NLBLOCK->PSSTART && \ + _pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \ + ) \ + : \ + ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ + (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ + ) \ + ) + +/* When PCRE is compiled as a C++ library, the subject pointer can be replaced +with a custom type. This makes it possible, for example, to allow pcre_exec() +to process subject strings that are discontinuous by using a smart pointer +class. It must always be possible to inspect all of the subject string in +pcre_exec() because of the way it backtracks. Two macros are required in the +normal case, for sign-unspecified and unsigned char pointers. The former is +used for the external interface and appears in pcre.h, which is why its name +must begin with PCRE_. */ + +#ifdef CUSTOM_SUBJECT_PTR +#define PCRE_SPTR CUSTOM_SUBJECT_PTR +#define USPTR CUSTOM_SUBJECT_PTR +#else +#define PCRE_SPTR const char * +#define USPTR const unsigned char * +#endif + +/* Include the public PCRE header and the definitions of UCP character property +values. */ + +#include "pcre.h" +#include "ucp.h" + +/* When compiling for use with the Virtual Pascal compiler, these functions +need to have their names changed. PCRE must be compiled with the -DVPCOMPAT +option on the command line. */ + +#ifdef VPCOMPAT +#define strncmp(s1,s2,m) _strncmp(s1,s2,m) +#define memcpy(d,s,n) _memcpy(d,s,n) +#define memmove(d,s,n) _memmove(d,s,n) +#define memset(s,c,n) _memset(s,c,n) +#else /* VPCOMPAT */ + +/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), +define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY +is set. Otherwise, include an emulating function for those systems that have +neither (there some non-Unix environments where this is the case). This assumes +that all calls to memmove are moving strings upwards in store, which is the +case in PCRE. */ + +#if ! HAVE_MEMMOVE +#undef memmove /* some systems may have a macro */ +#if HAVE_BCOPY +#define memmove(a, b, c) bcopy(b, a, c) +#else /* HAVE_BCOPY */ +static void * +pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) +{ +size_t i; +dest += n; +src += n; +for (i = 0; i < n; ++i) *(--dest) = *(--src); +return dest; +} +#define memmove(a, b, c) pcre_memmove(a, b, c) +#endif /* not HAVE_BCOPY */ +#endif /* not HAVE_MEMMOVE */ +#endif /* not VPCOMPAT */ + + +/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored +in big-endian order) by default. These are used, for example, to link from the +start of a subpattern to its alternatives and its end. The use of 2 bytes per +offset limits the size of the compiled regex to around 64K, which is big enough +for almost everybody. However, I received a request for an even bigger limit. +For this reason, and also to make the code easier to maintain, the storing and +loading of offsets from the byte string is now handled by the macros that are +defined here. + +The macros are controlled by the value of LINK_SIZE. This defaults to 2 in +the config.h file, but can be overridden by using -D on the command line. This +is automated on Unix systems via the "configure" command. */ + +#if LINK_SIZE == 2 + +#define PUT(a,n,d) \ + (a[n] = (d) >> 8), \ + (a[(n)+1] = (d) & 255) + +#define GET(a,n) \ + (((a)[n] << 8) | (a)[(n)+1]) + +#define MAX_PATTERN_SIZE (1 << 16) + + +#elif LINK_SIZE == 3 + +#define PUT(a,n,d) \ + (a[n] = (d) >> 16), \ + (a[(n)+1] = (d) >> 8), \ + (a[(n)+2] = (d) & 255) + +#define GET(a,n) \ + (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) + +#define MAX_PATTERN_SIZE (1 << 24) + + +#elif LINK_SIZE == 4 + +#define PUT(a,n,d) \ + (a[n] = (d) >> 24), \ + (a[(n)+1] = (d) >> 16), \ + (a[(n)+2] = (d) >> 8), \ + (a[(n)+3] = (d) & 255) + +#define GET(a,n) \ + (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) + +#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ + + +#else +#error LINK_SIZE must be either 2, 3, or 4 +#endif + + +/* Convenience macro defined in terms of the others */ + +#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE + + +/* PCRE uses some other 2-byte quantities that do not change when the size of +offsets changes. There are used for repeat counts and for other things such as +capturing parenthesis numbers in back references. */ + +#define PUT2(a,n,d) \ + a[n] = (d) >> 8; \ + a[(n)+1] = (d) & 255 + +#define GET2(a,n) \ + (((a)[n] << 8) | (a)[(n)+1]) + +#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 + + +/* When UTF-8 encoding is being used, a character is no longer just a single +byte. The macros for character handling generate simple sequences when used in +byte-mode, and more complicated ones for UTF-8 characters. */ + +#ifndef SUPPORT_UTF8 +#define GETCHAR(c, eptr) c = *eptr; +#define GETCHARTEST(c, eptr) c = *eptr; +#define GETCHARINC(c, eptr) c = *eptr++; +#define GETCHARINCTEST(c, eptr) c = *eptr++; +#define GETCHARLEN(c, eptr, len) c = *eptr; +#define BACKCHAR(eptr) + +#else /* SUPPORT_UTF8 */ + +/* Get the next UTF-8 character, not advancing the pointer. This is called when +we know we are in UTF-8 mode. */ + +#define GETCHAR(c, eptr) \ + c = *eptr; \ + if (c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) \ + { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ + } + +/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the +pointer. */ + +#define GETCHARTEST(c, eptr) \ + c = *eptr; \ + if (utf8 && c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) \ + { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ + } + +/* Get the next UTF-8 character, advancing the pointer. This is called when we +know we are in UTF-8 mode. */ + +#define GETCHARINC(c, eptr) \ + c = *eptr++; \ + if (c >= 0xc0) \ + { \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + while (gcaa-- > 0) \ + { \ + gcss -= 6; \ + c |= (*eptr++ & 0x3f) << gcss; \ + } \ + } + +/* Get the next character, testing for UTF-8 mode, and advancing the pointer */ + +#define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ + if (utf8 && c >= 0xc0) \ + { \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + while (gcaa-- > 0) \ + { \ + gcss -= 6; \ + c |= (*eptr++ & 0x3f) << gcss; \ + } \ + } + +/* Get the next UTF-8 character, not advancing the pointer, incrementing length +if there are extra bytes. This is called when we know we are in UTF-8 mode. */ + +#define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ + if (c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) \ + { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ + len += gcaa; \ + } + +/* If the pointer is not at the start of a character, move it back until +it is. Called only in UTF-8 mode. */ + +#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--; + +#endif + + +/* In case there is no definition of offsetof() provided - though any proper +Standard C system should have one. */ + +#ifndef offsetof +#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) +#endif + + +/* These are the public options that can change during matching. */ + +#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) + +/* Private options flags start at the most significant end of the four bytes. +The public options defined in pcre.h start at the least significant end. Make +sure they don't overlap! The bits are getting a bit scarce now -- when we run +out, there is a dummy word in the structure that could be used for the private +bits. */ + +#define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */ +#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ +#define PCRE_REQCHSET 0x20000000 /* req_byte is set */ +#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ +#define PCRE_JCHANGED 0x08000000 /* j option changes within regex */ + +/* Options for the "extra" block produced by pcre_study(). */ + +#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ + +/* Masks for identifying the public options that are permitted at compile +time, run time, or study time, respectively. */ + +#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY) + +#define PUBLIC_OPTIONS \ + (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ + PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ + PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ + PCRE_DUPNAMES|PCRE_NEWLINE_BITS) + +#define PUBLIC_EXEC_OPTIONS \ + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ + PCRE_PARTIAL|PCRE_NEWLINE_BITS) + +#define PUBLIC_DFA_EXEC_OPTIONS \ + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ + PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS) + +#define PUBLIC_STUDY_OPTIONS 0 /* None defined */ + +/* Magic number to provide a small check against being handed junk. Also used +to detect whether a pattern was compiled on a host of different endianness. */ + +#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ + +/* Negative values for the firstchar and reqchar variables */ + +#define REQ_UNSET (-2) +#define REQ_NONE (-1) + +/* The maximum remaining length of subject we are prepared to search for a +req_byte match. */ + +#define REQ_BYTE_MAX 1000 + +/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a +variable-length repeat, or a anything other than literal characters. */ + +#define REQ_CASELESS 0x0100 /* indicates caselessness */ +#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ + +/* Miscellaneous definitions */ + +typedef gboolean BOOL; + +/* Escape items that are just an encoding of a particular data value. */ + +#ifndef ESC_e +#define ESC_e 27 +#endif + +#ifndef ESC_f +#define ESC_f '\f' +#endif + +#ifndef ESC_n +#define ESC_n '\n' +#endif + +#ifndef ESC_r +#define ESC_r '\r' +#endif + +/* We can't officially use ESC_t because it is a POSIX reserved identifier +(presumably because of all the others like size_t). */ + +#ifndef ESC_tee +#define ESC_tee '\t' +#endif + +/* Codes for different types of Unicode property */ + +#define PT_ANY 0 /* Any property - matches all chars */ +#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ +#define PT_GC 2 /* General characteristic (e.g. L) */ +#define PT_PC 3 /* Particular characteristic (e.g. Lu) */ +#define PT_SC 4 /* Script (e.g. Han) */ + +/* Flag bits and data types for the extended class (OP_XCLASS) for classes that +contain UTF-8 characters with values greater than 255. */ + +#define XCL_NOT 0x01 /* Flag: this is a negative class */ +#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ + +#define XCL_END 0 /* Marks end of individual items */ +#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ +#define XCL_RANGE 2 /* A range (two multibyte chars) follows */ +#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ +#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ + +/* These are escaped items that aren't just an encoding of a particular data +value such as \n. They must have non-zero values, as check_escape() returns +their negation. Also, they must appear in the same order as in the opcode +definitions below, up to ESC_z. There's a dummy for OP_ANY because it +corresponds to "." rather than an escape sequence. The final one must be +ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). +There are two tests in the code for an escape greater than ESC_b and less than +ESC_Z to detect the types that may be repeated. These are the types that +consume characters. If any new escapes are put in between that don't consume a +character, that code will have to change. */ + +enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, + ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z, + ESC_E, ESC_Q, ESC_k, ESC_REF }; + + +/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets +that extract substrings. Starting from 1 (i.e. after OP_END), the values up to +OP_EOD must correspond in order to the list of escapes immediately above. + +To keep stored, compiled patterns compatible, new opcodes should be added +immediately before OP_BRA, where (since release 7.0) a gap is left for this +purpose. + +*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions +that follow must also be updated to match. There is also a table called +"coptable" in pcre_dfa_exec.c that must be updated. */ + +enum { + OP_END, /* 0 End of pattern */ + + /* Values corresponding to backslashed metacharacters */ + + OP_SOD, /* 1 Start of data: \A */ + OP_SOM, /* 2 Start of match (subject + offset): \G */ + OP_NOT_WORD_BOUNDARY, /* 3 \B */ + OP_WORD_BOUNDARY, /* 4 \b */ + OP_NOT_DIGIT, /* 5 \D */ + OP_DIGIT, /* 6 \d */ + OP_NOT_WHITESPACE, /* 7 \S */ + OP_WHITESPACE, /* 8 \s */ + OP_NOT_WORDCHAR, /* 9 \W */ + OP_WORDCHAR, /* 10 \w */ + OP_ANY, /* 11 Match any character */ + OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ + OP_NOTPROP, /* 13 \P (not Unicode property) */ + OP_PROP, /* 14 \p (Unicode property) */ + OP_ANYNL, /* 15 \R (any newline sequence) */ + OP_EXTUNI, /* 16 \X (extended Unicode sequence */ + OP_EODN, /* 17 End of data or \n at end of data: \Z. */ + OP_EOD, /* 18 End of data: \z */ + + OP_OPT, /* 19 Set runtime options */ + OP_CIRC, /* 20 Start of line - varies with multiline switch */ + OP_DOLL, /* 21 End of line - varies with multiline switch */ + OP_CHAR, /* 22 Match one character, casefully */ + OP_CHARNC, /* 23 Match one character, caselessly */ + OP_NOT, /* 24 Match one character, not the following one */ + + OP_STAR, /* 25 The maximizing and minimizing versions of */ + OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */ + OP_PLUS, /* 27 the minimizing one second. */ + OP_MINPLUS, /* 28 This first set applies to single characters.*/ + OP_QUERY, /* 29 */ + OP_MINQUERY, /* 30 */ + + OP_UPTO, /* 31 From 0 to n matches */ + OP_MINUPTO, /* 32 */ + OP_EXACT, /* 33 Exactly n matches */ + + OP_POSSTAR, /* 34 Possessified star */ + OP_POSPLUS, /* 35 Possessified plus */ + OP_POSQUERY, /* 36 Posesssified query */ + OP_POSUPTO, /* 37 Possessified upto */ + + OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */ + OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */ + OP_NOTPLUS, /* 40 the minimizing one second. They must be in */ + OP_NOTMINPLUS, /* 41 exactly the same order as those above. */ + OP_NOTQUERY, /* 42 This set applies to "not" single characters. */ + OP_NOTMINQUERY, /* 43 */ + + OP_NOTUPTO, /* 44 From 0 to n matches */ + OP_NOTMINUPTO, /* 45 */ + OP_NOTEXACT, /* 46 Exactly n matches */ + + OP_NOTPOSSTAR, /* 47 Possessified versions */ + OP_NOTPOSPLUS, /* 48 */ + OP_NOTPOSQUERY, /* 49 */ + OP_NOTPOSUPTO, /* 50 */ + + OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */ + OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */ + OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */ + OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */ + OP_TYPEQUERY, /* 55 This set applies to character types such as \d */ + OP_TYPEMINQUERY, /* 56 */ + + OP_TYPEUPTO, /* 57 From 0 to n matches */ + OP_TYPEMINUPTO, /* 58 */ + OP_TYPEEXACT, /* 59 Exactly n matches */ + + OP_TYPEPOSSTAR, /* 60 Possessified versions */ + OP_TYPEPOSPLUS, /* 61 */ + OP_TYPEPOSQUERY, /* 62 */ + OP_TYPEPOSUPTO, /* 63 */ + + OP_CRSTAR, /* 64 The maximizing and minimizing versions of */ + OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */ + OP_CRPLUS, /* 66 the minimizing one second. These codes must */ + OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */ + OP_CRQUERY, /* 68 These are for character classes and back refs */ + OP_CRMINQUERY, /* 69 */ + OP_CRRANGE, /* 70 These are different to the three sets above. */ + OP_CRMINRANGE, /* 71 */ + + OP_CLASS, /* 72 Match a character class, chars < 256 only */ + OP_NCLASS, /* 73 Same, but the bitmap was created from a negative + class - the difference is relevant only when a UTF-8 + character > 255 is encountered. */ + + OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the + class. This does both positive and negative. */ + + OP_REF, /* 75 Match a back reference */ + OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */ + OP_CALLOUT, /* 77 Call out to external function if provided */ + + OP_ALT, /* 78 Start of alternation */ + OP_KET, /* 79 End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* 80 These two must remain together and in this */ + OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */ + + /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ + + OP_ASSERT, /* 82 Positive lookahead */ + OP_ASSERT_NOT, /* 83 Negative lookahead */ + OP_ASSERTBACK, /* 84 Positive lookbehind */ + OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */ + OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */ + + /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, + as there's a test for >= ONCE for a subpattern that isn't an assertion. */ + + OP_ONCE, /* 87 Atomic group */ + OP_BRA, /* 88 Start of non-capturing bracket */ + OP_CBRA, /* 89 Start of capturing bracket */ + OP_COND, /* 90 Conditional group */ + + /* These three must follow the previous three, in the same order. There's a + check for >= SBRA to distinguish the two sets. */ + + OP_SBRA, /* 91 Start of non-capturing bracket, check empty */ + OP_SCBRA, /* 92 Start of capturing bracket, check empty */ + OP_SCOND, /* 93 Conditional group, check empty */ + + OP_CREF, /* 94 Used to hold a capture number as condition */ + OP_RREF, /* 95 Used to hold a recursion number as condition */ + OP_DEF, /* 96 The DEFINE condition */ + + OP_BRAZERO, /* 97 These two must remain together and in this */ + OP_BRAMINZERO /* 98 order. */ +}; + + +/* This macro defines textual names for all the opcodes. These are used only +for debugging. The macro is referenced only in pcre_printint.c. */ + +#define OP_NAME_LIST \ + "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ + "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ + "notprop", "prop", "anynl", "extuni", \ + "\\Z", "\\z", \ + "Opt", "^", "$", "char", "charnc", "not", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", \ + "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ + "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ + "AssertB", "AssertB not", "Reverse", \ + "Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \ + "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero" + + +/* This macro defines the length of fixed length operations in the compiled +regex. The lengths are used when searching for specific things, and also in the +debugging printing of a compiled regex. We use a macro so that it can be +defined close to the definitions of the opcodes themselves. + +As things have been extended, some of these are no longer fixed lenths, but are +minima instead. For example, the length of a single-character repeat may vary +in UTF-8 mode. The code that uses this table must know about such things. */ + +#define OP_LENGTHS \ + 1, /* End */ \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ + 1, 1, /* Any, Anybyte */ \ + 3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \ + 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ + 2, /* Char - the minimum length */ \ + 2, /* Charnc - the minimum length */ \ + 2, /* not */ \ + /* Positive single-char repeats ** These are */ \ + 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ + 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ + 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ + /* Negative single-char repeats - only for chars < 256 */ \ + 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ + 4, 4, 4, /* NOT upto, minupto, exact */ \ + 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ + /* Positive type repeats */ \ + 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ + 4, 4, 4, /* Type upto, minupto, exact */ \ + 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ + /* Character class & ref repeats */ \ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ + 5, 5, /* CRRANGE, CRMINRANGE */ \ + 33, /* CLASS */ \ + 33, /* NCLASS */ \ + 0, /* XCLASS - variable length */ \ + 3, /* REF */ \ + 1+LINK_SIZE, /* RECURSE */ \ + 2+2*LINK_SIZE, /* CALLOUT */ \ + 1+LINK_SIZE, /* Alt */ \ + 1+LINK_SIZE, /* Ket */ \ + 1+LINK_SIZE, /* KetRmax */ \ + 1+LINK_SIZE, /* KetRmin */ \ + 1+LINK_SIZE, /* Assert */ \ + 1+LINK_SIZE, /* Assert not */ \ + 1+LINK_SIZE, /* Assert behind */ \ + 1+LINK_SIZE, /* Assert behind not */ \ + 1+LINK_SIZE, /* Reverse */ \ + 1+LINK_SIZE, /* ONCE */ \ + 1+LINK_SIZE, /* BRA */ \ + 3+LINK_SIZE, /* CBRA */ \ + 1+LINK_SIZE, /* COND */ \ + 1+LINK_SIZE, /* SBRA */ \ + 3+LINK_SIZE, /* SCBRA */ \ + 1+LINK_SIZE, /* SCOND */ \ + 3, /* CREF */ \ + 3, /* RREF */ \ + 1, /* DEF */ \ + 1, 1, /* BRAZERO, BRAMINZERO */ \ + + +/* A magic value for OP_RREF to indicate the "any recursion" condition. */ + +#define RREF_ANY 0xffff + +/* Error code numbers. They are given names so that they can more easily be +tracked. */ + +enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, + ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, + ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, + ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, + ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, + ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 }; + +/* The real format of the start of the pcre block; the index of names and the +code vector run on as long as necessary after the end. We store an explicit +offset to the name table so that if a regex is compiled on one host, saved, and +then run on another where the size of pointers is different, all might still +be well. For the case of compiled-on-4 and run-on-8, we include an extra +pointer that is always NULL. For future-proofing, a few dummy fields were +originally included - even though you can never get this planning right - but +there is only one left now. + +NOTE NOTE NOTE: +Because people can now save and re-use compiled patterns, any additions to this +structure should be made at the end, and something earlier (e.g. a new +flag in the options or one of the dummy fields) should indicate that the new +fields are present. Currently PCRE always sets the dummy fields to zero. +NOTE NOTE NOTE: +*/ + +typedef struct real_pcre { + pcre_uint32 magic_number; + pcre_uint32 size; /* Total that was malloced */ + pcre_uint32 options; + pcre_uint32 dummy1; /* For future use, maybe */ + + pcre_uint16 top_bracket; + pcre_uint16 top_backref; + pcre_uint16 first_byte; + pcre_uint16 req_byte; + pcre_uint16 name_table_offset; /* Offset to name table that follows */ + pcre_uint16 name_entry_size; /* Size of any name items */ + pcre_uint16 name_count; /* Number of name items */ + pcre_uint16 ref_count; /* Reference count */ + + const unsigned char *tables; /* Pointer to tables or NULL for std */ + const unsigned char *nullpad; /* NULL padding */ +} real_pcre; + +/* The format of the block used to store data from pcre_study(). The same +remark (see NOTE above) about extending this structure applies. */ + +typedef struct pcre_study_data { + pcre_uint32 size; /* Total that was malloced */ + pcre_uint32 options; + uschar start_bits[32]; +} pcre_study_data; + +/* Structure for passing "static" information around between the functions +doing the compiling, so that they are thread-safe. */ + +typedef struct compile_data { + const uschar *lcc; /* Points to lower casing table */ + const uschar *fcc; /* Points to case-flipping table */ + const uschar *cbits; /* Points to character type table */ + const uschar *ctypes; /* Points to table of type maps */ + const uschar *start_workspace;/* The start of working space */ + const uschar *start_code; /* The start of the compiled code */ + const uschar *start_pattern; /* The start of the pattern */ + const uschar *end_pattern; /* The end of the pattern */ + uschar *hwm; /* High watermark of workspace */ + uschar *name_table; /* The name/number table */ + int names_found; /* Number of entries so far */ + int name_entry_size; /* Size of each entry */ + int bracount; /* Count of capturing parens */ + int top_backref; /* Maximum back reference */ + unsigned int backref_map; /* Bitmap of low back refs */ + int external_options; /* External (initial) options */ + int req_varyopt; /* "After variable item" flag for reqbyte */ + BOOL nopartial; /* Set TRUE if partial won't work */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ + uschar nl[4]; /* Newline string when fixed length */ +} compile_data; + +/* Structure for maintaining a chain of pointers to the currently incomplete +branches, for testing for left recursion. */ + +typedef struct branch_chain { + struct branch_chain *outer; + uschar *current; +} branch_chain; + +/* Structure for items in a linked list that represents an explicit recursive +call within the pattern. */ + +typedef struct recursion_info { + struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ + int group_num; /* Number of group that was called */ + const uschar *after_call; /* "Return value": points after the call in the expr */ + USPTR save_start; /* Old value of md->start_match */ + int *offset_save; /* Pointer to start of saved offsets */ + int saved_max; /* Number of saved offsets */ +} recursion_info; + +/* When compiling in a mode that doesn't use recursive calls to match(), +a structure is used to remember local variables on the heap. It is defined in +pcre_exec.c, close to the match() function, so that it is easy to keep it in +step with any changes of local variable. However, the pointer to the current +frame must be saved in some "static" place over a longjmp(). We declare the +structure here so that we can put a pointer in the match_data structure. NOTE: +This isn't used for a "normal" compilation of pcre. */ + +struct heapframe; + +/* Structure for building a chain of data for holding the values of the subject +pointer at the start of each subpattern, so as to detect when an empty string +has been matched by a subpattern - to break infinite loops. */ + +typedef struct eptrblock { + struct eptrblock *epb_prev; + USPTR epb_saved_eptr; +} eptrblock; + + +/* Structure for passing "static" information around between the functions +doing traditional NFA matching, so that they are thread-safe. */ + +typedef struct match_data { + unsigned long int match_call_count; /* As it says */ + unsigned long int match_limit; /* As it says */ + unsigned long int match_limit_recursion; /* As it says */ + int *offset_vector; /* Offset vector */ + int offset_end; /* One past the end */ + int offset_max; /* The maximum usable for return data */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ + uschar nl[4]; /* Newline string when fixed */ + const uschar *lcc; /* Points to lower casing table */ + const uschar *ctypes; /* Points to table of type maps */ + BOOL offset_overflow; /* Set if too many extractions */ + BOOL notbol; /* NOTBOL flag */ + BOOL noteol; /* NOTEOL flag */ + BOOL utf8; /* UTF8 flag */ + BOOL endonly; /* Dollar not before final \n */ + BOOL notempty; /* Empty string match not wanted */ + BOOL partial; /* PARTIAL flag */ + BOOL hitend; /* Hit the end of the subject at some point */ + const uschar *start_code; /* For use when recursing */ + USPTR start_subject; /* Start of the subject string */ + USPTR end_subject; /* End of the subject string */ + USPTR start_match; /* Start of this match attempt */ + USPTR end_match_ptr; /* Subject position at end match */ + int end_offset_top; /* Highwater mark at end of match */ + int capture_last; /* Most recent capture number */ + int start_offset; /* The start offset value */ + eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ + int eptrn; /* Next free eptrblock */ + recursion_info *recursive; /* Linked list of recursion data */ + void *callout_data; /* To pass back to callouts */ + struct heapframe *thisframe; /* Used only when compiling for no recursion */ +} match_data; + +/* A similar structure is used for the same purpose by the DFA matching +functions. */ + +typedef struct dfa_match_data { + const uschar *start_code; /* Start of the compiled pattern */ + const uschar *start_subject; /* Start of the subject string */ + const uschar *end_subject; /* End of subject string */ + const uschar *tables; /* Character tables */ + int moptions; /* Match options */ + int poptions; /* Pattern options */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ + uschar nl[4]; /* Newline string when fixed */ + void *callout_data; /* To pass back to callouts */ +} dfa_match_data; + +/* Bit definitions for entries in the pcre_ctypes table. */ + +#define ctype_space 0x01 +#define ctype_letter 0x02 +#define ctype_digit 0x04 +#define ctype_xdigit 0x08 +#define ctype_word 0x10 /* alphameric or '_' */ +#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ + +/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set +of bits for a class map. Some classes are built by combining these tables. */ + +#define cbit_space 0 /* [:space:] or \s */ +#define cbit_xdigit 32 /* [:xdigit:] */ +#define cbit_digit 64 /* [:digit:] or \d */ +#define cbit_upper 96 /* [:upper:] */ +#define cbit_lower 128 /* [:lower:] */ +#define cbit_word 160 /* [:word:] or \w */ +#define cbit_graph 192 /* [:graph:] */ +#define cbit_print 224 /* [:print:] */ +#define cbit_punct 256 /* [:punct:] */ +#define cbit_cntrl 288 /* [:cntrl:] */ +#define cbit_length 320 /* Length of the cbits table */ + +/* Offsets of the various tables from the base tables pointer, and +total length. */ + +#define lcc_offset 0 +#define fcc_offset 256 +#define cbits_offset 512 +#define ctypes_offset (cbits_offset + cbit_length) +#define tables_length (ctypes_offset + 256) + +/* Layout of the UCP type table that translates property names into types and +codes. */ + +typedef struct { + pcre_uint16 offset; + pcre_uint16 type; + pcre_uint16 value; +} ucp_type_table; + + +/* Internal shared data tables. These are tables that are used by more than one +of the exported public functions. They have to be "external" in the C sense, +but are not part of the PCRE public API. The data for these tables is in the +pcre_tables.c module. */ + +extern const int _pcre_utf8_table1[]; +extern const int _pcre_utf8_table2[]; +extern const int _pcre_utf8_table3[]; +extern const uschar _pcre_utf8_table4[]; + +extern const int _pcre_utf8_table1_size; + +extern const char _pcre_ucp_names[]; +extern const ucp_type_table _pcre_utt[]; +extern const int _pcre_utt_size; + +extern const uschar _pcre_default_tables[]; + +extern const uschar _pcre_OP_lengths[]; + + +/* Internal shared functions. These are functions that are used by more than +one of the exported public functions. They have to be "external" in the C +sense, but are not part of the PCRE public API. */ + +extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *, + BOOL); +extern int _pcre_ord2utf8(int, uschar *); +extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, + const pcre_study_data *, pcre_study_data *); +extern int _pcre_ucp_findprop(const unsigned int, int *, int *); +extern unsigned int _pcre_ucp_othercase(const unsigned int); +extern int _pcre_valid_utf8(const uschar *, int); +extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *, + BOOL); +extern BOOL _pcre_xclass(int, const uschar *); + +#endif + +/* End of pcre_internal.h */ diff --git a/glib/pcre/pcre_maketables.c b/glib/pcre/pcre_maketables.c new file mode 100644 index 0000000..29e4098 --- /dev/null +++ b/glib/pcre/pcre_maketables.c @@ -0,0 +1,140 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_maketables(), which builds +character tables for PCRE in the current locale. The file is compiled on its +own as part of the PCRE library. However, it is also included in the +compilation of dftables.c, in which case the macro DFTABLES is defined. */ + + +#ifndef DFTABLES +#include "pcre_internal.h" +#endif + + +/************************************************* +* Create PCRE character tables * +*************************************************/ + +/* This function builds a set of character tables for use by PCRE and returns +a pointer to them. They are build using the ctype functions, and consequently +their contents will depend upon the current locale setting. When compiled as +part of the library, the store is obtained via pcre_malloc(), but when compiled +inside dftables, use malloc(). + +Arguments: none +Returns: pointer to the contiguous block of data +*/ + +const unsigned char * +pcre_maketables(void) +{ +unsigned char *yield, *p; +int i; + +#ifndef DFTABLES +yield = (unsigned char*)(pcre_malloc)(tables_length); +#else +yield = (unsigned char*)malloc(tables_length); +#endif + +if (yield == NULL) return NULL; +p = yield; + +/* First comes the lower casing table */ + +for (i = 0; i < 256; i++) *p++ = tolower(i); + +/* Next the case-flipping table */ + +for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); + +/* Then the character class tables. Don't try to be clever and save effort on +exclusive ones - in some locales things may be different. Note that the table +for "space" includes everything "isspace" gives, including VT in the default +locale. This makes it work for the POSIX class [:space:]. Note also that it is +possible for a character to be alnum or alpha without being lower or upper, +such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at +least under Debian Linux's locales as of 12/2005). So we must test for alnum +specially. */ + +memset(p, 0, cbit_length); +for (i = 0; i < 256; i++) + { + if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7); + if (isupper(i)) p[cbit_upper + i/8] |= 1 << (i&7); + if (islower(i)) p[cbit_lower + i/8] |= 1 << (i&7); + if (isalnum(i)) p[cbit_word + i/8] |= 1 << (i&7); + if (i == '_') p[cbit_word + i/8] |= 1 << (i&7); + if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7); + if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7); + if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7); + if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7); + if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7); + if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7); + } +p += cbit_length; + +/* Finally, the character type table. In this, we exclude VT from the white +space chars, because Perl doesn't recognize it as such for \s and for comments +within regexes. */ + +for (i = 0; i < 256; i++) + { + int x = 0; + if (i != 0x0b && isspace(i)) x += ctype_space; + if (isalpha(i)) x += ctype_letter; + if (isdigit(i)) x += ctype_digit; + if (isxdigit(i)) x += ctype_xdigit; + if (isalnum(i) || i == '_') x += ctype_word; + + /* Note: strchr includes the terminating zero in the characters it considers. + In this instance, that is ok because we want binary zero to be flagged as a + meta-character, which in this sense is any character that terminates a run + of data characters. */ + + if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta; + *p++ = x; + } + +return yield; +} + +/* End of pcre_maketables.c */ diff --git a/glib/pcre/pcre_newline.c b/glib/pcre/pcre_newline.c new file mode 100644 index 0000000..348791b --- /dev/null +++ b/glib/pcre/pcre_newline.c @@ -0,0 +1,135 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains internal functions for testing newlines when more than +one kind of newline is to be recognized. When a newline is found, its length is +returned. In principle, we could implement several newline "types", each +referring to a different set of newline characters. At present, PCRE supports +only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL, +so for now the type isn't passed into the functions. It can easily be added +later if required. The full list of Unicode newline characters is taken from +http://unicode.org/unicode/reports/tr18/. */ + + +#include "pcre_internal.h" + + + +/************************************************* +* Check for newline at given position * +*************************************************/ + +/* It is guaranteed that the initial value of ptr is less than the end of the +string that is being processed. + +Arguments: + ptr pointer to possible newline + endptr pointer to the end of the string + lenptr where to return the length + utf8 TRUE if in utf8 mode + +Returns: TRUE or FALSE +*/ + +BOOL +_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr, + BOOL utf8) +{ +int c; +if (utf8) { GETCHAR(c, ptr); } else c = *ptr; +switch(c) + { + case 0x000a: /* LF */ + case 0x000b: /* VT */ + case 0x000c: *lenptr = 1; return TRUE; /* FF */ + case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; + return TRUE; /* CR */ + case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ + case 0x2028: /* LS */ + case 0x2029: *lenptr = 3; return TRUE; /* PS */ + default: return FALSE; + } +} + + + +/************************************************* +* Check for newline at previous position * +*************************************************/ + +/* It is guaranteed that the initial value of ptr is greater than the start of +the string that is being processed. + +Arguments: + ptr pointer to possible newline + startptr pointer to the start of the string + lenptr where to return the length + utf8 TRUE if in utf8 mode + +Returns: TRUE or FALSE +*/ + +BOOL +_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr, + BOOL utf8) +{ +int c; +ptr--; +if (utf8) + { + BACKCHAR(ptr); + GETCHAR(c, ptr); + } +else c = *ptr; +switch(c) + { + case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; + return TRUE; /* LF */ + case 0x000b: /* VT */ + case 0x000c: /* FF */ + case 0x000d: *lenptr = 1; return TRUE; /* CR */ + case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ + case 0x2028: /* LS */ + case 0x2029: *lenptr = 3; return TRUE; /* PS */ + default: return FALSE; + } +} + +/* End of pcre_newline.c */ diff --git a/glib/pcre/pcre_ord2utf8.c b/glib/pcre/pcre_ord2utf8.c new file mode 100644 index 0000000..fc4d6de --- /dev/null +++ b/glib/pcre/pcre_ord2utf8.c @@ -0,0 +1,78 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This file contains a private PCRE function that converts an ordinal +character value into a UTF8 string. */ + + +#include "pcre_internal.h" + + +/************************************************* +* Convert character value to UTF-8 * +*************************************************/ + +/* This function takes an integer value in the range 0 - 0x7fffffff +and encodes it as a UTF-8 character in 0 to 6 bytes. + +Arguments: + cvalue the character value + buffer pointer to buffer for result - at least 6 bytes long + +Returns: number of characters placed in the buffer +*/ + +int +_pcre_ord2utf8(int cvalue, uschar *buffer) +{ +register int i, j; +for (i = 0; i < _pcre_utf8_table1_size; i++) + if (cvalue <= _pcre_utf8_table1[i]) break; +buffer += i; +for (j = i; j > 0; j--) + { + *buffer-- = 0x80 | (cvalue & 0x3f); + cvalue >>= 6; + } +*buffer = _pcre_utf8_table2[i] | cvalue; +return i + 1; +} + +/* End of pcre_ord2utf8.c */ diff --git a/glib/pcre/pcre_refcount.c b/glib/pcre/pcre_refcount.c new file mode 100644 index 0000000..e696581 --- /dev/null +++ b/glib/pcre/pcre_refcount.c @@ -0,0 +1,77 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_refcount(), which is an +auxiliary function that can be used to maintain a reference count in a compiled +pattern data block. This might be helpful in applications where the block is +shared by different users. */ + +#include "pcre_internal.h" + + +/************************************************* +* Maintain reference count * +*************************************************/ + +/* The reference count is a 16-bit field, initialized to zero. It is not +possible to transfer a non-zero count from one host to a different host that +has a different byte order - though I can't see why anyone in their right mind +would ever want to do that! + +Arguments: + argument_re points to compiled code + adjust value to add to the count + +Returns: the (possibly updated) count value (a non-negative number), or + a negative error number +*/ + +PCRE_DATA_SCOPE int +pcre_refcount(pcre *argument_re, int adjust) +{ +real_pcre *re = (real_pcre *)argument_re; +if (re == NULL) return PCRE_ERROR_NULL; +re->ref_count = (-adjust > re->ref_count)? 0 : + (adjust + re->ref_count > 65535)? 65535 : + re->ref_count + adjust; +return re->ref_count; +} + +/* End of pcre_refcount.c */ diff --git a/glib/pcre/pcre_study.c b/glib/pcre/pcre_study.c new file mode 100644 index 0000000..87f8c6f --- /dev/null +++ b/glib/pcre/pcre_study.c @@ -0,0 +1,570 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_study(), along with local +supporting functions. */ + + +#include "pcre_internal.h" + + +/* Returns from set_start_bits() */ + +enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; + + +/************************************************* +* Set a bit and maybe its alternate case * +*************************************************/ + +/* Given a character, set its bit in the table, and also the bit for the other +version of a letter if we are caseless. + +Arguments: + start_bits points to the bit map + c is the character + caseless the caseless flag + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) +{ +start_bits[c/8] |= (1 << (c&7)); +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) + start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +} + + + +/************************************************* +* Create bitmap of starting bytes * +*************************************************/ + +/* This function scans a compiled unanchored expression recursively and +attempts to build a bitmap of the set of possible starting bytes. As time goes +by, we may be able to get more clever at doing this. The SSB_CONTINUE return is +useful for parenthesized groups in patterns such as (a*)b where the group +provides some optional starting bytes but scanning must continue at the outer +level to find at least one mandatory byte. At the outermost level, this +function fails unless the result is SSB_DONE. + +Arguments: + code points to an expression + start_bits points to a 32-byte table, initialized to 0 + caseless the current state of the caseless flag + utf8 TRUE if in UTF-8 mode + cd the block with char table pointers + +Returns: SSB_FAIL => Failed to find any starting bytes + SSB_DONE => Found mandatory starting bytes + SSB_CONTINUE => Found optional starting bytes +*/ + +static int +set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, + BOOL utf8, compile_data *cd) +{ +register int c; +int yield = SSB_DONE; + +#if 0 +/* ========================================================================= */ +/* The following comment and code was inserted in January 1999. In May 2006, +when it was observed to cause compiler warnings about unused values, I took it +out again. If anybody is still using OS/2, they will have to put it back +manually. */ + +/* This next statement and the later reference to dummy are here in order to +trick the optimizer of the IBM C compiler for OS/2 into generating correct +code. Apparently IBM isn't going to fix the problem, and we would rather not +disable optimization (in this module it actually makes a big difference, and +the pcre module can use all the optimization it can get). */ + +volatile int dummy; +/* ========================================================================= */ +#endif + +do + { + const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE; + BOOL try_next = TRUE; + + while (try_next) /* Loop for items in this branch */ + { + int rc; + switch(*tcode) + { + /* Fail if we reach something we don't understand */ + + default: + return SSB_FAIL; + + /* If we hit a bracket or a positive lookahead assertion, recurse to set + bits from within the subpattern. If it can't find anything, we have to + give up. If it finds some mandatory character(s), we are done for this + branch. Otherwise, carry on scanning after the subpattern. */ + + case OP_BRA: + case OP_SBRA: + case OP_CBRA: + case OP_SCBRA: + case OP_ONCE: + case OP_ASSERT: + rc = set_start_bits(tcode, start_bits, caseless, utf8, cd); + if (rc == SSB_FAIL) return SSB_FAIL; + if (rc == SSB_DONE) try_next = FALSE; else + { + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + } + break; + + /* If we hit ALT or KET, it means we haven't found anything mandatory in + this branch, though we might have found something optional. For ALT, we + continue with the next alternative, but we have to arrange that the final + result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, + return SSB_CONTINUE: if this is the top level, that indicates failure, + but after a nested subpattern, it causes scanning to continue. */ + + case OP_ALT: + yield = SSB_CONTINUE; + try_next = FALSE; + break; + + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + return SSB_CONTINUE; + + /* Skip over callout */ + + case OP_CALLOUT: + tcode += 2 + 2*LINK_SIZE; + break; + + /* Skip over lookbehind and negative lookahead assertions */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + break; + + /* Skip over an option setting, changing the caseless flag */ + + case OP_OPT: + caseless = (tcode[1] & PCRE_CASELESS) != 0; + tcode += 2; + break; + + /* BRAZERO does the bracket, but carries on. */ + + case OP_BRAZERO: + case OP_BRAMINZERO: + if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL) + return SSB_FAIL; +/* ========================================================================= + See the comment at the head of this function concerning the next line, + which was an old fudge for the benefit of OS/2. + dummy = 1; + ========================================================================= */ + do tcode += GET(tcode,1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + break; + + /* Single-char * or ? sets the bit and tries the next item */ + + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + set_bit(start_bits, tcode[1], caseless, cd); + tcode += 2; +#ifdef SUPPORT_UTF8 + if (utf8 && tcode[-1] >= 0xc0) + tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; +#endif + break; + + /* Single-char upto sets the bit and tries the next */ + + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + set_bit(start_bits, tcode[3], caseless, cd); + tcode += 4; +#ifdef SUPPORT_UTF8 + if (utf8 && tcode[-1] >= 0xc0) + tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; +#endif + break; + + /* At least one single char sets the bit and stops */ + + case OP_EXACT: /* Fall through */ + tcode += 2; + + case OP_CHAR: + case OP_CHARNC: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + set_bit(start_bits, tcode[1], caseless, cd); + try_next = FALSE; + break; + + /* Single character type sets the bits and stops */ + + case OP_NOT_DIGIT: + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; + try_next = FALSE; + break; + + case OP_DIGIT: + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; + try_next = FALSE; + break; + + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + + case OP_NOT_WHITESPACE: + for (c = 0; c < 32; c++) + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= ~d; + } + try_next = FALSE; + break; + + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + + case OP_WHITESPACE: + for (c = 0; c < 32; c++) + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= d; + } + try_next = FALSE; + break; + + case OP_NOT_WORDCHAR: + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_word]; + try_next = FALSE; + break; + + case OP_WORDCHAR: + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_word]; + try_next = FALSE; + break; + + /* One or more character type fudges the pointer and restarts, knowing + it will hit a single character type and stop there. */ + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + tcode++; + break; + + case OP_TYPEEXACT: + tcode += 3; + break; + + /* Zero or more repeats of character types set the bits and then + try again. */ + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + tcode += 2; /* Fall through */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: + switch(tcode[1]) + { + case OP_ANY: + return SSB_FAIL; + + case OP_NOT_DIGIT: + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; + break; + + case OP_DIGIT: + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; + break; + + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + + case OP_NOT_WHITESPACE: + for (c = 0; c < 32; c++) + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= ~d; + } + break; + + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + + case OP_WHITESPACE: + for (c = 0; c < 32; c++) + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= d; + } + break; + + case OP_NOT_WORDCHAR: + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_word]; + break; + + case OP_WORDCHAR: + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_word]; + break; + } + + tcode += 2; + break; + + /* Character class where all the information is in a bit map: set the + bits and either carry on or not, according to the repeat count. If it was + a negative class, and we are operating with UTF-8 characters, any byte + with a value >= 0xc4 is a potentially valid starter because it starts a + character with a value > 255. */ + + case OP_NCLASS: + if (utf8) + { + start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ + memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ + } + /* Fall through */ + + case OP_CLASS: + { + tcode++; + + /* In UTF-8 mode, the bits in a bit map correspond to character + values, not to byte values. However, the bit map we are constructing is + for byte values. So we have to do a conversion for characters whose + value is > 127. In fact, there are only two possible starting bytes for + characters in the range 128 - 255. */ + + if (utf8) + { + for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; + for (c = 128; c < 256; c++) + { + if ((tcode[c/8] && (1 << (c&7))) != 0) + { + int d = (c >> 6) | 0xc0; /* Set bit for this starter */ + start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ + c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ + } + } + } + + /* In non-UTF-8 mode, the two bit maps are completely compatible. */ + + else + { + for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; + } + + /* Advance past the bit map, and act on what follows */ + + tcode += 32; + switch (*tcode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + tcode++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; + else try_next = FALSE; + break; + + default: + try_next = FALSE; + break; + } + } + break; /* End of bitmap class handling */ + + } /* End of switch */ + } /* End of try_next loop */ + + code += GET(code, 1); /* Advance to next branch */ + } +while (*code == OP_ALT); +return yield; +} + + + +/************************************************* +* Study a compiled expression * +*************************************************/ + +/* This function is handed a compiled expression that it must study to produce +information that will speed up the matching. It returns a pcre_extra block +which then gets handed back to pcre_exec(). + +Arguments: + re points to the compiled expression + options contains option bits + errorptr points to where to place error messages; + set NULL unless error + +Returns: pointer to a pcre_extra block, with study_data filled in and the + appropriate flag set; + NULL on error or if no optimization possible +*/ + +PCRE_DATA_SCOPE pcre_extra * +pcre_study(const pcre *external_re, int options, const char **errorptr) +{ +uschar start_bits[32]; +pcre_extra *extra; +pcre_study_data *study; +const uschar *tables; +uschar *code; +compile_data compile_block; +const real_pcre *re = (const real_pcre *)external_re; + +*errorptr = NULL; + +if (re == NULL || re->magic_number != MAGIC_NUMBER) + { + *errorptr = "argument is not a compiled regular expression"; + return NULL; + } + +if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) + { + *errorptr = "unknown or incorrect option bit(s) set"; + return NULL; + } + +code = (uschar *)re + re->name_table_offset + + (re->name_count * re->name_entry_size); + +/* For an anchored pattern, or an unanchored pattern that has a first char, or +a multiline pattern that matches only at "line starts", no further processing +at present. */ + +if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) + return NULL; + +/* Set the character tables in the block that is passed around */ + +tables = re->tables; +if (tables == NULL) + (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); + +compile_block.lcc = tables + lcc_offset; +compile_block.fcc = tables + fcc_offset; +compile_block.cbits = tables + cbits_offset; +compile_block.ctypes = tables + ctypes_offset; + +/* See if we can find a fixed set of initial characters for the pattern. */ + +memset(start_bits, 0, 32 * sizeof(uschar)); +if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, + (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; + +/* Get a pcre_extra block and a pcre_study_data block. The study data is put in +the latter, which is pointed to by the former, which may also get additional +data set later by the calling program. At the moment, the size of +pcre_study_data is fixed. We nevertheless save it in a field for returning via +the pcre_fullinfo() function so that if it becomes variable in the future, we +don't have to change that code. */ + +extra = (pcre_extra *)(pcre_malloc) + (sizeof(pcre_extra) + sizeof(pcre_study_data)); + +if (extra == NULL) + { + *errorptr = "failed to get memory"; + return NULL; + } + +study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); +extra->flags = PCRE_EXTRA_STUDY_DATA; +extra->study_data = study; + +study->size = sizeof(pcre_study_data); +study->options = PCRE_STUDY_MAPPED; +memcpy(study->start_bits, start_bits, sizeof(start_bits)); + +return extra; +} + +/* End of pcre_study.c */ diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c new file mode 100644 index 0000000..b764b1d --- /dev/null +++ b/glib/pcre/pcre_tables.c @@ -0,0 +1,304 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains some fixed tables that are used by more than one of the +PCRE code modules. The tables are also #included by the pcretest program, which +uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name +clashes with the library. */ + + +#include "pcre_internal.h" + + +/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that +the definition is next to the definition of the opcodes in pcre_internal.h. */ + +const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; + + + +/************************************************* +* Tables for UTF-8 support * +*************************************************/ + +/* These are the breakpoints for different numbers of bytes in a UTF-8 +character. */ + +const int _pcre_utf8_table1[] = + { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; + +const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); + +/* These are the indicator bits and the mask for the data bits to set in the +first byte of a character, indexed by the number of additional bytes. */ + +const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; +const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; + +/* Table of the number of extra bytes, indexed by the first byte masked with +0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ + +const uschar _pcre_utf8_table4[] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; + +/* This table translates Unicode property names into type and code values. It +is searched by binary chop, so must be in collating sequence of name. */ + +const char _pcre_ucp_names[] = + "Any\0" + "Arabic\0" + "Armenian\0" + "Balinese\0" + "Bengali\0" + "Bopomofo\0" + "Braille\0" + "Buginese\0" + "Buhid\0" + "C\0" + "Canadian_Aboriginal\0" + "Cc\0" + "Cf\0" + "Cherokee\0" + "Cn\0" + "Co\0" + "Common\0" + "Coptic\0" + "Cs\0" + "Cuneiform\0" + "Cypriot\0" + "Cyrillic\0" + "Deseret\0" + "Devanagari\0" + "Ethiopic\0" + "Georgian\0" + "Glagolitic\0" + "Gothic\0" + "Greek\0" + "Gujarati\0" + "Gurmukhi\0" + "Han\0" + "Hangul\0" + "Hanunoo\0" + "Hebrew\0" + "Hiragana\0" + "Inherited\0" + "Kannada\0" + "Katakana\0" + "Kharoshthi\0" + "Khmer\0" + "L\0" + "L&\0" + "Lao\0" + "Latin\0" + "Limbu\0" + "Linear_B\0" + "Ll\0" + "Lm\0" + "Lo\0" + "Lt\0" + "Lu\0" + "M\0" + "Malayalam\0" + "Mc\0" + "Me\0" + "Mn\0" + "Mongolian\0" + "Myanmar\0" + "N\0" + "Nd\0" + "New_Tai_Lue\0" + "Nko\0" + "Nl\0" + "No\0" + "Ogham\0" + "Old_Italic\0" + "Old_Persian\0" + "Oriya\0" + "Osmanya\0" + "P\0" + "Pc\0" + "Pd\0" + "Pe\0" + "Pf\0" + "Phags_Pa\0" + "Phoenician\0" + "Pi\0" + "Po\0" + "Ps\0" + "Runic\0" + "S\0" + "Sc\0" + "Shavian\0" + "Sinhala\0" + "Sk\0" + "Sm\0" + "So\0" + "Syloti_Nagri\0" + "Syriac\0" + "Tagalog\0" + "Tagbanwa\0" + "Tai_Le\0" + "Tamil\0" + "Telugu\0" + "Thaana\0" + "Thai\0" + "Tibetan\0" + "Tifinagh\0" + "Ugaritic\0" + "Yi\0" + "Z\0" + "Zl\0" + "Zp\0" + "Zs\0"; + +const ucp_type_table _pcre_utt[] = { + { 0, PT_ANY, 0 }, + { 4, PT_SC, ucp_Arabic }, + { 11, PT_SC, ucp_Armenian }, + { 20, PT_SC, ucp_Balinese }, + { 29, PT_SC, ucp_Bengali }, + { 37, PT_SC, ucp_Bopomofo }, + { 46, PT_SC, ucp_Braille }, + { 54, PT_SC, ucp_Buginese }, + { 63, PT_SC, ucp_Buhid }, + { 69, PT_GC, ucp_C }, + { 71, PT_SC, ucp_Canadian_Aboriginal }, + { 91, PT_PC, ucp_Cc }, + { 94, PT_PC, ucp_Cf }, + { 97, PT_SC, ucp_Cherokee }, + { 106, PT_PC, ucp_Cn }, + { 109, PT_PC, ucp_Co }, + { 112, PT_SC, ucp_Common }, + { 119, PT_SC, ucp_Coptic }, + { 126, PT_PC, ucp_Cs }, + { 129, PT_SC, ucp_Cuneiform }, + { 139, PT_SC, ucp_Cypriot }, + { 147, PT_SC, ucp_Cyrillic }, + { 156, PT_SC, ucp_Deseret }, + { 164, PT_SC, ucp_Devanagari }, + { 175, PT_SC, ucp_Ethiopic }, + { 184, PT_SC, ucp_Georgian }, + { 193, PT_SC, ucp_Glagolitic }, + { 204, PT_SC, ucp_Gothic }, + { 211, PT_SC, ucp_Greek }, + { 217, PT_SC, ucp_Gujarati }, + { 226, PT_SC, ucp_Gurmukhi }, + { 235, PT_SC, ucp_Han }, + { 239, PT_SC, ucp_Hangul }, + { 246, PT_SC, ucp_Hanunoo }, + { 254, PT_SC, ucp_Hebrew }, + { 261, PT_SC, ucp_Hiragana }, + { 270, PT_SC, ucp_Inherited }, + { 280, PT_SC, ucp_Kannada }, + { 288, PT_SC, ucp_Katakana }, + { 297, PT_SC, ucp_Kharoshthi }, + { 308, PT_SC, ucp_Khmer }, + { 314, PT_GC, ucp_L }, + { 316, PT_LAMP, 0 }, + { 319, PT_SC, ucp_Lao }, + { 323, PT_SC, ucp_Latin }, + { 329, PT_SC, ucp_Limbu }, + { 335, PT_SC, ucp_Linear_B }, + { 344, PT_PC, ucp_Ll }, + { 347, PT_PC, ucp_Lm }, + { 350, PT_PC, ucp_Lo }, + { 353, PT_PC, ucp_Lt }, + { 356, PT_PC, ucp_Lu }, + { 359, PT_GC, ucp_M }, + { 361, PT_SC, ucp_Malayalam }, + { 371, PT_PC, ucp_Mc }, + { 374, PT_PC, ucp_Me }, + { 377, PT_PC, ucp_Mn }, + { 380, PT_SC, ucp_Mongolian }, + { 390, PT_SC, ucp_Myanmar }, + { 398, PT_GC, ucp_N }, + { 400, PT_PC, ucp_Nd }, + { 403, PT_SC, ucp_New_Tai_Lue }, + { 415, PT_SC, ucp_Nko }, + { 419, PT_PC, ucp_Nl }, + { 422, PT_PC, ucp_No }, + { 425, PT_SC, ucp_Ogham }, + { 431, PT_SC, ucp_Old_Italic }, + { 442, PT_SC, ucp_Old_Persian }, + { 454, PT_SC, ucp_Oriya }, + { 460, PT_SC, ucp_Osmanya }, + { 468, PT_GC, ucp_P }, + { 470, PT_PC, ucp_Pc }, + { 473, PT_PC, ucp_Pd }, + { 476, PT_PC, ucp_Pe }, + { 479, PT_PC, ucp_Pf }, + { 482, PT_SC, ucp_Phags_Pa }, + { 491, PT_SC, ucp_Phoenician }, + { 502, PT_PC, ucp_Pi }, + { 505, PT_PC, ucp_Po }, + { 508, PT_PC, ucp_Ps }, + { 511, PT_SC, ucp_Runic }, + { 517, PT_GC, ucp_S }, + { 519, PT_PC, ucp_Sc }, + { 522, PT_SC, ucp_Shavian }, + { 530, PT_SC, ucp_Sinhala }, + { 538, PT_PC, ucp_Sk }, + { 541, PT_PC, ucp_Sm }, + { 544, PT_PC, ucp_So }, + { 547, PT_SC, ucp_Syloti_Nagri }, + { 560, PT_SC, ucp_Syriac }, + { 567, PT_SC, ucp_Tagalog }, + { 575, PT_SC, ucp_Tagbanwa }, + { 584, PT_SC, ucp_Tai_Le }, + { 591, PT_SC, ucp_Tamil }, + { 597, PT_SC, ucp_Telugu }, + { 604, PT_SC, ucp_Thaana }, + { 611, PT_SC, ucp_Thai }, + { 616, PT_SC, ucp_Tibetan }, + { 624, PT_SC, ucp_Tifinagh }, + { 633, PT_SC, ucp_Ugaritic }, + { 642, PT_SC, ucp_Yi }, + { 645, PT_GC, ucp_Z }, + { 647, PT_PC, ucp_Zl }, + { 650, PT_PC, ucp_Zp }, + { 653, PT_PC, ucp_Zs } +}; + +const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); + +/* End of pcre_tables.c */ diff --git a/glib/pcre/pcre_try_flipped.c b/glib/pcre/pcre_try_flipped.c new file mode 100644 index 0000000..00c94fc --- /dev/null +++ b/glib/pcre/pcre_try_flipped.c @@ -0,0 +1,132 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains an internal function that tests a compiled pattern to +see if it was compiled with the opposite endianness. If so, it uses an +auxiliary local function to flip the appropriate bytes. */ + + +#include "pcre_internal.h" + + +/************************************************* +* Flip bytes in an integer * +*************************************************/ + +/* This function is called when the magic number in a regex doesn't match, in +order to flip its bytes to see if we are dealing with a pattern that was +compiled on a host of different endianness. If so, this function is used to +flip other byte values. + +Arguments: + value the number to flip + n the number of bytes to flip (assumed to be 2 or 4) + +Returns: the flipped value +*/ + +static unsigned long int +byteflip(unsigned long int value, int n) +{ +if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); +return ((value & 0x000000ff) << 24) | + ((value & 0x0000ff00) << 8) | + ((value & 0x00ff0000) >> 8) | + ((value & 0xff000000) >> 24); +} + + + +/************************************************* +* Test for a byte-flipped compiled regex * +*************************************************/ + +/* This function is called from pcre_exec(), pcre_dfa_exec(), and also from +pcre_fullinfo(). Its job is to test whether the regex is byte-flipped - that +is, it was compiled on a system of opposite endianness. The function is called +only when the native MAGIC_NUMBER test fails. If the regex is indeed flipped, +we flip all the relevant values into a different data block, and return it. + +Arguments: + re points to the regex + study points to study data, or NULL + internal_re points to a new regex block + internal_study points to a new study block + +Returns: the new block if is is indeed a byte-flipped regex + NULL if it is not +*/ + +real_pcre * +_pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, + const pcre_study_data *study, pcre_study_data *internal_study) +{ +if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) + return NULL; + +*internal_re = *re; /* To copy other fields */ +internal_re->size = byteflip(re->size, sizeof(re->size)); +internal_re->options = byteflip(re->options, sizeof(re->options)); +internal_re->top_bracket = + (pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket)); +internal_re->top_backref = + (pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref)); +internal_re->first_byte = + (pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte)); +internal_re->req_byte = + (pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte)); +internal_re->name_table_offset = + (pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset)); +internal_re->name_entry_size = + (pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size)); +internal_re->name_count = + (pcre_uint16)byteflip(re->name_count, sizeof(re->name_count)); + +if (study != NULL) + { + *internal_study = *study; /* To copy other fields */ + internal_study->size = byteflip(study->size, sizeof(study->size)); + internal_study->options = byteflip(study->options, sizeof(study->options)); + } + +return internal_re; +} + +/* End of pcre_tryflipped.c */ diff --git a/glib/pcre/pcre_ucp_searchfuncs.c b/glib/pcre/pcre_ucp_searchfuncs.c new file mode 100644 index 0000000..b95d279 --- /dev/null +++ b/glib/pcre/pcre_ucp_searchfuncs.c @@ -0,0 +1,126 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This file has been modified to use glib instead of the internal table + * in ucptable.c -- Marco Barisione */ + +/* This module contains code for searching the table of Unicode character +properties. */ + +#include "pcre_internal.h" + +#include "ucp.h" /* Category definitions */ +#include "ucpinternal.h" /* Internal table details */ + + +/* Table to translate from particular type value to the general value. */ + +static int ucp_gentype[] = { + ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ + ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ + ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ + ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ + ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ + ucp_P, ucp_P, /* Ps, Po */ + ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ + ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ +}; + + + +/************************************************* +* Search table and return type * +*************************************************/ + +/* Three values are returned: the category is ucp_C, ucp_L, etc. The detailed +character type is ucp_Lu, ucp_Nd, etc. The script is ucp_Latin, etc. + +Arguments: + c the character value + type_ptr the detailed character type is returned here + script_ptr the script is returned here + +Returns: the character type category +*/ + +int +_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr) +{ +/* Note that the Unicode types have the same values in glib and in + * PCRE, so ucp_Ll == G_UNICODE_LOWERCASE_LETTER, + * ucp_Zs == G_UNICODE_SPACE_SEPARATOR, and so on. */ +*type_ptr = g_unichar_type(c); +*script_ptr = g_unichar_get_script(c); +return ucp_gentype[*type_ptr]; +} + + + + +/************************************************* +* Search table and return other case * +*************************************************/ + +/* If the given character is a letter, and there is another case for the +letter, return the other case. Otherwise, return -1. + +Arguments: + c the character value + +Returns: the other case or NOTACHAR if none +*/ + +unsigned int +_pcre_ucp_othercase(const unsigned int c) +{ +int other_case = NOTACHAR; + +if (g_unichar_islower(c)) + other_case = g_unichar_toupper(c); +else if (g_unichar_isupper(c)) + other_case = g_unichar_tolower(c); + +if (other_case == c) + other_case = NOTACHAR; + +return other_case; +} + + +/* End of pcre_ucp_searchfuncs.c */ diff --git a/glib/pcre/pcre_valid_utf8.c b/glib/pcre/pcre_valid_utf8.c new file mode 100644 index 0000000..a5766b4 --- /dev/null +++ b/glib/pcre/pcre_valid_utf8.c @@ -0,0 +1,13 @@ +#include "pcre_internal.h" + +/* + * This function is not needed by GRegex, so print an error and + * return always -1, that is the string is a valid UTF-8 encoded + * string. + */ +int +_pcre_valid_utf8(const uschar *string, int length) +{ +g_warning ("%s: this function should not be called", G_STRLOC); +return -1; +} diff --git a/glib/pcre/pcre_version.c b/glib/pcre/pcre_version.c new file mode 100644 index 0000000..9edf3e0 --- /dev/null +++ b/glib/pcre/pcre_version.c @@ -0,0 +1,86 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre_version(), which returns a +string that identifies the PCRE version that is in use. */ + + +#include "pcre_internal.h" + + +/************************************************* +* Return version string * +*************************************************/ + +/* These macros are the standard way of turning unquoted text into C strings. +They allow macros like PCRE_MAJOR to be defined without quotes, which is +convenient for user programs that want to test its value. */ + +#define STRING(a) # a +#define XSTRING(s) STRING(s) + +/* A problem turned up with PCRE_PRERELEASE, which is defined empty for +production releases. Originally, it was used naively in this code: + + return XSTRING(PCRE_MAJOR) + "." XSTRING(PCRE_MINOR) + XSTRING(PCRE_PRERELEASE) + " " XSTRING(PCRE_DATE); + +However, when PCRE_PRERELEASE is empty, this leads to an attempted expansion of +STRING(). The C standard states: "If (before argument substitution) any +argument consists of no preprocessing tokens, the behavior is undefined." It +turns out the gcc treats this case as a single empty string - which is what we +really want - but Visual C grumbles about the lack of an argument for the +macro. Unfortunately, both are within their rights. To cope with both ways of +handling this, I had resort to some messy hackery that does a test at run time. +I could find no way of detecting that a macro is defined as an empty string at +pre-processor time. This hack uses a standard trick for avoiding calling +the STRING macro with an empty argument when doing the test. */ + +PCRE_DATA_SCOPE const char * +pcre_version(void) +{ +return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)? + XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : + XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE); +} + +/* End of pcre_version.c */ diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c new file mode 100644 index 0000000..57c514b --- /dev/null +++ b/glib/pcre/pcre_xclass.c @@ -0,0 +1,144 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains an internal function that is used to match an extended +class (one that contains characters whose values are > 255). It is used by both +pcre_exec() and pcre_def_exec(). */ + + +#include "pcre_internal.h" + + +/************************************************* +* Match character against an XCLASS * +*************************************************/ + +/* This function is called to match a character against an extended class that +might contain values > 255. + +Arguments: + c the character + data points to the flag byte of the XCLASS data + +Returns: TRUE if character matches, else FALSE +*/ + +BOOL +_pcre_xclass(int c, const uschar *data) +{ +int t; +BOOL negated = (*data & XCL_NOT) != 0; + +/* Character values < 256 are matched against a bitmap, if one is present. If +not, we still carry on, because there may be ranges that start below 256 in the +additional data. */ + +if (c < 256) + { + if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) + return !negated; /* char found */ + } + +/* First skip the bit map if present. Then match against the list of Unicode +properties or large chars or ranges that end with a large char. We won't ever +encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ + +if ((*data++ & XCL_MAP) != 0) data += 32; + +while ((t = *data++) != XCL_END) + { + int x, y; + if (t == XCL_SINGLE) + { + GETCHARINC(x, data); + if (c == x) return !negated; + } + else if (t == XCL_RANGE) + { + GETCHARINC(x, data); + GETCHARINC(y, data); + if (c >= x && c <= y) return !negated; + } + +#ifdef SUPPORT_UCP + else /* XCL_PROP & XCL_NOTPROP */ + { + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); + + switch(*data) + { + case PT_ANY: + if (t == XCL_PROP) return !negated; + break; + + case PT_LAMP: + if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == + (t == XCL_PROP)) return !negated; + break; + + case PT_GC: + if ((data[1] == category) == (t == XCL_PROP)) return !negated; + break; + + case PT_PC: + if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; + break; + + case PT_SC: + if ((data[1] == script) == (t == XCL_PROP)) return !negated; + break; + + /* This should never occur, but compilers may mutter if there is no + default. */ + + default: + return FALSE; + } + + data += 2; + } +#endif /* SUPPORT_UCP */ + } + +return negated; /* char did not match */ +} + +/* End of pcre_xclass.c */ diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h new file mode 100644 index 0000000..b2616b2 --- /dev/null +++ b/glib/pcre/ucp.h @@ -0,0 +1,133 @@ +/************************************************* +* Unicode Property Table handler * +*************************************************/ + +#ifndef _UCP_H +#define _UCP_H + +/* This file contains definitions of the property values that are returned by +the function _pcre_ucp_findprop(). New values that are added for new releases +of Unicode should always be at the end of each enum, for backwards +compatibility. */ + +/* These are the general character categories. */ + +enum { + ucp_C, /* Other */ + ucp_L, /* Letter */ + ucp_M, /* Mark */ + ucp_N, /* Number */ + ucp_P, /* Punctuation */ + ucp_S, /* Symbol */ + ucp_Z /* Separator */ +}; + +/* These are the particular character types. */ + +enum { + ucp_Cc, /* Control */ + ucp_Cf, /* Format */ + ucp_Cn, /* Unassigned */ + ucp_Co, /* Private use */ + ucp_Cs, /* Surrogate */ + ucp_Ll, /* Lower case letter */ + ucp_Lm, /* Modifier letter */ + ucp_Lo, /* Other letter */ + ucp_Lt, /* Title case letter */ + ucp_Lu, /* Upper case letter */ + ucp_Mc, /* Spacing mark */ + ucp_Me, /* Enclosing mark */ + ucp_Mn, /* Non-spacing mark */ + ucp_Nd, /* Decimal number */ + ucp_Nl, /* Letter number */ + ucp_No, /* Other number */ + ucp_Pc, /* Connector punctuation */ + ucp_Pd, /* Dash punctuation */ + ucp_Pe, /* Close punctuation */ + ucp_Pf, /* Final punctuation */ + ucp_Pi, /* Initial punctuation */ + ucp_Po, /* Other punctuation */ + ucp_Ps, /* Open punctuation */ + ucp_Sc, /* Currency symbol */ + ucp_Sk, /* Modifier symbol */ + ucp_Sm, /* Mathematical symbol */ + ucp_So, /* Other symbol */ + ucp_Zl, /* Line separator */ + ucp_Zp, /* Paragraph separator */ + ucp_Zs /* Space separator */ +}; + +/* These are the script identifications. */ + +enum { + ucp_Arabic = G_UNICODE_SCRIPT_ARABIC, + ucp_Armenian = G_UNICODE_SCRIPT_ARMENIAN, + ucp_Bengali = G_UNICODE_SCRIPT_BENGALI, + ucp_Bopomofo = G_UNICODE_SCRIPT_BOPOMOFO, + ucp_Braille = G_UNICODE_SCRIPT_BRAILLE, + ucp_Buginese = G_UNICODE_SCRIPT_BUGINESE, + ucp_Buhid = G_UNICODE_SCRIPT_BUHID, + ucp_Canadian_Aboriginal = G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, + ucp_Cherokee = G_UNICODE_SCRIPT_CHEROKEE, + ucp_Common = G_UNICODE_SCRIPT_COMMON, + ucp_Coptic = G_UNICODE_SCRIPT_COPTIC, + ucp_Cypriot = G_UNICODE_SCRIPT_CYPRIOT, + ucp_Cyrillic = G_UNICODE_SCRIPT_CYRILLIC, + ucp_Deseret = G_UNICODE_SCRIPT_DESERET, + ucp_Devanagari = G_UNICODE_SCRIPT_DEVANAGARI, + ucp_Ethiopic = G_UNICODE_SCRIPT_ETHIOPIC, + ucp_Georgian = G_UNICODE_SCRIPT_GEORGIAN, + ucp_Glagolitic = G_UNICODE_SCRIPT_GLAGOLITIC, + ucp_Gothic = G_UNICODE_SCRIPT_GOTHIC, + ucp_Greek = G_UNICODE_SCRIPT_GREEK, + ucp_Gujarati = G_UNICODE_SCRIPT_GUJARATI, + ucp_Gurmukhi = G_UNICODE_SCRIPT_GURMUKHI, + ucp_Han = G_UNICODE_SCRIPT_HAN, + ucp_Hangul = G_UNICODE_SCRIPT_HANGUL, + ucp_Hanunoo = G_UNICODE_SCRIPT_HANUNOO, + ucp_Hebrew = G_UNICODE_SCRIPT_HEBREW, + ucp_Hiragana = G_UNICODE_SCRIPT_HIRAGANA, + ucp_Inherited = G_UNICODE_SCRIPT_INHERITED, + ucp_Kannada = G_UNICODE_SCRIPT_KANNADA, + ucp_Katakana = G_UNICODE_SCRIPT_KATAKANA, + ucp_Kharoshthi = G_UNICODE_SCRIPT_KHAROSHTHI, + ucp_Khmer = G_UNICODE_SCRIPT_KHMER, + ucp_Lao = G_UNICODE_SCRIPT_LAO, + ucp_Latin = G_UNICODE_SCRIPT_LATIN, + ucp_Limbu = G_UNICODE_SCRIPT_LIMBU, + ucp_Linear_B = G_UNICODE_SCRIPT_LINEAR_B, + ucp_Malayalam = G_UNICODE_SCRIPT_MALAYALAM, + ucp_Mongolian = G_UNICODE_SCRIPT_MONGOLIAN, + ucp_Myanmar = G_UNICODE_SCRIPT_MYANMAR, + ucp_New_Tai_Lue = G_UNICODE_SCRIPT_NEW_TAI_LUE, + ucp_Ogham = G_UNICODE_SCRIPT_OGHAM, + ucp_Old_Italic = G_UNICODE_SCRIPT_OLD_ITALIC, + ucp_Old_Persian = G_UNICODE_SCRIPT_OLD_PERSIAN, + ucp_Oriya = G_UNICODE_SCRIPT_ORIYA, + ucp_Osmanya = G_UNICODE_SCRIPT_OSMANYA, + ucp_Runic = G_UNICODE_SCRIPT_RUNIC, + ucp_Shavian = G_UNICODE_SCRIPT_SHAVIAN, + ucp_Sinhala = G_UNICODE_SCRIPT_SINHALA, + ucp_Syloti_Nagri = G_UNICODE_SCRIPT_SYLOTI_NAGRI, + ucp_Syriac = G_UNICODE_SCRIPT_SYRIAC, + ucp_Tagalog = G_UNICODE_SCRIPT_TAGALOG, + ucp_Tagbanwa = G_UNICODE_SCRIPT_TAGBANWA, + ucp_Tai_Le = G_UNICODE_SCRIPT_TAI_LE, + ucp_Tamil = G_UNICODE_SCRIPT_TAMIL, + ucp_Telugu = G_UNICODE_SCRIPT_TELUGU, + ucp_Thaana = G_UNICODE_SCRIPT_THAANA, + ucp_Thai = G_UNICODE_SCRIPT_THAI, + ucp_Tibetan = G_UNICODE_SCRIPT_TIBETAN, + ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH, + ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC, + ucp_Yi = G_UNICODE_SCRIPT_YI, + ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, /* New for Unicode 5.0.0 */ + ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, /* New for Unicode 5.0.0 */ + ucp_Nko = G_UNICODE_SCRIPT_NKO, /* New for Unicode 5.0.0 */ + ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, /* New for Unicode 5.0.0 */ + ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN /* New for Unicode 5.0.0 */ +}; + +#endif + +/* End of ucp.h */ diff --git a/glib/pcre/ucpinternal.h b/glib/pcre/ucpinternal.h new file mode 100644 index 0000000..811a373 --- /dev/null +++ b/glib/pcre/ucpinternal.h @@ -0,0 +1,92 @@ +/************************************************* +* Unicode Property Table handler * +*************************************************/ + +#ifndef _UCPINTERNAL_H +#define _UCPINTERNAL_H + +/* Internal header file defining the layout of the bits in each pair of 32-bit +words that form a data item in the table. */ + +typedef struct cnode { + pcre_uint32 f0; + pcre_uint32 f1; +} cnode; + +/* Things for the f0 field */ + +#define f0_scriptmask 0xff000000 /* Mask for script field */ +#define f0_scriptshift 24 /* Shift for script value */ +#define f0_rangeflag 0x00f00000 /* Flag for a range item */ +#define f0_charmask 0x001fffff /* Mask for code point value */ + +/* Things for the f1 field */ + +#define f1_typemask 0xfc000000 /* Mask for char type field */ +#define f1_typeshift 26 /* Shift for the type field */ +#define f1_rangemask 0x0000ffff /* Mask for a range offset */ +#define f1_casemask 0x0000ffff /* Mask for a case offset */ +#define f1_caseneg 0xffff8000 /* Bits for negation */ + +/* The data consists of a vector of structures of type cnode. The two unsigned +32-bit integers are used as follows: + +(f0) (1) The most significant byte holds the script number. The numbers are + defined by the enum in ucp.h. + + (2) The 0x00800000 bit is set if this entry defines a range of characters. + It is not set if this entry defines a single character + + (3) The 0x00600000 bits are spare. + + (4) The 0x001fffff bits contain the code point. No Unicode code point will + ever be greater than 0x0010ffff, so this should be OK for ever. + +(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are + defined by an enum in ucp.h. + + (2) The 0x03ff0000 bits are spare. + + (3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of + range if this entry defines a range, OR the *signed* offset to the + character's "other case" partner if this entry defines a single + character. There is no partner if the value is zero. + +------------------------------------------------------------------------------- +| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) | +------------------------------------------------------------------------------- + | | | | | + | | |-> spare | |-> spare + | | | + | |-> spare |-> spare + | + |-> range flag + +The upper/lower casing information is set only for characters that come in +pairs. The non-one-to-one mappings in the Unicode data are ignored. + +When searching the data, proceed as follows: + +(1) Set up for a binary chop search. + +(2) If the top is not greater than the bottom, the character is not in the + table. Its type must therefore be "Cn" ("Undefined"). + +(3) Find the middle vector element. + +(4) Extract the code point and compare. If equal, we are done. + +(5) If the test character is smaller, set the top to the current point, and + goto (2). + +(6) If the current entry defines a range, compute the last character by adding + the offset, and see if the test character is within the range. If it is, + we are done. + +(7) Otherwise, set the bottom to one element past the current point and goto + (2). +*/ + +#endif /* _UCPINTERNAL_H */ + +/* End of ucpinternal.h */ diff --git a/glib/update-pcre/Makefile.am b/glib/update-pcre/Makefile.am new file mode 100644 index 0000000..ac7f82c --- /dev/null +++ b/glib/update-pcre/Makefile.am @@ -0,0 +1,8 @@ +EXTRA_DIST = \ + update.sh \ + Makefile.am-1 \ + Makefile.am-2 \ + digitab.patch \ + memory.patch \ + pcre_ucp_searchfuncs.c \ + pcre_valid_utf8.c diff --git a/glib/update-pcre/Makefile.am-1 b/glib/update-pcre/Makefile.am-1 new file mode 100644 index 0000000..fb71e28 --- /dev/null +++ b/glib/update-pcre/Makefile.am-1 @@ -0,0 +1,28 @@ +INCLUDES = \ + -DG_LOG_DOMAIN=\"GLib-GRegex\" \ + -DSUPPORT_UCP \ + -DSUPPORT_UTF8 \ + -DNEWLINE=-1 \ + -DMATCH_LIMIT=10000000 \ + -DMATCH_LIMIT_RECURSION=10000000 \ + -DMAX_NAME_SIZE=32 \ + -DMAX_NAME_COUNT=10000 \ + -DMAX_DUPLENGTH=30000 \ + -DLINK_SIZE=2 \ + -DEBCDIC=0 \ + -DPOSIX_MALLOC_THRESHOLD=10 \ + -I$(top_srcdir) \ + -I$(srcdir) \ + -I$(top_srcdir)/glib \ + @GLIB_DEBUG_FLAGS@ \ + -DG_DISABLE_DEPRECATED \ + $(DEPRECATED_FLAGS)\ + $(WARN_CFLAGS) \ + $(PCRE_WARN_CFLAGS) \ + $(DEP_CFLAGS) + +noinst_LTLIBRARIES = libpcre.la + +libpcre_headers = + +libpcre_la_SOURCES = \ diff --git a/glib/update-pcre/Makefile.am-2 b/glib/update-pcre/Makefile.am-2 new file mode 100644 index 0000000..94d4d3c --- /dev/null +++ b/glib/update-pcre/Makefile.am-2 @@ -0,0 +1,10 @@ + $(libpcre_headers) + +libpcre_la_LIBADD = $(DEP_LIBS) + +libpcre_la_LDFLAGS = -no-undefined + +EXTRA_DIST = \ + COPYING \ + makefile.msc + diff --git a/glib/update-pcre/digitab.patch b/glib/update-pcre/digitab.patch new file mode 100644 index 0000000..a745fbb --- /dev/null +++ b/glib/update-pcre/digitab.patch @@ -0,0 +1,133 @@ +--- pcre_compile.c 2006-10-10 12:00:00.000000000 +0200 ++++ pcre_compile.c 2006-10-10 12:00:00.000000000 +0200 +@@ -246,130 +246,6 @@ static const char *error_texts[] = { + }; + + +-/* Table to identify digits and hex digits. This is used when compiling +-patterns. Note that the tables in chartables are dependent on the locale, and +-may mark arbitrary characters as digits - but the PCRE compiling code expects +-to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have +-a private table here. It costs 256 bytes, but it is a lot faster than doing +-character value tests (at least in some simple cases I timed), and in some +-applications one wants PCRE to compile efficiently as well as match +-efficiently. +- +-For convenience, we use the same bit definitions as in chartables: +- +- 0x04 decimal digit +- 0x08 hexadecimal digit +- +-Then we can use ctype_digit and ctype_xdigit in the code. */ +- +-#if !EBCDIC /* This is the "normal" case, for ASCII systems */ +-static const unsigned char digitab[] = +- { +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ +- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ +- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ +- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ +- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ +- +-#else /* This is the "abnormal" case, for EBCDIC systems */ +-static const unsigned char digitab[] = +- { +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ +- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ +- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ +- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ +- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ +- +-static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ +- 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ +- 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ +- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ +- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ +- 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ +- 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ +- 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ +- 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ +- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ +- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ +- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ +- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ +- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ +- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ +- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ +- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ +- 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ +- 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ +- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ +- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ +- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ +- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ +- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ +- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ +- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ +-#endif +- +- + /* Definition to allow mutual recursion */ + + static BOOL diff --git a/glib/update-pcre/memory.patch b/glib/update-pcre/memory.patch new file mode 100644 index 0000000..65b7b97 --- /dev/null +++ b/glib/update-pcre/memory.patch @@ -0,0 +1,87 @@ +diff -r 0f4042339eb5 pcre/pcre.h +--- pcre/pcre.h Tue Jul 25 22:39:16 2006 +0200 ++++ pcre/pcre.h Tue Jul 25 22:52:10 2006 +0200 +@@ -233,25 +233,14 @@ typedef struct pcre_callout_block { + /* ------------------------------------------------------------------ */ + } pcre_callout_block; + +-/* Indirection for store get and free functions. These can be set to +-alternative malloc/free functions if required. Special ones are used in the +-non-recursive case for "frames". There is also an optional callout function +-that is triggered by the (?) regex item. For Virtual Pascal, these definitions +-have to take another form. */ +- +-#ifndef VPCOMPAT +-PCRE_DATA_SCOPE void *(*pcre_malloc)(size_t); +-PCRE_DATA_SCOPE void (*pcre_free)(void *); +-PCRE_DATA_SCOPE void *(*pcre_stack_malloc)(size_t); +-PCRE_DATA_SCOPE void (*pcre_stack_free)(void *); ++#include "glib.h" ++#include "galias.h" ++ ++#define pcre_malloc g_try_malloc ++#define pcre_free g_free ++#define pcre_stack_malloc g_try_malloc ++ + PCRE_DATA_SCOPE int (*pcre_callout)(pcre_callout_block *); +-#else /* VPCOMPAT */ +-PCRE_DATA_SCOPE void *pcre_malloc(size_t); +-PCRE_DATA_SCOPE void pcre_free(void *); +-PCRE_DATA_SCOPE void *pcre_stack_malloc(size_t); +-PCRE_DATA_SCOPE void pcre_stack_free(void *); +-PCRE_DATA_SCOPE int pcre_callout(pcre_callout_block *); +-#endif /* VPCOMPAT */ + + /* Exported PCRE functions */ + +diff -r 0f4042339eb5 pcre/pcre_globals.c +--- pcre/pcre_globals.c Tue Jul 25 22:39:16 2006 +0200 ++++ pcre/pcre_globals.c Tue Jul 25 22:52:10 2006 +0200 +@@ -50,32 +50,9 @@ differently, and global variables are no + #include "pcre_internal.h" + + +-#ifndef VPCOMPAT +- +-/************************************************************************** +-This code used to be here for use when compiling as a C++ library. However, +-according to Dair Grant it is not needed: " +- +- Including 'extern "C"' in the declaration generates an "initialized and +- declared `extern'" warning from gcc 4.0.1. Since we include pcre_internal.h, +- which includes pcre.h, which declares these prototypes within an extern "C" {} +- block, we shouldn't need the prefix here. +- +-So, from Release 7.0 I have cut this out. +- + #ifdef __cplusplus +-extern "C" void *(*pcre_malloc)(size_t) = malloc; +-extern "C" void (*pcre_free)(void *) = free; +-extern "C" void *(*pcre_stack_malloc)(size_t) = malloc; +-extern "C" void (*pcre_stack_free)(void *) = free; + extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL; + #else +-**************************************************************************/ +- +-void *(*pcre_malloc)(size_t) = malloc; +-void (*pcre_free)(void *) = free; +-void *(*pcre_stack_malloc)(size_t) = malloc; +-void (*pcre_stack_free)(void *) = free; + int (*pcre_callout)(pcre_callout_block *) = NULL; + #endif + +diff -r 0f4042339eb5 pcre/pcre_internal.h +--- pcre/pcre_internal.h Tue Jul 25 22:39:16 2006 +0200 ++++ pcre/pcre_internal.h Tue Jul 25 22:52:10 2006 +0200 +@@ -480,10 +480,7 @@ variable-length repeat, or a anything ot + + /* Miscellaneous definitions */ + +-typedef int BOOL; +- +-#define FALSE 0 +-#define TRUE 1 ++typedef gboolean BOOL; + + /* Escape items that are just an encoding of a particular data value. */ + diff --git a/glib/update-pcre/pcre_ucp_searchfuncs.c b/glib/update-pcre/pcre_ucp_searchfuncs.c new file mode 100644 index 0000000..b95d279 --- /dev/null +++ b/glib/update-pcre/pcre_ucp_searchfuncs.c @@ -0,0 +1,126 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This file has been modified to use glib instead of the internal table + * in ucptable.c -- Marco Barisione */ + +/* This module contains code for searching the table of Unicode character +properties. */ + +#include "pcre_internal.h" + +#include "ucp.h" /* Category definitions */ +#include "ucpinternal.h" /* Internal table details */ + + +/* Table to translate from particular type value to the general value. */ + +static int ucp_gentype[] = { + ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ + ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ + ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ + ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ + ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ + ucp_P, ucp_P, /* Ps, Po */ + ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ + ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ +}; + + + +/************************************************* +* Search table and return type * +*************************************************/ + +/* Three values are returned: the category is ucp_C, ucp_L, etc. The detailed +character type is ucp_Lu, ucp_Nd, etc. The script is ucp_Latin, etc. + +Arguments: + c the character value + type_ptr the detailed character type is returned here + script_ptr the script is returned here + +Returns: the character type category +*/ + +int +_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr) +{ +/* Note that the Unicode types have the same values in glib and in + * PCRE, so ucp_Ll == G_UNICODE_LOWERCASE_LETTER, + * ucp_Zs == G_UNICODE_SPACE_SEPARATOR, and so on. */ +*type_ptr = g_unichar_type(c); +*script_ptr = g_unichar_get_script(c); +return ucp_gentype[*type_ptr]; +} + + + + +/************************************************* +* Search table and return other case * +*************************************************/ + +/* If the given character is a letter, and there is another case for the +letter, return the other case. Otherwise, return -1. + +Arguments: + c the character value + +Returns: the other case or NOTACHAR if none +*/ + +unsigned int +_pcre_ucp_othercase(const unsigned int c) +{ +int other_case = NOTACHAR; + +if (g_unichar_islower(c)) + other_case = g_unichar_toupper(c); +else if (g_unichar_isupper(c)) + other_case = g_unichar_tolower(c); + +if (other_case == c) + other_case = NOTACHAR; + +return other_case; +} + + +/* End of pcre_ucp_searchfuncs.c */ diff --git a/glib/update-pcre/pcre_valid_utf8.c b/glib/update-pcre/pcre_valid_utf8.c new file mode 100644 index 0000000..a5766b4 --- /dev/null +++ b/glib/update-pcre/pcre_valid_utf8.c @@ -0,0 +1,13 @@ +#include "pcre_internal.h" + +/* + * This function is not needed by GRegex, so print an error and + * return always -1, that is the string is a valid UTF-8 encoded + * string. + */ +int +_pcre_valid_utf8(const uschar *string, int length) +{ +g_warning ("%s: this function should not be called", G_STRLOC); +return -1; +} diff --git a/glib/update-pcre/ucp.patch b/glib/update-pcre/ucp.patch new file mode 100644 index 0000000..8abd812 --- /dev/null +++ b/glib/update-pcre/ucp.patch @@ -0,0 +1,141 @@ +--- pcre/ucp.h 2006-07-05 13:28:01.000000000 +0200 ++++ pcre/ucp.h 2006-10-09 16:27:19.000000000 +0200 +@@ -60,72 +60,72 @@ enum { + /* These are the script identifications. */ + + enum { +- ucp_Arabic, +- ucp_Armenian, +- ucp_Bengali, +- ucp_Bopomofo, +- ucp_Braille, +- ucp_Buginese, +- ucp_Buhid, +- ucp_Canadian_Aboriginal, +- ucp_Cherokee, +- ucp_Common, +- ucp_Coptic, +- ucp_Cypriot, +- ucp_Cyrillic, +- ucp_Deseret, +- ucp_Devanagari, +- ucp_Ethiopic, +- ucp_Georgian, +- ucp_Glagolitic, +- ucp_Gothic, +- ucp_Greek, +- ucp_Gujarati, +- ucp_Gurmukhi, +- ucp_Han, +- ucp_Hangul, +- ucp_Hanunoo, +- ucp_Hebrew, +- ucp_Hiragana, +- ucp_Inherited, +- ucp_Kannada, +- ucp_Katakana, +- ucp_Kharoshthi, +- ucp_Khmer, +- ucp_Lao, +- ucp_Latin, +- ucp_Limbu, +- ucp_Linear_B, +- ucp_Malayalam, +- ucp_Mongolian, +- ucp_Myanmar, +- ucp_New_Tai_Lue, +- ucp_Ogham, +- ucp_Old_Italic, +- ucp_Old_Persian, +- ucp_Oriya, +- ucp_Osmanya, +- ucp_Runic, +- ucp_Shavian, +- ucp_Sinhala, +- ucp_Syloti_Nagri, +- ucp_Syriac, +- ucp_Tagalog, +- ucp_Tagbanwa, +- ucp_Tai_Le, +- ucp_Tamil, +- ucp_Telugu, +- ucp_Thaana, +- ucp_Thai, +- ucp_Tibetan, +- ucp_Tifinagh, +- ucp_Ugaritic, +- ucp_Yi, +- ucp_Balinese, /* New for Unicode 5.0.0 */ +- ucp_Cuneiform, /* New for Unicode 5.0.0 */ +- ucp_Nko, /* New for Unicode 5.0.0 */ +- ucp_Phags_Pa, /* New for Unicode 5.0.0 */ +- ucp_Phoenician /* New for Unicode 5.0.0 */ ++ ucp_Arabic = G_UNICODE_SCRIPT_ARABIC, ++ ucp_Armenian = G_UNICODE_SCRIPT_ARMENIAN, ++ ucp_Bengali = G_UNICODE_SCRIPT_BENGALI, ++ ucp_Bopomofo = G_UNICODE_SCRIPT_BOPOMOFO, ++ ucp_Braille = G_UNICODE_SCRIPT_BRAILLE, ++ ucp_Buginese = G_UNICODE_SCRIPT_BUGINESE, ++ ucp_Buhid = G_UNICODE_SCRIPT_BUHID, ++ ucp_Canadian_Aboriginal = G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, ++ ucp_Cherokee = G_UNICODE_SCRIPT_CHEROKEE, ++ ucp_Common = G_UNICODE_SCRIPT_COMMON, ++ ucp_Coptic = G_UNICODE_SCRIPT_COPTIC, ++ ucp_Cypriot = G_UNICODE_SCRIPT_CYPRIOT, ++ ucp_Cyrillic = G_UNICODE_SCRIPT_CYRILLIC, ++ ucp_Deseret = G_UNICODE_SCRIPT_DESERET, ++ ucp_Devanagari = G_UNICODE_SCRIPT_DEVANAGARI, ++ ucp_Ethiopic = G_UNICODE_SCRIPT_ETHIOPIC, ++ ucp_Georgian = G_UNICODE_SCRIPT_GEORGIAN, ++ ucp_Glagolitic = G_UNICODE_SCRIPT_GLAGOLITIC, ++ ucp_Gothic = G_UNICODE_SCRIPT_GOTHIC, ++ ucp_Greek = G_UNICODE_SCRIPT_GREEK, ++ ucp_Gujarati = G_UNICODE_SCRIPT_GUJARATI, ++ ucp_Gurmukhi = G_UNICODE_SCRIPT_GURMUKHI, ++ ucp_Han = G_UNICODE_SCRIPT_HAN, ++ ucp_Hangul = G_UNICODE_SCRIPT_HANGUL, ++ ucp_Hanunoo = G_UNICODE_SCRIPT_HANUNOO, ++ ucp_Hebrew = G_UNICODE_SCRIPT_HEBREW, ++ ucp_Hiragana = G_UNICODE_SCRIPT_HIRAGANA, ++ ucp_Inherited = G_UNICODE_SCRIPT_INHERITED, ++ ucp_Kannada = G_UNICODE_SCRIPT_KANNADA, ++ ucp_Katakana = G_UNICODE_SCRIPT_KATAKANA, ++ ucp_Kharoshthi = G_UNICODE_SCRIPT_KHAROSHTHI, ++ ucp_Khmer = G_UNICODE_SCRIPT_KHMER, ++ ucp_Lao = G_UNICODE_SCRIPT_LAO, ++ ucp_Latin = G_UNICODE_SCRIPT_LATIN, ++ ucp_Limbu = G_UNICODE_SCRIPT_LIMBU, ++ ucp_Linear_B = G_UNICODE_SCRIPT_LINEAR_B, ++ ucp_Malayalam = G_UNICODE_SCRIPT_MALAYALAM, ++ ucp_Mongolian = G_UNICODE_SCRIPT_MONGOLIAN, ++ ucp_Myanmar = G_UNICODE_SCRIPT_MYANMAR, ++ ucp_New_Tai_Lue = G_UNICODE_SCRIPT_NEW_TAI_LUE, ++ ucp_Ogham = G_UNICODE_SCRIPT_OGHAM, ++ ucp_Old_Italic = G_UNICODE_SCRIPT_OLD_ITALIC, ++ ucp_Old_Persian = G_UNICODE_SCRIPT_OLD_PERSIAN, ++ ucp_Oriya = G_UNICODE_SCRIPT_ORIYA, ++ ucp_Osmanya = G_UNICODE_SCRIPT_OSMANYA, ++ ucp_Runic = G_UNICODE_SCRIPT_RUNIC, ++ ucp_Shavian = G_UNICODE_SCRIPT_SHAVIAN, ++ ucp_Sinhala = G_UNICODE_SCRIPT_SINHALA, ++ ucp_Syloti_Nagri = G_UNICODE_SCRIPT_SYLOTI_NAGRI, ++ ucp_Syriac = G_UNICODE_SCRIPT_SYRIAC, ++ ucp_Tagalog = G_UNICODE_SCRIPT_TAGALOG, ++ ucp_Tagbanwa = G_UNICODE_SCRIPT_TAGBANWA, ++ ucp_Tai_Le = G_UNICODE_SCRIPT_TAI_LE, ++ ucp_Tamil = G_UNICODE_SCRIPT_TAMIL, ++ ucp_Telugu = G_UNICODE_SCRIPT_TELUGU, ++ ucp_Thaana = G_UNICODE_SCRIPT_THAANA, ++ ucp_Thai = G_UNICODE_SCRIPT_THAI, ++ ucp_Tibetan = G_UNICODE_SCRIPT_TIBETAN, ++ ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH, ++ ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC, ++ ucp_Yi = G_UNICODE_SCRIPT_YI, ++ ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, /* New for Unicode 5.0.0 */ ++ ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, /* New for Unicode 5.0.0 */ ++ ucp_Nko = G_UNICODE_SCRIPT_NKO, /* New for Unicode 5.0.0 */ ++ ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, /* New for Unicode 5.0.0 */ ++ ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN /* New for Unicode 5.0.0 */ + }; + + #endif diff --git a/glib/update-pcre/update.sh b/glib/update-pcre/update.sh new file mode 100644 index 0000000..a566e2d --- /dev/null +++ b/glib/update-pcre/update.sh @@ -0,0 +1,159 @@ +#! /bin/sh + +IN="../update-pcre" +PCRE=$1 + +if [ "x$PCRE" = x -o "x$PCRE" = x--help -o "x$PCRE" = x-h ]; then + cat >&2 << EOF + +$0 PCRE-DIR + + Updates the local PCRE copy with a different version of the library, + contained in the directory PCRE-DIR. + + This will delete the content of the local pcre directory, copy the + necessary files from PCRE-DIR, and generate other needed files, such + as Makefile.am +EOF + exit +fi + +if [ ! -f gregex.h ]; then + echo "This script should be executed from the directory containing gregex.c." 2> /dev/null + exit 1 +fi + +if [ ! -f $PCRE/Makefile.in -o ! -f $PCRE/pcre_compile.c ]; then + echo "'$PCRE' does not contain a valid PCRE version." 2> /dev/null + exit 1 +fi + + +echo "Deleting old PCRE library" +mv pcre/.svn tmp-pcre-svn +rm -R pcre 2> /dev/null +mkdir pcre +cd pcre + +# pcre_chartables.c is generated by dfatables. +# We do not want to compile and execute dfatables.c every time, because +# this could be a problem (e.g. when cross-compiling), so now generate +# the file and then distribuite it with GRegex. +echo "Generating pcre_chartables.c" +cp -R $PCRE tmp-build +cd tmp-build +./configure --enable-utf8 --enable-unicode-properties --disable-cpp > /dev/null +make pcre_chartables.c > /dev/null +cat > ../pcre_chartables.c << \EOF +/* This file is autogenerated by ../update-pcre/update.sh during + * the update of the local copy of PCRE. + */ +EOF +cat pcre_chartables.c >> ../pcre_chartables.c +cd .. +rm -R tmp-build + +# Compiled C files. +echo "Generating makefiles" +all_files=`awk '/^OBJ = /, /^\\s*$/ \ + { \ + sub("^OBJ = ", ""); \ + sub(".@OBJEXT@[[:blank:]]*\\\\\\\\", ""); \ + sub("\\\\$\\\\(POSIX_OBJ\\\\)", ""); \ + print; \ + }' \ + $PCRE/Makefile.in` + +# Headers. +included_files="pcre.h pcre_internal.h ucp.h ucpinternal.h" + +# Generate Makefile.am. +cat $IN/Makefile.am-1 > Makefile.am +for name in $all_files; do + echo " $name.c \\" >> Makefile.am + if [ $name != pcre_chartables ]; then + # pcre_chartables.c is a generated file. + cp $PCRE/$name.c . + fi +done +for f in $included_files; do + echo " $f \\" >> Makefile.am + cp $PCRE/$f . +done +cat $IN/Makefile.am-2 >> Makefile.am + +# Generate makefile.msc +cat > makefile.msc << EOF +!IFDEF DEBUG +CRT=-MDd +!ELSE +CRT=-MD +!ENDIF + +CFLAGS = \\ + -I ..\\.. \\ + -DHAVE_CONFIG_H \\ + -DHAVE_LONG_LONG_FORMAT \\ + -DSUPPORT_UCP \\ + -DSUPPORT_UTF8 \\ + -DNEWLINE=10 \\ + -DMATCH_LIMIT=10000000 \\ + -DMATCH_LIMIT_RECURSION=10000000 \\ + -DMAX_NAME_SIZE=32 \\ + -DMAX_NAME_COUNT=10000 \\ + -DMAX_DUPLENGTH=30000 \\ + -DLINK_SIZE=2 \\ + -DEBCDIC=0 \\ + -DPOSIX_MALLOC_THRESHOLD=10 + +OBJECTS = \\ +` +for f in $all_files; do + echo " $f.obj \\\\" +done +` + +pcre.lib : \$(OBJECTS) + lib -out:pcre.lib \$(OBJECTS) + +.c.obj: + \$(CC) \$(CRT) \$(CFLAGS) -Ox -GD -c $< +EOF + +echo "Patching PCRE" + +# Copy the license. +cp $PCRE/COPYING . + +# Use glib for memory allocation. +patch > /dev/null < $IN/memory.patch + +# Copy the modified version of pcre_valid_utf8.c. +cp $IN/pcre_valid_utf8.c . + +# Copy the modified version of pcre_ucp_searchfuncs.c that uses glib +# for Unicode properties. +cp $IN/pcre_ucp_searchfuncs.c . +patch > /dev/null < $IN/ucp.patch + +# Remove the digitab array in pcre_compile.c. +patch > /dev/null < $IN/digitab.patch +sed -i -e 's/(digitab\[\(.*\)\] & ctype_digit)/g_ascii_isdigit(\1)/' pcre_compile.c +sed -i -e 's/(digitab\[\(.*\)\] & ctype_xdigit)/g_ascii_isxdigit(\1)/' pcre_compile.c + +# Reduce the number of relocations. +$IN/make_utt.py +patch > /dev/null < $IN/utt.patch +patch > /dev/null < $IN/table-reduction.patch + +# Copy back the old SVN directory. +mv ../tmp-pcre-svn .svn + + +cat << EOF + +Update completed. You now should check that everything is working. +Remember to update the regex syntax doc with the new features +(docs/reference/glib/regex-syntax.sgml) and to run the tests. +EOF + diff --git a/tests/Makefile.am b/tests/Makefile.am index 2964bd1..f4ab9b3 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,6 +1,12 @@ SUBDIRS=gobject refcount -INCLUDES = -g -I$(top_srcdir) -I$(top_srcdir)/glib -I$(top_srcdir)/gmodule $(GLIB_DEBUG_FLAGS) +if ENABLE_REGEX +enable_regex = -DENABLE_REGEX +else +enable_regex = +endif + +INCLUDES = -g -I$(top_srcdir) -I$(top_srcdir)/glib -I$(top_srcdir)/gmodule $(GLIB_DEBUG_FLAGS) $(enable_regex) EFENCE= @@ -112,7 +118,8 @@ test_programs = \ unicode-encoding \ utf8-validate \ utf8-pointer \ - uri-test + uri-test \ + regex-test test_scripts = run-markup-tests.sh run-collate-tests.sh run-bookmark-test.sh @@ -183,6 +190,7 @@ unicode_collate_LDADD = $(progs_ldadd) utf8_validate_LDADD = $(progs_ldadd) utf8_pointer_LDADD = $(progs_ldadd) uri_test_LDADD = $(progs_ldadd) +regex_test_LDADD = $(progs_ldadd) lib_LTLIBRARIES = libmoduletestplugin_a.la libmoduletestplugin_b.la diff --git a/tests/regex-test.c b/tests/regex-test.c new file mode 100644 index 0000000..1c0854d --- /dev/null +++ b/tests/regex-test.c @@ -0,0 +1,2607 @@ +/* + * Copyright (C) 2005 - 2006, Marco Barisione + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#undef G_DISABLE_ASSERT +#undef G_LOG_DOMAIN + +#include +#include +#include "glib.h" + +#ifdef ENABLE_REGEX + +/* U+20AC EURO SIGN (symbol, currency) */ +#define EURO "\xe2\x82\xac" +/* U+00E0 LATIN SMALL LETTER A WITH GRAVE (letter, lowercase) */ +#define AGRAVE "\xc3\xa0" +/* U+00C0 LATIN CAPITAL LETTER A WITH GRAVE (letter, uppercase) */ +#define AGRAVE_UPPER "\xc3\x80" +/* U+00E8 LATIN SMALL LETTER E WITH GRAVE (letter, lowercase) */ +#define EGRAVE "\xc3\xa8" +/* U+00F2 LATIN SMALL LETTER O WITH GRAVE (letter, lowercase) */ +#define OGRAVE "\xc3\xb2" +/* U+014B LATIN SMALL LETTER ENG (letter, lowercase) */ +#define ENG "\xc5\x8b" +/* U+0127 LATIN SMALL LETTER H WITH STROKE (letter, lowercase) */ +#define HSTROKE "\xc4\xa7" +/* U+0634 ARABIC LETTER SHEEN (letter, other) */ +#define SHEEN "\xd8\xb4" +/* U+1374 ETHIOPIC NUMBER THIRTY (number, other) */ +#define ETH30 "\xe1\x8d\xb4" + +/* A random value use to mark untouched integer variables. */ +#define UNTOUCHED -559038737 + +static gboolean noisy = FALSE; +static gboolean abort_on_fail = FALSE; + +#define PASS passed++ +#define FAIL \ + G_STMT_START \ + { \ + failed++; \ + if (abort_on_fail) \ + goto end; \ + } \ + G_STMT_END + +/* A replacement for strcmp that doesn't crash with null pointers. */ +static gboolean +streq (const gchar *s1, const gchar *s2) +{ + if (s1 == NULL && s2 == NULL) + return TRUE; + else if (s1 == NULL) + return FALSE; + else if (s2 == NULL) + return FALSE; + else + return strcmp (s1, s2) == 0; +} + +static void +verbose (const gchar *format, ...) +{ + /* Function copied from glib/tests/patterntest.c by Matthias Clasen. */ + gchar *msg; + va_list args; + + va_start (args, format); + msg = g_strdup_vprintf (format, args); + va_end (args); + + if (noisy) + g_print (msg); + g_free (msg); +} + +static gboolean +test_new (const gchar *pattern, + GRegexCompileFlags compile_opts, + GRegexMatchFlags match_opts) +{ + GRegex *regex; + + verbose ("compiling \"%s\" \t", pattern); + + regex = g_regex_new (pattern, compile_opts, match_opts, NULL); + if (regex == NULL) + { + g_print ("failed \t(pattern: \"%s\", compile: %d, match %d)\n", + pattern, compile_opts, match_opts); + return FALSE; + } + + if (!g_regex_optimize (regex, NULL)) + { + g_print ("failed optimization \t(pattern: \"%s\", compile: %d, match %d)\n", + pattern, compile_opts, match_opts); + return FALSE; + } + + if (!streq (g_regex_get_pattern (regex), pattern)) + { + g_print ("failed \t(pattern: \"%s\")\n", + pattern); + g_regex_free (regex); + return FALSE; + } + + g_regex_free (regex); + /* Free a null string. */ + g_regex_free (NULL); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_NEW(pattern, compile_opts, match_opts) { \ + total++; \ + if (test_new (pattern, compile_opts, match_opts)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_new_fail (const gchar *pattern, + GRegexCompileFlags compile_opts) +{ + GRegex *regex; + + verbose ("compiling \"%s\" (expected a failure) \t", pattern); + + regex = g_regex_new (pattern, compile_opts, 0, NULL); + + if (regex != NULL) + { + g_print ("failed \t(pattern: \"%s\", compile: %d)\n", + pattern, compile_opts); + g_regex_free (regex); + return FALSE; + } + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_NEW_FAIL(pattern, compile_opts) { \ + total++; \ + if (test_new_fail (pattern, compile_opts)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_copy (const gchar *pattern) +{ + GRegex *regex1, *regex2, *regex3; + + verbose ("copying \"%s\" \t", pattern); + + regex1 = g_regex_new (pattern, 0, 0, NULL); + if (regex1 != NULL) + /* pattern can be not valid as we want to test what happens + * when the regex passed to g_regex_copy() is null */ + g_regex_optimize (regex1, NULL); + regex2 = g_regex_copy (regex1); + + if (regex1 != NULL && + !streq (g_regex_get_pattern (regex1), g_regex_get_pattern(regex2))) + { + g_print ("failed \t(pattern: \"%s\")\n", pattern); + g_regex_free (regex1); + g_regex_free (regex2); + return FALSE; + } + + g_regex_free (regex1); + + /* force the creation of the internal GRegexMatch */ + if (regex2 != NULL) + g_regex_match (regex2, "a", 0); + regex3 = g_regex_copy (regex2); + g_regex_free (regex2); + + if (regex3 != NULL && + !streq (g_regex_get_pattern (regex3), pattern)) + { + g_print ("failed \t(pattern: \"%s\")\n", pattern); + g_regex_free (regex3); + return FALSE; + } + + g_regex_free (regex3); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_COPY(pattern) { \ + total++; \ + if (test_copy (pattern)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_match_simple (const gchar *pattern, + const gchar *string, + GRegexCompileFlags compile_opts, + GRegexMatchFlags match_opts, + gboolean expected) +{ + gboolean match; + + verbose ("matching \"%s\" against \"%s\" \t", string, pattern); + + match = g_regex_match_simple (pattern, string, compile_opts, match_opts); + if (match != expected) + { + g_print ("failed \t(unexpected %s)\n", match ? "match" : "mismatch"); + return FALSE; + } + else + { + verbose ("passed (%s)\n", match ? "match" : "nomatch"); + return TRUE; + } +} + +#define TEST_MATCH_SIMPLE(pattern, string, compile_opts, match_opts, expected) { \ + total++; \ + if (test_match_simple (pattern, string, compile_opts, match_opts, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_match (const gchar *pattern, + GRegexCompileFlags compile_opts, + GRegexMatchFlags match_opts, + const gchar *string, + gssize string_len, + gint start_position, + GRegexMatchFlags match_opts2, + gboolean expected) +{ + GRegex *regex; + gboolean match; + + verbose ("matching \"%s\" against \"%s\" (start: %d, len: %d) \t", + string, pattern, start_position, string_len); + + regex = g_regex_new (pattern, compile_opts, match_opts, NULL); + match = g_regex_match_full (regex, string, string_len, + start_position, match_opts2, NULL); + if (match != expected) + { + g_print ("failed \t(unexpected %s)\n", match ? "match" : "mismatch"); + g_regex_free (regex); + return FALSE; + } + + /* Repeat the test to verify that g_regex_clear() is not needed. */ + match = g_regex_match_full (regex, string, string_len, + start_position, match_opts2, NULL); + if (match != expected) + { + g_print ("failed \t(second match != first match)\n"); + g_regex_free (regex); + return FALSE; + } + + if (string_len == -1 && start_position == 0) + { + match = g_regex_match (regex, string, match_opts2); + if (match != expected) + { + g_print ("failed \t(pattern: \"%s\", string: \"%s\")\n", + pattern, string); + g_regex_free (regex); + return FALSE; + } + } + + g_regex_free (regex); + + verbose ("passed (%s)\n", match ? "match" : "nomatch"); + return TRUE; +} + +#define TEST_MATCH(pattern, compile_opts, match_opts, string, \ + string_len, start_position, match_opts2, expected) { \ + total++; \ + if (test_match (pattern, compile_opts, match_opts, string, \ + string_len, start_position, match_opts2, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +struct _Match +{ + gchar *string; + gint start, end; +}; +typedef struct _Match Match; + +static void +free_match (gpointer data, gpointer user_data) +{ + Match *match = data; + if (match == NULL) + return; + g_free (match->string); + g_free (match); +} + +static gboolean +test_match_next_full (const gchar *pattern, + const gchar *string, + gssize string_len, + gint start_position, + ...) +{ + GRegex *regex; + va_list args; + GSList *matches = NULL; + GSList *expected = NULL; + GSList *l_exp, *l_match; + gboolean ret = TRUE; + + verbose ("matching \"%s\" against \"%s\" (start: %d, len: %d) \t", + string, pattern, start_position, string_len); + + /* The va_list is a NULL-terminated sequence of: extected matched string, + * expected start and expected end. */ + va_start (args, start_position); + while (TRUE) + { + Match *match; + const gchar *expected_string = va_arg (args, const gchar *); + if (expected_string == NULL) + break; + match = g_new0 (Match, 1); + match->string = g_strdup (expected_string); + match->start = va_arg (args, gint); + match->end = va_arg (args, gint); + expected = g_slist_prepend (expected, match); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + + while (g_regex_match_next_full (regex, string, string_len, + start_position, 0, NULL)) + { + Match *match = g_new0 (Match, 1); + match->string = g_regex_fetch (regex, 0, string); + match->start = UNTOUCHED; + match->end = UNTOUCHED; + g_regex_fetch_pos (regex, 0, &match->start, &match->end); + matches = g_slist_prepend (matches, match); + } + matches = g_slist_reverse (matches); + + if (g_slist_length (matches) != g_slist_length (expected)) + { + gint match_count = g_slist_length (matches); + g_print ("failed \t(got %d %s, expected %d)\n", match_count, + match_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + l_match = matches; + while (l_exp != NULL) + { + Match *exp = l_exp->data; + Match *match = l_match->data; + + if (!streq(exp->string, match->string)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + match->string, exp->string); + ret = FALSE; + goto exit; + } + + if (exp->start != match->start || exp->end != match->end) + { + g_print ("failed \t(got [%d, %d], expected [%d, %d])\n", + match->start, match->end, exp->start, exp->end); + ret = FALSE; + goto exit; + } + + l_exp = g_slist_next (l_exp); + l_match = g_slist_next (l_match); + } + +exit: + if (ret) + { + gint count = g_slist_length (matches); + verbose ("passed (%d %s)\n", count, count == 1 ? "match" : "matches"); + } + + g_regex_free (regex); + g_slist_foreach (expected, free_match, NULL); + g_slist_free (expected); + g_slist_foreach (matches, free_match, NULL); + g_slist_free (matches); + + return ret; +} + +static gboolean +test_match_next (const gchar *pattern, + const gchar *string, + ...) +{ + GRegex *regex; + va_list args; + GSList *matches = NULL; + GSList *expected = NULL; + GSList *l_exp, *l_match; + gboolean ret = TRUE; + + verbose ("matching \"%s\" against \"%s\" \t", string, pattern); + + /* The va_list is a NULL-terminated sequence of: extected matched string, + * expected start and expected end. */ + va_start (args, string); + while (TRUE) + { + Match *match; + const gchar *expected_string = va_arg (args, const gchar *); + if (expected_string == NULL) + break; + match = g_new0 (Match, 1); + match->string = g_strdup (expected_string); + match->start = va_arg (args, gint); + match->end = va_arg (args, gint); + expected = g_slist_prepend (expected, match); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + + while (g_regex_match_next (regex, string, 0)) + { + Match *match = g_new0 (Match, 1); + match->string = g_regex_fetch (regex, 0, string); + match->start = UNTOUCHED; + match->end = UNTOUCHED; + g_regex_fetch_pos (regex, 0, &match->start, &match->end); + matches = g_slist_prepend (matches, match); + } + matches = g_slist_reverse (matches); + + if (g_slist_length (matches) != g_slist_length (expected)) + { + gint match_count = g_slist_length (matches); + g_print ("failed \t(got %d %s, expected %d)\n", match_count, + match_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + l_match = matches; + while (l_exp != NULL) + { + Match *exp = l_exp->data; + Match *match = l_match->data; + + if (!streq(exp->string, match->string)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + match->string, exp->string); + ret = FALSE; + goto exit; + } + + if (exp->start != match->start || exp->end != match->end) + { + g_print ("failed \t(got [%d, %d], expected [%d, %d])\n", + match->start, match->end, exp->start, exp->end); + ret = FALSE; + goto exit; + } + + l_exp = g_slist_next (l_exp); + l_match = g_slist_next (l_match); + } + +exit: + if (ret) + { + gint count = g_slist_length (matches); + verbose ("passed (%d %s)\n", count, count == 1 ? "match" : "matches"); + } + + g_regex_free (regex); + g_slist_foreach (expected, free_match, NULL); + g_slist_free (expected); + g_slist_foreach (matches, free_match, NULL); + g_slist_free (matches); + + return ret; +} + +#define TEST_MATCH_NEXT0(pattern, string, string_len, start_position) { \ + total++; \ + if (test_match_next_full (pattern, string, string_len, start_position, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_next (pattern, string, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_NEXT1(pattern, string, string_len, start_position, \ + t1, s1, e1) { \ + total++; \ + if (test_match_next_full (pattern, string, string_len, start_position, \ + t1, s1, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_next (pattern, string, t1, s1, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_NEXT2(pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2) { \ + total++; \ + if (test_match_next_full (pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_next (pattern, string, t1, s1, e1, t2, s2, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_NEXT3(pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, t3, s3, e3) { \ + total++; \ + if (test_match_next_full (pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, t3, s3, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_next (pattern, string, t1, s1, e1, t2, s2, e2, t3, s3, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_NEXT4(pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, t3, s3, e3, t4, s4, e4) { \ + total++; \ + if (test_match_next_full (pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, t3, s3, e3, t4, s4, e4, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++;\ + if (test_match_next (pattern, string, t1, s1, e1, t2, s2, e2, t3, s3, e3, \ + t4, s4, e4, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +static gboolean +test_match_count (const gchar *pattern, + const gchar *string, + gint start_position, + GRegexMatchFlags match_opts, + gint expected_count) +{ + GRegex *regex; + gint count; + + verbose ("fetching match count (string: \"%s\", pattern: \"%s\", start: %d) \t", + string, pattern, start_position); + + regex = g_regex_new (pattern, 0, 0, NULL); + + g_regex_match_next_full (regex, string, -1, start_position, + match_opts, NULL); + count = g_regex_get_match_count (regex); + + if (count != expected_count) + { + g_print ("failed \t(got %d, expected: %d)\n", count, expected_count); + return FALSE; + } + + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_MATCH_COUNT(pattern, string, start_position, match_opts, expected_count) { \ + total++; \ + if (test_match_count (pattern, string, start_position, match_opts, expected_count)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_partial (const gchar *pattern, + const gchar *string, + gboolean expected) +{ + GRegex *regex; + + verbose ("partial matching (string: \"%s\", pattern: \"%s\") \t", + string, pattern); + + regex = g_regex_new (pattern, 0, 0, NULL); + + g_regex_match (regex, string, G_REGEX_MATCH_PARTIAL); + if (expected != g_regex_is_partial_match (regex)) + { + g_print ("failed \t(got %d, expected: %d)\n", !expected, expected); + g_regex_free (regex); + return FALSE; + } + + if (expected && g_regex_fetch_pos (regex, 0, NULL, NULL)) + { + g_print ("failed \t(got sub-pattern 0)\n"); + g_regex_free (regex); + return FALSE; + } + + if (expected && g_regex_fetch_pos (regex, 1, NULL, NULL)) + { + g_print ("failed \t(got sub-pattern 1)\n"); + g_regex_free (regex); + return FALSE; + } + + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_PARTIAL(pattern, string, expected) { \ + total++; \ + if (test_partial (pattern, string, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_clear (const gchar *pattern, + const gchar *string, + gint start_position) +{ + GRegex *regex; + gboolean match1, match2; + gint start1 = UNTOUCHED; + gint end1 = UNTOUCHED; + gint start2 = UNTOUCHED; + gint end2 = UNTOUCHED; + gchar *text1 = NULL; + gchar *text2 = NULL; + gboolean ret = TRUE; + + verbose ("testing clear with \"%s\" against \"%s\" (start: %d) \t", + string, pattern, start_position); + + regex = g_regex_new (pattern, 0, 0, NULL); + + match1 = g_regex_match_next_full (regex, string, UNTOUCHED, start_position, + 0, NULL); + if (match1) + { + text1 = g_regex_fetch (regex, 0, string); + g_regex_fetch_pos (regex, 0, &start1, &end1); + } + + g_regex_clear (regex); + + match2 = g_regex_match_next_full (regex, string, UNTOUCHED, start_position, + 0, NULL); + if (match2) + { + text2 = g_regex_fetch (regex, 0, string); + g_regex_fetch_pos (regex, 0, &start2, &end2); + } + + if (match1 != match2) + { + g_print ("failed \t(different matches)\n"); + ret = FALSE; + } + else if (match1) + { + if (!streq (text1, text2)) + { + g_print ("failed \t(first: \"%s\", second: \"%s\")\n", + text1, text2); + ret = FALSE; + } + if (start1 != start2 || end1 != end2) + { + g_print ("failed \t(first: [%d, %d], second: [%d, %d])\n", + start1, end1, start2, end2); + ret = FALSE; + } + } + + g_regex_free (regex); + g_free (text1); + g_free (text2); + + if (ret) + verbose ("passed\n"); + + return ret; +} + +#define TEST_CLEAR(pattern, string, start_position) { \ + total++; \ + if (test_clear (pattern, string, start_position)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_sub_pattern (const gchar *pattern, + const gchar *string, + gint start_position, + gint sub_n, + const gchar *expected_sub, + gint expected_start, + gint expected_end) +{ + GRegex *regex; + gchar *sub_expr; + gint start = UNTOUCHED, end = UNTOUCHED; + + verbose ("fetching sub-pattern %d from \"%s\" (pattern: \"%s\") \t", + sub_n, string, pattern); + + regex = g_regex_new (pattern, 0, 0, NULL); + g_regex_match_full (regex, string, -1, start_position, 0, NULL); + + sub_expr = g_regex_fetch (regex, sub_n, string); + if (!streq(sub_expr, expected_sub)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + sub_expr, expected_sub); + g_free (sub_expr); + g_regex_free (regex); + return FALSE; + } + g_free (sub_expr); + + g_regex_fetch_pos (regex, sub_n, &start, &end); + if (start != expected_start || end != expected_end) + { + g_print ("failed \t(got [%d, %d], expected [%d, %d])\n", + start, end, expected_start, expected_end); + g_regex_free (regex); + return FALSE; + } + + /* Repeat the test to verify that g_regex_clear() is not needed. */ + g_regex_match_full (regex, string, -1, start_position, 0, NULL); + + sub_expr = g_regex_fetch (regex, sub_n, string); + if (!streq(sub_expr, expected_sub)) + { + g_print ("failed \t(second match != first matchs)\n"); + g_free (sub_expr); + g_regex_free (regex); + return FALSE; + } + g_free (sub_expr); + + g_regex_fetch_pos (regex, sub_n, &start, &end); + if (start != expected_start || end != expected_end) + { + g_print ("failed \t(second match != first matchs)\n"); + g_regex_free (regex); + return FALSE; + } + + + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_SUB_PATTERN(pattern, string, start_position, sub_n, expected_sub, \ + expected_start, expected_end) { \ + total++; \ + if (test_sub_pattern (pattern, string, start_position, sub_n, expected_sub, \ + expected_start, expected_end)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_named_sub_pattern (const gchar *pattern, + const gchar *string, + gint start_position, + const gchar *sub_name, + const gchar *expected_sub, + gint expected_start, + gint expected_end) +{ + GRegex *regex; + gint start = UNTOUCHED, end = UNTOUCHED; + gchar *sub_expr; + + verbose ("fetching sub-pattern \"%s\" from \"%s\" (pattern: \"%s\") \t", + sub_name, string, pattern); + + regex = g_regex_new (pattern, 0, 0, NULL); + + g_regex_match_full (regex, string, -1, start_position, 0, NULL); + sub_expr = g_regex_fetch_named (regex, sub_name, string); + if (!streq (sub_expr, expected_sub)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + sub_expr, expected_sub); + g_free (sub_expr); + g_regex_free (regex); + return FALSE; + } + g_free (sub_expr); + + g_regex_fetch_named_pos (regex, sub_name, &start, &end); + if (start != expected_start || end != expected_end) + { + g_print ("failed \t(got [%d, %d], expected [%d, %d])\n", + start, end, expected_start, expected_end); + g_regex_free (regex); + return FALSE; + } + + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_NAMED_SUB_PATTERN(pattern, string, start_position, sub_name, \ + expected_sub, expected_start, expected_end) { \ + total++; \ + if (test_named_sub_pattern (pattern, string, start_position, sub_name, \ + expected_sub, expected_start, expected_end)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_fetch_all (const gchar *pattern, + const gchar *string, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *l_exp; + gchar **matches; + gint match_count; + gboolean ret = TRUE; + gint i; + + verbose ("fetching all sub-patterns from \"%s\" (pattern: \"%s\") \t", + string, pattern); + + /* The va_list is a NULL-terminated sequence of extected strings. */ + va_start (args, string); + while (TRUE) + { + gchar *expected_string = va_arg (args, gchar *); + if (expected_string == NULL) + break; + else + expected = g_slist_prepend (expected, g_strdup (expected_string)); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + g_regex_match (regex, string, 0); + matches = g_regex_fetch_all (regex, string); + if (matches) + match_count = g_strv_length (matches); + else + match_count = 0; + + if (match_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", match_count, + match_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + for (i = 0; l_exp != NULL; i++, l_exp = g_slist_next (l_exp)) + { + if (!streq(l_exp->data, matches [i])) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + matches [i], (gchar *)l_exp->data); + ret = FALSE; + goto exit; + } + } + + verbose ("passed (%d %s)\n", match_count, + match_count == 1 ? "match" : "matches"); + +exit: + g_regex_free (regex); + g_slist_foreach (expected, (GFunc)g_free, NULL); + g_slist_free (expected); + g_strfreev (matches); + + return ret; +} + +#define TEST_FETCH_ALL0(pattern, string) { \ + total++; \ + if (test_fetch_all (pattern, string, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +#define TEST_FETCH_ALL1(pattern, string, e1) { \ + total++; \ + if (test_fetch_all (pattern, string, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +#define TEST_FETCH_ALL2(pattern, string, e1, e2) { \ + total++; \ + if (test_fetch_all (pattern, string, e1, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +#define TEST_FETCH_ALL3(pattern, string, e1, e2, e3) { \ + total++; \ + if (test_fetch_all (pattern, string, e1, e2, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_split_simple (const gchar *pattern, + const gchar *string, + ...) +{ + va_list args; + GSList *expected = NULL; + GSList *l_exp; + gchar **tokens; + gint token_count; + gboolean ret = TRUE; + gint i; + + verbose ("splitting \"%s\" against \"%s\" \t", string, pattern); + + /* The va_list is a NULL-terminated sequence of extected strings. */ + va_start (args, string); + while (TRUE) + { + gchar *expected_string = va_arg (args, gchar *); + if (expected_string == NULL) + break; + else + expected = g_slist_prepend (expected, g_strdup (expected_string)); + } + expected = g_slist_reverse (expected); + va_end (args); + + tokens = g_regex_split_simple (pattern, string, 0, 0); + if (tokens) + token_count = g_strv_length (tokens); + else + token_count = 0; + + if (token_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", token_count, + token_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + for (i = 0; l_exp != NULL; i++, l_exp = g_slist_next (l_exp)) + { + if (!streq(l_exp->data, tokens [i])) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + tokens[i], (gchar *)l_exp->data); + ret = FALSE; + goto exit; + } + } + + verbose ("passed (%d %s)\n", token_count, + token_count == 1 ? "token" : "tokens"); + +exit: + g_slist_foreach (expected, (GFunc)g_free, NULL); + g_slist_free (expected); + g_strfreev (tokens); + + return ret; +} + +#define TEST_SPLIT_SIMPLE0(pattern, string) { \ + total++; \ + if (test_split_simple (pattern, string, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +#define TEST_SPLIT_SIMPLE1(pattern, string, e1) { \ + total++; \ + if (test_split_simple (pattern, string, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +#define TEST_SPLIT_SIMPLE2(pattern, string, e1, e2) { \ + total++; \ + if (test_split_simple (pattern, string, e1, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +#define TEST_SPLIT_SIMPLE3(pattern, string, e1, e2, e3) { \ + total++; \ + if (test_split_simple (pattern, string, e1, e2, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_split_full (const gchar *pattern, + const gchar *string, + gint start_position, + gint max_tokens, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *l_exp; + gchar **tokens; + gint token_count; + gboolean ret = TRUE; + gint i; + + verbose ("splitting \"%s\" against \"%s\" (start: %d, max: %d) \t", + string, pattern, start_position, max_tokens); + + /* The va_list is a NULL-terminated sequence of extected strings. */ + va_start (args, max_tokens); + while (TRUE) + { + gchar *expected_string = va_arg (args, gchar *); + if (expected_string == NULL) + break; + else + expected = g_slist_prepend (expected, g_strdup (expected_string)); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + tokens = g_regex_split_full (regex, string, -1, start_position, + 0, max_tokens, NULL); + if (tokens) + token_count = g_strv_length (tokens); + else + token_count = 0; + + if (token_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", token_count, + token_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + for (i = 0; l_exp != NULL; i++, l_exp = g_slist_next (l_exp)) + { + if (!streq(l_exp->data, tokens [i])) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + tokens[i], (gchar *)l_exp->data); + ret = FALSE; + goto exit; + } + } + + verbose ("passed (%d %s)\n", token_count, + token_count == 1 ? "token" : "tokens"); + +exit: + g_regex_free (regex); + g_slist_foreach (expected, (GFunc)g_free, NULL); + g_slist_free (expected); + g_strfreev (tokens); + + return ret; +} + +static gboolean +test_split (const gchar *pattern, + const gchar *string, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *l_exp; + gchar **tokens; + gint token_count; + gboolean ret = TRUE; + gint i; + + verbose ("splitting \"%s\" against \"%s\" \t", string, pattern); + + /* The va_list is a NULL-terminated sequence of extected strings. */ + va_start (args, string); + while (TRUE) + { + gchar *expected_string = va_arg (args, gchar *); + if (expected_string == NULL) + break; + else + expected = g_slist_prepend (expected, g_strdup (expected_string)); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + tokens = g_regex_split (regex, string, 0); + if (tokens) + token_count = g_strv_length (tokens); + else + token_count = 0; + + if (token_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", token_count, + token_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + for (i = 0; l_exp != NULL; i++, l_exp = g_slist_next (l_exp)) + { + if (!streq(l_exp->data, tokens [i])) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + tokens[i], (gchar *)l_exp->data); + ret = FALSE; + goto exit; + } + } + + verbose ("passed (%d %s)\n", token_count, + token_count == 1 ? "token" : "tokens"); + +exit: + g_regex_free (regex); + g_slist_foreach (expected, (GFunc)g_free, NULL); + g_slist_free (expected); + g_strfreev (tokens); + + return ret; +} + +#define TEST_SPLIT0(pattern, string, start_position, max_tokens) { \ + total++; \ + if (test_split_full (pattern, string, start_position, max_tokens, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0 && max_tokens <= 0) \ + { \ + total++; \ + if (test_split (pattern, string, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_SPLIT1(pattern, string, start_position, max_tokens, e1) { \ + total++; \ + if (test_split_full (pattern, string, start_position, max_tokens, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0 && max_tokens <= 0) \ + { \ + total++; \ + if (test_split (pattern, string, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_SPLIT2(pattern, string, start_position, max_tokens, e1, e2) { \ + total++; \ + if (test_split_full (pattern, string, start_position, max_tokens, e1, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0 && max_tokens <= 0) \ + { \ + total++; \ + if (test_split (pattern, string, e1, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_SPLIT3(pattern, string, start_position, max_tokens, e1, e2, e3) { \ + total++; \ + if (test_split_full (pattern, string, start_position, max_tokens, e1, e2, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0 && max_tokens <= 0) \ + { \ + total++; \ + if (test_split (pattern, string, e1, e2, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +static gboolean +test_split_next_full (const gchar *pattern, + const gchar *string, + gint start_position, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *tokens; + GSList *l_exp, *l_token; + gint token_count; + gchar *token; + gboolean ret = TRUE; + + verbose ("splitting \"%s\" against \"%s\" (start: %d) \t", + string, pattern, start_position); + + /* The va_list is a NULL-terminated sequence of extected strings. */ + va_start (args, start_position); + while (TRUE) + { + gchar *expected_string = va_arg (args, gchar *); + if (expected_string == NULL) + break; + else + expected = g_slist_prepend (expected, g_strdup (expected_string)); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + + tokens = NULL; + while ((token = g_regex_split_next_full (regex, string, -1, + start_position, 0, NULL))) + { + tokens = g_slist_prepend (tokens, token); + } + tokens = g_slist_reverse (tokens); + token_count = g_slist_length (tokens); + + if (token_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", token_count, + token_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + l_token = tokens; + while (l_exp != NULL) + { + if (!streq(l_exp->data, l_token->data)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + (gchar *)l_token->data, (gchar *)l_exp->data); + ret = FALSE; + goto exit; + } + + l_exp = g_slist_next (l_exp); + l_token = g_slist_next (l_token); + } + + verbose ("passed (%d %s)\n", token_count, + token_count == 1 ? "token" : "tokens"); + +exit: + g_regex_free (regex); + g_slist_foreach (expected, (GFunc)g_free, NULL); + g_slist_free (expected); + g_slist_foreach (tokens, (GFunc)g_free, NULL); + g_slist_free (tokens); + + return ret; +} + +static gboolean +test_split_next (const gchar *pattern, + const gchar *string, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *tokens; + GSList *l_exp, *l_token; + gint token_count; + gchar *token; + gboolean ret = TRUE; + + verbose ("splitting \"%s\" against \"%s\" \t", string, pattern); + + /* The va_list is a NULL-terminated sequence of extected strings. */ + va_start (args, string); + while (TRUE) + { + gchar *expected_string = va_arg (args, gchar *); + if (expected_string == NULL) + break; + else + expected = g_slist_prepend (expected, g_strdup (expected_string)); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + + tokens = NULL; + while ((token = g_regex_split_next (regex, string, 0))) + { + tokens = g_slist_prepend (tokens, token); + } + tokens = g_slist_reverse (tokens); + token_count = g_slist_length (tokens); + + if (token_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", token_count, + token_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + l_token = tokens; + while (l_exp != NULL) + { + if (!streq(l_exp->data, l_token->data)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + (gchar *)l_token->data, (gchar *)l_exp->data); + ret = FALSE; + goto exit; + } + + l_exp = g_slist_next (l_exp); + l_token = g_slist_next (l_token); + } + + verbose ("passed (%d %s)\n", token_count, + token_count == 1 ? "token" : "tokens"); + +exit: + g_regex_free (regex); + g_slist_foreach (expected, (GFunc)g_free, NULL); + g_slist_free (expected); + g_slist_foreach (tokens, (GFunc)g_free, NULL); + g_slist_free (tokens); + + return ret; +} + +#define TEST_SPLIT_NEXT1(pattern, string, start_position, e1) { \ + total++; \ + if (test_split_next_full (pattern, string, start_position, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0) \ + { \ + total++; \ + if (test_split_next (pattern, string, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_SPLIT_NEXT2(pattern, string, start_position, e1, e2) { \ + total++; \ + if (test_split_next_full (pattern, string, start_position, e1, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0) \ + { \ + total++; \ + if (test_split_next (pattern, string, e1, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_SPLIT_NEXT3(pattern, string, start_position, e1, e2, e3) { \ + total++; \ + if (test_split_next_full (pattern, string, start_position, e1, e2, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (start_position == 0) \ + { \ + total++; \ + if (test_split_next (pattern, string, e1, e2, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +static gboolean +test_expand (const gchar *pattern, + const gchar *string, + const gchar *string_to_expand, + gboolean raw, + const gchar *expected) +{ + GRegex *regex; + gchar *res; + + verbose ("expanding the references in \"%s\" (pattern: \"%s\", string: \"%s\") \t", + string_to_expand, pattern, string); + + regex = g_regex_new (pattern, raw ? G_REGEX_RAW : 0, 0, NULL); + g_regex_match (regex, string, 0); + res = g_regex_expand_references (regex, string, string_to_expand, NULL); + if (!streq (res, expected)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", res, expected); + g_free (res); + g_regex_free (regex); + return FALSE; + } + + g_free (res); + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_EXPAND(pattern, string, string_to_expand, raw, expected) { \ + total++; \ + if (test_expand (pattern, string, string_to_expand, raw, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_replace (const gchar *pattern, + const gchar *string, + gint start_position, + const gchar *replacement, + const gchar *expected) +{ + GRegex *regex; + gchar *res; + + verbose ("replacing \"%s\" in \"%s\" (pattern: \"%s\", start: %d) \t", + replacement, string, pattern, start_position); + + regex = g_regex_new (pattern, 0, 0, NULL); + res = g_regex_replace (regex, string, -1, start_position, replacement, 0, NULL); + if (!streq (res, expected)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", res, expected); + g_free (res); + g_regex_free (regex); + return FALSE; + } + + g_free (res); + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_REPLACE(pattern, string, start_position, replacement, expected) { \ + total++; \ + if (test_replace (pattern, string, start_position, replacement, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_replace_lit (const gchar *pattern, + const gchar *string, + gint start_position, + const gchar *replacement, + const gchar *expected) +{ + GRegex *regex; + gchar *res; + + verbose ("replacing literally \"%s\" in \"%s\" (pattern: \"%s\", start: %d) \t", + replacement, string, pattern, start_position); + + regex = g_regex_new (pattern, 0, 0, NULL); + res = g_regex_replace_literal (regex, string, -1, start_position, + replacement, 0, NULL); + if (!streq (res, expected)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", res, expected); + g_free (res); + g_regex_free (regex); + return FALSE; + } + + g_free (res); + g_regex_free (regex); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_REPLACE_LIT(pattern, string, start_position, replacement, expected) { \ + total++; \ + if (test_replace_lit (pattern, string, start_position, replacement, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_get_string_number (const gchar *pattern, + const gchar *name, + gint expected_num) +{ + GRegex *regex; + gint num; + + verbose ("getting the number of \"%s\" (pattern: \"%s\") \t", + name, pattern); + + regex = g_regex_new (pattern, 0, 0, NULL); + num = g_regex_get_string_number (regex, name); + g_regex_free (regex); + + if (num != expected_num) + { + g_print ("failed \t(got %d, expected %d)\n", num, expected_num); + return FALSE; + } + else + { + verbose ("passed\n"); + return TRUE; + } +} + +#define TEST_GET_STRING_NUMBER(pattern, name, expected_num) { \ + total++; \ + if (test_get_string_number (pattern, name, expected_num)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_escape (const gchar *string, + gint length, + const gchar *expected) +{ + gchar *escaped; + + verbose ("escaping \"%s\" (len: %d) \t", string, length); + + escaped = g_regex_escape_string (string, length); + + if (!streq (escaped, expected)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", escaped, expected); + g_free (escaped); + return FALSE; + } + + g_free (escaped); + + verbose ("passed\n"); + return TRUE; +} + +#define TEST_ESCAPE(string, length, expected) { \ + total++; \ + if (test_escape (string, length, expected)) \ + PASS; \ + else \ + FAIL; \ +} + +static gboolean +test_match_all_full (const gchar *pattern, + const gchar *string, + gssize string_len, + gint start_position, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *l_exp; + gboolean match_ok; + gboolean ret = TRUE; + gint match_count; + gint i; + + verbose ("matching all in \"%s\" against \"%s\" (start: %d, len: %d) \t", + string, pattern, start_position, string_len); + + /* The va_list is a NULL-terminated sequence of: extected matched string, + * expected start and expected end. */ + va_start (args, start_position); + while (TRUE) + { + Match *match; + const gchar *expected_string = va_arg (args, const gchar *); + if (expected_string == NULL) + break; + match = g_new0 (Match, 1); + match->string = g_strdup (expected_string); + match->start = va_arg (args, gint); + match->end = va_arg (args, gint); + expected = g_slist_prepend (expected, match); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + match_ok = g_regex_match_all_full (regex, string, string_len, + start_position, 0, NULL); + + if (match_ok && g_slist_length (expected) == 0) + { + g_print ("failed\n"); + ret = FALSE; + goto exit; + } + if (!match_ok && g_slist_length (expected) != 0) + { + g_print ("failed\n"); + ret = FALSE; + goto exit; + } + + match_count = g_regex_get_match_count (regex); + if (match_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", match_count, + match_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + for (i = 0; i < match_count; i++) + { + gint start, end; + gchar *matched_string; + Match *exp = l_exp->data; + + matched_string = g_regex_fetch (regex, i, string); + g_regex_fetch_pos (regex, i, &start, &end); + + if (!streq(exp->string, matched_string)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + matched_string, exp->string); + g_free (matched_string); + ret = FALSE; + goto exit; + } + g_free (matched_string); + + if (exp->start != start || exp->end != end) + { + g_print ("failed \t(got [%d, %d], expected [%d, %d])\n", + start, end, exp->start, exp->end); + ret = FALSE; + goto exit; + } + + l_exp = g_slist_next (l_exp); + } + +exit: + if (ret) + { + verbose ("passed (%d %s)\n", match_count, match_count == 1 ? "match" : "matches"); + } + + g_regex_free (regex); + g_slist_foreach (expected, free_match, NULL); + g_slist_free (expected); + + return ret; +} + +static gboolean +test_match_all (const gchar *pattern, + const gchar *string, + ...) +{ + GRegex *regex; + va_list args; + GSList *expected = NULL; + GSList *l_exp; + gboolean match_ok; + gboolean ret = TRUE; + gint match_count; + gint i; + + verbose ("matching all in \"%s\" against \"%s\" \t", string, pattern); + + /* The va_list is a NULL-terminated sequence of: extected matched string, + * expected start and expected end. */ + va_start (args, string); + while (TRUE) + { + Match *match; + const gchar *expected_string = va_arg (args, const gchar *); + if (expected_string == NULL) + break; + match = g_new0 (Match, 1); + match->string = g_strdup (expected_string); + match->start = va_arg (args, gint); + match->end = va_arg (args, gint); + expected = g_slist_prepend (expected, match); + } + expected = g_slist_reverse (expected); + va_end (args); + + regex = g_regex_new (pattern, 0, 0, NULL); + match_ok = g_regex_match_all (regex, string, 0); + + if (match_ok && g_slist_length (expected) == 0) + { + g_print ("failed\n"); + ret = FALSE; + goto exit; + } + if (!match_ok && g_slist_length (expected) != 0) + { + g_print ("failed\n"); + ret = FALSE; + goto exit; + } + + match_count = g_regex_get_match_count (regex); + if (match_count != g_slist_length (expected)) + { + g_print ("failed \t(got %d %s, expected %d)\n", match_count, + match_count == 1 ? "match" : "matches", + g_slist_length (expected)); + ret = FALSE; + goto exit; + } + + l_exp = expected; + for (i = 0; i < match_count; i++) + { + gint start, end; + gchar *matched_string; + Match *exp = l_exp->data; + + matched_string = g_regex_fetch (regex, i, string); + g_regex_fetch_pos (regex, i, &start, &end); + + if (!streq(exp->string, matched_string)) + { + g_print ("failed \t(got \"%s\", expected \"%s\")\n", + matched_string, exp->string); + g_free (matched_string); + ret = FALSE; + goto exit; + } + g_free (matched_string); + + if (exp->start != start || exp->end != end) + { + g_print ("failed \t(got [%d, %d], expected [%d, %d])\n", + start, end, exp->start, exp->end); + ret = FALSE; + goto exit; + } + + l_exp = g_slist_next (l_exp); + } + +exit: + if (ret) + { + verbose ("passed (%d %s)\n", match_count, match_count == 1 ? "match" : "matches"); + } + + g_regex_free (regex); + g_slist_foreach (expected, free_match, NULL); + g_slist_free (expected); + + return ret; +} + +#define TEST_MATCH_ALL0(pattern, string, string_len, start_position) { \ + total++; \ + if (test_match_all_full (pattern, string, string_len, start_position, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_all (pattern, string, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_ALL1(pattern, string, string_len, start_position, \ + t1, s1, e1) { \ + total++; \ + if (test_match_all_full (pattern, string, string_len, start_position, \ + t1, s1, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_all (pattern, string, t1, s1, e1, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_ALL2(pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2) { \ + total++; \ + if (test_match_all_full (pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_all (pattern, string, t1, s1, e1, t2, s2, e2, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_MATCH_ALL3(pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, t3, s3, e3) { \ + total++; \ + if (test_match_all_full (pattern, string, string_len, start_position, \ + t1, s1, e1, t2, s2, e2, t3, s3, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + if (string_len == -1 && start_position == 0) \ + { \ + total++; \ + if (test_match_all (pattern, string, t1, s1, e1, t2, s2, e2, t3, s3, e3, NULL)) \ + PASS; \ + else \ + FAIL; \ + } \ +} + +#define TEST_NULL_MATCH(code) \ + G_STMT_START \ + { \ + GRegex *re = g_regex_new ("a", 0, 0, NULL); \ + verbose ("trying '" #code "' on a clean regex \t"); \ + code; \ + g_regex_free (re); \ + re = g_regex_new ("a", 0, 0, NULL); \ + g_regex_match (re, "b", 0); \ + g_regex_clear (re); \ + code; \ + g_regex_free (re); \ + /* this test always passes if the code does not crash */ \ + PASS; \ + verbose ("passed\n"); \ + } \ + G_STMT_END + +#define TEST_NULL_MATCH_RET(code, expected, type, format) \ + G_STMT_START \ + { \ + type ret; \ + GRegex *re = g_regex_new ("a", 0, 0, NULL); \ + verbose ("trying '" #code "' on a clean regex \t"); \ + ret = code; \ + g_regex_free (re); \ + if (ret != expected) \ + { \ + g_print ("failed \t(got '" format "', expected '" format \ + "', with a newly created regex)\n", ret, expected); \ + FAIL; \ + } \ + else \ + { \ + re = g_regex_new ("a", 0, 0, NULL); \ + g_regex_match (re, "a", 0); \ + g_regex_clear (re); \ + ret = code; \ + g_regex_free (re); \ + if (ret != expected) \ + { \ + g_print ("failed \t(got " format ", expected " format \ + ", with a cleared regex)\n", ret, expected); \ + FAIL; \ + } \ + else \ + { \ + verbose ("passed\n"); \ + PASS; \ + } \ + } \ + } \ + G_STMT_END + +int +main (int argc, char *argv[]) +{ + gint total = 0; + gint passed = 0; + gint failed = 0; + gint i = 0; + + setlocale (LC_ALL, ""); + + for (i = 1; i < argc; i++) + { + if (streq ("--noisy", argv[i])) + noisy = TRUE; + else if (streq ("--abort", argv[i])) + abort_on_fail = TRUE; + } + + g_setenv ("G_DEBUG", "fatal_warnings", TRUE); + + /* TEST_NEW(pattern, compile_opts, match_opts) */ + TEST_NEW("", 0, 0); + TEST_NEW(".*", 0, 0); + TEST_NEW(".*", G_REGEX_MULTILINE, 0); + TEST_NEW(".*", G_REGEX_DOTALL, 0); + TEST_NEW(".*", G_REGEX_DOTALL, G_REGEX_MATCH_NOTBOL); + TEST_NEW("(123\\d*)[a-zA-Z]+(?P.*)", 0, 0); + TEST_NEW("(123\\d*)[a-zA-Z]+(?P.*)", G_REGEX_CASELESS, 0); + TEST_NEW("(?Px)|(?Py)", G_REGEX_DUPNAMES, 0); + /* This gives "internal error: code overflow" with pcre 6.0 */ + TEST_NEW("(?i)(?-i)", 0, 0); + + /* TEST_NEW_FAIL(pattern, compile_opts) */ + TEST_NEW_FAIL("(", 0); + TEST_NEW_FAIL(")", 0); + TEST_NEW_FAIL("[", 0); + TEST_NEW_FAIL("*", 0); + TEST_NEW_FAIL("?", 0); + TEST_NEW_FAIL("(?Px)|(?Py)", 0); + + /* TEST_COPY(pattern) */ + TEST_COPY(""); + TEST_COPY(".*"); + TEST_COPY("a|b"); + TEST_COPY("(123\\d*)[a-zA-Z]+(?P.*)"); + /* Test if g_regex_copy() works with null regexes. */ + TEST_COPY("("); + + /* TEST_MATCH_SIMPLE(pattern, string, compile_opts, match_opts, expected) */ + TEST_MATCH_SIMPLE("a", "", 0, 0, FALSE); + TEST_MATCH_SIMPLE("a", "a", 0, 0, TRUE); + TEST_MATCH_SIMPLE("a", "ba", 0, 0, TRUE); + TEST_MATCH_SIMPLE("^a", "ba", 0, 0, FALSE); + TEST_MATCH_SIMPLE("a", "ba", G_REGEX_ANCHORED, 0, FALSE); + TEST_MATCH_SIMPLE("a", "ba", 0, G_REGEX_MATCH_ANCHORED, FALSE); + TEST_MATCH_SIMPLE("a", "ab", G_REGEX_ANCHORED, 0, TRUE); + TEST_MATCH_SIMPLE("a", "ab", 0, G_REGEX_MATCH_ANCHORED, TRUE); + TEST_MATCH_SIMPLE("a", "a", G_REGEX_CASELESS, 0, TRUE); + TEST_MATCH_SIMPLE("a", "A", G_REGEX_CASELESS, 0, TRUE); + /* These are needed to test extended properties. */ + TEST_MATCH_SIMPLE(AGRAVE, AGRAVE, G_REGEX_CASELESS, 0, TRUE); + TEST_MATCH_SIMPLE(AGRAVE, AGRAVE_UPPER, G_REGEX_CASELESS, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{L}", "a", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{L}", "1", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{L}", AGRAVE, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{L}", AGRAVE_UPPER, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{L}", SHEEN, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{L}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ll}", "a", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Ll}", AGRAVE, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Ll}", AGRAVE_UPPER, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ll}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Sc}", AGRAVE, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Sc}", EURO, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Sc}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{N}", "a", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{N}", "1", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{N}", AGRAVE, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{N}", AGRAVE_UPPER, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{N}", SHEEN, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{N}", ETH30, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Nd}", "a", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Nd}", "1", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Nd}", AGRAVE, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Nd}", AGRAVE_UPPER, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Nd}", SHEEN, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Nd}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Common}", SHEEN, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Common}", "a", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Common}", AGRAVE, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Common}", AGRAVE_UPPER, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Common}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Common}", "%", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Common}", "1", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Arabic}", SHEEN, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Arabic}", "a", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Arabic}", AGRAVE, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Arabic}", AGRAVE_UPPER, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Arabic}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Arabic}", "%", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Arabic}", "1", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Latin}", SHEEN, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Latin}", "a", 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Latin}", AGRAVE, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Latin}", AGRAVE_UPPER, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Latin}", ETH30, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Latin}", "%", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Latin}", "1", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", SHEEN, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", "a", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", AGRAVE, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", AGRAVE_UPPER, 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", ETH30, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", "%", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{Ethiopic}", "1", 0, 0, FALSE); + TEST_MATCH_SIMPLE("\\p{L}(?<=\\p{Arabic})", SHEEN, 0, 0, TRUE); + TEST_MATCH_SIMPLE("\\p{L}(?<=\\p{Latin})", SHEEN, 0, 0, FALSE); + /* Invalid patterns. */ + TEST_MATCH_SIMPLE("\\", "a", 0, 0, FALSE); + TEST_MATCH_SIMPLE("[", "", 0, 0, FALSE); + + /* TEST_MATCH(pattern, compile_opts, match_opts, string, + * string_len, start_position, match_opts2, expected) */ + TEST_MATCH("a", 0, 0, "a", -1, 0, 0, TRUE); + TEST_MATCH("a", 0, 0, "A", -1, 0, 0, FALSE); + TEST_MATCH("a", G_REGEX_CASELESS, 0, "A", -1, 0, 0, TRUE); + TEST_MATCH("a", 0, 0, "ab", -1, 1, 0, FALSE); + TEST_MATCH("a", 0, 0, "ba", 1, 0, 0, FALSE); + TEST_MATCH("a", 0, 0, "bab", -1, 0, 0, TRUE); + TEST_MATCH("a", 0, 0, "b", -1, 0, 0, FALSE); + TEST_MATCH("a", 0, G_REGEX_ANCHORED, "a", -1, 0, 0, TRUE); + TEST_MATCH("a", 0, G_REGEX_ANCHORED, "ab", -1, 1, 0, FALSE); + TEST_MATCH("a", 0, G_REGEX_ANCHORED, "ba", 1, 0, 0, FALSE); + TEST_MATCH("a", 0, G_REGEX_ANCHORED, "bab", -1, 0, 0, FALSE); + TEST_MATCH("a", 0, G_REGEX_ANCHORED, "b", -1, 0, 0, FALSE); + TEST_MATCH("a", 0, 0, "a", -1, 0, G_REGEX_ANCHORED, TRUE); + TEST_MATCH("a", 0, 0, "ab", -1, 1, G_REGEX_ANCHORED, FALSE); + TEST_MATCH("a", 0, 0, "ba", 1, 0, G_REGEX_ANCHORED, FALSE); + TEST_MATCH("a", 0, 0, "bab", -1, 0, G_REGEX_ANCHORED, FALSE); + TEST_MATCH("a", 0, 0, "b", -1, 0, G_REGEX_ANCHORED, FALSE); + TEST_MATCH("a|b", 0, 0, "a", -1, 0, 0, TRUE); + TEST_MATCH("\\d", 0, 0, EURO, -1, 0, 0, FALSE); + TEST_MATCH("^.$", 0, 0, EURO, -1, 0, 0, TRUE); + TEST_MATCH("^.{3}$", 0, 0, EURO, -1, 0, 0, FALSE); + TEST_MATCH("^.$", G_REGEX_RAW, 0, EURO, -1, 0, 0, FALSE); + TEST_MATCH("^.{3}$", G_REGEX_RAW, 0, EURO, -1, 0, 0, TRUE); + TEST_MATCH(AGRAVE, G_REGEX_CASELESS, 0, AGRAVE_UPPER, -1, 0, 0, TRUE); + + /* New lines handling. */ + TEST_MATCH("^a\\Rb$", 0, 0, "a\r\nb", -1, 0, 0, TRUE); + TEST_MATCH("^a\\Rb$", 0, 0, "a\nb", -1, 0, 0, TRUE); + TEST_MATCH("^a\\Rb$", 0, 0, "a\rb", -1, 0, 0, TRUE); + TEST_MATCH("^a\\Rb$", 0, 0, "a\n\rb", -1, 0, 0, FALSE); + TEST_MATCH("^a\\R\\Rb$", 0, 0, "a\n\rb", -1, 0, 0, TRUE); + TEST_MATCH("^a\\nb$", 0, 0, "a\r\nb", -1, 0, 0, FALSE); + TEST_MATCH("^a\\r\\nb$", 0, 0, "a\r\nb", -1, 0, 0, TRUE); + + TEST_MATCH("^b$", 0, 0, "a\nb\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, 0, "a\nb\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, 0, "a\r\nb\r\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, 0, "a\rb\rc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, 0, "a\nb\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_LF, 0, "a\nb\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CRLF, 0, "a\nb\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, 0, "a\r\nb\r\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_LF, 0, "a\r\nb\r\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CRLF, 0, "a\r\nb\r\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, 0, "a\rb\rc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_LF, 0, "a\rb\rc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CRLF, 0, "a\rb\rc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CR, "a\nb\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_LF, "a\nb\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CRLF, "a\nb\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CR, "a\r\nb\r\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_LF, "a\r\nb\r\nc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CRLF, "a\r\nb\r\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CR, "a\rb\rc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_LF, "a\rb\rc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CRLF, "a\rb\rc", -1, 0, 0, FALSE); + + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_ANY, "a\nb\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_ANY, "a\rb\rc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_ANY, "a\r\nb\r\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_LF, "a\nb\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_LF, "a\rb\rc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_CRLF, "a\r\nb\r\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_CRLF, "a\rb\rc", -1, 0, 0, FALSE); + + TEST_MATCH("a#\nb", G_REGEX_EXTENDED, 0, "a", -1, 0, 0, FALSE); + TEST_MATCH("a#\r\nb", G_REGEX_EXTENDED, 0, "a", -1, 0, 0, FALSE); + TEST_MATCH("a#\rb", G_REGEX_EXTENDED, 0, "a", -1, 0, 0, FALSE); + TEST_MATCH("a#\nb", G_REGEX_EXTENDED, G_REGEX_MATCH_NEWLINE_CR, "a", -1, 0, 0, FALSE); + TEST_MATCH("a#\nb", G_REGEX_EXTENDED | G_REGEX_NEWLINE_CR, 0, "a", -1, 0, 0, TRUE); + + /* TEST_MATCH_NEXT#(pattern, string, string_len, start_position, ...) */ + TEST_MATCH_NEXT0("a", "x", -1, 0); + TEST_MATCH_NEXT0("a", "ax", -1, 1); + TEST_MATCH_NEXT0("a", "xa", 1, 0); + TEST_MATCH_NEXT0("a", "axa", 1, 2); + TEST_MATCH_NEXT1("a", "a", -1, 0, "a", 0, 1); + TEST_MATCH_NEXT1("a", "xax", -1, 0, "a", 1, 2); + TEST_MATCH_NEXT1(EURO, ENG EURO, -1, 0, EURO, 2, 5); + TEST_MATCH_NEXT1("a*", "", -1, 0, "", 0, 0); + TEST_MATCH_NEXT2("a*", "aa", -1, 0, "aa", 0, 2, "", 2, 2); + TEST_MATCH_NEXT2(EURO "*", EURO EURO, -1, 0, EURO EURO, 0, 6, "", 6, 6); + TEST_MATCH_NEXT2("a", "axa", -1, 0, "a", 0, 1, "a", 2, 3); + TEST_MATCH_NEXT2("a+", "aaxa", -1, 0, "aa", 0, 2, "a", 3, 4); + TEST_MATCH_NEXT2("a", "aa", -1, 0, "a", 0, 1, "a", 1, 2); + TEST_MATCH_NEXT2("a", "ababa", -1, 2, "a", 2, 3, "a", 4, 5); + TEST_MATCH_NEXT2(EURO "+", EURO "-" EURO, -1, 0, EURO, 0, 3, EURO, 4, 7); + TEST_MATCH_NEXT3("", "ab", -1, 0, "", 0, 0, "", 1, 1, "", 2, 2); + TEST_MATCH_NEXT3("", AGRAVE "b", -1, 0, "", 0, 0, "", 2, 2, "", 3, 3); + TEST_MATCH_NEXT3("a", "aaxa", -1, 0, "a", 0, 1, "a", 1, 2, "a", 3, 4); + TEST_MATCH_NEXT3("a", "aa" OGRAVE "a", -1, 0, "a", 0, 1, "a", 1, 2, "a", 4, 5); + TEST_MATCH_NEXT3("a*", "aax", -1, 0, "aa", 0, 2, "", 2, 2, "", 3, 3); + TEST_MATCH_NEXT4("a*", "aaxa", -1, 0, "aa", 0, 2, "", 2, 2, "a", 3, 4, "", 4, 4); + + /* TEST_MATCH_COUNT(pattern, string, start_position, match_opts, expected_count) */ + TEST_MATCH_COUNT("a", "", 0, 0, 0); + TEST_MATCH_COUNT("a", "a", 0, 0, 1); + TEST_MATCH_COUNT("a", "a", 1, 0, 0); + TEST_MATCH_COUNT("(.)", "a", 0, 0, 2); + TEST_MATCH_COUNT("(.)", EURO, 0, 0, 2); + TEST_MATCH_COUNT("(?:.)", "a", 0, 0, 1); + TEST_MATCH_COUNT("(?P.)", "a", 0, 0, 2); + TEST_MATCH_COUNT("a$", "a", 0, G_REGEX_MATCH_NOTEOL, 0); + TEST_MATCH_COUNT("(a)?(b)", "b", 0, 0, 3); + TEST_MATCH_COUNT("(a)?(b)", "ab", 0, 0, 3); + + /* TEST_PARTIAL(pattern, string, expected) */ + TEST_PARTIAL("^ab", "a", TRUE); + TEST_PARTIAL("^ab", "xa", FALSE); + TEST_PARTIAL("ab", "xa", TRUE); + TEST_PARTIAL("ab", "ab", FALSE); /* normal match. */ + TEST_PARTIAL("a+b", "aa", FALSE); /* PCRE_ERROR_BAD_PARTIAL */ + TEST_PARTIAL("(a)+b", "aa", TRUE); + TEST_PARTIAL("a?b", "a", TRUE); + + /* TEST_CLEAR(pattern, string, start_position) */ + TEST_CLEAR("$^", "aaa", 0); + TEST_CLEAR("a", "xax", 0); + TEST_CLEAR("a", "xax", 1); + TEST_CLEAR("a", "xax", 2); + TEST_CLEAR("a", "aa", 0); + TEST_CLEAR(HSTROKE, HSTROKE, 0); + + /* TEST_SUB_PATTERN(pattern, string, start_position, sub_n, expected_sub, + * expected_start, expected_end) */ + TEST_SUB_PATTERN("a", "a", 0, 0, "a", 0, 1); + TEST_SUB_PATTERN("a(.)", "ab", 0, 1, "b", 1, 2); + TEST_SUB_PATTERN("a(.)", "a" EURO, 0, 1, EURO, 1, 4); + TEST_SUB_PATTERN("(?:.*)(a)(.)", "xxa" ENG, 0, 2, ENG, 3, 5); + TEST_SUB_PATTERN("(" HSTROKE ")", "a" HSTROKE ENG, 0, 1, HSTROKE, 1, 3); + TEST_SUB_PATTERN("a", "a", 0, 1, NULL, UNTOUCHED, UNTOUCHED); + TEST_SUB_PATTERN("a", "a", 0, 1, NULL, UNTOUCHED, UNTOUCHED); + TEST_SUB_PATTERN("(a)?(b)", "b", 0, 0, "b", 0, 1); + TEST_SUB_PATTERN("(a)?(b)", "b", 0, 1, "", -1, -1); + TEST_SUB_PATTERN("(a)?(b)", "b", 0, 2, "b", 0, 1); + + /* TEST_NAMED_SUB_PATTERN(pattern, string, start_position, sub_name, + * expected_sub, expected_start, expected_end) */ + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", "ab", 0, "A", "b", 1, 2); + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", "aab", 1, "A", "b", 2, 3); + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", EURO "ab", 0, "A", "b", 4, 5); + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", EURO "ab", 0, "B", NULL, UNTOUCHED, UNTOUCHED); + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", EURO "ab", 0, "C", NULL, UNTOUCHED, UNTOUCHED); + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", "a" EGRAVE "x", 0, "A", EGRAVE, 1, 3); + TEST_NAMED_SUB_PATTERN("a(?P.)(?P.)?", "a" EGRAVE "x", 0, "B", "x", 3, 4); + TEST_NAMED_SUB_PATTERN("(?Pa)?(?Pb)", "b", 0, "A", "", -1, -1); + TEST_NAMED_SUB_PATTERN("(?Pa)?(?Pb)", "b", 0, "B", "b", 0, 1); + + /* TEST_FETCH_ALL#(pattern, string, ...) */ + TEST_FETCH_ALL0("a", ""); + TEST_FETCH_ALL0("a", "b"); + TEST_FETCH_ALL1("a", "a", "a"); + TEST_FETCH_ALL1("a+", "aa", "aa"); + TEST_FETCH_ALL1("(?:a)", "a", "a"); + TEST_FETCH_ALL2("(a)", "a", "a", "a"); + TEST_FETCH_ALL2("a(.)", "ab", "ab", "b"); + TEST_FETCH_ALL2("a(.)", "a" HSTROKE, "a" HSTROKE, HSTROKE); + TEST_FETCH_ALL3("(?:.*)(a)(.)", "xyazk", "xyaz", "a", "z"); + TEST_FETCH_ALL3("(?P.)(a)", "xa", "xa", "x", "a"); + TEST_FETCH_ALL3("(?P.)(a)", ENG "a", ENG "a", ENG, "a"); + TEST_FETCH_ALL3("(a)?(b)", "b", "b", "", "b"); + TEST_FETCH_ALL3("(a)?(b)", "ab", "ab", "a", "b"); + + /* TEST_SPLIT_SIMPLE#(pattern, string, ...) */ + TEST_SPLIT_SIMPLE0("", ""); + TEST_SPLIT_SIMPLE0("a", ""); + TEST_SPLIT_SIMPLE1(",", "a", "a"); + TEST_SPLIT_SIMPLE1("(,)\\s*", "a", "a"); + TEST_SPLIT_SIMPLE2(",", "a,b", "a", "b"); + TEST_SPLIT_SIMPLE3(",", "a,b,c", "a", "b", "c"); + TEST_SPLIT_SIMPLE3(",\\s*", "a,b,c", "a", "b", "c"); + TEST_SPLIT_SIMPLE3(",\\s*", "a, b, c", "a", "b", "c"); + TEST_SPLIT_SIMPLE3("(,)\\s*", "a,b", "a", ",", "b"); + TEST_SPLIT_SIMPLE3("(,)\\s*", "a, b", "a", ",", "b"); + /* Not matched sub-strings. */ + TEST_SPLIT_SIMPLE2("a|(b)", "xay", "x", "y"); + TEST_SPLIT_SIMPLE3("a|(b)", "xby", "x", "b", "y"); + /* Empty matches. */ + TEST_SPLIT_SIMPLE3("", "abc", "a", "b", "c"); + TEST_SPLIT_SIMPLE3(" *", "ab c", "a", "b", "c"); + /* Invalid patterns. */ + TEST_SPLIT_SIMPLE0("\\", ""); + TEST_SPLIT_SIMPLE0("[", ""); + + /* TEST_SPLIT#(pattern, string, start_position, max_tokens, ...) */ + TEST_SPLIT0("", "", 0, 0); + TEST_SPLIT0("a", "", 0, 0); + TEST_SPLIT0("a", "", 0, 1); + TEST_SPLIT0("a", "", 0, 2); + TEST_SPLIT0("a", "a", 1, 0); + TEST_SPLIT1(",", "a", 0, 0, "a"); + TEST_SPLIT1(",", "a,b", 0, 1, "a,b"); + TEST_SPLIT1("(,)\\s*", "a", 0, 0, "a"); + TEST_SPLIT1(",", "a,b", 2, 0, "b"); + TEST_SPLIT2(",", "a,b", 0, 0, "a", "b"); + TEST_SPLIT2(",", "a,b,c", 0, 2, "a", "b,c"); + TEST_SPLIT2(",", "a,b", 1, 0, "", "b"); + TEST_SPLIT2(",", "a,", 0, 0, "a", ""); + TEST_SPLIT3(",", "a,b,c", 0, 0, "a", "b", "c"); + TEST_SPLIT3(",\\s*", "a,b,c", 0, 0, "a", "b", "c"); + TEST_SPLIT3(",\\s*", "a, b, c", 0, 0, "a", "b", "c"); + TEST_SPLIT3("(,)\\s*", "a,b", 0, 0, "a", ",", "b"); + TEST_SPLIT3("(,)\\s*", "a, b", 0, 0, "a", ",", "b"); + /* Not matched sub-strings. */ + TEST_SPLIT2("a|(b)", "xay", 0, 0, "x", "y"); + TEST_SPLIT3("a|(b)", "xby", 0, -1, "x", "b", "y"); + /* Empty matches. */ + TEST_SPLIT2(" *", "ab c", 1, 0, "b", "c"); + TEST_SPLIT3("", "abc", 0, 0, "a", "b", "c"); + TEST_SPLIT3(" *", "ab c", 0, 0, "a", "b", "c"); + TEST_SPLIT1(" *", "ab c", 0, 1, "ab c"); + TEST_SPLIT2(" *", "ab c", 0, 2, "a", "b c"); + TEST_SPLIT3(" *", "ab c", 0, 3, "a", "b", "c"); + TEST_SPLIT3(" *", "ab c", 0, 4, "a", "b", "c"); + + /* TEST_SPLIT_NEXT#(pattern, string, start_position, ...) */ + TEST_SPLIT_NEXT1(",", "a", 0, "a"); + TEST_SPLIT_NEXT1("(,)\\s*", "a", 0, "a"); + TEST_SPLIT_NEXT1(",", "a,b", 2, "b"); + TEST_SPLIT_NEXT2(",", "a,b", 0, "a", "b"); + TEST_SPLIT_NEXT2(",", "a,b", 1, "", "b"); + TEST_SPLIT_NEXT2(",", "a,", 0, "a", ""); + TEST_SPLIT_NEXT3(",", "a,b,c", 0, "a", "b", "c"); + TEST_SPLIT_NEXT3(",\\s*", "a,b,c", 0, "a", "b", "c"); + TEST_SPLIT_NEXT3(",\\s*", "a, b, c", 0, "a", "b", "c"); + TEST_SPLIT_NEXT3("(,)\\s*", "a,b", 0, "a", ",", "b"); + TEST_SPLIT_NEXT3("(,)\\s*", "a, b", 0, "a", ",", "b"); + /* Not matched sub-strings. */ + TEST_SPLIT_NEXT2("a|(b)", "xay", 0, "x", "y"); + TEST_SPLIT_NEXT3("a|(b)", "xby", 0, "x", "b", "y"); + /* Empty matches. */ + TEST_SPLIT_NEXT2(" *", "ab c", 1, "b", "c"); + TEST_SPLIT_NEXT3("", "abc", 0, "a", "b", "c"); + TEST_SPLIT_NEXT3(" *", "ab c", 0, "a", "b", "c"); + + /* TEST_EXPAND(pattern, string, string_to_expand, raw, expected) */ + TEST_EXPAND("a", "a", "", FALSE, ""); + TEST_EXPAND("a", "a", "\\0", FALSE, "a"); + TEST_EXPAND("a", "a", "\\1", FALSE, ""); + TEST_EXPAND("(a)", "ab", "\\1", FALSE, "a"); + TEST_EXPAND("(a)", "a", "\\1", FALSE, "a"); + TEST_EXPAND("(a)", "a", "\\g<1>", FALSE, "a"); + TEST_EXPAND("a", "a", "\\0130", FALSE, "X"); + TEST_EXPAND("a", "a", "\\\\\\0", FALSE, "\\a"); + TEST_EXPAND("a(?P.)c", "xabcy", "X\\gX", FALSE, "XbX"); + TEST_EXPAND("(.)(?P<1>.)", "ab", "\\1", FALSE, "a"); + TEST_EXPAND("(.)(?P<1>.)", "ab", "\\g<1>", FALSE, "a"); + TEST_EXPAND(".", EURO, "\\0", FALSE, EURO); + TEST_EXPAND("(.)", EURO, "\\1", FALSE, EURO); + TEST_EXPAND("(?P.)", EURO, "\\g", FALSE, EURO); + TEST_EXPAND(".", "a", EURO, FALSE, EURO); + TEST_EXPAND(".", "a", EURO "\\0", FALSE, EURO "a"); + TEST_EXPAND(".", "", "\\Lab\\Ec", FALSE, "abc"); + TEST_EXPAND(".", "", "\\LaB\\EC", FALSE, "abC"); + TEST_EXPAND(".", "", "\\Uab\\Ec", FALSE, "ABc"); + TEST_EXPAND(".", "", "a\\ubc", FALSE, "aBc"); + TEST_EXPAND(".", "", "a\\lbc", FALSE, "abc"); + TEST_EXPAND(".", "", "A\\uBC", FALSE, "ABC"); + TEST_EXPAND(".", "", "A\\lBC", FALSE, "AbC"); + TEST_EXPAND(".", "", "A\\l\\\\BC", FALSE, "A\\BC"); + TEST_EXPAND(".", "", "\\L" AGRAVE "\\E", FALSE, AGRAVE); + TEST_EXPAND(".", "", "\\U" AGRAVE "\\E", FALSE, AGRAVE_UPPER); + TEST_EXPAND(".", "", "\\u" AGRAVE "a", FALSE, AGRAVE_UPPER "a"); + TEST_EXPAND(".", "ab", "x\\U\\0y\\Ez", FALSE, "xAYz"); + TEST_EXPAND(".(.)", "AB", "x\\L\\1y\\Ez", FALSE, "xbyz"); + TEST_EXPAND(".", "ab", "x\\u\\0y\\Ez", FALSE, "xAyz"); + TEST_EXPAND(".(.)", "AB", "x\\l\\1y\\Ez", FALSE, "xbyz"); + TEST_EXPAND(".(.)", "a" AGRAVE_UPPER, "x\\l\\1y", FALSE, "x" AGRAVE "y"); + TEST_EXPAND("a", "bab", "\\x{61}", FALSE, "a"); + TEST_EXPAND("a", "bab", "\\x61", FALSE, "a"); + TEST_EXPAND("a", "bab", "\\x5a", FALSE, "Z"); + TEST_EXPAND("a", "bab", "\\0\\x5A", FALSE, "aZ"); + TEST_EXPAND("a", "bab", "\\1\\x{5A}", FALSE, "Z"); + TEST_EXPAND("a", "bab", "\\x{00E0}", FALSE, AGRAVE); + TEST_EXPAND("", "bab", "\\x{0634}", FALSE, SHEEN); + TEST_EXPAND("", "bab", "\\x{634}", FALSE, SHEEN); + TEST_EXPAND("", "", "\\t", FALSE, "\t"); + TEST_EXPAND("", "", "\\v", FALSE, "\v"); + TEST_EXPAND("", "", "\\r", FALSE, "\r"); + TEST_EXPAND("", "", "\\n", FALSE, "\n"); + TEST_EXPAND("", "", "\\f", FALSE, "\f"); + TEST_EXPAND("", "", "\\a", FALSE, "\a"); + TEST_EXPAND("", "", "\\b", FALSE, "\b"); + TEST_EXPAND("a(.)", "abc", "\\0\\b\\1", FALSE, "ab\bb"); + TEST_EXPAND("a(.)", "abc", "\\0141", FALSE, "a"); + TEST_EXPAND("a(.)", "abc", "\\078", FALSE, "\a8"); + TEST_EXPAND("a(.)", "abc", "\\077", FALSE, "?"); + TEST_EXPAND("a(.)", "abc", "\\0778", FALSE, "?8"); + TEST_EXPAND("a(.)", "a" AGRAVE "b", "\\1", FALSE, AGRAVE); + TEST_EXPAND("a(.)", "a" AGRAVE "b", "\\1", TRUE, "\xc3"); + TEST_EXPAND("a(.)", "a" AGRAVE "b", "\\0", TRUE, "a\xc3"); + /* Invalid strings. */ + TEST_EXPAND("", "", "\\Q", FALSE, NULL); + TEST_EXPAND("", "", "x\\Ay", FALSE, NULL); + TEST_EXPAND("", "", "\\g<", FALSE, NULL); + TEST_EXPAND("", "", "\\g<>", FALSE, NULL); + TEST_EXPAND("", "", "\\g<1a>", FALSE, NULL); + TEST_EXPAND("", "", "\\g", FALSE, NULL); + TEST_EXPAND("", "", "\\", FALSE, NULL); + TEST_EXPAND("a", "a", "\\x{61", FALSE, NULL); + TEST_EXPAND("a", "a", "\\x6X", FALSE, NULL); + + /* TEST_REPLACE(pattern, string, start_position, replacement, expected) */ + TEST_REPLACE("a", "ababa", 0, "A", "AbAbA"); + TEST_REPLACE("a", "ababa", 1, "A", "abAbA"); + TEST_REPLACE("a", "ababa", 2, "A", "abAbA"); + TEST_REPLACE("a", "ababa", 3, "A", "ababA"); + TEST_REPLACE("a", "ababa", 4, "A", "ababA"); + TEST_REPLACE("a", "abababa", 2, "A", "abAbAbA"); + TEST_REPLACE("$^", "abc", 0, "X", "abc"); + TEST_REPLACE("(.)a", "ciao", 0, "a\\1", "caio"); + TEST_REPLACE("a.", "abc", 0, "\\0\\0", "ababc"); + TEST_REPLACE("a", "asd", 0, "\\0101", "Asd"); + TEST_REPLACE("(a).\\1", "aba cda", 0, "\\1\\n", "a\n cda"); + TEST_REPLACE("a" AGRAVE "a", "a" AGRAVE "a", 0, "x", "x"); + TEST_REPLACE("a" AGRAVE "a", "a" AGRAVE "a", 0, OGRAVE, OGRAVE); + TEST_REPLACE("[^-]", "-" EURO "-x-" HSTROKE, 0, "a", "-a-a-a"); + TEST_REPLACE("[^-]", "-" EURO "-" HSTROKE, 0, "a\\g<0>a", "-a" EURO "a-a" HSTROKE "a"); + TEST_REPLACE("-", "-" EURO "-" HSTROKE, 0, "", EURO HSTROKE); + TEST_REPLACE(".*", "hello", 0, "\\U\\0\\E", "HELLO"); + TEST_REPLACE(".*", "hello", 0, "\\u\\0", "Hello"); + TEST_REPLACE("\\S+", "hello world", 0, "\\U-\\0-", "-HELLO- -WORLD-"); + TEST_REPLACE(".", "a", 0, "\\A", NULL); + TEST_REPLACE(".", "a", 0, "\\g", NULL); + + /* TEST_REPLACE_LIT(pattern, string, start_position, replacement, expected) */ + TEST_REPLACE_LIT("a", "ababa", 0, "A", "AbAbA"); + TEST_REPLACE_LIT("a", "ababa", 1, "A", "abAbA"); + TEST_REPLACE_LIT("a", "ababa", 2, "A", "abAbA"); + TEST_REPLACE_LIT("a", "ababa", 3, "A", "ababA"); + TEST_REPLACE_LIT("a", "ababa", 4, "A", "ababA"); + TEST_REPLACE_LIT("a", "abababa", 2, "A", "abAbAbA"); + TEST_REPLACE_LIT("a", "abcadaa", 0, "A", "AbcAdAA"); + TEST_REPLACE_LIT("$^", "abc", 0, "X", "abc"); + TEST_REPLACE_LIT("(.)a", "ciao", 0, "a\\1", "ca\\1o"); + TEST_REPLACE_LIT("a.", "abc", 0, "\\0\\0\\n", "\\0\\0\\nc"); + TEST_REPLACE_LIT("a" AGRAVE "a", "a" AGRAVE "a", 0, "x", "x"); + TEST_REPLACE_LIT("a" AGRAVE "a", "a" AGRAVE "a", 0, OGRAVE, OGRAVE); + TEST_REPLACE_LIT(AGRAVE, "-" AGRAVE "-" HSTROKE, 0, "a" ENG "a", "-a" ENG "a-" HSTROKE); + TEST_REPLACE_LIT("[^-]", "-" EURO "-" AGRAVE "-" HSTROKE, 0, "a", "-a-a-a"); + TEST_REPLACE_LIT("[^-]", "-" EURO "-" AGRAVE, 0, "a\\g<0>a", "-a\\g<0>a-a\\g<0>a"); + TEST_REPLACE_LIT("-", "-" EURO "-" AGRAVE "-" HSTROKE, 0, "", EURO AGRAVE HSTROKE); + + /* TEST_GET_STRING_NUMBER(pattern, name, expected_num) */ + TEST_GET_STRING_NUMBER("", "A", -1); + TEST_GET_STRING_NUMBER("(?P.)", "A", 1); + TEST_GET_STRING_NUMBER("(?P.)", "B", -1); + TEST_GET_STRING_NUMBER("(?P.)(?Pa)", "A", 1); + TEST_GET_STRING_NUMBER("(?P.)(?Pa)", "B", 2); + TEST_GET_STRING_NUMBER("(?P.)(?Pa)", "C", -1); + TEST_GET_STRING_NUMBER("(?P.)(.)(?Pa)", "A", 1); + TEST_GET_STRING_NUMBER("(?P.)(.)(?Pa)", "B", 3); + TEST_GET_STRING_NUMBER("(?P.)(.)(?Pa)", "C", -1); + TEST_GET_STRING_NUMBER("(?:a)(?P.)", "A", 1); + TEST_GET_STRING_NUMBER("(?:a)(?P.)", "B", -1); + + /* TEST_ESCAPE(string, length, expected) */ + TEST_ESCAPE("hello world", -1, "hello world"); + TEST_ESCAPE("hello world", 5, "hello"); + TEST_ESCAPE("hello.world", -1, "hello\\.world"); + TEST_ESCAPE("a(b\\b.$", -1, "a\\(b\\\\b\\.\\$"); + TEST_ESCAPE("hello\0world", -1, "hello"); + TEST_ESCAPE("hello\0world", 11, "hello\\0world"); + TEST_ESCAPE(EURO "*" ENG, -1, EURO "\\*" ENG); + TEST_ESCAPE("a$", -1, "a\\$"); + TEST_ESCAPE("$a", -1, "\\$a"); + TEST_ESCAPE("a$a", -1, "a\\$a"); + TEST_ESCAPE("$a$", -1, "\\$a\\$"); + TEST_ESCAPE("$a$", 0, ""); + TEST_ESCAPE("$a$", 1, "\\$"); + TEST_ESCAPE("$a$", 2, "\\$a"); + TEST_ESCAPE("$a$", 3, "\\$a\\$"); + TEST_ESCAPE("$a$", 4, "\\$a\\$\\0"); + TEST_ESCAPE("|()[]{}^$*+?.", -1, "\\|\\(\\)\\[\\]\\{\\}\\^\\$\\*\\+\\?\\."); + TEST_ESCAPE("a|a(a)a[a]a{a}a^a$a*a+a?a.a", -1, + "a\\|a\\(a\\)a\\[a\\]a\\{a\\}a\\^a\\$a\\*a\\+a\\?a\\.a"); + + /* TEST_MATCH_ALL#(pattern, string, string_len, start_position, ...) */ + TEST_MATCH_ALL0("<.*>", "", -1, 0); + TEST_MATCH_ALL0("a+", "", -1, 0); + TEST_MATCH_ALL0("a+", "a", 0, 0); + TEST_MATCH_ALL0("a+", "a", -1, 1); + TEST_MATCH_ALL1("<.*>", "", -1, 0, "", 0, 3); + TEST_MATCH_ALL1("a+", "a", -1, 0, "a", 0, 1); + TEST_MATCH_ALL1("a+", "aa", 1, 0, "a", 0, 1); + TEST_MATCH_ALL1("a+", "aa", -1, 1, "a", 1, 2); + TEST_MATCH_ALL1("a+", "aa", 2, 1, "a", 1, 2); + TEST_MATCH_ALL1(".+", ENG, -1, 0, ENG, 0, 2); + TEST_MATCH_ALL2("<.*>", "", -1, 0, "", 0, 6, "", 0, 3); + TEST_MATCH_ALL2("a+", "aa", -1, 0, "aa", 0, 2, "a", 0, 1); + TEST_MATCH_ALL2(".+", ENG EURO, -1, 0, ENG EURO, 0, 5, ENG, 0, 2); + TEST_MATCH_ALL3("<.*>", "", -1, 0, "", 0, 9, + "", 0, 6, "", 0, 3); + TEST_MATCH_ALL3("a+", "aaa", -1, 0, "aaa", 0, 3, "aa", 0, 2, "a", 0, 1); + + /* TEST_NULL_MATCH(code) */ + /* TEST_NULL_MATCH_RET(code, expected, type) */ + /* Test to see what happens if a function needing GRegexMatch is called + * when GRegexMatch is NULL. The result should be the same when the function + * is called after g_regex_clear. + * "re" is a GRegex, the pattern is "a". */ + TEST_NULL_MATCH(g_regex_clear (re)); + TEST_NULL_MATCH(g_regex_get_pattern (re)); + TEST_NULL_MATCH_RET(g_regex_optimize (re, NULL), TRUE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_match (re, "a", 0), TRUE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_match (re, "b", 0), FALSE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_match_full (re, "a", -1, 0, 0, NULL), TRUE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_match_full (re, "a", -1, 1, 0, NULL), FALSE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_match_full (re, "b", -1, 0, 0, NULL), FALSE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_get_match_count (re), -1, gint, "%d"); + TEST_NULL_MATCH_RET(g_regex_is_partial_match (re), FALSE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_fetch (re, 0, "abc"), NULL, gchar *, "%p"); + TEST_NULL_MATCH_RET(g_regex_fetch_pos (re, 0, NULL, NULL), FALSE, gboolean, "%d"); + TEST_NULL_MATCH_RET(g_regex_fetch_all (re, "b"), NULL, gchar **, "%p"); + TEST_NULL_MATCH_RET(g_regex_get_string_number (re, "X"), -1, gint, "%d"); + +end: /* if abort_on_fail is TRUE the flow passes to this label. */ + verbose ("\n%u tests passed, %u failed\n", passed, failed); + return failed; +} + +#else /* ENABLE_REGEX false */ + +int +main (int argc, char *argv[]) +{ + g_print ("GRegex is disabled.\n"); + return 0; +} + +#endif /* ENABLE_REGEX */