+
+#include "pcre_internal.h"
+
+#define DFTABLES /* pcre_maketables.c notices this */
+#include "pcre_maketables.c"
+
+
+int main(int argc, char **argv)
+{
+FILE *f;
+int i = 1;
+const unsigned char *tables;
+const unsigned char *base_of_tables;
+
+/* By default, the default C locale is used rather than what the building user
+happens to have set. However, if the -L option is given, set the locale from
+the LC_xxx environment variables. */
+
+if (argc > 1 && strcmp(argv[1], "-L") == 0)
+ {
+ setlocale(LC_ALL, ""); /* Set from environment variables */
+ i++;
+ }
+
+if (argc < i + 1)
+ {
+ fprintf(stderr, "dftables: one filename argument is required\n");
+ return 1;
+ }
+
+tables = pcre_maketables();
+base_of_tables = tables;
+
+f = fopen(argv[i], "wb");
+if (f == NULL)
+ {
+ fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]);
+ return 1;
+ }
+
+/* There are several fprintf() calls here, because gcc in pedantic mode
+complains about the very long string otherwise. */
+
+fprintf(f,
+ "/*************************************************\n"
+ "* Perl-Compatible Regular Expressions *\n"
+ "*************************************************/\n\n"
+ "/* This file was automatically written by the dftables auxiliary\n"
+ "program. It contains character tables that are used when no external\n"
+ "tables are passed to PCRE by the application that calls it. The tables\n"
+ "are used only for characters whose code values are less than 256.\n\n");
+fprintf(f,
+ "The following #includes are present because without them gcc 4.x may remove\n"
+ "the array definition from the final binary if PCRE is built into a static\n"
+ "library and dead code stripping is activated. This leads to link errors.\n"
+ "Pulling in the header ensures that the array gets flagged as \"someone\n"
+ "outside this compilation unit might reference this\" and so it will always\n"
+ "be supplied to the linker. */\n\n"
+ "#ifdef HAVE_CONFIG_H\n"
+ "#include \"config.h\"\n"
+ "#endif\n\n"
+ "#include \"pcre_internal.h\"\n\n");
+fprintf(f,
+ "const unsigned char _pcre_default_tables[] = {\n\n"
+ "/* This table is a lower casing table. */\n\n");
+
+fprintf(f, " ");
+for (i = 0; i < 256; i++)
+ {
+ if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
+ fprintf(f, "%3d", *tables++);
+ if (i != 255) fprintf(f, ",");
+ }
+fprintf(f, ",\n\n");
+
+fprintf(f, "/* This table is a case flipping table. */\n\n");
+
+fprintf(f, " ");
+for (i = 0; i < 256; i++)
+ {
+ if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
+ fprintf(f, "%3d", *tables++);
+ if (i != 255) fprintf(f, ",");
+ }
+fprintf(f, ",\n\n");
+
+fprintf(f,
+ "/* This table contains bit maps for various character classes.\n"
+ "Each map is 32 bytes long and the bits run from the least\n"
+ "significant end of each byte. The classes that have their own\n"
+ "maps are: space, xdigit, digit, upper, lower, word, graph\n"
+ "print, punct, and cntrl. Other classes are built from combinations. */\n\n");
+
+fprintf(f, " ");
+for (i = 0; i < cbit_length; i++)
+ {
+ if ((i & 7) == 0 && i != 0)
+ {
+ if ((i & 31) == 0) fprintf(f, "\n");
+ fprintf(f, "\n ");
+ }
+ fprintf(f, "0x%02x", *tables++);
+ if (i != cbit_length - 1) fprintf(f, ",");
+ }
+fprintf(f, ",\n\n");
+
+fprintf(f,
+ "/* This table identifies various classes of character by individual bits:\n"
+ " 0x%02x white space character\n"
+ " 0x%02x letter\n"
+ " 0x%02x decimal digit\n"
+ " 0x%02x hexadecimal digit\n"
+ " 0x%02x alphanumeric or '_'\n"
+ " 0x%02x regular expression metacharacter or binary zero\n*/\n\n",
+ ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word,
+ ctype_meta);
+
+fprintf(f, " ");
+for (i = 0; i < 256; i++)
+ {
+ if ((i & 7) == 0 && i != 0)
+ {
+ fprintf(f, " /* ");
+ if (isprint(i-8)) fprintf(f, " %c -", i-8);
+ else fprintf(f, "%3d-", i-8);
+ if (isprint(i-1)) fprintf(f, " %c ", i-1);
+ else fprintf(f, "%3d", i-1);
+ fprintf(f, " */\n ");
+ }
+ fprintf(f, "0x%02x", *tables++);
+ if (i != 255) fprintf(f, ",");
+ }
+
+fprintf(f, "};/* ");
+if (isprint(i-8)) fprintf(f, " %c -", i-8);
+ else fprintf(f, "%3d-", i-8);
+if (isprint(i-1)) fprintf(f, " %c ", i-1);
+ else fprintf(f, "%3d", i-1);
+fprintf(f, " */\n\n/* End of pcre_chartables.c */\n");
+
+fclose(f);
+free((void *)base_of_tables);
+return 0;
+}
+
+/* End of dftables.c */
diff --git a/src/doc/html/index.html b/src/doc/html/index.html
new file mode 100644
index 0000000..8a7174e
--- /dev/null
+++ b/src/doc/html/index.html
@@ -0,0 +1,140 @@
+
+
+
+PCRE specification
+
+
+Perl-compatible Regular Expressions (PCRE)
+
+The HTML documentation for PCRE comprises the following pages:
+
+
+
+pcre |
+ Introductory page |
+
+pcre-config |
+ Information about the installation configuration |
+
+pcreapi |
+ PCRE's native API |
+
+pcrebuild |
+ Options for building PCRE |
+
+pcrecallout |
+ The callout facility |
+
+pcrecompat |
+ Compability with Perl |
+
+pcrecpp |
+ The C++ wrapper for the PCRE library |
+
+pcregrep |
+ The pcregrep command |
+
+pcrematching |
+ Discussion of the two matching algorithms |
+
+pcrepartial |
+ Using PCRE for partial matching |
+
+pcrepattern |
+ Specification of the regular expressions supported by PCRE |
+
+pcreperform |
+ Some comments on performance |
+
+pcreposix |
+ The POSIX API to the PCRE library |
+
+pcreprecompile |
+ How to save and re-use compiled patterns |
+
+pcresample |
+ Description of the sample program |
+
+pcrestack |
+ Discussion of PCRE's stack usage |
+
+pcresyntax |
+ Syntax quick-reference summary |
+
+pcretest |
+ The pcretest command for testing PCRE |
+
+
+
+There are also individual pages that summarize the interface for each function
+in the library:
+
+
+
+
+
diff --git a/src/doc/html/pcre-config.html b/src/doc/html/pcre-config.html
new file mode 100644
index 0000000..0987745
--- /dev/null
+++ b/src/doc/html/pcre-config.html
@@ -0,0 +1,88 @@
+
+
+pcre-config specification
+
+
+pcre-config man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
SYNOPSIS
+
+pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
+[--libs-posix] [--cflags] [--cflags-posix]
+
+
DESCRIPTION
+
+pcre-config returns the configuration of the installed PCRE
+libraries and the options required to compile a program to use them.
+
+
OPTIONS
+
+--prefix
+Writes the directory prefix used in the PCRE installation for architecture
+independent files (/usr on many systems, /usr/local on some
+systems) to the standard output.
+
+
+--exec-prefix
+Writes the directory prefix used in the PCRE installation for architecture
+dependent files (normally the same as --prefix) to the standard output.
+
+
+--version
+Writes the version number of the installed PCRE libraries to the standard
+output.
+
+
+--libs
+Writes to the standard output the command line options required to link
+with PCRE (-lpcre on many systems).
+
+
+--libs-posix
+Writes to the standard output the command line options required to link with
+the PCRE posix emulation library (-lpcreposix -lpcre on many
+systems).
+
+
+--cflags
+Writes to the standard output the command line options required to compile
+files that use PCRE (this may include some -I options, but is blank on
+many systems).
+
+
+--cflags-posix
+Writes to the standard output the command line options required to compile
+files that use the PCRE posix emulation library (this may include some -I
+options, but is blank on many systems).
+
+
SEE ALSO
+
+pcre(3)
+
+
AUTHOR
+
+This manual page was originally written by Mark Baker for the Debian GNU/Linux
+system. It has been slightly revised as a generic PCRE man page.
+
+
REVISION
+
+Last updated: 18 April 2007
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre.html b/src/doc/html/pcre.html
new file mode 100644
index 0000000..e71edee
--- /dev/null
+++ b/src/doc/html/pcre.html
@@ -0,0 +1,304 @@
+
+
+pcre specification
+
+
+pcre man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
INTRODUCTION
+
+The PCRE library is a set of functions that implement regular expression
+pattern matching using the same syntax and semantics as Perl, with just a few
+differences. Certain features that appeared in Python and PCRE before they
+appeared in Perl are also available using the Python syntax. There is also some
+support for certain .NET and Oniguruma syntax items, and there is an option for
+requesting some minor changes that give better JavaScript compatibility.
+
+
+The current implementation of PCRE (release 7.x) corresponds approximately with
+Perl 5.10, including support for UTF-8 encoded strings and Unicode general
+category properties. However, UTF-8 and Unicode support has to be explicitly
+enabled; it is not the default. The Unicode tables correspond to Unicode
+release 5.0.0.
+
+
+In addition to the Perl-compatible matching function, PCRE contains an
+alternative matching function that matches the same compiled patterns in a
+different way. In certain circumstances, the alternative function has some
+advantages. For a discussion of the two matching algorithms, see the
+pcrematching
+page.
+
+
+PCRE is written in C and released as a C library. A number of people have
+written wrappers and interfaces of various kinds. In particular, Google Inc.
+have provided a comprehensive C++ wrapper. This is now included as part of the
+PCRE distribution. The
+pcrecpp
+page has details of this interface. Other people's contributions can be found
+in the Contrib directory at the primary FTP site, which is:
+ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
+
+
+Details of exactly which Perl regular expression features are and are not
+supported by PCRE are given in separate documents. See the
+pcrepattern
+and
+pcrecompat
+pages. There is a syntax summary in the
+pcresyntax
+page.
+
+
+Some features of PCRE can be included, excluded, or changed when the library is
+built. The
+pcre_config()
+function makes it possible for a client to discover which features are
+available. The features themselves are described in the
+pcrebuild
+page. Documentation about building PCRE for various operating systems can be
+found in the README file in the source distribution.
+
+
+The library contains a number of undocumented internal functions and data
+tables that are used by more than one of the exported external functions, but
+which are not intended for use by external callers. Their names all begin with
+"_pcre_", which hopefully will not provoke any name clashes. In some
+environments, it is possible to control which external symbols are exported
+when a shared library is built, and in these cases the undocumented symbols are
+not exported.
+
+
USER DOCUMENTATION
+
+The user documentation for PCRE comprises a number of different sections. In
+the "man" format, each of these is a separate "man page". In the HTML format,
+each is a separate page, linked from the index page. In the plain text format,
+all the sections are concatenated, for ease of searching. The sections are as
+follows:
+
+ pcre this document
+ pcre-config show PCRE installation configuration information
+ pcreapi details of PCRE's native C API
+ pcrebuild options for building PCRE
+ pcrecallout details of the callout feature
+ pcrecompat discussion of Perl compatibility
+ pcrecpp details of the C++ wrapper
+ pcregrep description of the pcregrep command
+ pcrematching discussion of the two matching algorithms
+ pcrepartial details of the partial matching facility
+ pcrepattern syntax and semantics of supported regular expressions
+ pcresyntax quick syntax reference
+ pcreperform discussion of performance issues
+ pcreposix the POSIX-compatible C API
+ pcreprecompile details of saving and re-using precompiled patterns
+ pcresample discussion of the sample program
+ pcrestack discussion of stack usage
+ pcretest description of the pcretest testing command
+
+In addition, in the "man" and HTML formats, there is a short page for each
+C library function, listing its arguments and results.
+
+
LIMITATIONS
+
+There are some size limitations in PCRE but it is hoped that they will never in
+practice be relevant.
+
+
+The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
+compiled with the default internal linkage size of 2. If you want to process
+regular expressions that are truly enormous, you can compile PCRE with an
+internal linkage size of 3 or 4 (see the README file in the source
+distribution and the
+pcrebuild
+documentation for details). In these cases the limit is substantially larger.
+However, the speed of execution is slower.
+
+
+All values in repeating quantifiers must be less than 65536.
+
+
+There is no limit to the number of parenthesized subpatterns, but there can be
+no more than 65535 capturing subpatterns.
+
+
+The maximum length of name for a named subpattern is 32 characters, and the
+maximum number of named subpatterns is 10000.
+
+
+The maximum length of a subject string is the largest positive number that an
+integer variable can hold. However, when using the traditional matching
+function, PCRE uses recursion to handle subpatterns and indefinite repetition.
+This means that the available stack space may limit the size of a subject
+string that can be processed by certain patterns. For a discussion of stack
+issues, see the
+pcrestack
+documentation.
+
+
UTF-8 AND UNICODE PROPERTY SUPPORT
+
+From release 3.3, PCRE has had some support for character strings encoded in
+the UTF-8 format. For release 4.0 this was greatly extended to cover most
+common requirements, and in release 5.0 additional support for Unicode general
+category properties was added.
+
+
+In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
+the code, and, in addition, you must call
+pcre_compile()
+with the PCRE_UTF8 option flag. When you do this, both the pattern and any
+subject strings that are matched against it are treated as UTF-8 strings
+instead of just strings of bytes.
+
+
+If you compile PCRE with UTF-8 support, but do not use it at run time, the
+library will be a bit bigger, but the additional run time overhead is limited
+to testing the PCRE_UTF8 flag occasionally, so should not be very big.
+
+
+If PCRE is built with Unicode character property support (which implies UTF-8
+support), the escape sequences \p{..}, \P{..}, and \X are supported.
+The available properties that can be tested are limited to the general
+category properties such as Lu for an upper case letter or Nd for a decimal
+number, the Unicode script names such as Arabic or Han, and the derived
+properties Any and L&. A full list is given in the
+pcrepattern
+documentation. Only the short names for properties are supported. For example,
+\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
+Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
+compatibility with Perl 5.6. PCRE does not support this.
+
+
+Validity of UTF-8 strings
+
+
+When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
+are (by default) checked for validity on entry to the relevant functions. From
+release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
+themselves derived from the Unicode specification. Earlier releases of PCRE
+followed the rules of RFC 2279, which allows the full range of 31-bit values (0
+to 0x7FFFFFFF). The current check allows only values in the range U+0 to
+U+10FFFF, excluding U+D800 to U+DFFF.
+
+
+The excluded code points are the "Low Surrogate Area" of Unicode, of which the
+Unicode Standard says this: "The Low Surrogate Area does not contain any
+character assignments, consequently no character code charts or namelists are
+provided for this area. Surrogates are reserved for use with UTF-16 and then
+must be used in pairs." The code points that are encoded by UTF-16 pairs are
+available as independent code points in the UTF-8 encoding. (In other words,
+the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
+UTF-8.)
+
+
+If an invalid UTF-8 string is passed to PCRE, an error return
+(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
+your strings are valid, and therefore want to skip these checks in order to
+improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
+at run time, PCRE assumes that the pattern or subject it is given
+(respectively) contains only valid UTF-8 codes. In this case, it does not
+diagnose an invalid UTF-8 string.
+
+
+If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
+happens depends on why the string is invalid. If the string conforms to the
+"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
+in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
+test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
+rules of RFC 2279. However, if the string does not even conform to RFC 2279,
+the result is undefined. Your program may crash.
+
+
+If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
+encoded in a UTF-8-like manner as per the old RFC, you can set
+PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
+situation, you will have to apply your own validity check.
+
+
+General comments about UTF-8 mode
+
+
+1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
+UTF-8 character if the value is greater than 127.
+
+
+2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
+characters for values greater than \177.
+
+
+3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
+bytes, for example: \x{100}{3}.
+
+
+4. The dot metacharacter matches one UTF-8 character instead of a single byte.
+
+
+5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
+but its use can lead to some strange effects. This facility is not available in
+the alternative matching function, pcre_dfa_exec().
+
+
+6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
+test characters of any code value, but the characters that PCRE recognizes as
+digits, spaces, or word characters remain the same set as before, all with
+values less than 256. This remains true even when PCRE includes Unicode
+property support, because to do otherwise would slow down PCRE in many common
+cases. If you really want to test for a wider sense of, say, "digit", you
+must use Unicode property tests such as \p{Nd}.
+
+
+7. Similarly, characters that match the POSIX named character classes are all
+low-valued characters.
+
+
+8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
+(\h, \H, \v, and \V) do match all the appropriate Unicode characters.
+
+
+9. Case-insensitive matching applies only to characters whose values are less
+than 128, unless PCRE is built with Unicode property support. Even when Unicode
+property support is available, PCRE still uses its own character tables when
+checking the case of low-valued characters, so as not to degrade performance.
+The Unicode property information is used only for characters with higher
+values. Even when Unicode property support is available, PCRE supports
+case-insensitive matching only when there is a one-to-one mapping between a
+letter's cases. There are a small number of many-to-one mappings in Unicode;
+these are not supported by PCRE.
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
+Putting an actual email address here seems to have been a spam magnet, so I've
+taken it away. If you want to email me, use my two initials, followed by the
+two digits 10, at the domain cam.ac.uk.
+
+
REVISION
+
+Last updated: 12 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_compile.html b/src/doc/html/pcre_compile.html
new file mode 100644
index 0000000..396a5fb
--- /dev/null
+++ b/src/doc/html/pcre_compile.html
@@ -0,0 +1,89 @@
+
+
+pcre_compile specification
+
+
+pcre_compile man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+pcre *pcre_compile(const char *pattern, int options,
+const char **errptr, int *erroffset,
+const unsigned char *tableptr);
+
+
+DESCRIPTION
+
+
+This function compiles a regular expression into an internal form. It is the
+same as pcre_compile2(), except for the absence of the errorcodeptr
+argument. Its arguments are:
+
+ pattern A zero-terminated string containing the
+ regular expression to be compiled
+ options Zero or more option bits
+ errptr Where to put an error message
+ erroffset Offset in pattern where error was found
+ tableptr Pointer to character tables, or NULL to
+ use the built-in default
+
+The option bits are:
+
+ PCRE_ANCHORED Force pattern anchoring
+ PCRE_AUTO_CALLOUT Compile automatic callouts
+ PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \R matches all Unicode line endings
+ PCRE_CASELESS Do caseless matching
+ PCRE_DOLLAR_ENDONLY $ not to match newline at end
+ PCRE_DOTALL . matches anything including NL
+ PCRE_DUPNAMES Allow duplicate names for subpatterns
+ PCRE_EXTENDED Ignore whitespace and # comments
+ PCRE_EXTRA PCRE extra features
+ (not much use currently)
+ PCRE_FIRSTLINE Force matching to be before newline
+ PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
+ PCRE_MULTILINE ^ and $ match newlines within data
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
+ sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
+ theses (named ones available)
+ PCRE_UNGREEDY Invert greediness of quantifiers
+ PCRE_UTF8 Run in UTF-8 mode
+ PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
+ validity (only relevant if
+ PCRE_UTF8 is set)
+
+PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
+PCRE_NO_UTF8_CHECK.
+
+
+The yield of the function is a pointer to a private data structure that
+contains the compiled pattern, or NULL if an error was detected. Note that
+compiling regular expressions with one version of PCRE for use with a different
+version is not guaranteed to work and may cause crashes.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_compile2.html b/src/doc/html/pcre_compile2.html
new file mode 100644
index 0000000..8d743c1
--- /dev/null
+++ b/src/doc/html/pcre_compile2.html
@@ -0,0 +1,89 @@
+
+
+pcre_compile2 specification
+
+
+pcre_compile2 man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+pcre *pcre_compile2(const char *pattern, int options,
+int *errorcodeptr,
+const char **errptr, int *erroffset,
+const unsigned char *tableptr);
+
+
+DESCRIPTION
+
+
+This function compiles a regular expression into an internal form. It is the
+same as pcre_compile(), except for the addition of the errorcodeptr
+argument. The arguments are:
+
+
+
+ pattern A zero-terminated string containing the
+ regular expression to be compiled
+ options Zero or more option bits
+ errorcodeptr Where to put an error code
+ errptr Where to put an error message
+ erroffset Offset in pattern where error was found
+ tableptr Pointer to character tables, or NULL to
+ use the built-in default
+
+The option bits are:
+
+ PCRE_ANCHORED Force pattern anchoring
+ PCRE_AUTO_CALLOUT Compile automatic callouts
+ PCRE_CASELESS Do caseless matching
+ PCRE_DOLLAR_ENDONLY $ not to match newline at end
+ PCRE_DOTALL . matches anything including NL
+ PCRE_DUPNAMES Allow duplicate names for subpatterns
+ PCRE_EXTENDED Ignore whitespace and # comments
+ PCRE_EXTRA PCRE extra features
+ (not much use currently)
+ PCRE_FIRSTLINE Force matching to be before newline
+ PCRE_MULTILINE ^ and $ match newlines within data
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
+ theses (named ones available)
+ PCRE_UNGREEDY Invert greediness of quantifiers
+ PCRE_UTF8 Run in UTF-8 mode
+ PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
+ validity (only relevant if
+ PCRE_UTF8 is set)
+
+PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
+PCRE_NO_UTF8_CHECK.
+
+
+The yield of the function is a pointer to a private data structure that
+contains the compiled pattern, or NULL if an error was detected. Note that
+compiling regular expressions with one version of PCRE for use with a different
+version is not guaranteed to work and may cause crashes.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_config.html b/src/doc/html/pcre_config.html
new file mode 100644
index 0000000..40dee37
--- /dev/null
+++ b/src/doc/html/pcre_config.html
@@ -0,0 +1,70 @@
+
+
+pcre_config specification
+
+
+pcre_config man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_config(int what, void *where);
+
+
+DESCRIPTION
+
+
+This function makes it possible for a client program to find out which optional
+features are available in the version of the PCRE library it is using. Its
+arguments are as follows:
+
+ what A code specifying what information is required
+ where Points to where to put the data
+
+The available codes are:
+
+ PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
+ PCRE_CONFIG_MATCH_LIMIT Internal resource limit
+ PCRE_CONFIG_MATCH_LIMIT_RECURSION
+ Internal recursion depth limit
+ PCRE_CONFIG_NEWLINE Value of the default newline sequence:
+ 13 (0x000d) for CR
+ 10 (0x000a) for LF
+ 3338 (0x0d0a) for CRLF
+ -2 for ANYCRLF
+ -1 for ANY
+ PCRE_CONFIG_BSR Indicates what \R matches by default:
+ 0 all Unicode line endings
+ 1 CR, LF, or CRLF only
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
+ Threshold of return slots, above
+ which malloc() is used by
+ the POSIX API
+ PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
+ PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
+ PCRE_CONFIG_UNICODE_PROPERTIES
+ Availability of Unicode property support
+ (1=yes 0=no)
+
+The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_copy_named_substring.html b/src/doc/html/pcre_copy_named_substring.html
new file mode 100644
index 0000000..2185518
--- /dev/null
+++ b/src/doc/html/pcre_copy_named_substring.html
@@ -0,0 +1,53 @@
+
+
+pcre_copy_named_substring specification
+
+
+pcre_copy_named_substring man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_copy_named_substring(const pcre *code,
+const char *subject, int *ovector,
+int stringcount, const char *stringname,
+char *buffer, int buffersize);
+
+
+DESCRIPTION
+
+
+This is a convenience function for extracting a captured substring, identified
+by name, into a given buffer. The arguments are:
+
+ code Pattern that was successfully matched
+ subject Subject that has been successfully matched
+ ovector Offset vector that pcre_exec() used
+ stringcount Value returned by pcre_exec()
+ stringname Name of the required substring
+ buffer Buffer to receive the string
+ buffersize Size of buffer
+
+The yield is the length of the substring, PCRE_ERROR_NOMEMORY if the buffer was
+too small, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_copy_substring.html b/src/doc/html/pcre_copy_substring.html
new file mode 100644
index 0000000..b7d2341
--- /dev/null
+++ b/src/doc/html/pcre_copy_substring.html
@@ -0,0 +1,51 @@
+
+
+pcre_copy_substring specification
+
+
+pcre_copy_substring man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_copy_substring(const char *subject, int *ovector,
+int stringcount, int stringnumber, char *buffer,
+int buffersize);
+
+
+DESCRIPTION
+
+
+This is a convenience function for extracting a captured substring into a given
+buffer. The arguments are:
+
+ subject Subject that has been successfully matched
+ ovector Offset vector that pcre_exec() used
+ stringcount Value returned by pcre_exec()
+ stringnumber Number of the required substring
+ buffer Buffer to receive the string
+ buffersize Size of buffer
+
+The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
+too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_dfa_exec.html b/src/doc/html/pcre_dfa_exec.html
new file mode 100644
index 0000000..02e7c8d
--- /dev/null
+++ b/src/doc/html/pcre_dfa_exec.html
@@ -0,0 +1,97 @@
+
+
+pcre_dfa_exec specification
+
+
+pcre_dfa_exec man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
+const char *subject, int length, int startoffset,
+int options, int *ovector, int ovecsize,
+int *workspace, int wscount);
+
+
+DESCRIPTION
+
+
+This function matches a compiled regular expression against a given subject
+string, using an alternative matching algorithm that scans the subject string
+just once (not Perl-compatible). Note that the main, Perl-compatible,
+matching function is pcre_exec(). The arguments for this function are:
+
+ code Points to the compiled pattern
+ extra Points to an associated pcre_extra structure,
+ or is NULL
+ subject Points to the subject string
+ length Length of the subject string, in bytes
+ startoffset Offset in bytes in the subject at which to
+ start matching
+ options Option bits
+ ovector Points to a vector of ints for result offsets
+ ovecsize Number of elements in the vector
+ workspace Points to a vector of ints used as working space
+ wscount Number of elements in the vector
+
+The options are:
+
+ PCRE_ANCHORED Match only at the first position
+ PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \R matches all Unicode line endings
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NOTBOL Subject is not the beginning of a line
+ PCRE_NOTEOL Subject is not the end of a line
+ PCRE_NOTEMPTY An empty string is not a valid match
+ PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
+ validity (only relevant if PCRE_UTF8
+ was set at compile time)
+ PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
+ PCRE_DFA_SHORTEST Return only the shortest match
+ PCRE_DFA_RESTART This is a restart after a partial match
+
+There are restrictions on what may appear in a pattern when using this matching
+function. Details are given in the
+pcrematching
+documentation.
+
+
+A pcre_extra structure contains the following fields:
+
+ flags Bits indicating which fields are set
+ study_data Opaque data from pcre_study()
+ match_limit Limit on internal resource use
+ match_limit_recursion Limit on internal recursion depth
+ callout_data Opaque data passed back to callouts
+ tables Points to character tables or is NULL
+
+The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
+PCRE_EXTRA_TABLES. For this matching function, the match_limit and
+match_limit_recursion fields are not used, and must not be set.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_exec.html b/src/doc/html/pcre_exec.html
new file mode 100644
index 0000000..86dcc02
--- /dev/null
+++ b/src/doc/html/pcre_exec.html
@@ -0,0 +1,90 @@
+
+
+pcre_exec specification
+
+
+pcre_exec man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_exec(const pcre *code, const pcre_extra *extra,
+const char *subject, int length, int startoffset,
+int options, int *ovector, int ovecsize);
+
+
+DESCRIPTION
+
+
+This function matches a compiled regular expression against a given subject
+string, using a matching algorithm that is similar to Perl's. It returns
+offsets to captured substrings. Its arguments are:
+
+ code Points to the compiled pattern
+ extra Points to an associated pcre_extra structure,
+ or is NULL
+ subject Points to the subject string
+ length Length of the subject string, in bytes
+ startoffset Offset in bytes in the subject at which to
+ start matching
+ options Option bits
+ ovector Points to a vector of ints for result offsets
+ ovecsize Number of elements in the vector (a multiple of 3)
+
+The options are:
+
+ PCRE_ANCHORED Match only at the first position
+ PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \R matches all Unicode line endings
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NOTBOL Subject is not the beginning of a line
+ PCRE_NOTEOL Subject is not the end of a line
+ PCRE_NOTEMPTY An empty string is not a valid match
+ PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
+ validity (only relevant if PCRE_UTF8
+ was set at compile time)
+ PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
+
+There are restrictions on what may appear in a pattern when partial matching is
+requested. For details, see the
+pcrepartial
+page.
+
+
+A pcre_extra structure contains the following fields:
+
+ flags Bits indicating which fields are set
+ study_data Opaque data from pcre_study()
+ match_limit Limit on internal resource use
+ match_limit_recursion Limit on internal recursion depth
+ callout_data Opaque data passed back to callouts
+ tables Points to character tables or is NULL
+
+The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
+PCRE_EXTRA_TABLES.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_free_substring.html b/src/doc/html/pcre_free_substring.html
new file mode 100644
index 0000000..fe62614
--- /dev/null
+++ b/src/doc/html/pcre_free_substring.html
@@ -0,0 +1,40 @@
+
+
+pcre_free_substring specification
+
+
+pcre_free_substring man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+void pcre_free_substring(const char *stringptr);
+
+
+DESCRIPTION
+
+
+This is a convenience function for freeing the store obtained by a previous
+call to pcre_get_substring() or pcre_get_named_substring(). Its
+only argument is a pointer to the string.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_free_substring_list.html b/src/doc/html/pcre_free_substring_list.html
new file mode 100644
index 0000000..a92c960
--- /dev/null
+++ b/src/doc/html/pcre_free_substring_list.html
@@ -0,0 +1,40 @@
+
+
+pcre_free_substring_list specification
+
+
+pcre_free_substring_list man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+void pcre_free_substring_list(const char **stringptr);
+
+
+DESCRIPTION
+
+
+This is a convenience function for freeing the store obtained by a previous
+call to pcre_get_substring_list(). Its only argument is a pointer to the
+list of string pointers.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_fullinfo.html b/src/doc/html/pcre_fullinfo.html
new file mode 100644
index 0000000..48fddf5
--- /dev/null
+++ b/src/doc/html/pcre_fullinfo.html
@@ -0,0 +1,72 @@
+
+
+pcre_fullinfo specification
+
+
+pcre_fullinfo man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
+int what, void *where);
+
+
+DESCRIPTION
+
+
+This function returns information about a compiled pattern. Its arguments are:
+
+ code Compiled regular expression
+ extra Result of pcre_study() or NULL
+ what What information is required
+ where Where to put the information
+
+The following information is available:
+
+ PCRE_INFO_BACKREFMAX Number of highest back reference
+ PCRE_INFO_CAPTURECOUNT Number of capturing subpatterns
+ PCRE_INFO_DEFAULT_TABLES Pointer to default tables
+ PCRE_INFO_FIRSTBYTE Fixed first byte for a match, or
+ -1 for start of string
+ or after newline, or
+ -2 otherwise
+ PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
+ PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
+ PCRE_INFO_LASTLITERAL Literal last byte required
+ PCRE_INFO_NAMECOUNT Number of named subpatterns
+ PCRE_INFO_NAMEENTRYSIZE Size of name table entry
+ PCRE_INFO_NAMETABLE Pointer to name table
+ PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
+ PCRE_INFO_OPTIONS Option bits used for compilation
+ PCRE_INFO_SIZE Size of compiled pattern
+ PCRE_INFO_STUDYSIZE Size of study data
+
+The yield of the function is zero on success or:
+
+ PCRE_ERROR_NULL the argument code was NULL
+ the argument where was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of what was invalid
+
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_get_named_substring.html b/src/doc/html/pcre_get_named_substring.html
new file mode 100644
index 0000000..24dc058
--- /dev/null
+++ b/src/doc/html/pcre_get_named_substring.html
@@ -0,0 +1,55 @@
+
+
+pcre_get_named_substring specification
+
+
+pcre_get_named_substring man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_get_named_substring(const pcre *code,
+const char *subject, int *ovector,
+int stringcount, const char *stringname,
+const char **stringptr);
+
+
+DESCRIPTION
+
+
+This is a convenience function for extracting a captured substring by name. The
+arguments are:
+
+ code Compiled pattern
+ subject Subject that has been successfully matched
+ ovector Offset vector that pcre_exec() used
+ stringcount Value returned by pcre_exec()
+ stringname Name of the required substring
+ stringptr Where to put the string pointer
+
+The memory in which the substring is placed is obtained by calling
+pcre_malloc(). The convenience function pcre_free_substring() can
+be used to free it when it is no longer needed. The yield of the function is
+the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
+could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_get_stringnumber.html b/src/doc/html/pcre_get_stringnumber.html
new file mode 100644
index 0000000..43af3aa
--- /dev/null
+++ b/src/doc/html/pcre_get_stringnumber.html
@@ -0,0 +1,49 @@
+
+
+pcre_get_stringnumber specification
+
+
+pcre_get_stringnumber man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_get_stringnumber(const pcre *code,
+const char *name);
+
+
+DESCRIPTION
+
+
+This convenience function finds the number of a named substring capturing
+parenthesis in a compiled pattern. Its arguments are:
+
+ code Compiled regular expression
+ name Name whose number is required
+
+The yield of the function is the number of the parenthesis if the name is
+found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
+(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
+pcre_get_stringnumber(). You can obtain the complete list by calling
+pcre_get_stringtable_entries().
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_get_stringtable_entries.html b/src/doc/html/pcre_get_stringtable_entries.html
new file mode 100644
index 0000000..dc20ffd
--- /dev/null
+++ b/src/doc/html/pcre_get_stringtable_entries.html
@@ -0,0 +1,52 @@
+
+
+pcre_get_stringtable_entries specification
+
+
+pcre_get_stringtable_entries man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_get_stringtable_entries(const pcre *code,
+const char *name, char **first, char **last);
+
+
+DESCRIPTION
+
+
+This convenience function finds, for a compiled pattern, the first and last
+entries for a given name in the table that translates capturing parenthesis
+names into numbers. When names are required to be unique (PCRE_DUPNAMES is
+not set), it is usually easier to use pcre_get_stringnumber()
+instead.
+
+ code Compiled regular expression
+ name Name whose entries required
+ first Where to return a pointer to the first entry
+ last Where to return a pointer to the last entry
+
+The yield of the function is the length of each entry, or
+PCRE_ERROR_NOSUBSTRING if none are found.
+
+
+There is a complete description of the PCRE native API, including the format of
+the table entries, in the
+pcreapi
+page, and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_get_substring.html b/src/doc/html/pcre_get_substring.html
new file mode 100644
index 0000000..9b40e4d
--- /dev/null
+++ b/src/doc/html/pcre_get_substring.html
@@ -0,0 +1,53 @@
+
+
+pcre_get_substring specification
+
+
+pcre_get_substring man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_get_substring(const char *subject, int *ovector,
+int stringcount, int stringnumber,
+const char **stringptr);
+
+
+DESCRIPTION
+
+
+This is a convenience function for extracting a captured substring. The
+arguments are:
+
+ subject Subject that has been successfully matched
+ ovector Offset vector that pcre_exec() used
+ stringcount Value returned by pcre_exec()
+ stringnumber Number of the required substring
+ stringptr Where to put the string pointer
+
+The memory in which the substring is placed is obtained by calling
+pcre_malloc(). The convenience function pcre_free_substring() can
+be used to free it when it is no longer needed. The yield of the function is
+the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
+be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_get_substring_list.html b/src/doc/html/pcre_get_substring_list.html
new file mode 100644
index 0000000..617a315
--- /dev/null
+++ b/src/doc/html/pcre_get_substring_list.html
@@ -0,0 +1,53 @@
+
+
+pcre_get_substring_list specification
+
+
+pcre_get_substring_list man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_get_substring_list(const char *subject,
+int *ovector, int stringcount, const char ***listptr);
+
+
+DESCRIPTION
+
+
+This is a convenience function for extracting a list of all the captured
+substrings. The arguments are:
+
+ subject Subject that has been successfully matched
+ ovector Offset vector that pcre_exec used
+ stringcount Value returned by pcre_exec
+ listptr Where to put a pointer to the list
+
+The memory in which the substrings and the list are placed is obtained by
+calling pcre_malloc(). The convenience function
+pcre_free_substring_list() can be used to free it when it is no longer
+needed. A pointer to a list of pointers is put in the variable whose address is
+in listptr. The list is terminated by a NULL pointer. The yield of the
+function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
+not be obtained.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_info.html b/src/doc/html/pcre_info.html
new file mode 100644
index 0000000..6693ffe
--- /dev/null
+++ b/src/doc/html/pcre_info.html
@@ -0,0 +1,39 @@
+
+
+pcre_info specification
+
+
+pcre_info man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_info(const pcre *code, int *optptr, int
+*firstcharptr);
+
+
+DESCRIPTION
+
+
+This function is obsolete. You should be using pcre_fullinfo() instead.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_maketables.html b/src/doc/html/pcre_maketables.html
new file mode 100644
index 0000000..cf8d69e
--- /dev/null
+++ b/src/doc/html/pcre_maketables.html
@@ -0,0 +1,42 @@
+
+
+pcre_maketables specification
+
+
+pcre_maketables man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+const unsigned char *pcre_maketables(void);
+
+
+DESCRIPTION
+
+
+This function builds a set of character tables for character values less than
+256. These can be passed to pcre_compile() to override PCRE's internal,
+built-in tables (which were made by pcre_maketables() when PCRE was
+compiled). You might want to do this if you are using a non-standard locale.
+The function yields a pointer to the tables.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_refcount.html b/src/doc/html/pcre_refcount.html
new file mode 100644
index 0000000..b748df2
--- /dev/null
+++ b/src/doc/html/pcre_refcount.html
@@ -0,0 +1,45 @@
+
+
+pcre_refcount specification
+
+
+pcre_refcount man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+int pcre_refcount(pcre *code, int adjust);
+
+
+DESCRIPTION
+
+
+This function is used to maintain a reference count inside a data block that
+contains a compiled pattern. Its arguments are:
+
+ code Compiled regular expression
+ adjust Adjustment to reference value
+
+The yield of the function is the adjusted reference value, which is constrained
+to lie between 0 and 65535.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_study.html b/src/doc/html/pcre_study.html
new file mode 100644
index 0000000..d290420
--- /dev/null
+++ b/src/doc/html/pcre_study.html
@@ -0,0 +1,56 @@
+
+
+pcre_study specification
+
+
+pcre_study man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+pcre_extra *pcre_study(const pcre *code, int options,
+const char **errptr);
+
+
+DESCRIPTION
+
+
+This function studies a compiled pattern, to see if additional information can
+be extracted that might speed up matching. Its arguments are:
+
+ code A compiled regular expression
+ options Options for pcre_study()
+ errptr Where to put an error message
+
+If the function succeeds, it returns a value that can be passed to
+pcre_exec() via its extra argument.
+
+
+If the function returns NULL, either it could not find any additional
+information, or there was an error. You can tell the difference by looking at
+the error value. It is NULL in first case.
+
+
+There are currently no options defined; the value of the second argument should
+always be zero.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcre_version.html b/src/doc/html/pcre_version.html
new file mode 100644
index 0000000..7bc8f86
--- /dev/null
+++ b/src/doc/html/pcre_version.html
@@ -0,0 +1,39 @@
+
+
+pcre_version specification
+
+
+pcre_version man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre.h>
+
+
+char *pcre_version(void);
+
+
+DESCRIPTION
+
+
+This function returns a character string that gives the version number of the
+PCRE library and the date of its release.
+
+
+There is a complete description of the PCRE native API in the
+pcreapi
+page and a description of the POSIX API in the
+pcreposix
+page.
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcreapi.html b/src/doc/html/pcreapi.html
new file mode 100644
index 0000000..266e2e6
--- /dev/null
+++ b/src/doc/html/pcreapi.html
@@ -0,0 +1,1981 @@
+
+
+pcreapi specification
+
+
+pcreapi man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PCRE NATIVE API
+
+#include <pcre.h>
+
+
+pcre *pcre_compile(const char *pattern, int options,
+const char **errptr, int *erroffset,
+const unsigned char *tableptr);
+
+
+pcre *pcre_compile2(const char *pattern, int options,
+int *errorcodeptr,
+const char **errptr, int *erroffset,
+const unsigned char *tableptr);
+
+
+pcre_extra *pcre_study(const pcre *code, int options,
+const char **errptr);
+
+
+int pcre_exec(const pcre *code, const pcre_extra *extra,
+const char *subject, int length, int startoffset,
+int options, int *ovector, int ovecsize);
+
+
+int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
+const char *subject, int length, int startoffset,
+int options, int *ovector, int ovecsize,
+int *workspace, int wscount);
+
+
+int pcre_copy_named_substring(const pcre *code,
+const char *subject, int *ovector,
+int stringcount, const char *stringname,
+char *buffer, int buffersize);
+
+
+int pcre_copy_substring(const char *subject, int *ovector,
+int stringcount, int stringnumber, char *buffer,
+int buffersize);
+
+
+int pcre_get_named_substring(const pcre *code,
+const char *subject, int *ovector,
+int stringcount, const char *stringname,
+const char **stringptr);
+
+
+int pcre_get_stringnumber(const pcre *code,
+const char *name);
+
+
+int pcre_get_stringtable_entries(const pcre *code,
+const char *name, char **first, char **last);
+
+
+int pcre_get_substring(const char *subject, int *ovector,
+int stringcount, int stringnumber,
+const char **stringptr);
+
+
+int pcre_get_substring_list(const char *subject,
+int *ovector, int stringcount, const char ***listptr);
+
+
+void pcre_free_substring(const char *stringptr);
+
+
+void pcre_free_substring_list(const char **stringptr);
+
+
+const unsigned char *pcre_maketables(void);
+
+
+int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
+int what, void *where);
+
+
+int pcre_info(const pcre *code, int *optptr, int
+*firstcharptr);
+
+
+int pcre_refcount(pcre *code, int adjust);
+
+
+int pcre_config(int what, void *where);
+
+
+char *pcre_version(void);
+
+
+void *(*pcre_malloc)(size_t);
+
+
+void (*pcre_free)(void *);
+
+
+void *(*pcre_stack_malloc)(size_t);
+
+
+void (*pcre_stack_free)(void *);
+
+
+int (*pcre_callout)(pcre_callout_block *);
+
+
PCRE API OVERVIEW
+
+PCRE has its own native API, which is described in this document. There are
+also some wrapper functions that correspond to the POSIX regular expression
+API. These are described in the
+pcreposix
+documentation. Both of these APIs define a set of C function calls. A C++
+wrapper is distributed with PCRE. It is documented in the
+pcrecpp
+page.
+
+
+The native API C function prototypes are defined in the header file
+pcre.h, and on Unix systems the library itself is called libpcre.
+It can normally be accessed by adding -lpcre to the command for linking
+an application that uses PCRE. The header file defines the macros PCRE_MAJOR
+and PCRE_MINOR to contain the major and minor release numbers for the library.
+Applications can use these to include support for different releases of PCRE.
+
+
+The functions pcre_compile(), pcre_compile2(), pcre_study(),
+and pcre_exec() are used for compiling and matching regular expressions
+in a Perl-compatible manner. A sample program that demonstrates the simplest
+way of using them is provided in the file called pcredemo.c in the source
+distribution. The
+pcresample
+documentation describes how to compile and run it.
+
+
+A second matching function, pcre_dfa_exec(), which is not
+Perl-compatible, is also provided. This uses a different algorithm for the
+matching. The alternative algorithm finds all possible matches (at a given
+point in the subject), and scans the subject just once. However, this algorithm
+does not return captured substrings. A description of the two matching
+algorithms and their advantages and disadvantages is given in the
+pcrematching
+documentation.
+
+
+In addition to the main compiling and matching functions, there are convenience
+functions for extracting captured substrings from a subject string that is
+matched by pcre_exec(). They are:
+
+ pcre_copy_substring()
+ pcre_copy_named_substring()
+ pcre_get_substring()
+ pcre_get_named_substring()
+ pcre_get_substring_list()
+ pcre_get_stringnumber()
+ pcre_get_stringtable_entries()
+
+pcre_free_substring() and pcre_free_substring_list() are also
+provided, to free the memory used for extracted strings.
+
+
+The function pcre_maketables() is used to build a set of character tables
+in the current locale for passing to pcre_compile(), pcre_exec(),
+or pcre_dfa_exec(). This is an optional facility that is provided for
+specialist use. Most commonly, no special tables are passed, in which case
+internal tables that are generated when PCRE is built are used.
+
+
+The function pcre_fullinfo() is used to find out information about a
+compiled pattern; pcre_info() is an obsolete version that returns only
+some of the available information, but is retained for backwards compatibility.
+The function pcre_version() returns a pointer to a string containing the
+version of PCRE and its date of release.
+
+
+The function pcre_refcount() maintains a reference count in a data block
+containing a compiled pattern. This is provided for the benefit of
+object-oriented applications.
+
+
+The global variables pcre_malloc and pcre_free initially contain
+the entry points of the standard malloc() and free() functions,
+respectively. PCRE calls the memory management functions via these variables,
+so a calling program can replace them if it wishes to intercept the calls. This
+should be done before calling any PCRE functions.
+
+
+The global variables pcre_stack_malloc and pcre_stack_free are also
+indirections to memory management functions. These special functions are used
+only when PCRE is compiled to use the heap for remembering data, instead of
+recursive function calls, when running the pcre_exec() function. See the
+pcrebuild
+documentation for details of how to do this. It is a non-standard way of
+building PCRE, for use in environments that have limited stacks. Because of the
+greater use of memory management, it runs more slowly. Separate functions are
+provided so that special-purpose external code can be used for this case. When
+used, these functions are always called in a stack-like manner (last obtained,
+first freed), and always for memory blocks of the same size. There is a
+discussion about PCRE's stack usage in the
+pcrestack
+documentation.
+
+
+The global variable pcre_callout initially contains NULL. It can be set
+by the caller to a "callout" function, which PCRE will then call at specified
+points during a matching operation. Details are given in the
+pcrecallout
+documentation.
+
+
NEWLINES
+
+PCRE supports five different conventions for indicating line breaks in
+strings: a single CR (carriage return) character, a single LF (linefeed)
+character, the two-character sequence CRLF, any of the three preceding, or any
+Unicode newline sequence. The Unicode newline sequences are the three just
+mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
+U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+(paragraph separator, U+2029).
+
+
+Each of the first three conventions is used by at least one operating system as
+its standard newline sequence. When PCRE is built, a default can be specified.
+The default default is LF, which is the Unix standard. When PCRE is run, the
+default can be overridden, either when a pattern is compiled, or when it is
+matched.
+
+
+At compile time, the newline convention can be specified by the options
+argument of pcre_compile(), or it can be specified by special text at the
+start of the pattern itself; this overrides any other settings. See the
+pcrepattern
+page for details of the special character sequences.
+
+
+In the PCRE documentation the word "newline" is used to mean "the character or
+pair of characters that indicate a line break". The choice of newline
+convention affects the handling of the dot, circumflex, and dollar
+metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
+recognized line ending sequence, the match position advancement for a
+non-anchored pattern. There is more detail about this in the
+section on pcre_exec() options
+below.
+
+
+The choice of newline convention does not affect the interpretation of
+the \n or \r escape sequences, nor does it affect what \R matches, which is
+controlled in a similar way, but by separate options.
+
+
MULTITHREADING
+
+The PCRE functions can be used in multi-threading applications, with the
+proviso that the memory management functions pointed to by pcre_malloc,
+pcre_free, pcre_stack_malloc, and pcre_stack_free, and the
+callout function pointed to by pcre_callout, are shared by all threads.
+
+
+The compiled form of a regular expression is not altered during matching, so
+the same compiled pattern can safely be used by several threads at once.
+
+
SAVING PRECOMPILED PATTERNS FOR LATER USE
+
+The compiled form of a regular expression can be saved and re-used at a later
+time, possibly by a different program, and even on a host other than the one on
+which it was compiled. Details are given in the
+pcreprecompile
+documentation. However, compiling a regular expression with one version of PCRE
+for use with a different version is not guaranteed to work and may cause
+crashes.
+
+
CHECKING BUILD-TIME OPTIONS
+
+int pcre_config(int what, void *where);
+
+
+The function pcre_config() makes it possible for a PCRE client to
+discover which optional features have been compiled into the PCRE library. The
+pcrebuild
+documentation has more details about these optional features.
+
+
+The first argument for pcre_config() is an integer, specifying which
+information is required; the second argument is a pointer to a variable into
+which the information is placed. The following information is available:
+
+ PCRE_CONFIG_UTF8
+
+The output is an integer that is set to one if UTF-8 support is available;
+otherwise it is set to zero.
+
+ PCRE_CONFIG_UNICODE_PROPERTIES
+
+The output is an integer that is set to one if support for Unicode character
+properties is available; otherwise it is set to zero.
+
+ PCRE_CONFIG_NEWLINE
+
+The output is an integer whose value specifies the default character sequence
+that is recognized as meaning "newline". The four values that are supported
+are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY. The
+default should normally be the standard sequence for your operating system.
+
+ PCRE_CONFIG_BSR
+
+The output is an integer whose value indicates what character sequences the \R
+escape sequence matches by default. A value of 0 means that \R matches any
+Unicode line ending sequence; a value of 1 means that \R matches only CR, LF,
+or CRLF. The default can be overridden when a pattern is compiled or matched.
+
+ PCRE_CONFIG_LINK_SIZE
+
+The output is an integer that contains the number of bytes used for internal
+linkage in compiled regular expressions. The value is 2, 3, or 4. Larger values
+allow larger regular expressions to be compiled, at the expense of slower
+matching. The default value of 2 is sufficient for all but the most massive
+patterns, since it allows the compiled pattern to be up to 64K in size.
+
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
+
+The output is an integer that contains the threshold above which the POSIX
+interface uses malloc() for output vectors. Further details are given in
+the
+pcreposix
+documentation.
+
+ PCRE_CONFIG_MATCH_LIMIT
+
+The output is an integer that gives the default limit for the number of
+internal matching function calls in a pcre_exec() execution. Further
+details are given with pcre_exec() below.
+
+ PCRE_CONFIG_MATCH_LIMIT_RECURSION
+
+The output is an integer that gives the default limit for the depth of
+recursion when calling the internal matching function in a pcre_exec()
+execution. Further details are given with pcre_exec() below.
+
+ PCRE_CONFIG_STACKRECURSE
+
+The output is an integer that is set to one if internal recursion when running
+pcre_exec() is implemented by recursive function calls that use the stack
+to remember their state. This is the usual way that PCRE is compiled. The
+output is zero if PCRE was compiled to use blocks of data on the heap instead
+of recursive function calls. In this case, pcre_stack_malloc and
+pcre_stack_free are called to manage memory blocks on the heap, thus
+avoiding the use of the stack.
+
+
COMPILING A PATTERN
+
+pcre *pcre_compile(const char *pattern, int options,
+const char **errptr, int *erroffset,
+const unsigned char *tableptr);
+pcre *pcre_compile2(const char *pattern, int options,
+int *errorcodeptr,
+const char **errptr, int *erroffset,
+const unsigned char *tableptr);
+
+
+Either of the functions pcre_compile() or pcre_compile2() can be
+called to compile a pattern into an internal form. The only difference between
+the two interfaces is that pcre_compile2() has an additional argument,
+errorcodeptr, via which a numerical error code can be returned.
+
+
+The pattern is a C string terminated by a binary zero, and is passed in the
+pattern argument. A pointer to a single block of memory that is obtained
+via pcre_malloc is returned. This contains the compiled code and related
+data. The pcre type is defined for the returned block; this is a typedef
+for a structure whose contents are not externally defined. It is up to the
+caller to free the memory (via pcre_free) when it is no longer required.
+
+
+Although the compiled code of a PCRE regex is relocatable, that is, it does not
+depend on memory location, the complete pcre data block is not
+fully relocatable, because it may contain a copy of the tableptr
+argument, which is an address (see below).
+
+
+The options argument contains various bit settings that affect the
+compilation. It should be zero if no options are required. The available
+options are described below. Some of them, in particular, those that are
+compatible with Perl, can also be set and unset from within the pattern (see
+the detailed description in the
+pcrepattern
+documentation). For these options, the contents of the options argument
+specifies their initial settings at the start of compilation and execution. The
+PCRE_ANCHORED and PCRE_NEWLINE_xxx options can be set at the time of
+matching as well as at compile time.
+
+
+If errptr is NULL, pcre_compile() returns NULL immediately.
+Otherwise, if compilation of a pattern fails, pcre_compile() returns
+NULL, and sets the variable pointed to by errptr to point to a textual
+error message. This is a static string that is part of the library. You must
+not try to free it. The offset from the start of the pattern to the character
+where the error was discovered is placed in the variable pointed to by
+erroffset, which must not be NULL. If it is, an immediate error is given.
+
+
+If pcre_compile2() is used instead of pcre_compile(), and the
+errorcodeptr argument is not NULL, a non-zero error code number is
+returned via this argument in the event of an error. This is in addition to the
+textual error message. Error codes and messages are listed below.
+
+
+If the final argument, tableptr, is NULL, PCRE uses a default set of
+character tables that are built when PCRE is compiled, using the default C
+locale. Otherwise, tableptr must be an address that is the result of a
+call to pcre_maketables(). This value is stored with the compiled
+pattern, and used again by pcre_exec(), unless another table pointer is
+passed to it. For more discussion, see the section on locale support below.
+
+
+This code fragment shows a typical straightforward call to pcre_compile():
+
+ pcre *re;
+ const char *error;
+ int erroffset;
+ re = pcre_compile(
+ "^A.*Z", /* the pattern */
+ 0, /* default options */
+ &error, /* for error message */
+ &erroffset, /* for error offset */
+ NULL); /* use default character tables */
+
+The following names for option bits are defined in the pcre.h header
+file:
+
+ PCRE_ANCHORED
+
+If this bit is set, the pattern is forced to be "anchored", that is, it is
+constrained to match only at the first matching point in the string that is
+being searched (the "subject string"). This effect can also be achieved by
+appropriate constructs in the pattern itself, which is the only way to do it in
+Perl.
+
+ PCRE_AUTO_CALLOUT
+
+If this bit is set, pcre_compile() automatically inserts callout items,
+all with number 255, before each pattern item. For discussion of the callout
+facility, see the
+pcrecallout
+documentation.
+
+ PCRE_BSR_ANYCRLF
+ PCRE_BSR_UNICODE
+
+These options (which are mutually exclusive) control what the \R escape
+sequence matches. The choice is either to match only CR, LF, or CRLF, or to
+match any Unicode newline sequence. The default is specified when PCRE is
+built. It can be overridden from within the pattern, or by setting an option
+when a compiled pattern is matched.
+
+ PCRE_CASELESS
+
+If this bit is set, letters in the pattern match both upper and lower case
+letters. It is equivalent to Perl's /i option, and it can be changed within a
+pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
+concept of case for characters whose values are less than 128, so caseless
+matching is always possible. For characters with higher values, the concept of
+case is supported if PCRE is compiled with Unicode property support, but not
+otherwise. If you want to use caseless matching for characters 128 and above,
+you must ensure that PCRE is compiled with Unicode property support as well as
+with UTF-8 support.
+
+ PCRE_DOLLAR_ENDONLY
+
+If this bit is set, a dollar metacharacter in the pattern matches only at the
+end of the subject string. Without this option, a dollar also matches
+immediately before a newline at the end of the string (but not before any other
+newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
+There is no equivalent to this option in Perl, and no way to set it within a
+pattern.
+
+ PCRE_DOTALL
+
+If this bit is set, a dot metacharater in the pattern matches all characters,
+including those that indicate newline. Without it, a dot does not match when
+the current position is at a newline. This option is equivalent to Perl's /s
+option, and it can be changed within a pattern by a (?s) option setting. A
+negative class such as [^a] always matches newline characters, independent of
+the setting of this option.
+
+ PCRE_DUPNAMES
+
+If this bit is set, names used to identify capturing subpatterns need not be
+unique. This can be helpful for certain types of pattern when it is known that
+only one instance of the named subpattern can ever be matched. There are more
+details of named subpatterns below; see also the
+pcrepattern
+documentation.
+
+ PCRE_EXTENDED
+
+If this bit is set, whitespace data characters in the pattern are totally
+ignored except when escaped or inside a character class. Whitespace does not
+include the VT character (code 11). In addition, characters between an
+unescaped # outside a character class and the next newline, inclusive, are also
+ignored. This is equivalent to Perl's /x option, and it can be changed within a
+pattern by a (?x) option setting.
+
+
+This option makes it possible to include comments inside complicated patterns.
+Note, however, that this applies only to data characters. Whitespace characters
+may never appear within special character sequences in a pattern, for example
+within the sequence (?( which introduces a conditional subpattern.
+
+ PCRE_EXTRA
+
+This option was invented in order to turn on additional functionality of PCRE
+that is incompatible with Perl, but it is currently of very little use. When
+set, any backslash in a pattern that is followed by a letter that has no
+special meaning causes an error, thus reserving these combinations for future
+expansion. By default, as in Perl, a backslash followed by a letter with no
+special meaning is treated as a literal. (Perl can, however, be persuaded to
+give a warning for this.) There are at present no other features controlled by
+this option. It can also be set by a (?X) option setting within a pattern.
+
+ PCRE_FIRSTLINE
+
+If this option is set, an unanchored pattern is required to match before or at
+the first newline in the subject string, though the matched text may continue
+over the newline.
+
+ PCRE_JAVASCRIPT_COMPAT
+
+If this option is set, PCRE's behaviour is changed in some ways so that it is
+compatible with JavaScript rather than Perl. The changes are as follows:
+
+
+(1) A lone closing square bracket in a pattern causes a compile-time error,
+because this is illegal in JavaScript (by default it is treated as a data
+character). Thus, the pattern AB]CD becomes illegal when this option is set.
+
+
+(2) At run time, a back reference to an unset subpattern group matches an empty
+string (by default this causes the current matching alternative to fail). A
+pattern such as (\1)(a) succeeds when this option is set (assuming it can find
+an "a" in the subject), whereas it fails by default, for Perl compatibility.
+
+ PCRE_MULTILINE
+
+By default, PCRE treats the subject string as consisting of a single line of
+characters (even if it actually contains newlines). The "start of line"
+metacharacter (^) matches only at the start of the string, while the "end of
+line" metacharacter ($) matches only at the end of the string, or before a
+terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
+Perl.
+
+
+When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
+match immediately following or immediately before internal newlines in the
+subject string, respectively, as well as at the very start and end. This is
+equivalent to Perl's /m option, and it can be changed within a pattern by a
+(?m) option setting. If there are no newlines in a subject string, or no
+occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
+
+ PCRE_NEWLINE_CR
+ PCRE_NEWLINE_LF
+ PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
+ PCRE_NEWLINE_ANY
+
+These options override the default newline definition that was chosen when PCRE
+was built. Setting the first or the second specifies that a newline is
+indicated by a single character (CR or LF, respectively). Setting
+PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
+CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
+preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
+that any Unicode newline sequence should be recognized. The Unicode newline
+sequences are the three just mentioned, plus the single characters VT (vertical
+tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
+separator, U+2028), and PS (paragraph separator, U+2029). The last two are
+recognized only in UTF-8 mode.
+
+
+The newline setting in the options word uses three bits that are treated
+as a number, giving eight possibilities. Currently only six are used (default
+plus the five values above). This means that if you set more than one newline
+option, the combination may or may not be sensible. For example,
+PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
+other combinations may yield unused numbers and cause an error.
+
+
+The only time that a line break is specially recognized when compiling a
+pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
+class is encountered. This indicates a comment that lasts until after the next
+line break sequence. In other circumstances, line break sequences are treated
+as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
+as whitespace characters and are therefore ignored.
+
+
+The newline option that is set at compile time becomes the default that is used
+for pcre_exec() and pcre_dfa_exec(), but it can be overridden.
+
+ PCRE_NO_AUTO_CAPTURE
+
+If this option is set, it disables the use of numbered capturing parentheses in
+the pattern. Any opening parenthesis that is not followed by ? behaves as if it
+were followed by ?: but named parentheses can still be used for capturing (and
+they acquire numbers in the usual way). There is no equivalent of this option
+in Perl.
+
+ PCRE_UNGREEDY
+
+This option inverts the "greediness" of the quantifiers so that they are not
+greedy by default, but become greedy if followed by "?". It is not compatible
+with Perl. It can also be set by a (?U) option setting within the pattern.
+
+ PCRE_UTF8
+
+This option causes PCRE to regard both the pattern and the subject as strings
+of UTF-8 characters instead of single-byte character strings. However, it is
+available only when PCRE is built to include UTF-8 support. If not, the use
+of this option provokes an error. Details of how this option changes the
+behaviour of PCRE are given in the
+section on UTF-8 support
+in the main
+pcre
+page.
+
+ PCRE_NO_UTF8_CHECK
+
+When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
+automatically checked. There is a discussion about the
+validity of UTF-8 strings
+in the main
+pcre
+page. If an invalid UTF-8 sequence of bytes is found, pcre_compile()
+returns an error. If you already know that your pattern is valid, and you want
+to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
+option. When it is set, the effect of passing an invalid UTF-8 string as a
+pattern is undefined. It may cause your program to crash. Note that this option
+can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress
+the UTF-8 validity checking of subject strings.
+
+
COMPILATION ERROR CODES
+
+The following table lists the error codes than may be returned by
+pcre_compile2(), along with the error messages that may be returned by
+both compiling functions. As PCRE has developed, some error codes have fallen
+out of use. To avoid confusion, they have not been re-used.
+
+ 0 no error
+ 1 \ at end of pattern
+ 2 \c at end of pattern
+ 3 unrecognized character follows \
+ 4 numbers out of order in {} quantifier
+ 5 number too big in {} quantifier
+ 6 missing terminating ] for character class
+ 7 invalid escape sequence in character class
+ 8 range out of order in character class
+ 9 nothing to repeat
+ 10 [this code is not in use]
+ 11 internal error: unexpected repeat
+ 12 unrecognized character after (? or (?-
+ 13 POSIX named classes are supported only within a class
+ 14 missing )
+ 15 reference to non-existent subpattern
+ 16 erroffset passed as NULL
+ 17 unknown option bit(s) set
+ 18 missing ) after comment
+ 19 [this code is not in use]
+ 20 regular expression is too large
+ 21 failed to get memory
+ 22 unmatched parentheses
+ 23 internal error: code overflow
+ 24 unrecognized character after (?<
+ 25 lookbehind assertion is not fixed length
+ 26 malformed number or name after (?(
+ 27 conditional group contains more than two branches
+ 28 assertion expected after (?(
+ 29 (?R or (?[+-]digits must be followed by )
+ 30 unknown POSIX class name
+ 31 POSIX collating elements are not supported
+ 32 this version of PCRE is not compiled with PCRE_UTF8 support
+ 33 [this code is not in use]
+ 34 character value in \x{...} sequence is too large
+ 35 invalid condition (?(0)
+ 36 \C not allowed in lookbehind assertion
+ 37 PCRE does not support \L, \l, \N, \U, or \u
+ 38 number after (?C is > 255
+ 39 closing ) for (?C expected
+ 40 recursive call could loop indefinitely
+ 41 unrecognized character after (?P
+ 42 syntax error in subpattern name (missing terminator)
+ 43 two named subpatterns have the same name
+ 44 invalid UTF-8 string
+ 45 support for \P, \p, and \X has not been compiled
+ 46 malformed \P or \p sequence
+ 47 unknown property name after \P or \p
+ 48 subpattern name is too long (maximum 32 characters)
+ 49 too many named subpatterns (maximum 10000)
+ 50 [this code is not in use]
+ 51 octal value is greater than \377 (not in UTF-8 mode)
+ 52 internal error: overran compiling workspace
+ 53 internal error: previously-checked referenced subpattern not found
+ 54 DEFINE group contains more than one branch
+ 55 repeating a DEFINE group is not allowed
+ 56 inconsistent NEWLINE options
+ 57 \g is not followed by a braced, angle-bracketed, or quoted
+ name/number or by a plain number
+ 58 a numbered reference must not be zero
+ 59 (*VERB) with an argument is not supported
+ 60 (*VERB) not recognized
+ 61 number is too big
+ 62 subpattern name expected
+ 63 digit expected after (?+
+ 64 ] is an invalid data character in JavaScript compatibility mode
+
+The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
+be used if the limits were changed when PCRE was built.
+
+
STUDYING A PATTERN
+
+pcre_extra *pcre_study(const pcre *code, int options
+const char **errptr);
+
+
+If a compiled pattern is going to be used several times, it is worth spending
+more time analyzing it in order to speed up the time taken for matching. The
+function pcre_study() takes a pointer to a compiled pattern as its first
+argument. If studying the pattern produces additional information that will
+help speed up matching, pcre_study() returns a pointer to a
+pcre_extra block, in which the study_data field points to the
+results of the study.
+
+
+The returned value from pcre_study() can be passed directly to
+pcre_exec(). However, a pcre_extra block also contains other
+fields that can be set by the caller before the block is passed; these are
+described
+below
+in the section on matching a pattern.
+
+
+If studying the pattern does not produce any additional information
+pcre_study() returns NULL. In that circumstance, if the calling program
+wants to pass any of the other fields to pcre_exec(), it must set up its
+own pcre_extra block.
+
+
+The second argument of pcre_study() contains option bits. At present, no
+options are defined, and this argument should always be zero.
+
+
+The third argument for pcre_study() is a pointer for an error message. If
+studying succeeds (even if no data is returned), the variable it points to is
+set to NULL. Otherwise it is set to point to a textual error message. This is a
+static string that is part of the library. You must not try to free it. You
+should test the error pointer for NULL after calling pcre_study(), to be
+sure that it has run successfully.
+
+
+This is a typical call to pcre_study():
+
+ pcre_extra *pe;
+ pe = pcre_study(
+ re, /* result of pcre_compile() */
+ 0, /* no options exist */
+ &error); /* set to NULL or points to a message */
+
+At present, studying a pattern is useful only for non-anchored patterns that do
+not have a single fixed starting character. A bitmap of possible starting
+bytes is created.
+
+
LOCALE SUPPORT
+
+PCRE handles caseless matching, and determines whether characters are letters,
+digits, or whatever, by reference to a set of tables, indexed by character
+value. When running in UTF-8 mode, this applies only to characters with codes
+less than 128. Higher-valued codes never match escapes such as \w or \d, but
+can be tested with \p if PCRE is built with Unicode character property
+support. The use of locales with Unicode is discouraged. If you are handling
+characters with codes greater than 128, you should either use UTF-8 and
+Unicode, or use locales, but not try to mix the two.
+
+
+PCRE contains an internal set of tables that are used when the final argument
+of pcre_compile() is NULL. These are sufficient for many applications.
+Normally, the internal tables recognize only ASCII characters. However, when
+PCRE is built, it is possible to cause the internal tables to be rebuilt in the
+default "C" locale of the local system, which may cause them to be different.
+
+
+The internal tables can always be overridden by tables supplied by the
+application that calls PCRE. These may be created in a different locale from
+the default. As more and more applications change to using Unicode, the need
+for this locale support is expected to die away.
+
+
+External tables are built by calling the pcre_maketables() function,
+which has no arguments, in the relevant locale. The result can then be passed
+to pcre_compile() or pcre_exec() as often as necessary. For
+example, to build and use tables that are appropriate for the French locale
+(where accented characters with values greater than 128 are treated as letters),
+the following code could be used:
+
+ setlocale(LC_CTYPE, "fr_FR");
+ tables = pcre_maketables();
+ re = pcre_compile(..., tables);
+
+The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
+are using Windows, the name for the French locale is "french".
+
+
+When pcre_maketables() runs, the tables are built in memory that is
+obtained via pcre_malloc. It is the caller's responsibility to ensure
+that the memory containing the tables remains available for as long as it is
+needed.
+
+
+The pointer that is passed to pcre_compile() is saved with the compiled
+pattern, and the same tables are used via this pointer by pcre_study()
+and normally also by pcre_exec(). Thus, by default, for any single
+pattern, compilation, studying and matching all happen in the same locale, but
+different patterns can be compiled in different locales.
+
+
+It is possible to pass a table pointer or NULL (indicating the use of the
+internal tables) to pcre_exec(). Although not intended for this purpose,
+this facility could be used to match a pattern in a different locale from the
+one in which it was compiled. Passing table pointers at run time is discussed
+below in the section on matching a pattern.
+
+
INFORMATION ABOUT A PATTERN
+
+int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
+int what, void *where);
+
+
+The pcre_fullinfo() function returns information about a compiled
+pattern. It replaces the obsolete pcre_info() function, which is
+nevertheless retained for backwards compability (and is documented below).
+
+
+The first argument for pcre_fullinfo() is a pointer to the compiled
+pattern. The second argument is the result of pcre_study(), or NULL if
+the pattern was not studied. The third argument specifies which piece of
+information is required, and the fourth argument is a pointer to a variable
+to receive the data. The yield of the function is zero for success, or one of
+the following negative numbers:
+
+ PCRE_ERROR_NULL the argument code was NULL
+ the argument where was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of what was invalid
+
+The "magic number" is placed at the start of each compiled pattern as an simple
+check against passing an arbitrary memory pointer. Here is a typical call of
+pcre_fullinfo(), to obtain the length of the compiled pattern:
+
+ int rc;
+ size_t length;
+ rc = pcre_fullinfo(
+ re, /* result of pcre_compile() */
+ pe, /* result of pcre_study(), or NULL */
+ PCRE_INFO_SIZE, /* what is required */
+ &length); /* where to put the data */
+
+The possible values for the third argument are defined in pcre.h, and are
+as follows:
+
+ PCRE_INFO_BACKREFMAX
+
+Return the number of the highest back reference in the pattern. The fourth
+argument should point to an int variable. Zero is returned if there are
+no back references.
+
+ PCRE_INFO_CAPTURECOUNT
+
+Return the number of capturing subpatterns in the pattern. The fourth argument
+should point to an int variable.
+
+ PCRE_INFO_DEFAULT_TABLES
+
+Return a pointer to the internal default character tables within PCRE. The
+fourth argument should point to an unsigned char * variable. This
+information call is provided for internal use by the pcre_study()
+function. External callers can cause PCRE to use its internal tables by passing
+a NULL table pointer.
+
+ PCRE_INFO_FIRSTBYTE
+
+Return information about the first byte of any matched string, for a
+non-anchored pattern. The fourth argument should point to an int
+variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is
+still recognized for backwards compatibility.)
+
+
+If there is a fixed first byte, for example, from a pattern such as
+(cat|cow|coyote), its value is returned. Otherwise, if either
+
+
+(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
+starts with "^", or
+
+
+(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
+(if it were set, the pattern would be anchored),
+
+
+-1 is returned, indicating that the pattern matches only at the start of a
+subject string or after any newline within the string. Otherwise -2 is
+returned. For anchored patterns, -2 is returned.
+
+ PCRE_INFO_FIRSTTABLE
+
+If the pattern was studied, and this resulted in the construction of a 256-bit
+table indicating a fixed set of bytes for the first byte in any matching
+string, a pointer to the table is returned. Otherwise NULL is returned. The
+fourth argument should point to an unsigned char * variable.
+
+ PCRE_INFO_HASCRORLF
+
+Return 1 if the pattern contains any explicit matches for CR or LF characters,
+otherwise 0. The fourth argument should point to an int variable. An
+explicit match is either a literal CR or LF character, or \r or \n.
+
+ PCRE_INFO_JCHANGED
+
+Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
+0. The fourth argument should point to an int variable. (?J) and
+(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
+
+ PCRE_INFO_LASTLITERAL
+
+Return the value of the rightmost literal byte that must exist in any matched
+string, other than at its start, if such a byte has been recorded. The fourth
+argument should point to an int variable. If there is no such byte, -1 is
+returned. For anchored patterns, a last literal byte is recorded only if it
+follows something of variable length. For example, for the pattern
+/^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value
+is -1.
+
+ PCRE_INFO_NAMECOUNT
+ PCRE_INFO_NAMEENTRYSIZE
+ PCRE_INFO_NAMETABLE
+
+PCRE supports the use of named as well as numbered capturing parentheses. The
+names are just an additional way of identifying the parentheses, which still
+acquire numbers. Several convenience functions such as
+pcre_get_named_substring() are provided for extracting captured
+substrings by name. It is also possible to extract the data directly, by first
+converting the name to a number in order to access the correct pointers in the
+output vector (described with pcre_exec() below). To do the conversion,
+you need to use the name-to-number map, which is described by these three
+values.
+
+
+The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT gives
+the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each
+entry; both of these return an int value. The entry size depends on the
+length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
+entry of the table (a pointer to char). The first two bytes of each entry
+are the number of the capturing parenthesis, most significant byte first. The
+rest of the entry is the corresponding name, zero terminated. The names are in
+alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of
+their parentheses numbers. For example, consider the following pattern (assume
+PCRE_EXTENDED is set, so white space - including newlines - is ignored):
+
+ (?<date> (?<year>(\d\d)?\d\d) - (?<month>\d\d) - (?<day>\d\d) )
+
+There are four named subpatterns, so the table has four entries, and each entry
+in the table is eight bytes long. The table is as follows, with non-printing
+bytes shows in hexadecimal, and undefined bytes shown as ??:
+
+ 00 01 d a t e 00 ??
+ 00 05 d a y 00 ?? ??
+ 00 04 m o n t h 00
+ 00 02 y e a r 00 ??
+
+When writing code to extract data from named subpatterns using the
+name-to-number map, remember that the length of the entries is likely to be
+different for each compiled pattern.
+
+ PCRE_INFO_OKPARTIAL
+
+Return 1 if the pattern can be used for partial matching, otherwise 0. The
+fourth argument should point to an int variable. The
+pcrepartial
+documentation lists the restrictions that apply to patterns when partial
+matching is used.
+
+ PCRE_INFO_OPTIONS
+
+Return a copy of the options with which the pattern was compiled. The fourth
+argument should point to an unsigned long int variable. These option bits
+are those specified in the call to pcre_compile(), modified by any
+top-level option settings at the start of the pattern itself. In other words,
+they are the options that will be in force when matching starts. For example,
+if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
+result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
+
+
+A pattern is automatically anchored by PCRE if all of its top-level
+alternatives begin with one of the following:
+
+ ^ unless PCRE_MULTILINE is set
+ \A always
+ \G always
+ .* if PCRE_DOTALL is set and there are no back references to the subpattern in which .* appears
+
+For such patterns, the PCRE_ANCHORED bit is set in the options returned by
+pcre_fullinfo().
+
+ PCRE_INFO_SIZE
+
+Return the size of the compiled pattern, that is, the value that was passed as
+the argument to pcre_malloc() when PCRE was getting memory in which to
+place the compiled data. The fourth argument should point to a size_t
+variable.
+
+ PCRE_INFO_STUDYSIZE
+
+Return the size of the data block pointed to by the study_data field in
+a pcre_extra block. That is, it is the value that was passed to
+pcre_malloc() when PCRE was getting memory into which to place the data
+created by pcre_study(). The fourth argument should point to a
+size_t variable.
+
+
OBSOLETE INFO FUNCTION
+
+int pcre_info(const pcre *code, int *optptr, int
+*firstcharptr);
+
+
+The pcre_info() function is now obsolete because its interface is too
+restrictive to return all the available data about a compiled pattern. New
+programs should use pcre_fullinfo() instead. The yield of
+pcre_info() is the number of capturing subpatterns, or one of the
+following negative numbers:
+
+ PCRE_ERROR_NULL the argument code was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+
+If the optptr argument is not NULL, a copy of the options with which the
+pattern was compiled is placed in the integer it points to (see
+PCRE_INFO_OPTIONS above).
+
+
+If the pattern is not anchored and the firstcharptr argument is not NULL,
+it is used to pass back information about the first character of any matched
+string (see PCRE_INFO_FIRSTBYTE above).
+
+
REFERENCE COUNTS
+
+int pcre_refcount(pcre *code, int adjust);
+
+
+The pcre_refcount() function is used to maintain a reference count in the
+data block that contains a compiled pattern. It is provided for the benefit of
+applications that operate in an object-oriented manner, where different parts
+of the application may be using the same compiled pattern, but you want to free
+the block when they are all done.
+
+
+When a pattern is compiled, the reference count field is initialized to zero.
+It is changed only by calling this function, whose action is to add the
+adjust value (which may be positive or negative) to it. The yield of the
+function is the new value. However, the value of the count is constrained to
+lie between 0 and 65535, inclusive. If the new value is outside these limits,
+it is forced to the appropriate limit value.
+
+
+Except when it is zero, the reference count is not correctly preserved if a
+pattern is compiled on one host and then transferred to a host whose byte-order
+is different. (This seems a highly unlikely scenario.)
+
+
MATCHING A PATTERN: THE TRADITIONAL FUNCTION
+
+int pcre_exec(const pcre *code, const pcre_extra *extra,
+const char *subject, int length, int startoffset,
+int options, int *ovector, int ovecsize);
+
+
+The function pcre_exec() is called to match a subject string against a
+compiled pattern, which is passed in the code argument. If the
+pattern has been studied, the result of the study should be passed in the
+extra argument. This function is the main matching facility of the
+library, and it operates in a Perl-like manner. For specialist use there is
+also an alternative matching function, which is described
+below
+in the section about the pcre_dfa_exec() function.
+
+
+In most applications, the pattern will have been compiled (and optionally
+studied) in the same process that calls pcre_exec(). However, it is
+possible to save compiled patterns and study data, and then use them later
+in different processes, possibly even on different hosts. For a discussion
+about this, see the
+pcreprecompile
+documentation.
+
+
+Here is an example of a simple call to pcre_exec():
+
+ int rc;
+ int ovector[30];
+ rc = pcre_exec(
+ re, /* result of pcre_compile() */
+ NULL, /* we didn't study the pattern */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ ovector, /* vector of integers for substring information */
+ 30); /* number of elements (NOT size in bytes) */
+
+
+
+Extra data for pcre_exec()
+
+
+If the extra argument is not NULL, it must point to a pcre_extra
+data block. The pcre_study() function returns such a block (when it
+doesn't return NULL), but you can also create one for yourself, and pass
+additional information in it. The pcre_extra block contains the following
+fields (not necessarily in this order):
+
+ unsigned long int flags;
+ void *study_data;
+ unsigned long int match_limit;
+ unsigned long int match_limit_recursion;
+ void *callout_data;
+ const unsigned char *tables;
+
+The flags field is a bitmap that specifies which of the other fields
+are set. The flag bits are:
+
+ PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_MATCH_LIMIT
+ PCRE_EXTRA_MATCH_LIMIT_RECURSION
+ PCRE_EXTRA_CALLOUT_DATA
+ PCRE_EXTRA_TABLES
+
+Other flag bits should be set to zero. The study_data field is set in the
+pcre_extra block that is returned by pcre_study(), together with
+the appropriate flag bit. You should not set this yourself, but you may add to
+the block by setting the other fields and their corresponding flag bits.
+
+
+The match_limit field provides a means of preventing PCRE from using up a
+vast amount of resources when running patterns that are not going to match,
+but which have a very large number of possibilities in their search trees. The
+classic example is the use of nested unlimited repeats.
+
+
+Internally, PCRE uses a function called match() which it calls repeatedly
+(sometimes recursively). The limit set by match_limit is imposed on the
+number of times this function is called during a match, which has the effect of
+limiting the amount of backtracking that can take place. For patterns that are
+not anchored, the count restarts from zero for each position in the subject
+string.
+
+
+The default value for the limit can be set when PCRE is built; the default
+default is 10 million, which handles all but the most extreme cases. You can
+override the default by suppling pcre_exec() with a pcre_extra
+block in which match_limit is set, and PCRE_EXTRA_MATCH_LIMIT is set in
+the flags field. If the limit is exceeded, pcre_exec() returns
+PCRE_ERROR_MATCHLIMIT.
+
+
+The match_limit_recursion field is similar to match_limit, but
+instead of limiting the total number of times that match() is called, it
+limits the depth of recursion. The recursion depth is a smaller number than the
+total number of calls, because not all calls to match() are recursive.
+This limit is of use only if it is set smaller than match_limit.
+
+
+Limiting the recursion depth limits the amount of stack that can be used, or,
+when PCRE has been compiled to use memory on the heap instead of the stack, the
+amount of heap memory that can be used.
+
+
+The default value for match_limit_recursion can be set when PCRE is
+built; the default default is the same value as the default for
+match_limit. You can override the default by suppling pcre_exec()
+with a pcre_extra block in which match_limit_recursion is set, and
+PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the limit
+is exceeded, pcre_exec() returns PCRE_ERROR_RECURSIONLIMIT.
+
+
+The pcre_callout field is used in conjunction with the "callout" feature,
+which is described in the
+pcrecallout
+documentation.
+
+
+The tables field is used to pass a character tables pointer to
+pcre_exec(); this overrides the value that is stored with the compiled
+pattern. A non-NULL value is stored with the compiled pattern only if custom
+tables were supplied to pcre_compile() via its tableptr argument.
+If NULL is passed to pcre_exec() using this mechanism, it forces PCRE's
+internal tables to be used. This facility is helpful when re-using patterns
+that have been saved after compiling with an external set of tables, because
+the external tables might be at a different address when pcre_exec() is
+called. See the
+pcreprecompile
+documentation for a discussion of saving compiled patterns for later use.
+
+
+Option bits for pcre_exec()
+
+
+The unused bits of the options argument for pcre_exec() must be
+zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
+PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
+
+ PCRE_ANCHORED
+
+The PCRE_ANCHORED option limits pcre_exec() to matching at the first
+matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
+to be anchored by virtue of its contents, it cannot be made unachored at
+matching time.
+
+ PCRE_BSR_ANYCRLF
+ PCRE_BSR_UNICODE
+
+These options (which are mutually exclusive) control what the \R escape
+sequence matches. The choice is either to match only CR, LF, or CRLF, or to
+match any Unicode newline sequence. These options override the choice that was
+made or defaulted when the pattern was compiled.
+
+ PCRE_NEWLINE_CR
+ PCRE_NEWLINE_LF
+ PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
+ PCRE_NEWLINE_ANY
+
+These options override the newline definition that was chosen or defaulted when
+the pattern was compiled. For details, see the description of
+pcre_compile() above. During matching, the newline choice affects the
+behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
+the way the match position is advanced after a match failure for an unanchored
+pattern.
+
+
+When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
+match attempt for an unanchored pattern fails when the current position is at a
+CRLF sequence, and the pattern contains no explicit matches for CR or LF
+characters, the match position is advanced by two characters instead of one, in
+other words, to after the CRLF.
+
+
+The above rule is a compromise that makes the most common cases work as
+expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
+set), it does not match the string "\r\nA" because, after failing at the
+start, it skips both the CR and the LF before retrying. However, the pattern
+[\r\n]A does match that string, because it contains an explicit CR or LF
+reference, and so advances only by one character after the first failure.
+
+
+An explicit match for CR of LF is either a literal appearance of one of those
+characters, or one of the \r or \n escape sequences. Implicit matches such as
+[^X] do not count, nor does \s (which includes CR and LF in the characters
+that it matches).
+
+
+Notwithstanding the above, anomalous effects may still occur when CRLF is a
+valid newline sequence and explicit \r or \n escapes appear in the pattern.
+
+ PCRE_NOTBOL
+
+This option specifies that first character of the subject string is not the
+beginning of a line, so the circumflex metacharacter should not match before
+it. Setting this without PCRE_MULTILINE (at compile time) causes circumflex
+never to match. This option affects only the behaviour of the circumflex
+metacharacter. It does not affect \A.
+
+ PCRE_NOTEOL
+
+This option specifies that the end of the subject string is not the end of a
+line, so the dollar metacharacter should not match it nor (except in multiline
+mode) a newline immediately before it. Setting this without PCRE_MULTILINE (at
+compile time) causes dollar never to match. This option affects only the
+behaviour of the dollar metacharacter. It does not affect \Z or \z.
+
+ PCRE_NOTEMPTY
+
+An empty string is not considered to be a valid match if this option is set. If
+there are alternatives in the pattern, they are tried. If all the alternatives
+match the empty string, the entire match fails. For example, if the pattern
+
+ a?b?
+
+is applied to a string not beginning with "a" or "b", it matches the empty
+string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
+valid, so PCRE searches further into the string for occurrences of "a" or "b".
+
+
+Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
+of a pattern match of the empty string within its split() function, and
+when using the /g modifier. It is possible to emulate Perl's behaviour after
+matching a null string by first trying the match again at the same offset with
+PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
+starting offset (see below) and trying an ordinary match again. There is some
+code that demonstrates how to do this in the pcredemo.c sample program.
+
+ PCRE_NO_UTF8_CHECK
+
+When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
+string is automatically checked when pcre_exec() is subsequently called.
+The value of startoffset is also checked to ensure that it points to the
+start of a UTF-8 character. There is a discussion about the validity of UTF-8
+strings in the
+section on UTF-8 support
+in the main
+pcre
+page. If an invalid UTF-8 sequence of bytes is found, pcre_exec() returns
+the error PCRE_ERROR_BADUTF8. If startoffset contains an invalid value,
+PCRE_ERROR_BADUTF8_OFFSET is returned.
+
+
+If you already know that your subject is valid, and you want to skip these
+checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
+calling pcre_exec(). You might want to do this for the second and
+subsequent calls to pcre_exec() if you are making repeated calls to find
+all the matches in a single subject string. However, you should be sure that
+the value of startoffset points to the start of a UTF-8 character. When
+PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a
+subject, or a value of startoffset that does not point to the start of a
+UTF-8 character, is undefined. Your program may crash.
+
+ PCRE_PARTIAL
+
+This option turns on the partial matching feature. If the subject string fails
+to match the pattern, but at some point during the matching process the end of
+the subject was reached (that is, the subject partially matches the pattern and
+the failure to match occurred only because there were not enough subject
+characters), pcre_exec() returns PCRE_ERROR_PARTIAL instead of
+PCRE_ERROR_NOMATCH. When PCRE_PARTIAL is used, there are restrictions on what
+may appear in the pattern. These are discussed in the
+pcrepartial
+documentation.
+
+
+The string to be matched by pcre_exec()
+
+
+The subject string is passed to pcre_exec() as a pointer in
+subject, a length in length, and a starting byte offset in
+startoffset. In UTF-8 mode, the byte offset must point to the start of a
+UTF-8 character. Unlike the pattern string, the subject may contain binary zero
+bytes. When the starting offset is zero, the search for a match starts at the
+beginning of the subject, and this is by far the most common case.
+
+
+A non-zero starting offset is useful when searching for another match in the
+same subject by calling pcre_exec() again after a previous success.
+Setting startoffset differs from just passing over a shortened string and
+setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
+lookbehind. For example, consider the pattern
+
+ \Biss\B
+
+which finds occurrences of "iss" in the middle of words. (\B matches only if
+the current position in the subject is not a word boundary.) When applied to
+the string "Mississipi" the first call to pcre_exec() finds the first
+occurrence. If pcre_exec() is called again with just the remainder of the
+subject, namely "issipi", it does not match, because \B is always false at the
+start of the subject, which is deemed to be a word boundary. However, if
+pcre_exec() is passed the entire string again, but with startoffset
+set to 4, it finds the second occurrence of "iss" because it is able to look
+behind the starting point to discover that it is preceded by a letter.
+
+
+If a non-zero starting offset is passed when the pattern is anchored, one
+attempt to match at the given offset is made. This can only succeed if the
+pattern does not require the match to be at the start of the subject.
+
+
+How pcre_exec() returns captured substrings
+
+
+In general, a pattern matches a certain portion of the subject, and in
+addition, further substrings from the subject may be picked out by parts of the
+pattern. Following the usage in Jeffrey Friedl's book, this is called
+"capturing" in what follows, and the phrase "capturing subpattern" is used for
+a fragment of a pattern that picks out a substring. PCRE supports several other
+kinds of parenthesized subpattern that do not cause substrings to be captured.
+
+
+Captured substrings are returned to the caller via a vector of integer offsets
+whose address is passed in ovector. The number of elements in the vector
+is passed in ovecsize, which must be a non-negative number. Note:
+this argument is NOT the size of ovector in bytes.
+
+
+The first two-thirds of the vector is used to pass back captured substrings,
+each substring using a pair of integers. The remaining third of the vector is
+used as workspace by pcre_exec() while matching capturing subpatterns,
+and is not available for passing back information. The length passed in
+ovecsize should always be a multiple of three. If it is not, it is
+rounded down.
+
+
+When a match is successful, information about captured substrings is returned
+in pairs of integers, starting at the beginning of ovector, and
+continuing up to two-thirds of its length at the most. The first element of a
+pair is set to the offset of the first character in a substring, and the second
+is set to the offset of the first character after the end of a substring. The
+first pair, ovector[0] and ovector[1], identify the portion of the
+subject string matched by the entire pattern. The next pair is used for the
+first capturing subpattern, and so on. The value returned by pcre_exec()
+is one more than the highest numbered pair that has been set. For example, if
+two substrings have been captured, the returned value is 3. If there are no
+capturing subpatterns, the return value from a successful match is 1,
+indicating that just the first pair of offsets has been set.
+
+
+If a capturing subpattern is matched repeatedly, it is the last portion of the
+string that it matched that is returned.
+
+
+If the vector is too small to hold all the captured substring offsets, it is
+used as far as possible (up to two-thirds of its length), and the function
+returns a value of zero. In particular, if the substring offsets are not of
+interest, pcre_exec() may be called with ovector passed as NULL and
+ovecsize as zero. However, if the pattern contains back references and
+the ovector is not big enough to remember the related substrings, PCRE
+has to get additional memory for use during matching. Thus it is usually
+advisable to supply an ovector.
+
+
+The pcre_info() function can be used to find out how many capturing
+subpatterns there are in a compiled pattern. The smallest size for
+ovector that will allow for n captured substrings, in addition to
+the offsets of the substring matched by the whole pattern, is (n+1)*3.
+
+
+It is possible for capturing subpattern number n+1 to match some part of
+the subject when subpattern n has not been used at all. For example, if
+the string "abc" is matched against the pattern (a|(z))(bc) the return from the
+function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
+happens, both values in the offset pairs corresponding to unused subpatterns
+are set to -1.
+
+
+Offset values that correspond to unused subpatterns at the end of the
+expression are also set to -1. For example, if the string "abc" is matched
+against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The
+return from the function is 2, because the highest used capturing subpattern
+number is 1. However, you can refer to the offsets for the second and third
+capturing subpatterns if you wish (assuming the vector is large enough, of
+course).
+
+
+Some convenience functions are provided for extracting the captured substrings
+as separate strings. These are described below.
+
+
+Error return values from pcre_exec()
+
+
+If pcre_exec() fails, it returns a negative number. The following are
+defined in the header file:
+
+ PCRE_ERROR_NOMATCH (-1)
+
+The subject string did not match the pattern.
+
+ PCRE_ERROR_NULL (-2)
+
+Either code or subject was passed as NULL, or ovector was
+NULL and ovecsize was not zero.
+
+ PCRE_ERROR_BADOPTION (-3)
+
+An unrecognized bit was set in the options argument.
+
+ PCRE_ERROR_BADMAGIC (-4)
+
+PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
+the case when it is passed a junk pointer and to detect when a pattern that was
+compiled in an environment of one endianness is run in an environment with the
+other endianness. This is the error that PCRE gives when the magic number is
+not present.
+
+ PCRE_ERROR_UNKNOWN_OPCODE (-5)
+
+While running the pattern match, an unknown item was encountered in the
+compiled pattern. This error could be caused by a bug in PCRE or by overwriting
+of the compiled pattern.
+
+ PCRE_ERROR_NOMEMORY (-6)
+
+If a pattern contains back references, but the ovector that is passed to
+pcre_exec() is not big enough to remember the referenced substrings, PCRE
+gets a block of memory at the start of matching to use for this purpose. If the
+call via pcre_malloc() fails, this error is given. The memory is
+automatically freed at the end of matching.
+
+ PCRE_ERROR_NOSUBSTRING (-7)
+
+This error is used by the pcre_copy_substring(),
+pcre_get_substring(), and pcre_get_substring_list() functions (see
+below). It is never returned by pcre_exec().
+
+ PCRE_ERROR_MATCHLIMIT (-8)
+
+The backtracking limit, as specified by the match_limit field in a
+pcre_extra structure (or defaulted) was reached. See the description
+above.
+
+ PCRE_ERROR_CALLOUT (-9)
+
+This error is never generated by pcre_exec() itself. It is provided for
+use by callout functions that want to yield a distinctive error code. See the
+pcrecallout
+documentation for details.
+
+ PCRE_ERROR_BADUTF8 (-10)
+
+A string that contains an invalid UTF-8 byte sequence was passed as a subject.
+
+ PCRE_ERROR_BADUTF8_OFFSET (-11)
+
+The UTF-8 byte sequence that was passed as a subject was valid, but the value
+of startoffset did not point to the beginning of a UTF-8 character.
+
+ PCRE_ERROR_PARTIAL (-12)
+
+The subject string did not match, but it did match partially. See the
+pcrepartial
+documentation for details of partial matching.
+
+ PCRE_ERROR_BADPARTIAL (-13)
+
+The PCRE_PARTIAL option was used with a compiled pattern containing items that
+are not supported for partial matching. See the
+pcrepartial
+documentation for details of partial matching.
+
+ PCRE_ERROR_INTERNAL (-14)
+
+An unexpected internal error has occurred. This error could be caused by a bug
+in PCRE or by overwriting of the compiled pattern.
+
+ PCRE_ERROR_BADCOUNT (-15)
+
+This error is given if the value of the ovecsize argument is negative.
+
+ PCRE_ERROR_RECURSIONLIMIT (-21)
+
+The internal recursion limit, as specified by the match_limit_recursion
+field in a pcre_extra structure (or defaulted) was reached. See the
+description above.
+
+ PCRE_ERROR_BADNEWLINE (-23)
+
+An invalid combination of PCRE_NEWLINE_xxx options was given.
+
+
+Error numbers -16 to -20 and -22 are not used by pcre_exec().
+
+
EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
+
+int pcre_copy_substring(const char *subject, int *ovector,
+int stringcount, int stringnumber, char *buffer,
+int buffersize);
+
+
+int pcre_get_substring(const char *subject, int *ovector,
+int stringcount, int stringnumber,
+const char **stringptr);
+
+
+int pcre_get_substring_list(const char *subject,
+int *ovector, int stringcount, const char ***listptr);
+
+
+Captured substrings can be accessed directly by using the offsets returned by
+pcre_exec() in ovector. For convenience, the functions
+pcre_copy_substring(), pcre_get_substring(), and
+pcre_get_substring_list() are provided for extracting captured substrings
+as new, separate, zero-terminated strings. These functions identify substrings
+by number. The next section describes functions for extracting named
+substrings.
+
+
+A substring that contains a binary zero is correctly extracted and has a
+further zero added on the end, but the result is not, of course, a C string.
+However, you can process such a string by referring to the length that is
+returned by pcre_copy_substring() and pcre_get_substring().
+Unfortunately, the interface to pcre_get_substring_list() is not adequate
+for handling strings containing binary zeros, because the end of the final
+string is not independently indicated.
+
+
+The first three arguments are the same for all three of these functions:
+subject is the subject string that has just been successfully matched,
+ovector is a pointer to the vector of integer offsets that was passed to
+pcre_exec(), and stringcount is the number of substrings that were
+captured by the match, including the substring that matched the entire regular
+expression. This is the value returned by pcre_exec() if it is greater
+than zero. If pcre_exec() returned zero, indicating that it ran out of
+space in ovector, the value passed as stringcount should be the
+number of elements in the vector divided by three.
+
+
+The functions pcre_copy_substring() and pcre_get_substring()
+extract a single substring, whose number is given as stringnumber. A
+value of zero extracts the substring that matched the entire pattern, whereas
+higher values extract the captured substrings. For pcre_copy_substring(),
+the string is placed in buffer, whose length is given by
+buffersize, while for pcre_get_substring() a new block of memory is
+obtained via pcre_malloc, and its address is returned via
+stringptr. The yield of the function is the length of the string, not
+including the terminating zero, or one of these error codes:
+
+ PCRE_ERROR_NOMEMORY (-6)
+
+The buffer was too small for pcre_copy_substring(), or the attempt to get
+memory failed for pcre_get_substring().
+
+ PCRE_ERROR_NOSUBSTRING (-7)
+
+There is no substring whose number is stringnumber.
+
+
+The pcre_get_substring_list() function extracts all available substrings
+and builds a list of pointers to them. All this is done in a single block of
+memory that is obtained via pcre_malloc. The address of the memory block
+is returned via listptr, which is also the start of the list of string
+pointers. The end of the list is marked by a NULL pointer. The yield of the
+function is zero if all went well, or the error code
+
+ PCRE_ERROR_NOMEMORY (-6)
+
+if the attempt to get the memory block failed.
+
+
+When any of these functions encounter a substring that is unset, which can
+happen when capturing subpattern number n+1 matches some part of the
+subject, but subpattern n has not been used at all, they return an empty
+string. This can be distinguished from a genuine zero-length substring by
+inspecting the appropriate offset in ovector, which is negative for unset
+substrings.
+
+
+The two convenience functions pcre_free_substring() and
+pcre_free_substring_list() can be used to free the memory returned by
+a previous call of pcre_get_substring() or
+pcre_get_substring_list(), respectively. They do nothing more than call
+the function pointed to by pcre_free, which of course could be called
+directly from a C program. However, PCRE is used in some situations where it is
+linked via a special interface to another programming language that cannot use
+pcre_free directly; it is for these cases that the functions are
+provided.
+
+
EXTRACTING CAPTURED SUBSTRINGS BY NAME
+
+int pcre_get_stringnumber(const pcre *code,
+const char *name);
+
+
+int pcre_copy_named_substring(const pcre *code,
+const char *subject, int *ovector,
+int stringcount, const char *stringname,
+char *buffer, int buffersize);
+
+
+int pcre_get_named_substring(const pcre *code,
+const char *subject, int *ovector,
+int stringcount, const char *stringname,
+const char **stringptr);
+
+
+To extract a substring by name, you first have to find associated number.
+For example, for this pattern
+
+ (a+)b(?<xxx>\d+)...
+
+the number of the subpattern called "xxx" is 2. If the name is known to be
+unique (PCRE_DUPNAMES was not set), you can find the number from the name by
+calling pcre_get_stringnumber(). The first argument is the compiled
+pattern, and the second is the name. The yield of the function is the
+subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no subpattern of
+that name.
+
+
+Given the number, you can extract the substring directly, or use one of the
+functions described in the previous section. For convenience, there are also
+two functions that do the whole job.
+
+
+Most of the arguments of pcre_copy_named_substring() and
+pcre_get_named_substring() are the same as those for the similarly named
+functions that extract by number. As these are described in the previous
+section, they are not re-described here. There are just two differences:
+
+
+First, instead of a substring number, a substring name is given. Second, there
+is an extra argument, given at the start, which is a pointer to the compiled
+pattern. This is needed in order to gain access to the name-to-number
+translation table.
+
+
+These functions call pcre_get_stringnumber(), and if it succeeds, they
+then call pcre_copy_substring() or pcre_get_substring(), as
+appropriate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names,
+the behaviour may not be what you want (see the next section).
+
+
DUPLICATE SUBPATTERN NAMES
+
+int pcre_get_stringtable_entries(const pcre *code,
+const char *name, char **first, char **last);
+
+
+When a pattern is compiled with the PCRE_DUPNAMES option, names for subpatterns
+are not required to be unique. Normally, patterns with duplicate names are such
+that in any one match, only one of the named subpatterns participates. An
+example is shown in the
+pcrepattern
+documentation.
+
+
+When duplicates are present, pcre_copy_named_substring() and
+pcre_get_named_substring() return the first substring corresponding to
+the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
+returned; no data is returned. The pcre_get_stringnumber() function
+returns one of the numbers that are associated with the name, but it is not
+defined which it is.
+
+
+If you want to get full details of all captured substrings for a given name,
+you must use the pcre_get_stringtable_entries() function. The first
+argument is the compiled pattern, and the second is the name. The third and
+fourth are pointers to variables which are updated by the function. After it
+has run, they point to the first and last entries in the name-to-number table
+for the given name. The function itself returns the length of each entry, or
+PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
+described above in the section entitled Information about a pattern.
+Given all the relevant entries for the name, you can extract each of their
+numbers, and hence the captured data, if any.
+
+
FINDING ALL POSSIBLE MATCHES
+
+The traditional matching function uses a similar algorithm to Perl, which stops
+when it finds the first match, starting at a given point in the subject. If you
+want to find all possible matches, or the longest possible match, consider
+using the alternative matching function (see below) instead. If you cannot use
+the alternative function, but still need to find all possible matches, you
+can kludge it up by making use of the callout facility, which is described in
+the
+pcrecallout
+documentation.
+
+
+What you have to do is to insert a callout right at the end of the pattern.
+When your callout function is called, extract and save the current matched
+substring. Then return 1, which forces pcre_exec() to backtrack and try
+other alternatives. Ultimately, when it runs out of matches, pcre_exec()
+will yield PCRE_ERROR_NOMATCH.
+
+
MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
+
+int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
+const char *subject, int length, int startoffset,
+int options, int *ovector, int ovecsize,
+int *workspace, int wscount);
+
+
+The function pcre_dfa_exec() is called to match a subject string against
+a compiled pattern, using a matching algorithm that scans the subject string
+just once, and does not backtrack. This has different characteristics to the
+normal algorithm, and is not compatible with Perl. Some of the features of PCRE
+patterns are not supported. Nevertheless, there are times when this kind of
+matching can be useful. For a discussion of the two matching algorithms, see
+the
+pcrematching
+documentation.
+
+
+The arguments for the pcre_dfa_exec() function are the same as for
+pcre_exec(), plus two extras. The ovector argument is used in a
+different way, and this is described below. The other common arguments are used
+in the same way as for pcre_exec(), so their description is not repeated
+here.
+
+
+The two additional arguments provide workspace for the function. The workspace
+vector should contain at least 20 elements. It is used for keeping track of
+multiple paths through the pattern tree. More workspace will be needed for
+patterns and subjects where there are a lot of potential matches.
+
+
+Here is an example of a simple call to pcre_dfa_exec():
+
+ int rc;
+ int ovector[10];
+ int wspace[20];
+ rc = pcre_dfa_exec(
+ re, /* result of pcre_compile() */
+ NULL, /* we didn't study the pattern */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ ovector, /* vector of integers for substring information */
+ 10, /* number of elements (NOT size in bytes) */
+ wspace, /* working space vector */
+ 20); /* number of elements (NOT size in bytes) */
+
+
+
+Option bits for pcre_dfa_exec()
+
+
+The unused bits of the options argument for pcre_dfa_exec() must be
+zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
+PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL,
+PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last three of these are
+the same as for pcre_exec(), so their description is not repeated here.
+
+ PCRE_PARTIAL
+
+This has the same general effect as it does for pcre_exec(), but the
+details are slightly different. When PCRE_PARTIAL is set for
+pcre_dfa_exec(), the return code PCRE_ERROR_NOMATCH is converted into
+PCRE_ERROR_PARTIAL if the end of the subject is reached, there have been no
+complete matches, but there is still at least one matching possibility. The
+portion of the string that provided the partial match is set as the first
+matching string.
+
+ PCRE_DFA_SHORTEST
+
+Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
+soon as it has found one match. Because of the way the alternative algorithm
+works, this is necessarily the shortest possible match at the first possible
+matching point in the subject string.
+
+ PCRE_DFA_RESTART
+
+When pcre_dfa_exec() is called with the PCRE_PARTIAL option, and returns
+a partial match, it is possible to call it again, with additional subject
+characters, and have it continue with the same match. The PCRE_DFA_RESTART
+option requests this action; when it is set, the workspace and
+wscount options must reference the same vector as before because data
+about the match so far is left in them after a partial match. There is more
+discussion of this facility in the
+pcrepartial
+documentation.
+
+
+Successful returns from pcre_dfa_exec()
+
+
+When pcre_dfa_exec() succeeds, it may have matched more than one
+substring in the subject. Note, however, that all the matches from one run of
+the function start at the same point in the subject. The shorter matches are
+all initial substrings of the longer matches. For example, if the pattern
+
+ <.*>
+
+is matched against the string
+
+ This is <something> <something else> <something further> no more
+
+the three matched strings are
+
+ <something>
+ <something> <something else>
+ <something> <something else> <something further>
+
+On success, the yield of the function is a number greater than zero, which is
+the number of matched substrings. The substrings themselves are returned in
+ovector. Each string uses two elements; the first is the offset to the
+start, and the second is the offset to the end. In fact, all the strings have
+the same start offset. (Space could have been saved by giving this only once,
+but it was decided to retain some compatibility with the way pcre_exec()
+returns data, even though the meaning of the strings is different.)
+
+
+The strings are returned in reverse order of length; that is, the longest
+matching string is given first. If there were too many matches to fit into
+ovector, the yield of the function is zero, and the vector is filled with
+the longest matches.
+
+
+Error returns from pcre_dfa_exec()
+
+
+The pcre_dfa_exec() function returns a negative number when it fails.
+Many of the errors are the same as for pcre_exec(), and these are
+described
+above.
+There are in addition the following errors that are specific to
+pcre_dfa_exec():
+
+ PCRE_ERROR_DFA_UITEM (-16)
+
+This return is given if pcre_dfa_exec() encounters an item in the pattern
+that it does not support, for instance, the use of \C or a back reference.
+
+ PCRE_ERROR_DFA_UCOND (-17)
+
+This return is given if pcre_dfa_exec() encounters a condition item that
+uses a back reference for the condition, or a test for recursion in a specific
+group. These are not supported.
+
+ PCRE_ERROR_DFA_UMLIMIT (-18)
+
+This return is given if pcre_dfa_exec() is called with an extra
+block that contains a setting of the match_limit field. This is not
+supported (it is meaningless).
+
+ PCRE_ERROR_DFA_WSSIZE (-19)
+
+This return is given if pcre_dfa_exec() runs out of space in the
+workspace vector.
+
+ PCRE_ERROR_DFA_RECURSE (-20)
+
+When a recursive subpattern is processed, the matching function calls itself
+recursively, using private vectors for ovector and workspace. This
+error is given if the output vector is not large enough. This should be
+extremely rare, as a vector of size 1000 is used.
+
+
SEE ALSO
+
+pcrebuild(3), pcrecallout(3), pcrecpp(3)(3),
+pcrematching(3), pcrepartial(3), pcreposix(3),
+pcreprecompile(3), pcresample(3), pcrestack(3).
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 12 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrebuild.html b/src/doc/html/pcrebuild.html
new file mode 100644
index 0000000..6fe0c67
--- /dev/null
+++ b/src/doc/html/pcrebuild.html
@@ -0,0 +1,340 @@
+
+
+pcrebuild specification
+
+
+pcrebuild man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PCRE BUILD-TIME OPTIONS
+
+This document describes the optional features of PCRE that can be selected when
+the library is compiled. It assumes use of the configure script, where
+the optional features are selected or deselected by providing options to
+configure before running the make command. However, the same
+options can be selected in both Unix-like and non-Unix-like environments using
+the GUI facility of CMakeSetup if you are using CMake instead of
+configure to build PCRE.
+
+
+The complete list of options for configure (which includes the standard
+ones such as the selection of the installation directory) can be obtained by
+running
+
+ ./configure --help
+
+The following sections include descriptions of options whose names begin with
+--enable or --disable. These settings specify changes to the defaults for the
+configure command. Because of the way that configure works,
+--enable and --disable always come in pairs, so the complementary option always
+exists as well, but as it specifies the default, it is not described.
+
+
C++ SUPPORT
+
+By default, the configure script will search for a C++ compiler and C++
+header files. If it finds them, it automatically builds the C++ wrapper library
+for PCRE. You can disable this by adding
+
+ --disable-cpp
+
+to the configure command.
+
+
UTF-8 SUPPORT
+
+To build PCRE with support for UTF-8 character strings, add
+
+ --enable-utf8
+
+to the configure command. Of itself, this does not make PCRE treat
+strings as UTF-8. As well as compiling PCRE with this option, you also have
+have to set the PCRE_UTF8 option when you call the pcre_compile()
+function.
+
+
UNICODE CHARACTER PROPERTY SUPPORT
+
+UTF-8 support allows PCRE to process character values greater than 255 in the
+strings that it handles. On its own, however, it does not provide any
+facilities for accessing the properties of such characters. If you want to be
+able to use the pattern escapes \P, \p, and \X, which refer to Unicode
+character properties, you must add
+
+ --enable-unicode-properties
+
+to the configure command. This implies UTF-8 support, even if you have
+not explicitly requested it.
+
+
+Including Unicode property support adds around 30K of tables to the PCRE
+library. Only the general category properties such as Lu and Nd are
+supported. Details are given in the
+pcrepattern
+documentation.
+
+
CODE VALUE OF NEWLINE
+
+By default, PCRE interprets character 10 (linefeed, LF) as indicating the end
+of a line. This is the normal newline character on Unix-like systems. You can
+compile PCRE to use character 13 (carriage return, CR) instead, by adding
+
+ --enable-newline-is-cr
+
+to the configure command. There is also a --enable-newline-is-lf option,
+which explicitly specifies linefeed as the newline character.
+
+
+Alternatively, you can specify that line endings are to be indicated by the two
+character sequence CRLF. If you want this, add
+
+ --enable-newline-is-crlf
+
+to the configure command. There is a fourth option, specified by
+
+ --enable-newline-is-anycrlf
+
+which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
+indicating a line ending. Finally, a fifth option, specified by
+
+ --enable-newline-is-any
+
+causes PCRE to recognize any Unicode newline sequence.
+
+
+Whatever line ending convention is selected when PCRE is built can be
+overridden when the library functions are called. At build time it is
+conventional to use the standard for your operating system.
+
+
WHAT \R MATCHES
+
+By default, the sequence \R in a pattern matches any Unicode newline sequence,
+whatever has been selected as the line ending sequence. If you specify
+
+ --enable-bsr-anycrlf
+
+the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
+selected when PCRE is built can be overridden when the library functions are
+called.
+
+
BUILDING SHARED AND STATIC LIBRARIES
+
+The PCRE building process uses libtool to build both shared and static
+Unix libraries by default. You can suppress one of these by adding one of
+
+ --disable-shared
+ --disable-static
+
+to the configure command, as required.
+
+
POSIX MALLOC USAGE
+
+When PCRE is called through the POSIX interface (see the
+pcreposix
+documentation), additional working storage is required for holding the pointers
+to capturing substrings, because PCRE requires three integers per substring,
+whereas the POSIX interface provides only two. If the number of expected
+substrings is small, the wrapper function uses space on the stack, because this
+is faster than using malloc() for each call. The default threshold above
+which the stack is no longer used is 10; it can be changed by adding a setting
+such as
+
+ --with-posix-malloc-threshold=20
+
+to the configure command.
+
+
HANDLING VERY LARGE PATTERNS
+
+Within a compiled pattern, offset values are used to point from one part to
+another (for example, from an opening parenthesis to an alternation
+metacharacter). By default, two-byte values are used for these offsets, leading
+to a maximum size for a compiled pattern of around 64K. This is sufficient to
+handle all but the most gigantic patterns. Nevertheless, some people do want to
+process enormous patterns, so it is possible to compile PCRE to use three-byte
+or four-byte offsets by adding a setting such as
+
+ --with-link-size=3
+
+to the configure command. The value given must be 2, 3, or 4. Using
+longer offsets slows down the operation of PCRE because it has to load
+additional bytes when handling them.
+
+
AVOIDING EXCESSIVE STACK USAGE
+
+When matching with the pcre_exec() function, PCRE implements backtracking
+by making recursive calls to an internal function called match(). In
+environments where the size of the stack is limited, this can severely limit
+PCRE's operation. (The Unix environment does not usually suffer from this
+problem, but it may sometimes be necessary to increase the maximum stack size.
+There is a discussion in the
+pcrestack
+documentation.) An alternative approach to recursion that uses memory from the
+heap to remember data, instead of using recursive function calls, has been
+implemented to work round the problem of limited stack size. If you want to
+build a version of PCRE that works this way, add
+
+ --disable-stack-for-recursion
+
+to the configure command. With this configuration, PCRE will use the
+pcre_stack_malloc and pcre_stack_free variables to call memory
+management functions. By default these point to malloc() and
+free(), but you can replace the pointers so that your own functions are
+used.
+
+
+Separate functions are provided rather than using pcre_malloc and
+pcre_free because the usage is very predictable: the block sizes
+requested are always the same, and the blocks are always freed in reverse
+order. A calling program might be able to implement optimized functions that
+perform better than malloc() and free(). PCRE runs noticeably more
+slowly when built in this way. This option affects only the pcre_exec()
+function; it is not relevant for the the pcre_dfa_exec() function.
+
+
LIMITING PCRE RESOURCE USAGE
+
+Internally, PCRE has a function called match(), which it calls repeatedly
+(sometimes recursively) when matching a pattern with the pcre_exec()
+function. By controlling the maximum number of times this function may be
+called during a single matching operation, a limit can be placed on the
+resources used by a single call to pcre_exec(). The limit can be changed
+at run time, as described in the
+pcreapi
+documentation. The default is 10 million, but this can be changed by adding a
+setting such as
+
+ --with-match-limit=500000
+
+to the configure command. This setting has no effect on the
+pcre_dfa_exec() matching function.
+
+
+In some environments it is desirable to limit the depth of recursive calls of
+match() more strictly than the total number of calls, in order to
+restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion
+is specified) that is used. A second limit controls this; it defaults to the
+value that is set for --with-match-limit, which imposes no additional
+constraints. However, you can set a lower limit by adding, for example,
+
+ --with-match-limit-recursion=10000
+
+to the configure command. This value can also be overridden at run time.
+
+
CREATING CHARACTER TABLES AT BUILD TIME
+
+PCRE uses fixed tables for processing characters whose code values are less
+than 256. By default, PCRE is built with a set of tables that are distributed
+in the file pcre_chartables.c.dist. These tables are for ASCII codes
+only. If you add
+
+ --enable-rebuild-chartables
+
+to the configure command, the distributed tables are no longer used.
+Instead, a program called dftables is compiled and run. This outputs the
+source for new set of tables, created in the default locale of your C runtime
+system. (This method of replacing the tables does not work if you are cross
+compiling, because dftables is run on the local host. If you need to
+create alternative tables when cross compiling, you will have to do so "by
+hand".)
+
+
USING EBCDIC CODE
+
+PCRE assumes by default that it will run in an environment where the character
+code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
+most computer operating systems. PCRE can, however, be compiled to run in an
+EBCDIC environment by adding
+
+ --enable-ebcdic
+
+to the configure command. This setting implies
+--enable-rebuild-chartables. You should only use it if you know that you are in
+an EBCDIC environment (for example, an IBM mainframe operating system).
+
+
PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT
+
+By default, pcregrep reads all files as plain text. You can build it so
+that it recognizes files whose names end in .gz or .bz2, and reads
+them with libz or libbz2, respectively, by adding one or both of
+
+ --enable-pcregrep-libz
+ --enable-pcregrep-libbz2
+
+to the configure command. These options naturally require that the
+relevant libraries are installed on your system. Configuration will fail if
+they are not.
+
+
PCRETEST OPTION FOR LIBREADLINE SUPPORT
+
+If you add
+
+ --enable-pcretest-libreadline
+
+to the configure command, pcretest is linked with the
+libreadline library, and when its input is from a terminal, it reads it
+using the readline() function. This provides line-editing and history
+facilities. Note that libreadline is GPL-licenced, so if you distribute a
+binary of pcretest linked in this way, there may be licensing issues.
+
+
+Setting this option causes the -lreadline option to be added to the
+pcretest build. In many operating environments with a sytem-installed
+libreadline this is sufficient. However, in some environments (e.g.
+if an unmodified distribution version of readline is in use), some extra
+configuration may be necessary. The INSTALL file for libreadline says
+this:
+
+ "Readline uses the termcap functions, but does not link with the
+ termcap or curses library itself, allowing applications which link
+ with readline the to choose an appropriate library."
+
+If your environment has not been set up so that an appropriate library is
+automatically included, you may need to add something like
+
+ LIBS="-ncurses"
+
+immediately before the configure command.
+
+
SEE ALSO
+
+pcreapi(3), pcre_config(3).
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 13 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrecallout.html b/src/doc/html/pcrecallout.html
new file mode 100644
index 0000000..f8b5e2e
--- /dev/null
+++ b/src/doc/html/pcrecallout.html
@@ -0,0 +1,201 @@
+
+
+pcrecallout specification
+
+
+pcrecallout man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PCRE CALLOUTS
+
+int (*pcre_callout)(pcre_callout_block *);
+
+
+PCRE provides a feature called "callout", which is a means of temporarily
+passing control to the caller of PCRE in the middle of pattern matching. The
+caller of PCRE provides an external function by putting its entry point in the
+global variable pcre_callout. By default, this variable contains NULL,
+which disables all calling out.
+
+
+Within a regular expression, (?C) indicates the points at which the external
+function is to be called. Different callout points can be identified by putting
+a number less than 256 after the letter C. The default value is zero.
+For example, this pattern has two callout points:
+
+ (?C1)abc(?C2)def
+
+If the PCRE_AUTO_CALLOUT option bit is set when pcre_compile() is called,
+PCRE automatically inserts callouts, all with number 255, before each item in
+the pattern. For example, if PCRE_AUTO_CALLOUT is used with the pattern
+
+ A(\d{2}|--)
+
+it is processed as if it were
+
+
+(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
+
+
+Notice that there is a callout before and after each parenthesis and
+alternation bar. Automatic callouts can be used for tracking the progress of
+pattern matching. The
+pcretest
+command has an option that sets automatic callouts; when it is used, the output
+indicates how the pattern is matched. This is useful information when you are
+trying to optimize the performance of a particular pattern.
+
+
MISSING CALLOUTS
+
+You should be aware that, because of optimizations in the way PCRE matches
+patterns, callouts sometimes do not happen. For example, if the pattern is
+
+ ab(?C4)cd
+
+PCRE knows that any matching string must contain the letter "d". If the subject
+string is "abyz", the lack of "d" means that matching doesn't ever start, and
+the callout is never reached. However, with "abyd", though the result is still
+no match, the callout is obeyed.
+
+
THE CALLOUT INTERFACE
+
+During matching, when PCRE reaches a callout point, the external function
+defined by pcre_callout is called (if it is set). This applies to both
+the pcre_exec() and the pcre_dfa_exec() matching functions. The
+only argument to the callout function is a pointer to a pcre_callout
+block. This structure contains the following fields:
+
+ int version;
+ int callout_number;
+ int *offset_vector;
+ const char *subject;
+ int subject_length;
+ int start_match;
+ int current_position;
+ int capture_top;
+ int capture_last;
+ void *callout_data;
+ int pattern_position;
+ int next_item_length;
+
+The version field is an integer containing the version number of the
+block format. The initial version was 0; the current version is 1. The version
+number will change again in future if additional fields are added, but the
+intention is never to remove any of the existing fields.
+
+
+The callout_number field contains the number of the callout, as compiled
+into the pattern (that is, the number after ?C for manual callouts, and 255 for
+automatically generated callouts).
+
+
+The offset_vector field is a pointer to the vector of offsets that was
+passed by the caller to pcre_exec() or pcre_dfa_exec(). When
+pcre_exec() is used, the contents can be inspected in order to extract
+substrings that have been matched so far, in the same way as for extracting
+substrings after a match has completed. For pcre_dfa_exec() this field is
+not useful.
+
+
+The subject and subject_length fields contain copies of the values
+that were passed to pcre_exec().
+
+
+The start_match field normally contains the offset within the subject at
+which the current match attempt started. However, if the escape sequence \K
+has been encountered, this value is changed to reflect the modified starting
+point. If the pattern is not anchored, the callout function may be called
+several times from the same point in the pattern for different starting points
+in the subject.
+
+
+The current_position field contains the offset within the subject of the
+current match pointer.
+
+
+When the pcre_exec() function is used, the capture_top field
+contains one more than the number of the highest numbered captured substring so
+far. If no substrings have been captured, the value of capture_top is
+one. This is always the case when pcre_dfa_exec() is used, because it
+does not support captured substrings.
+
+
+The capture_last field contains the number of the most recently captured
+substring. If no substrings have been captured, its value is -1. This is always
+the case when pcre_dfa_exec() is used.
+
+
+The callout_data field contains a value that is passed to
+pcre_exec() or pcre_dfa_exec() specifically so that it can be
+passed back in callouts. It is passed in the pcre_callout field of the
+pcre_extra data structure. If no such data was passed, the value of
+callout_data in a pcre_callout block is NULL. There is a
+description of the pcre_extra structure in the
+pcreapi
+documentation.
+
+
+The pattern_position field is present from version 1 of the
+pcre_callout structure. It contains the offset to the next item to be
+matched in the pattern string.
+
+
+The next_item_length field is present from version 1 of the
+pcre_callout structure. It contains the length of the next item to be
+matched in the pattern string. When the callout immediately precedes an
+alternation bar, a closing parenthesis, or the end of the pattern, the length
+is zero. When the callout precedes an opening parenthesis, the length is that
+of the entire subpattern.
+
+
+The pattern_position and next_item_length fields are intended to
+help in distinguishing between different automatic callouts, which all have the
+same callout number. However, they are set for all callouts.
+
+
RETURN VALUES
+
+The external callout function returns an integer to PCRE. If the value is zero,
+matching proceeds as normal. If the value is greater than zero, matching fails
+at the current point, but the testing of other matching possibilities goes
+ahead, just as if a lookahead assertion had failed. If the value is less than
+zero, the match is abandoned, and pcre_exec() (or pcre_dfa_exec())
+returns the negative value.
+
+
+Negative values should normally be chosen from the set of PCRE_ERROR_xxx
+values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
+The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
+it will never be used by PCRE itself.
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 29 May 2007
+
+Copyright © 1997-2007 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrecompat.html b/src/doc/html/pcrecompat.html
new file mode 100644
index 0000000..d1b93d0
--- /dev/null
+++ b/src/doc/html/pcrecompat.html
@@ -0,0 +1,179 @@
+
+
+pcrecompat specification
+
+
+pcrecompat man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+DIFFERENCES BETWEEN PCRE AND PERL
+
+
+This document describes the differences in the ways that PCRE and Perl handle
+regular expressions. The differences described here are mainly with respect to
+Perl 5.8, though PCRE versions 7.0 and later contain some features that are
+expected to be in the forthcoming Perl 5.10.
+
+
+1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
+it does have are given in the
+section on UTF-8 support
+in the main
+pcre
+page.
+
+
+2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
+them, but they do not mean what you might think. For example, (?!a){3} does
+not assert that the next three characters are not "a". It just asserts that the
+next character is not "a" three times.
+
+
+3. Capturing subpatterns that occur inside negative lookahead assertions are
+counted, but their entries in the offsets vector are never set. Perl sets its
+numerical variables from any such patterns that are matched before the
+assertion fails to match something (thereby succeeding), but only if the
+negative lookahead assertion contains just one branch.
+
+
+4. Though binary zero characters are supported in the subject string, they are
+not allowed in a pattern string because it is passed as a normal C string,
+terminated by zero. The escape sequence \0 can be used in the pattern to
+represent a binary zero.
+
+
+5. The following Perl escape sequences are not supported: \l, \u, \L,
+\U, and \N. In fact these are implemented by Perl's general string-handling
+and are not part of its pattern matching engine. If any of these are
+encountered by PCRE, an error is generated.
+
+
+6. The Perl escape sequences \p, \P, and \X are supported only if PCRE is
+built with Unicode character property support. The properties that can be
+tested with \p and \P are limited to the general category properties such as
+Lu and Nd, script names such as Greek or Han, and the derived properties Any
+and L&.
+
+
+7. PCRE does support the \Q...\E escape for quoting substrings. Characters in
+between are treated as literals. This is slightly different from Perl in that $
+and @ are also handled as literals inside the quotes. In Perl, they cause
+variable interpolation (but of course PCRE does not have variables). Note the
+following examples:
+
+ Pattern PCRE matches Perl matches
+
+ \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
+ \Qabc\$xyz\E abc\$xyz abc\$xyz
+ \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
+
+The \Q...\E sequence is recognized both inside and outside character classes.
+
+
+8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
+constructions. However, there is support for recursive patterns. This is not
+available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
+feature allows an external function to be called during pattern matching. See
+the
+pcrecallout
+documentation for details.
+
+
+9. Subpatterns that are called recursively or as "subroutines" are always
+treated as atomic groups in PCRE. This is like Python, but unlike Perl.
+
+
+10. There are some differences that are concerned with the settings of captured
+strings when part of a pattern is repeated. For example, matching "aba" against
+the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
+
+
+11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
+(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
+argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
+parentheses, PCRE does not set that capture group; this is different to Perl.
+
+
+12. PCRE provides some extensions to the Perl regular expression facilities.
+Perl 5.10 will include new features that are not in earlier versions, some of
+which (such as named parentheses) have been in PCRE for some time. This list is
+with respect to Perl 5.10:
+
+
+(a) Although lookbehind assertions must match fixed length strings, each
+alternative branch of a lookbehind assertion can match a different length of
+string. Perl requires them all to have the same length.
+
+
+(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
+meta-character matches only at the very end of the string.
+
+
+(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
+meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
+(Perl can be made to issue a warning.)
+
+
+(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
+inverted, that is, by default they are not greedy, but if followed by a
+question mark they are.
+
+
+(e) PCRE_ANCHORED can be used at matching time to force a pattern to be tried
+only at the first matching position in the subject string.
+
+
+(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
+options for pcre_exec() have no Perl equivalents.
+
+
+(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
+by the PCRE_BSR_ANYCRLF option.
+
+
+(h) The callout facility is PCRE-specific.
+
+
+(i) The partial matching facility is PCRE-specific.
+
+
+(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
+different hosts that have the other endianness.
+
+
+(k) The alternative matching function (pcre_dfa_exec()) matches in a
+different way and is not Perl-compatible.
+
+
+(l) PCRE recognizes some special sequences such as (*CR) at the start of
+a pattern that set overall options that cannot be changed within the pattern.
+
+
+AUTHOR
+
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
+REVISION
+
+
+Last updated: 11 September 2007
+
+Copyright © 1997-2007 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrecpp.html b/src/doc/html/pcrecpp.html
new file mode 100644
index 0000000..bda1675
--- /dev/null
+++ b/src/doc/html/pcrecpp.html
@@ -0,0 +1,365 @@
+
+
+pcrecpp specification
+
+
+pcrecpp man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
SYNOPSIS OF C++ WRAPPER
+
+#include <pcrecpp.h>
+
+
DESCRIPTION
+
+The C++ wrapper for PCRE was provided by Google Inc. Some additional
+functionality was added by Giuseppe Maxia. This brief man page was constructed
+from the notes in the pcrecpp.h file, which should be consulted for
+further details.
+
+
MATCHING INTERFACE
+
+The "FullMatch" operation checks that supplied text matches a supplied pattern
+exactly. If pointer arguments are supplied, it copies matched sub-strings that
+match sub-patterns into them.
+
+ Example: successful match
+ pcrecpp::RE re("h.*o");
+ re.FullMatch("hello");
+
+ Example: unsuccessful match (requires full match):
+ pcrecpp::RE re("e");
+ !re.FullMatch("hello");
+
+ Example: creating a temporary RE object:
+ pcrecpp::RE("h.*o").FullMatch("hello");
+
+You can pass in a "const char*" or a "string" for "text". The examples below
+tend to use a const char*. You can, as in the different examples above, store
+the RE object explicitly in a variable or use a temporary RE object. The
+examples below use one mode or the other arbitrarily. Either could correctly be
+used for any of these examples.
+
+
+You must supply extra pointer arguments to extract matched subpieces.
+
+ Example: extracts "ruby" into "s" and 1234 into "i"
+ int i;
+ string s;
+ pcrecpp::RE re("(\\w+):(\\d+)");
+ re.FullMatch("ruby:1234", &s, &i);
+
+ Example: does not try to extract any extra sub-patterns
+ re.FullMatch("ruby:1234", &s);
+
+ Example: does not try to extract into NULL
+ re.FullMatch("ruby:1234", NULL, &i);
+
+ Example: integer overflow causes failure
+ !re.FullMatch("ruby:1234567891234", NULL, &i);
+
+ Example: fails because there aren't enough sub-patterns:
+ !pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s);
+
+ Example: fails because string cannot be stored in integer
+ !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
+
+The provided pointer arguments can be pointers to any scalar numeric
+type, or one of:
+
+ string (matched piece is copied to string)
+ StringPiece (StringPiece is mutated to point to matched piece)
+ T (where "bool T::ParseFrom(const char*, int)" exists)
+ NULL (the corresponding matched sub-pattern is not copied)
+
+The function returns true iff all of the following conditions are satisfied:
+
+ a. "text" matches "pattern" exactly;
+
+ b. The number of matched sub-patterns is >= number of supplied
+ pointers;
+
+ c. The "i"th argument has a suitable type for holding the
+ string captured as the "i"th sub-pattern. If you pass in
+ void * NULL for the "i"th argument, or a non-void * NULL
+ of the correct type, or pass fewer arguments than the
+ number of sub-patterns, "i"th captured sub-pattern is
+ ignored.
+
+CAVEAT: An optional sub-pattern that does not exist in the matched
+string is assigned the empty string. Therefore, the following will
+return false (because the empty string is not a valid number):
+
+ int number;
+ pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
+
+The matching interface supports at most 16 arguments per call.
+If you need more, consider using the more general interface
+pcrecpp::RE::DoMatch. See pcrecpp.h for the signature for
+DoMatch.
+
+
QUOTING METACHARACTERS
+
+You can use the "QuoteMeta" operation to insert backslashes before all
+potentially meaningful characters in a string. The returned string, used as a
+regular expression, will exactly match the original string.
+
+ Example:
+ string quoted = RE::QuoteMeta(unquoted);
+
+Note that it's legal to escape a character even if it has no special meaning in
+a regular expression -- so this function does that. (This also makes it
+identical to the perl function of the same name; see "perldoc -f quotemeta".)
+For example, "1.5-2.0?" becomes "1\.5\-2\.0\?".
+
+
PARTIAL MATCHES
+
+You can use the "PartialMatch" operation when you want the pattern
+to match any substring of the text.
+
+ Example: simple search for a string:
+ pcrecpp::RE("ell").PartialMatch("hello");
+
+ Example: find first number in a string:
+ int number;
+ pcrecpp::RE re("(\\d+)");
+ re.PartialMatch("x*100 + 20", &number);
+ assert(number == 100);
+
+
+
UTF-8 AND THE MATCHING INTERFACE
+
+By default, pattern and text are plain text, one byte per character. The UTF8
+flag, passed to the constructor, causes both pattern and string to be treated
+as UTF-8 text, still a byte stream but potentially multiple bytes per
+character. In practice, the text is likelier to be UTF-8 than the pattern, but
+the match returned may depend on the UTF8 flag, so always use it when matching
+UTF8 text. For example, "." will match one byte normally but with UTF8 set may
+match up to three bytes of a multi-byte character.
+
+ Example:
+ pcrecpp::RE_Options options;
+ options.set_utf8();
+ pcrecpp::RE re(utf8_pattern, options);
+ re.FullMatch(utf8_string);
+
+ Example: using the convenience function UTF8():
+ pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
+ re.FullMatch(utf8_string);
+
+NOTE: The UTF8 flag is ignored if pcre was not configured with the
+
+ --enable-utf8 flag.
+
+
+
PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
+
+PCRE defines some modifiers to change the behavior of the regular expression
+engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
+pass such modifiers to a RE class. Currently, the following modifiers are
+supported:
+
+ modifier description Perl corresponding
+
+ PCRE_CASELESS case insensitive match /i
+ PCRE_MULTILINE multiple lines match /m
+ PCRE_DOTALL dot matches newlines /s
+ PCRE_DOLLAR_ENDONLY $ matches only at end N/A
+ PCRE_EXTRA strict escape parsing N/A
+ PCRE_EXTENDED ignore whitespaces /x
+ PCRE_UTF8 handles UTF8 chars built-in
+ PCRE_UNGREEDY reverses * and *? N/A
+ PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
+
+(*) Both Perl and PCRE allow non capturing parentheses by means of the
+"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
+capture, while (ab|cd) does.
+
+
+For a full account on how each modifier works, please check the
+PCRE API reference page.
+
+
+For each modifier, there are two member functions whose name is made
+out of the modifier in lowercase, without the "PCRE_" prefix. For
+instance, PCRE_CASELESS is handled by
+
+ bool caseless()
+
+which returns true if the modifier is set, and
+
+ RE_Options & set_caseless(bool)
+
+which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can be
+accessed through the set_match_limit() and match_limit() member
+functions. Setting match_limit to a non-zero value will limit the
+execution of pcre to keep it from doing bad things like blowing the stack or
+taking an eternity to return a result. A value of 5000 is good enough to stop
+stack blowup in a 2MB thread stack. Setting match_limit to zero disables
+match limiting. Alternatively, you can call match_limit_recursion()
+which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much PCRE
+recurses. match_limit() limits the number of matches PCRE does;
+match_limit_recursion() limits the depth of internal recursion, and
+therefore the amount of stack that is used.
+
+
+Normally, to pass one or more modifiers to a RE class, you declare
+a RE_Options object, set the appropriate options, and pass this
+object to a RE constructor. Example:
+
+ RE_options opt;
+ opt.set_caseless(true);
+ if (RE("HELLO", opt).PartialMatch("hello world")) ...
+
+RE_options has two constructors. The default constructor takes no arguments and
+creates a set of flags that are off by default. The optional parameter
+option_flags is to facilitate transfer of legacy code from C programs.
+This lets you do
+
+ RE(pattern,
+ RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
+
+However, new code is better off doing
+
+ RE(pattern,
+ RE_Options().set_caseless(true).set_multiline(true))
+ .PartialMatch(str);
+
+If you are going to pass one of the most used modifiers, there are some
+convenience functions that return a RE_Options class with the
+appropriate modifier already set: CASELESS(), UTF8(),
+MULTILINE(), DOTALL(), and EXTENDED().
+
+
+If you need to set several options at once, and you don't want to go through
+the pains of declaring a RE_Options object and setting several options, there
+is a parallel method that give you such ability on the fly. You can concatenate
+several set_xxxxx() member functions, since each of them returns a
+reference to its class object. For example, to pass PCRE_CASELESS,
+PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
+
+ RE(" ^ xyz \\s+ .* blah$",
+ RE_Options()
+ .set_caseless(true)
+ .set_extended(true)
+ .set_multiline(true)).PartialMatch(sometext);
+
+
+
+
SCANNING TEXT INCREMENTALLY
+
+The "Consume" operation may be useful if you want to repeatedly
+match regular expressions at the front of a string and skip over
+them as they match. This requires use of the "StringPiece" type,
+which represents a sub-range of a real string. Like RE, StringPiece
+is defined in the pcrecpp namespace.
+
+ Example: read lines of the form "var = value" from a string.
+ string contents = ...; // Fill string somehow
+ pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
+
+
+
+
+ string var;
+ int value;
+ pcrecpp::RE re("(\\w+) = (\\d+)\n");
+ while (re.Consume(&input, &var, &value)) {
+ ...;
+ }
+
+Each successful call to "Consume" will set "var/value", and also
+advance "input" so it points past the matched text.
+
+
+The "FindAndConsume" operation is similar to "Consume" but does not
+anchor your match at the beginning of the string. For example, you
+could extract all words from a string by repeatedly calling
+
+ pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
+
+
+
PARSING HEX/OCTAL/C-RADIX NUMBERS
+
+By default, if you pass a pointer to a numeric value, the
+corresponding text is interpreted as a base-10 number. You can
+instead wrap the pointer with a call to one of the operators Hex(),
+Octal(), or CRadix() to interpret the text in another base. The
+CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
+prefixes, but defaults to base-10.
+
+ Example:
+ int a, b, c, d;
+ pcrecpp::RE re("(.*) (.*) (.*) (.*)");
+ re.FullMatch("100 40 0100 0x40",
+ pcrecpp::Octal(&a), pcrecpp::Hex(&b),
+ pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
+
+will leave 64 in a, b, c, and d.
+
+
REPLACING PARTS OF STRINGS
+
+You can replace the first match of "pattern" in "str" with "rewrite".
+Within "rewrite", backslash-escaped digits (\1 to \9) can be
+used to insert text matching corresponding parenthesized group
+from the pattern. \0 in "rewrite" refers to the entire matching
+text. For example:
+
+ string s = "yabba dabba doo";
+ pcrecpp::RE("b+").Replace("d", &s);
+
+will leave "s" containing "yada dabba doo". The result is true if the pattern
+matches and a replacement occurs, false otherwise.
+
+
+GlobalReplace is like Replace except that it replaces all
+occurrences of the pattern in the string with the rewrite. Replacements are
+not subject to re-matching. For example:
+
+ string s = "yabba dabba doo";
+ pcrecpp::RE("b+").GlobalReplace("d", &s);
+
+will leave "s" containing "yada dada doo". It returns the number of
+replacements made.
+
+
+Extract is like Replace, except that if the pattern matches,
+"rewrite" is copied into "out" (an additional argument) with substitutions.
+The non-matching portions of "text" are ignored. Returns true iff a match
+occurred and the extraction happened successfully; if no match occurs, the
+string is left unaffected.
+
+
AUTHOR
+
+The C++ wrapper was contributed by Google Inc.
+
+Copyright © 2007 Google Inc.
+
+
+
REVISION
+
+Last updated: 12 November 2007
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcregrep.html b/src/doc/html/pcregrep.html
new file mode 100644
index 0000000..4367031
--- /dev/null
+++ b/src/doc/html/pcregrep.html
@@ -0,0 +1,516 @@
+
+
+pcregrep specification
+
+
+pcregrep man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
SYNOPSIS
+
+pcregrep [options] [long options] [pattern] [path1 path2 ...]
+
+
DESCRIPTION
+
+pcregrep searches files for character patterns, in the same way as other
+grep commands do, but it uses the PCRE regular expression library to support
+patterns that are compatible with the regular expressions of Perl 5. See
+pcrepattern(3)
+for a full description of syntax and semantics of the regular expressions
+that PCRE supports.
+
+
+Patterns, whether supplied on the command line or in a separate file, are given
+without delimiters. For example:
+
+ pcregrep Thursday /etc/motd
+
+If you attempt to use delimiters (for example, by surrounding a pattern with
+slashes, as is common in Perl scripts), they are interpreted as part of the
+pattern. Quotes can of course be used to delimit patterns on the command line
+because they are interpreted by the shell, and indeed they are required if a
+pattern contains white space or shell metacharacters.
+
+
+The first argument that follows any option settings is treated as the single
+pattern to be matched when neither -e nor -f is present.
+Conversely, when one or both of these options are used to specify patterns, all
+arguments are treated as path names. At least one of -e, -f, or an
+argument pattern must be provided.
+
+
+If no files are specified, pcregrep reads the standard input. The
+standard input can also be referenced by a name consisting of a single hyphen.
+For example:
+
+ pcregrep some-pattern /file1 - /file3
+
+By default, each line that matches a pattern is copied to the standard
+output, and if there is more than one file, the file name is output at the
+start of each line, followed by a colon. However, there are options that can
+change how pcregrep behaves. In particular, the -M option makes it
+possible to search for patterns that span line boundaries. What defines a line
+boundary is controlled by the -N (--newline) option.
+
+
+Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
+BUFSIZ is defined in <stdio.h>. When there is more than one pattern
+(specified by the use of -e and/or -f), each pattern is applied to
+each line in the order in which they are defined, except that all the -e
+patterns are tried before the -f patterns. As soon as one pattern matches
+(or fails to match when -v is used), no further patterns are considered.
+
+
+When --only-matching, --file-offsets, or --line-offsets
+is used, the output is the part of the line that matched (either shown
+literally, or as an offset). In this case, scanning resumes immediately
+following the match, so that further matches on the same line can be found.
+If there are multiple patterns, they are all tried on the remainder of the
+line. However, patterns that follow the one that matched are not tried on the
+earlier part of the line.
+
+
+If the LC_ALL or LC_CTYPE environment variable is set,
+pcregrep uses the value to set a locale when calling the PCRE library.
+The --locale option can be used to override this.
+
+
SUPPORT FOR COMPRESSED FILES
+
+It is possible to compile pcregrep so that it uses libz or
+libbz2 to read files whose names end in .gz or .bz2,
+respectively. You can find out whether your binary has support for one or both
+of these file types by running it with the --help option. If the
+appropriate support is not present, files are treated as plain text. The
+standard input is always so treated.
+
+
OPTIONS
+
+--
+This terminate the list of options. It is useful if the next item on the
+command line starts with a hyphen but is not an option. This allows for the
+processing of patterns and filenames that start with hyphens.
+
+
+-A number, --after-context=number
+Output number lines of context after each matching line. If filenames
+and/or line numbers are being output, a hyphen separator is used instead of a
+colon for the context lines. A line containing "--" is output between each
+group of lines, unless they are in fact contiguous in the input file. The value
+of number is expected to be relatively small. However, pcregrep
+guarantees to have up to 8K of following text available for context output.
+
+
+-B number, --before-context=number
+Output number lines of context before each matching line. If filenames
+and/or line numbers are being output, a hyphen separator is used instead of a
+colon for the context lines. A line containing "--" is output between each
+group of lines, unless they are in fact contiguous in the input file. The value
+of number is expected to be relatively small. However, pcregrep
+guarantees to have up to 8K of preceding text available for context output.
+
+
+-C number, --context=number
+Output number lines of context both before and after each matching line.
+This is equivalent to setting both -A and -B to the same value.
+
+
+-c, --count
+Do not output individual lines; instead just output a count of the number of
+lines that would otherwise have been output. If several files are given, a
+count is output for each of them. In this mode, the -A, -B, and
+-C options are ignored.
+
+
+--colour, --color
+If this option is given without any data, it is equivalent to "--colour=auto".
+If data is required, it must be given in the same shell item, separated by an
+equals sign.
+
+
+--colour=value, --color=value
+This option specifies under what circumstances the part of a line that matched
+a pattern should be coloured in the output. The value may be "never" (the
+default), "always", or "auto". In the latter case, colouring happens only if
+the standard output is connected to a terminal. The colour can be specified by
+setting the environment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
+of this variable should be a string of two numbers, separated by a semicolon.
+They are copied directly into the control string for setting colour on a
+terminal, so it is your responsibility to ensure that they make sense. If
+neither of the environment variables is set, the default is "1;31", which gives
+red.
+
+
+-D action, --devices=action
+If an input path is not a regular file or a directory, "action" specifies how
+it is to be processed. Valid values are "read" (the default) or "skip"
+(silently skip the path).
+
+
+-d action, --directories=action
+If an input path is a directory, "action" specifies how it is to be processed.
+Valid values are "read" (the default), "recurse" (equivalent to the -r
+option), or "skip" (silently skip the path). In the default case, directories
+are read as if they were ordinary files. In some operating systems the effect
+of reading a directory like this is an immediate end-of-file.
+
+
+-e pattern, --regex=pattern, --regexp=pattern
+Specify a pattern to be matched. This option can be used multiple times in
+order to specify several patterns. It can also be used as a way of specifying a
+single pattern that starts with a hyphen. When -e is used, no argument
+pattern is taken from the command line; all arguments are treated as file
+names. There is an overall maximum of 100 patterns. They are applied to each
+line in the order in which they are defined until one matches (or fails to
+match if -v is used). If -f is used with -e, the command line
+patterns are matched first, followed by the patterns from the file, independent
+of the order in which these options are specified. Note that multiple use of
+-e is not the same as a single pattern with alternatives. For example,
+X|Y finds the first character in a line that is X or Y, whereas if the two
+patterns are given separately, pcregrep finds X if it is present, even if
+it follows Y in the line. It finds Y only if there is no X in the line. This
+really matters only if you are using -o to show the part(s) of the line
+that matched.
+
+
+--exclude=pattern
+When pcregrep is searching the files in a directory as a consequence of
+the -r (recursive search) option, any regular files whose names match the
+pattern are excluded. Subdirectories are not excluded by this option; they are
+searched recursively, subject to the --exclude_dir and
+--include_dir options. The pattern is a PCRE regular expression, and is
+matched against the final component of the file name (not the entire path). If
+a file name matches both --include and --exclude, it is excluded.
+There is no short form for this option.
+
+
+--exclude_dir=pattern
+When pcregrep is searching the contents of a directory as a consequence
+of the -r (recursive search) option, any subdirectories whose names match
+the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
+subdirectories.) The pattern is a PCRE regular expression, and is matched
+against the final component of the name (not the entire path). If a
+subdirectory name matches both --include_dir and --exclude_dir, it
+is excluded. There is no short form for this option.
+
+
+-F, --fixed-strings
+Interpret each pattern as a list of fixed strings, separated by newlines,
+instead of as a regular expression. The -w (match as a word) and -x
+(match whole line) options can be used with -F. They apply to each of the
+fixed strings. A line is selected if any of the fixed strings are found in it
+(subject to -w or -x, if present).
+
+
+-f filename, --file=filename
+Read a number of patterns from the file, one per line, and match them against
+each line of input. A data line is output if any of the patterns match it. The
+filename can be given as "-" to refer to the standard input. When -f is
+used, patterns specified on the command line using -e may also be
+present; they are tested before the file's patterns. However, no other pattern
+is taken from the command line; all arguments are treated as file names. There
+is an overall maximum of 100 patterns. Trailing white space is removed from
+each line, and blank lines are ignored. An empty file contains no patterns and
+therefore matches nothing. See also the comments about multiple patterns versus
+a single pattern with alternatives in the description of -e above.
+
+
+--file-offsets
+Instead of showing lines or parts of lines that match, show each match as an
+offset from the start of the file and a length, separated by a comma. In this
+mode, no context is shown. That is, the -A, -B, and -C
+options are ignored. If there is more than one match in a line, each of them is
+shown separately. This option is mutually exclusive with --line-offsets
+and --only-matching.
+
+
+-H, --with-filename
+Force the inclusion of the filename at the start of output lines when searching
+a single file. By default, the filename is not shown in this case. For matching
+lines, the filename is followed by a colon and a space; for context lines, a
+hyphen separator is used. If a line number is also being output, it follows the
+file name without a space.
+
+
+-h, --no-filename
+Suppress the output filenames when searching multiple files. By default,
+filenames are shown when multiple files are searched. For matching lines, the
+filename is followed by a colon and a space; for context lines, a hyphen
+separator is used. If a line number is also being output, it follows the file
+name without a space.
+
+
+--help
+Output a help message, giving brief details of the command options and file
+type support, and then exit.
+
+
+-i, --ignore-case
+Ignore upper/lower case distinctions during comparisons.
+
+
+--include=pattern
+When pcregrep is searching the files in a directory as a consequence of
+the -r (recursive search) option, only those regular files whose names
+match the pattern are included. Subdirectories are always included and searched
+recursively, subject to the \fP--include_dir\fP and --exclude_dir
+options. The pattern is a PCRE regular expression, and is matched against the
+final component of the file name (not the entire path). If a file name matches
+both --include and --exclude, it is excluded. There is no short
+form for this option.
+
+
+--include_dir=pattern
+When pcregrep is searching the contents of a directory as a consequence
+of the -r (recursive search) option, only those subdirectories whose
+names match the pattern are included. (Note that the --include option
+does not affect subdirectories.) The pattern is a PCRE regular expression, and
+is matched against the final component of the name (not the entire path). If a
+subdirectory name matches both --include_dir and --exclude_dir, it
+is excluded. There is no short form for this option.
+
+
+-L, --files-without-match
+Instead of outputting lines from the files, just output the names of the files
+that do not contain any lines that would have been output. Each file name is
+output once, on a separate line.
+
+
+-l, --files-with-matches
+Instead of outputting lines from the files, just output the names of the files
+containing lines that would have been output. Each file name is output
+once, on a separate line. Searching stops as soon as a matching line is found
+in a file.
+
+
+--label=name
+This option supplies a name to be used for the standard input when file names
+are being output. If not supplied, "(standard input)" is used. There is no
+short form for this option.
+
+
+--line-offsets
+Instead of showing lines or parts of lines that match, show each match as a
+line number, the offset from the start of the line, and a length. The line
+number is terminated by a colon (as usual; see the -n option), and the
+offset and length are separated by a comma. In this mode, no context is shown.
+That is, the -A, -B, and -C options are ignored. If there is
+more than one match in a line, each of them is shown separately. This option is
+mutually exclusive with --file-offsets and --only-matching.
+
+
+--locale=locale-name
+This option specifies a locale to be used for pattern matching. It overrides
+the value in the LC_ALL or LC_CTYPE environment variables. If no
+locale is specified, the PCRE library's default (usually the "C" locale) is
+used. There is no short form for this option.
+
+
+-M, --multiline
+Allow patterns to match more than one line. When this option is given, patterns
+may usefully contain literal newline characters and internal occurrences of ^
+and $ characters. The output for any one match may consist of more than one
+line. When this option is set, the PCRE library is called in "multiline" mode.
+There is a limit to the number of lines that can be matched, imposed by the way
+that pcregrep buffers the input file as it scans it. However,
+pcregrep ensures that at least 8K characters or the rest of the document
+(whichever is the shorter) are available for forward matching, and similarly
+the previous 8K characters (or all the previous characters, if fewer than 8K)
+are guaranteed to be available for lookbehind assertions.
+
+
+-N newline-type, --newline=newline-type
+The PCRE library supports five different conventions for indicating
+the ends of lines. They are the single-character sequences CR (carriage return)
+and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
+which recognizes any of the preceding three types, and an "any" convention, in
+which any Unicode line ending sequence is assumed to end a line. The Unicode
+sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
+(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
+PS (paragraph separator, U+2029).
+
+
+When the PCRE library is built, a default line-ending sequence is specified.
+This is normally the standard sequence for the operating system. Unless
+otherwise specified by this option, pcregrep uses the library's default.
+The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
+makes it possible to use pcregrep on files that have come from other
+environments without having to modify their line endings. If the data that is
+being scanned does not agree with the convention set by this option,
+pcregrep may behave in strange ways.
+
+
+-n, --line-number
+Precede each output line by its line number in the file, followed by a colon
+and a space for matching lines or a hyphen and a space for context lines. If
+the filename is also being output, it precedes the line number. This option is
+forced if --line-offsets is used.
+
+
+-o, --only-matching
+Show only the part of the line that matched a pattern. In this mode, no
+context is shown. That is, the -A, -B, and -C options are
+ignored. If there is more than one match in a line, each of them is shown
+separately. If -o is combined with -v (invert the sense of the
+match to find non-matching lines), no output is generated, but the return code
+is set appropriately. This option is mutually exclusive with
+--file-offsets and --line-offsets.
+
+
+-q, --quiet
+Work quietly, that is, display nothing except error messages. The exit
+status indicates whether or not any matches were found.
+
+
+-r, --recursive
+If any given path is a directory, recursively scan the files it contains,
+taking note of any --include and --exclude settings. By default, a
+directory is read as a normal file; in some operating systems this gives an
+immediate end-of-file. This option is a shorthand for setting the -d
+option to "recurse".
+
+
+-s, --no-messages
+Suppress error messages about non-existent or unreadable files. Such files are
+quietly skipped. However, the return code is still 2, even if matches were
+found in other files.
+
+
+-u, --utf-8
+Operate in UTF-8 mode. This option is available only if PCRE has been compiled
+with UTF-8 support. Both patterns and subject lines must be valid strings of
+UTF-8 characters.
+
+
+-V, --version
+Write the version numbers of pcregrep and the PCRE library that is being
+used to the standard error stream.
+
+
+-v, --invert-match
+Invert the sense of the match, so that lines which do not match any of
+the patterns are the ones that are found.
+
+
+-w, --word-regex, --word-regexp
+Force the patterns to match only whole words. This is equivalent to having \b
+at the start and end of the pattern.
+
+
+-x, --line-regex, --line-regexp
+Force the patterns to be anchored (each must start matching at the beginning of
+a line) and in addition, require them to match entire lines. This is
+equivalent to having ^ and $ characters at the start and end of each
+alternative branch in every pattern.
+
+
ENVIRONMENT VARIABLES
+
+The environment variables LC_ALL and LC_CTYPE are examined, in that
+order, for a locale. The first one that is set is used. This can be overridden
+by the --locale option. If no locale is set, the PCRE library's default
+(usually the "C" locale) is used.
+
+
NEWLINES
+
+The -N (--newline) option allows pcregrep to scan files with
+different newline conventions from the default. However, the setting of this
+option does not affect the way in which pcregrep writes information to
+the standard error and output streams. It uses the string "\n" in C
+printf() calls to indicate newlines, relying on the C I/O library to
+convert this to an appropriate sequence if the output is sent to a file.
+
+
OPTIONS COMPATIBILITY
+
+The majority of short and long forms of pcregrep's options are the same
+as in the GNU grep program. Any long option of the form
+--xxx-regexp (GNU terminology) is also available as --xxx-regex
+(PCRE terminology). However, the --locale, -M, --multiline,
+-u, and --utf-8 options are specific to pcregrep.
+
+
OPTIONS WITH DATA
+
+There are four different ways in which an option with data can be specified.
+If a short form option is used, the data may follow immediately, or in the next
+command line item. For example:
+
+ -f/some/file
+ -f /some/file
+
+If a long form option is used, the data may appear in the same command line
+item, separated by an equals character, or (with one exception) it may appear
+in the next command line item. For example:
+
+ --file=/some/file
+ --file /some/file
+
+Note, however, that if you want to supply a file name beginning with ~ as data
+in a shell command, and have the shell expand ~ to a home directory, you must
+separate the file name from the option, because the shell does not treat ~
+specially unless it is at the start of an item.
+
+
+The exception to the above is the --colour (or --color) option,
+for which the data is optional. If this option does have data, it must be given
+in the first form, using an equals character. Otherwise it will be assumed that
+it has no data.
+
+
MATCHING ERRORS
+
+It is possible to supply a regular expression that takes a very long time to
+fail to match certain lines. Such patterns normally involve nested indefinite
+repeats, for example: (a+)*\d when matched against a line of a's with no final
+digit. The PCRE matching function has a resource limit that causes it to abort
+in these circumstances. If this happens, pcregrep outputs an error
+message and the line that caused the problem to the standard error stream. If
+there are more than 20 such errors, pcregrep gives up.
+
+
DIAGNOSTICS
+
+Exit status is 0 if any matches were found, 1 if no matches were found, and 2
+for syntax errors and non-existent or inacessible files (even if matches were
+found in other files) or too many matching errors. Using the -s option to
+suppress error messages about inaccessble files does not affect the return
+code.
+
+
SEE ALSO
+
+pcrepattern(3), pcretest(1).
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 08 March 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrematching.html b/src/doc/html/pcrematching.html
new file mode 100644
index 0000000..2cad88b
--- /dev/null
+++ b/src/doc/html/pcrematching.html
@@ -0,0 +1,224 @@
+
+
+pcrematching specification
+
+
+pcrematching man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PCRE MATCHING ALGORITHMS
+
+This document describes the two different algorithms that are available in PCRE
+for matching a compiled regular expression against a given subject string. The
+"standard" algorithm is the one provided by the pcre_exec() function.
+This works in the same was as Perl's matching function, and provides a
+Perl-compatible matching operation.
+
+
+An alternative algorithm is provided by the pcre_dfa_exec() function;
+this operates in a different way, and is not Perl-compatible. It has advantages
+and disadvantages compared with the standard algorithm, and these are described
+below.
+
+
+When there is only one possible way in which a given subject string can match a
+pattern, the two algorithms give the same answer. A difference arises, however,
+when there are multiple possibilities. For example, if the pattern
+
+ ^<.*>
+
+is matched against the string
+
+ <something> <something else> <something further>
+
+there are three possible answers. The standard algorithm finds only one of
+them, whereas the alternative algorithm finds all three.
+
+
REGULAR EXPRESSIONS AS TREES
+
+The set of strings that are matched by a regular expression can be represented
+as a tree structure. An unlimited repetition in the pattern makes the tree of
+infinite size, but it is still a tree. Matching the pattern to a given subject
+string (from a given starting point) can be thought of as a search of the tree.
+There are two ways to search a tree: depth-first and breadth-first, and these
+correspond to the two matching algorithms provided by PCRE.
+
+
THE STANDARD MATCHING ALGORITHM
+
+In the terminology of Jeffrey Friedl's book "Mastering Regular
+Expressions", the standard algorithm is an "NFA algorithm". It conducts a
+depth-first search of the pattern tree. That is, it proceeds along a single
+path through the tree, checking that the subject matches what is required. When
+there is a mismatch, the algorithm tries any alternatives at the current point,
+and if they all fail, it backs up to the previous branch point in the tree, and
+tries the next alternative branch at that level. This often involves backing up
+(moving to the left) in the subject string as well. The order in which
+repetition branches are tried is controlled by the greedy or ungreedy nature of
+the quantifier.
+
+
+If a leaf node is reached, a matching string has been found, and at that point
+the algorithm stops. Thus, if there is more than one possible match, this
+algorithm returns the first one that it finds. Whether this is the shortest,
+the longest, or some intermediate length depends on the way the greedy and
+ungreedy repetition quantifiers are specified in the pattern.
+
+
+Because it ends up with a single path through the tree, it is relatively
+straightforward for this algorithm to keep track of the substrings that are
+matched by portions of the pattern in parentheses. This provides support for
+capturing parentheses and back references.
+
+
THE ALTERNATIVE MATCHING ALGORITHM
+
+This algorithm conducts a breadth-first search of the tree. Starting from the
+first matching point in the subject, it scans the subject string from left to
+right, once, character by character, and as it does this, it remembers all the
+paths through the tree that represent valid matches. In Friedl's terminology,
+this is a kind of "DFA algorithm", though it is not implemented as a
+traditional finite state machine (it keeps multiple states active
+simultaneously).
+
+
+The scan continues until either the end of the subject is reached, or there are
+no more unterminated paths. At this point, terminated paths represent the
+different matching possibilities (if there are none, the match has failed).
+Thus, if there is more than one possible match, this algorithm finds all of
+them, and in particular, it finds the longest. In PCRE, there is an option to
+stop the algorithm after the first match (which is necessarily the shortest)
+has been found.
+
+
+Note that all the matches that are found start at the same point in the
+subject. If the pattern
+
+ cat(er(pillar)?)
+
+is matched against the string "the caterpillar catchment", the result will be
+the three strings "cat", "cater", and "caterpillar" that start at the fourth
+character of the subject. The algorithm does not automatically move on to find
+matches that start at later positions.
+
+
+There are a number of features of PCRE regular expressions that are not
+supported by the alternative matching algorithm. They are as follows:
+
+
+1. Because the algorithm finds all possible matches, the greedy or ungreedy
+nature of repetition quantifiers is not relevant. Greedy and ungreedy
+quantifiers are treated in exactly the same way. However, possessive
+quantifiers can make a difference when what follows could also match what is
+quantified, for example in a pattern like this:
+
+ ^a++\w!
+
+This pattern matches "aaab!" but not "aaa!", which would be matched by a
+non-possessive quantifier. Similarly, if an atomic group is present, it is
+matched as if it were a standalone pattern at the current point, and the
+longest match is then "locked in" for the rest of the overall pattern.
+
+
+2. When dealing with multiple paths through the tree simultaneously, it is not
+straightforward to keep track of captured substrings for the different matching
+possibilities, and PCRE's implementation of this algorithm does not attempt to
+do this. This means that no captured substrings are available.
+
+
+3. Because no substrings are captured, back references within the pattern are
+not supported, and cause errors if encountered.
+
+
+4. For the same reason, conditional expressions that use a backreference as the
+condition or test for a specific group recursion are not supported.
+
+
+5. Because many paths through the tree may be active, the \K escape sequence,
+which resets the start of the match when encountered (but may be on some paths
+and not on others), is not supported. It causes an error if encountered.
+
+
+6. Callouts are supported, but the value of the capture_top field is
+always 1, and the value of the capture_last field is always -1.
+
+
+7. The \C escape sequence, which (in the standard algorithm) matches a single
+byte, even in UTF-8 mode, is not supported because the alternative algorithm
+moves through the subject string one character at a time, for all active paths
+through the tree.
+
+
+8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
+supported. (*FAIL) is supported, and behaves like a failing negative assertion.
+
+
ADVANTAGES OF THE ALTERNATIVE ALGORITHM
+
+Using the alternative matching algorithm provides the following advantages:
+
+
+1. All possible matches (at a single point in the subject) are automatically
+found, and in particular, the longest match is found. To find more than one
+match using the standard algorithm, you have to do kludgy things with
+callouts.
+
+
+2. There is much better support for partial matching. The restrictions on the
+content of the pattern that apply when using the standard algorithm for partial
+matching do not apply to the alternative algorithm. For non-anchored patterns,
+the starting position of a partial match is available.
+
+
+3. Because the alternative algorithm scans the subject string just once, and
+never needs to backtrack, it is possible to pass very long subject strings to
+the matching function in several pieces, checking for partial matching each
+time.
+
+
DISADVANTAGES OF THE ALTERNATIVE ALGORITHM
+
+The alternative algorithm suffers from a number of disadvantages:
+
+
+1. It is substantially slower than the standard algorithm. This is partly
+because it has to search for all possible matches, but is also because it is
+less susceptible to optimization.
+
+
+2. Capturing parentheses and back references are not supported.
+
+
+3. Although atomic groups are supported, their use does not provide the
+performance advantage that it does for the standard algorithm.
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 19 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrepartial.html b/src/doc/html/pcrepartial.html
new file mode 100644
index 0000000..1fab23c
--- /dev/null
+++ b/src/doc/html/pcrepartial.html
@@ -0,0 +1,242 @@
+
+
+pcrepartial specification
+
+
+pcrepartial man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PARTIAL MATCHING IN PCRE
+
+In normal use of PCRE, if the subject string that is passed to
+pcre_exec() or pcre_dfa_exec() matches as far as it goes, but is
+too short to match the entire pattern, PCRE_ERROR_NOMATCH is returned. There
+are circumstances where it might be helpful to distinguish this case from other
+cases in which there is no match.
+
+
+Consider, for example, an application where a human is required to type in data
+for a field with specific formatting requirements. An example might be a date
+in the form ddmmmyy, defined by this pattern:
+
+ ^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
+
+If the application sees the user's keystrokes one by one, and can check that
+what has been typed so far is potentially valid, it is able to raise an error
+as soon as a mistake is made, possibly beeping and not reflecting the
+character that has been typed. This immediate feedback is likely to be a better
+user interface than a check that is delayed until the entire string has been
+entered.
+
+
+PCRE supports the concept of partial matching by means of the PCRE_PARTIAL
+option, which can be set when calling pcre_exec() or
+pcre_dfa_exec(). When this flag is set for pcre_exec(), the return
+code PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
+during the matching process the last part of the subject string matched part of
+the pattern. Unfortunately, for non-anchored matching, it is not possible to
+obtain the position of the start of the partial match. No captured data is set
+when PCRE_ERROR_PARTIAL is returned.
+
+
+When PCRE_PARTIAL is set for pcre_dfa_exec(), the return code
+PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of the
+subject is reached, there have been no complete matches, but there is still at
+least one matching possibility. The portion of the string that provided the
+partial match is set as the first matching string.
+
+
+Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers the
+last literal byte in a pattern, and abandons matching immediately if such a
+byte is not present in the subject string. This optimization cannot be used
+for a subject string that might match only partially.
+
+
RESTRICTED PATTERNS FOR PCRE_PARTIAL
+
+Because of the way certain internal optimizations are implemented in the
+pcre_exec() function, the PCRE_PARTIAL option cannot be used with all
+patterns. These restrictions do not apply when pcre_dfa_exec() is used.
+For pcre_exec(), repeated single characters such as
+
+ a{2,4}
+
+and repeated single metasequences such as
+
+ \d+
+
+are not permitted if the maximum number of occurrences is greater than one.
+Optional items such as \d? (where the maximum is one) are permitted.
+Quantifiers with any values are permitted after parentheses, so the invalid
+examples above can be coded thus:
+
+ (a){2,4}
+ (\d)+
+
+These constructions run more slowly, but for the kinds of application that are
+envisaged for this facility, this is not felt to be a major restriction.
+
+
+If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
+pcre_exec() returns the error code PCRE_ERROR_BADPARTIAL (-13).
+You can use the PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out
+if a compiled pattern can be used for partial matching.
+
+
EXAMPLE OF PARTIAL MATCHING USING PCRETEST
+
+If the escape sequence \P is present in a pcretest data line, the
+PCRE_PARTIAL flag is used for the match. Here is a run of pcretest that
+uses the date example quoted above:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 25jun04\P
+ 0: 25jun04
+ 1: jun
+ data> 25dec3\P
+ Partial match
+ data> 3ju\P
+ Partial match
+ data> 3juj\P
+ No match
+ data> j\P
+ No match
+
+The first data string is matched completely, so pcretest shows the
+matched substrings. The remaining four strings do not match the complete
+pattern, but the first two are partial matches. The same test, using
+pcre_dfa_exec() matching (by means of the \D escape sequence), produces
+the following output:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 25jun04\P\D
+ 0: 25jun04
+ data> 23dec3\P\D
+ Partial match: 23dec3
+ data> 3ju\P\D
+ Partial match: 3ju
+ data> 3juj\P\D
+ No match
+ data> j\P\D
+ No match
+
+Notice that in this case the portion of the string that was matched is made
+available.
+
+
MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()
+
+When a partial match has been found using pcre_dfa_exec(), it is possible
+to continue the match by providing additional subject data and calling
+pcre_dfa_exec() again with the same compiled regular expression, this
+time setting the PCRE_DFA_RESTART option. You must also pass the same working
+space as before, because this is where details of the previous partial match
+are stored. Here is an example using pcretest, using the \R escape
+sequence to set the PCRE_DFA_RESTART option (\P and \D are as above):
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 23ja\P\D
+ Partial match: 23ja
+ data> n05\R\D
+ 0: n05
+
+The first call has "23ja" as the subject, and requests partial matching; the
+second call has "n05" as the subject for the continued (restarted) match.
+Notice that when the match is complete, only the last part is shown; PCRE does
+not retain the previously partially-matched string. It is up to the calling
+program to do that if it needs to.
+
+
+You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
+over multiple segments. This facility can be used to pass very long subject
+strings to pcre_dfa_exec(). However, some care is needed for certain
+types of pattern.
+
+
+1. If the pattern contains tests for the beginning or end of a line, you need
+to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
+subject string for any call does not contain the beginning or end of a line.
+
+
+2. If the pattern contains backward assertions (including \b or \B), you need
+to arrange for some overlap in the subject strings to allow for this. For
+example, you could pass the subject in chunks that are 500 bytes long, but in
+a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
+bytes at the start of the buffer.
+
+
+3. Matching a subject string that is split into multiple segments does not
+always produce exactly the same result as matching over one single long string.
+The difference arises when there are multiple matching possibilities, because a
+partial match result is given only when there are no completed matches in a
+call to pcre_dfa_exec(). This means that as soon as the shortest match has
+been found, continuation to a new subject segment is no longer possible.
+Consider this pcretest example:
+
+ re> /dog(sbody)?/
+ data> do\P\D
+ Partial match: do
+ data> gsb\R\P\D
+ 0: g
+ data> dogsbody\D
+ 0: dogsbody
+ 1: dog
+
+The pattern matches the words "dog" or "dogsbody". When the subject is
+presented in several parts ("do" and "gsb" being the first two) the match stops
+when "dog" has been found, and it is not possible to continue. On the other
+hand, if "dogsbody" is presented as a single string, both matches are found.
+
+
+Because of this phenomenon, it does not usually make sense to end a pattern
+that is going to be matched in this way with a variable repeat.
+
+
+4. Patterns that contain alternatives at the top level which do not all
+start with the same pattern item may not work as expected. For example,
+consider this pattern:
+
+ 1234|3789
+
+If the first part of the subject is "ABC123", a partial match of the first
+alternative is found at offset 3. There is no partial match for the second
+alternative, because such a match does not start at the same point in the
+subject string. Attempting to continue with the string "789" does not yield a
+match because only those alternatives that match at one point in the subject
+are remembered. The problem arises because the start of the second alternative
+matches within the first alternative. There is no problem with anchored
+patterns or patterns such as:
+
+ 1234|ABCD
+
+where no string can be a partial match for both alternatives.
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 04 June 2007
+
+Copyright © 1997-2007 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrepattern.html b/src/doc/html/pcrepattern.html
new file mode 100644
index 0000000..9cc055c
--- /dev/null
+++ b/src/doc/html/pcrepattern.html
@@ -0,0 +1,2247 @@
+
+
+pcrepattern specification
+
+
+pcrepattern man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PCRE REGULAR EXPRESSION DETAILS
+
+The syntax and semantics of the regular expressions that are supported by PCRE
+are described in detail below. There is a quick-reference syntax summary in the
+pcresyntax
+page. PCRE tries to match Perl syntax and semantics as closely as it can. PCRE
+also supports some alternative regular expression syntax (which does not
+conflict with the Perl syntax) in order to provide some compatibility with
+regular expressions in Python, .NET, and Oniguruma.
+
+
+Perl's regular expressions are described in its own documentation, and
+regular expressions in general are covered in a number of books, some of which
+have copious examples. Jeffrey Friedl's "Mastering Regular Expressions",
+published by O'Reilly, covers regular expressions in great detail. This
+description of PCRE's regular expressions is intended as reference material.
+
+
+The original operation of PCRE was on strings of one-byte characters. However,
+there is now also support for UTF-8 character strings. To use this, you must
+build PCRE to include UTF-8 support, and then call pcre_compile() with
+the PCRE_UTF8 option. How this affects pattern matching is mentioned in several
+places below. There is also a summary of UTF-8 features in the
+section on UTF-8 support
+in the main
+pcre
+page.
+
+
+The remainder of this document discusses the patterns that are supported by
+PCRE when its main matching function, pcre_exec(), is used.
+From release 6.0, PCRE offers a second matching function,
+pcre_dfa_exec(), which matches using a different algorithm that is not
+Perl-compatible. Some of the features discussed below are not available when
+pcre_dfa_exec() is used. The advantages and disadvantages of the
+alternative function, and how it differs from the normal function, are
+discussed in the
+pcrematching
+page.
+
+
NEWLINE CONVENTIONS
+
+PCRE supports five different conventions for indicating line breaks in
+strings: a single CR (carriage return) character, a single LF (linefeed)
+character, the two-character sequence CRLF, any of the three preceding, or any
+Unicode newline sequence. The
+pcreapi
+page has
+further discussion
+about newlines, and shows how to set the newline convention in the
+options arguments for the compiling and matching functions.
+
+
+It is also possible to specify a newline convention by starting a pattern
+string with one of the following five sequences:
+
+ (*CR) carriage return
+ (*LF) linefeed
+ (*CRLF) carriage return, followed by linefeed
+ (*ANYCRLF) any of the three above
+ (*ANY) all Unicode newline sequences
+
+These override the default and the options given to pcre_compile(). For
+example, on a Unix system where LF is the default newline sequence, the pattern
+
+ (*CR)a.b
+
+changes the convention to CR. That pattern matches "a\nb" because LF is no
+longer a newline. Note that these special settings, which are not
+Perl-compatible, are recognized only at the very start of a pattern, and that
+they must be in upper case. If more than one of them is present, the last one
+is used.
+
+
+The newline convention does not affect what the \R escape sequence matches. By
+default, this is any Unicode newline sequence, for Perl compatibility. However,
+this can be changed; see the description of \R in the section entitled
+"Newline sequences"
+below. A change of \R setting can be combined with a change of newline
+convention.
+
+
CHARACTERS AND METACHARACTERS
+
+A regular expression is a pattern that is matched against a subject string from
+left to right. Most characters stand for themselves in a pattern, and match the
+corresponding characters in the subject. As a trivial example, the pattern
+
+ The quick brown fox
+
+matches a portion of a subject string that is identical to itself. When
+caseless matching is specified (the PCRE_CASELESS option), letters are matched
+independently of case. In UTF-8 mode, PCRE always understands the concept of
+case for characters whose values are less than 128, so caseless matching is
+always possible. For characters with higher values, the concept of case is
+supported if PCRE is compiled with Unicode property support, but not otherwise.
+If you want to use caseless matching for characters 128 and above, you must
+ensure that PCRE is compiled with Unicode property support as well as with
+UTF-8 support.
+
+
+The power of regular expressions comes from the ability to include alternatives
+and repetitions in the pattern. These are encoded in the pattern by the use of
+metacharacters, which do not stand for themselves but instead are
+interpreted in some special way.
+
+
+There are two different sets of metacharacters: those that are recognized
+anywhere in the pattern except within square brackets, and those that are
+recognized within square brackets. Outside square brackets, the metacharacters
+are as follows:
+
+ \ general escape character with several uses
+ ^ assert start of string (or line, in multiline mode)
+ $ assert end of string (or line, in multiline mode)
+ . match any character except newline (by default)
+ [ start character class definition
+ | start of alternative branch
+ ( start subpattern
+ ) end subpattern
+ ? extends the meaning of (
+ also 0 or 1 quantifier
+ also quantifier minimizer
+ * 0 or more quantifier
+ + 1 or more quantifier
+ also "possessive quantifier"
+ { start min/max quantifier
+
+Part of a pattern that is in square brackets is called a "character class". In
+a character class the only metacharacters are:
+
+ \ general escape character
+ ^ negate the class, but only if the first character
+ - indicates character range
+ [ POSIX character class (only if followed by POSIX syntax)
+ ] terminates the character class
+
+The following sections describe the use of each of the metacharacters.
+
+
BACKSLASH
+
+The backslash character has several uses. Firstly, if it is followed by a
+non-alphanumeric character, it takes away any special meaning that character
+may have. This use of backslash as an escape character applies both inside and
+outside character classes.
+
+
+For example, if you want to match a * character, you write \* in the pattern.
+This escaping action applies whether or not the following character would
+otherwise be interpreted as a metacharacter, so it is always safe to precede a
+non-alphanumeric with backslash to specify that it stands for itself. In
+particular, if you want to match a backslash, you write \\.
+
+
+If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
+pattern (other than in a character class) and characters between a # outside
+a character class and the next newline are ignored. An escaping backslash can
+be used to include a whitespace or # character as part of the pattern.
+
+
+If you want to remove the special meaning from a sequence of characters, you
+can do so by putting them between \Q and \E. This is different from Perl in
+that $ and @ are handled as literals in \Q...\E sequences in PCRE, whereas in
+Perl, $ and @ cause variable interpolation. Note the following examples:
+
+ Pattern PCRE matches Perl matches
+
+ \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
+ \Qabc\$xyz\E abc\$xyz abc\$xyz
+ \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
+
+The \Q...\E sequence is recognized both inside and outside character classes.
+
+
+Non-printing characters
+
+
+A second use of backslash provides a way of encoding non-printing characters
+in patterns in a visible manner. There is no restriction on the appearance of
+non-printing characters, apart from the binary zero that terminates a pattern,
+but when a pattern is being prepared by text editing, it is usually easier to
+use one of the following escape sequences than the binary character it
+represents:
+
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is any character
+ \e escape (hex 1B)
+ \f formfeed (hex 0C)
+ \n linefeed (hex 0A)
+ \r carriage return (hex 0D)
+ \t tab (hex 09)
+ \ddd character with octal code ddd, or backreference
+ \xhh character with hex code hh
+ \x{hhh..} character with hex code hhh..
+
+The precise effect of \cx is as follows: if x is a lower case letter, it
+is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
+Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c; becomes hex
+7B.
+
+
+After \x, from zero to two hexadecimal digits are read (letters can be in
+upper or lower case). Any number of hexadecimal digits may appear between \x{
+and }, but the value of the character code must be less than 256 in non-UTF-8
+mode, and less than 2**31 in UTF-8 mode. That is, the maximum value in
+hexadecimal is 7FFFFFFF. Note that this is bigger than the largest Unicode code
+point, which is 10FFFF.
+
+
+If characters other than hexadecimal digits appear between \x{ and }, or if
+there is no terminating }, this form of escape is not recognized. Instead, the
+initial \x will be interpreted as a basic hexadecimal escape, with no
+following digits, giving a character whose value is zero.
+
+
+Characters whose value is less than 256 can be defined by either of the two
+syntaxes for \x. There is no difference in the way they are handled. For
+example, \xdc is exactly the same as \x{dc}.
+
+
+After \0 up to two further octal digits are read. If there are fewer than two
+digits, just those that are present are used. Thus the sequence \0\x\07
+specifies two binary zeros followed by a BEL character (code value 7). Make
+sure you supply two digits after the initial zero if the pattern character that
+follows is itself an octal digit.
+
+
+The handling of a backslash followed by a digit other than 0 is complicated.
+Outside a character class, PCRE reads it and any following digits as a decimal
+number. If the number is less than 10, or if there have been at least that many
+previous capturing left parentheses in the expression, the entire sequence is
+taken as a back reference. A description of how this works is given
+later,
+following the discussion of
+parenthesized subpatterns.
+
+
+Inside a character class, or if the decimal number is greater than 9 and there
+have not been that many capturing subpatterns, PCRE re-reads up to three octal
+digits following the backslash, and uses them to generate a data character. Any
+subsequent digits stand for themselves. In non-UTF-8 mode, the value of a
+character specified in octal must be less than \400. In UTF-8 mode, values up
+to \777 are permitted. For example:
+
+ \040 is another way of writing a space
+ \40 is the same, provided there are fewer than 40 previous capturing subpatterns
+ \7 is always a back reference
+ \11 might be a back reference, or another way of writing a tab
+ \011 is always a tab
+ \0113 is a tab followed by the character "3"
+ \113 might be a back reference, otherwise the character with octal code 113
+ \377 might be a back reference, otherwise the byte consisting entirely of 1 bits
+ \81 is either a back reference, or a binary zero followed by the two characters "8" and "1"
+
+Note that octal values of 100 or greater must not be introduced by a leading
+zero, because no more than three octal digits are ever read.
+
+
+All the sequences that define a single character value can be used both inside
+and outside character classes. In addition, inside a character class, the
+sequence \b is interpreted as the backspace character (hex 08), and the
+sequences \R and \X are interpreted as the characters "R" and "X",
+respectively. Outside a character class, these sequences have different
+meanings
+(see below).
+
+
+Absolute and relative back references
+
+
+The sequence \g followed by an unsigned or a negative number, optionally
+enclosed in braces, is an absolute or relative back reference. A named back
+reference can be coded as \g{name}. Back references are discussed
+later,
+following the discussion of
+parenthesized subpatterns.
+
+
+Absolute and relative subroutine calls
+
+
+For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or
+a number enclosed either in angle brackets or single quotes, is an alternative
+syntax for referencing a subpattern as a "subroutine". Details are discussed
+later.
+Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
+synonymous. The former is a back reference; the latter is a subroutine call.
+
+
+Generic character types
+
+
+Another use of backslash is for specifying generic character types. The
+following are always recognized:
+
+ \d any decimal digit
+ \D any character that is not a decimal digit
+ \h any horizontal whitespace character
+ \H any character that is not a horizontal whitespace character
+ \s any whitespace character
+ \S any character that is not a whitespace character
+ \v any vertical whitespace character
+ \V any character that is not a vertical whitespace character
+ \w any "word" character
+ \W any "non-word" character
+
+Each pair of escape sequences partitions the complete set of characters into
+two disjoint sets. Any given character matches one, and only one, of each pair.
+
+
+These character type sequences can appear both inside and outside character
+classes. They each match one character of the appropriate type. If the current
+matching point is at the end of the subject string, all of them fail, since
+there is no character to match.
+
+
+For compatibility with Perl, \s does not match the VT character (code 11).
+This makes it different from the the POSIX "space" class. The \s characters
+are HT (9), LF (10), FF (12), CR (13), and space (32). If "use locale;" is
+included in a Perl script, \s may match the VT character. In PCRE, it never
+does.
+
+
+In UTF-8 mode, characters with values greater than 128 never match \d, \s, or
+\w, and always match \D, \S, and \W. This is true even when Unicode
+character property support is available. These sequences retain their original
+meanings from before UTF-8 support was available, mainly for efficiency
+reasons.
+
+
+The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to the
+other sequences, these do match certain high-valued codepoints in UTF-8 mode.
+The horizontal space characters are:
+
+ U+0009 Horizontal tab
+ U+0020 Space
+ U+00A0 Non-break space
+ U+1680 Ogham space mark
+ U+180E Mongolian vowel separator
+ U+2000 En quad
+ U+2001 Em quad
+ U+2002 En space
+ U+2003 Em space
+ U+2004 Three-per-em space
+ U+2005 Four-per-em space
+ U+2006 Six-per-em space
+ U+2007 Figure space
+ U+2008 Punctuation space
+ U+2009 Thin space
+ U+200A Hair space
+ U+202F Narrow no-break space
+ U+205F Medium mathematical space
+ U+3000 Ideographic space
+
+The vertical space characters are:
+
+ U+000A Linefeed
+ U+000B Vertical tab
+ U+000C Formfeed
+ U+000D Carriage return
+ U+0085 Next line
+ U+2028 Line separator
+ U+2029 Paragraph separator
+
+
+
+A "word" character is an underscore or any character less than 256 that is a
+letter or digit. The definition of letters and digits is controlled by PCRE's
+low-valued character tables, and may vary if locale-specific matching is taking
+place (see
+"Locale support"
+in the
+pcreapi
+page). For example, in a French locale such as "fr_FR" in Unix-like systems,
+or "french" in Windows, some character codes greater than 128 are used for
+accented letters, and these are matched by \w. The use of locales with Unicode
+is discouraged.
+
+
+Newline sequences
+
+
+Outside a character class, by default, the escape sequence \R matches any
+Unicode newline sequence. This is a Perl 5.10 feature. In non-UTF-8 mode \R is
+equivalent to the following:
+
+ (?>\r\n|\n|\x0b|\f|\r|\x85)
+
+This is an example of an "atomic group", details of which are given
+below.
+This particular group matches either the two-character sequence CR followed by
+LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
+U+000B), FF (formfeed, U+000C), CR (carriage return, U+000D), or NEL (next
+line, U+0085). The two-character sequence is treated as a single unit that
+cannot be split.
+
+
+In UTF-8 mode, two additional characters whose codepoints are greater than 255
+are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029).
+Unicode character property support is not needed for these characters to be
+recognized.
+
+
+It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
+complete set of Unicode line endings) by setting the option PCRE_BSR_ANYCRLF
+either at compile time or when the pattern is matched. (BSR is an abbrevation
+for "backslash R".) This can be made the default when PCRE is built; if this is
+the case, the other behaviour can be requested via the PCRE_BSR_UNICODE option.
+It is also possible to specify these settings by starting a pattern string with
+one of the following sequences:
+
+ (*BSR_ANYCRLF) CR, LF, or CRLF only
+ (*BSR_UNICODE) any Unicode newline sequence
+
+These override the default and the options given to pcre_compile(), but
+they can be overridden by options given to pcre_exec(). Note that these
+special settings, which are not Perl-compatible, are recognized only at the
+very start of a pattern, and that they must be in upper case. If more than one
+of them is present, the last one is used. They can be combined with a change of
+newline convention, for example, a pattern can start with:
+
+ (*ANY)(*BSR_ANYCRLF)
+
+Inside a character class, \R matches the letter "R".
+
+
+Unicode character properties
+
+
+When PCRE is built with Unicode character property support, three additional
+escape sequences that match characters with specific properties are available.
+When not in UTF-8 mode, these sequences are of course limited to testing
+characters whose codepoints are less than 256, but they do work in this mode.
+The extra escape sequences are:
+
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \X an extended Unicode sequence
+
+The property names represented by xx above are limited to the Unicode
+script names, the general category properties, and "Any", which matches any
+character (including newline). Other properties such as "InMusicalSymbols" are
+not currently supported by PCRE. Note that \P{Any} does not match any
+characters, so always causes a match failure.
+
+
+Sets of Unicode characters are defined as belonging to certain scripts. A
+character from one of these sets can be matched using a script name. For
+example:
+
+ \p{Greek}
+ \P{Han}
+
+Those that are not part of an identified script are lumped together as
+"Common". The current list of scripts is:
+
+
+Arabic,
+Armenian,
+Balinese,
+Bengali,
+Bopomofo,
+Braille,
+Buginese,
+Buhid,
+Canadian_Aboriginal,
+Cherokee,
+Common,
+Coptic,
+Cuneiform,
+Cypriot,
+Cyrillic,
+Deseret,
+Devanagari,
+Ethiopic,
+Georgian,
+Glagolitic,
+Gothic,
+Greek,
+Gujarati,
+Gurmukhi,
+Han,
+Hangul,
+Hanunoo,
+Hebrew,
+Hiragana,
+Inherited,
+Kannada,
+Katakana,
+Kharoshthi,
+Khmer,
+Lao,
+Latin,
+Limbu,
+Linear_B,
+Malayalam,
+Mongolian,
+Myanmar,
+New_Tai_Lue,
+Nko,
+Ogham,
+Old_Italic,
+Old_Persian,
+Oriya,
+Osmanya,
+Phags_Pa,
+Phoenician,
+Runic,
+Shavian,
+Sinhala,
+Syloti_Nagri,
+Syriac,
+Tagalog,
+Tagbanwa,
+Tai_Le,
+Tamil,
+Telugu,
+Thaana,
+Thai,
+Tibetan,
+Tifinagh,
+Ugaritic,
+Yi.
+
+
+Each character has exactly one general category property, specified by a
+two-letter abbreviation. For compatibility with Perl, negation can be specified
+by including a circumflex between the opening brace and the property name. For
+example, \p{^Lu} is the same as \P{Lu}.
+
+
+If only one letter is specified with \p or \P, it includes all the general
+category properties that start with that letter. In this case, in the absence
+of negation, the curly brackets in the escape sequence are optional; these two
+examples have the same effect:
+
+ \p{L}
+ \pL
+
+The following general category property codes are supported:
+
+ C Other
+ Cc Control
+ Cf Format
+ Cn Unassigned
+ Co Private use
+ Cs Surrogate
+
+ L Letter
+ Ll Lower case letter
+ Lm Modifier letter
+ Lo Other letter
+ Lt Title case letter
+ Lu Upper case letter
+
+ M Mark
+ Mc Spacing mark
+ Me Enclosing mark
+ Mn Non-spacing mark
+
+ N Number
+ Nd Decimal number
+ Nl Letter number
+ No Other number
+
+ P Punctuation
+ Pc Connector punctuation
+ Pd Dash punctuation
+ Pe Close punctuation
+ Pf Final punctuation
+ Pi Initial punctuation
+ Po Other punctuation
+ Ps Open punctuation
+
+ S Symbol
+ Sc Currency symbol
+ Sk Modifier symbol
+ Sm Mathematical symbol
+ So Other symbol
+
+ Z Separator
+ Zl Line separator
+ Zp Paragraph separator
+ Zs Space separator
+
+The special property L& is also supported: it matches a character that has
+the Lu, Ll, or Lt property, in other words, a letter that is not classified as
+a modifier or "other".
+
+
+The Cs (Surrogate) property applies only to characters in the range U+D800 to
+U+DFFF. Such characters are not valid in UTF-8 strings (see RFC 3629) and so
+cannot be tested by PCRE, unless UTF-8 validity checking has been turned off
+(see the discussion of PCRE_NO_UTF8_CHECK in the
+pcreapi
+page).
+
+
+The long synonyms for these properties that Perl supports (such as \p{Letter})
+are not supported by PCRE, nor is it permitted to prefix any of these
+properties with "Is".
+
+
+No character that is in the Unicode table has the Cn (unassigned) property.
+Instead, this property is assumed for any code point that is not in the
+Unicode table.
+
+
+Specifying caseless matching does not affect these escape sequences. For
+example, \p{Lu} always matches only upper case letters.
+
+
+The \X escape matches any number of Unicode characters that form an extended
+Unicode sequence. \X is equivalent to
+
+ (?>\PM\pM*)
+
+That is, it matches a character without the "mark" property, followed by zero
+or more characters with the "mark" property, and treats the sequence as an
+atomic group
+(see below).
+Characters with the "mark" property are typically accents that affect the
+preceding character. None of them have codepoints less than 256, so in
+non-UTF-8 mode \X matches any one character.
+
+
+Matching characters by Unicode property is not fast, because PCRE has to search
+a structure that contains data for over fifteen thousand characters. That is
+why the traditional escape sequences such as \d and \w do not use Unicode
+properties in PCRE.
+
+
+Resetting the match start
+
+
+The escape sequence \K, which is a Perl 5.10 feature, causes any previously
+matched characters not to be included in the final matched sequence. For
+example, the pattern:
+
+ foo\Kbar
+
+matches "foobar", but reports that it has matched "bar". This feature is
+similar to a lookbehind assertion
+(described below).
+However, in this case, the part of the subject before the real match does not
+have to be of fixed length, as lookbehind assertions do. The use of \K does
+not interfere with the setting of
+captured substrings.
+For example, when the pattern
+
+ (foo)\Kbar
+
+matches "foobar", the first substring is still set to "foo".
+
+
+Simple assertions
+
+
+The final use of backslash is for certain simple assertions. An assertion
+specifies a condition that has to be met at a particular point in a match,
+without consuming any characters from the subject string. The use of
+subpatterns for more complicated assertions is described
+below.
+The backslashed assertions are:
+
+ \b matches at a word boundary
+ \B matches when not at a word boundary
+ \A matches at the start of the subject
+ \Z matches at the end of the subject
+ also matches before a newline at the end of the subject
+ \z matches only at the end of the subject
+ \G matches at the first matching position in the subject
+
+These assertions may not appear in character classes (but note that \b has a
+different meaning, namely the backspace character, inside a character class).
+
+
+A word boundary is a position in the subject string where the current character
+and the previous character do not both match \w or \W (i.e. one matches
+\w and the other matches \W), or the start or end of the string if the
+first or last character matches \w, respectively.
+
+
+The \A, \Z, and \z assertions differ from the traditional circumflex and
+dollar (described in the next section) in that they only ever match at the very
+start and end of the subject string, whatever options are set. Thus, they are
+independent of multiline mode. These three assertions are not affected by the
+PCRE_NOTBOL or PCRE_NOTEOL options, which affect only the behaviour of the
+circumflex and dollar metacharacters. However, if the startoffset
+argument of pcre_exec() is non-zero, indicating that matching is to start
+at a point other than the beginning of the subject, \A can never match. The
+difference between \Z and \z is that \Z matches before a newline at the end
+of the string as well as at the very end, whereas \z matches only at the end.
+
+
+The \G assertion is true only when the current matching position is at the
+start point of the match, as specified by the startoffset argument of
+pcre_exec(). It differs from \A when the value of startoffset is
+non-zero. By calling pcre_exec() multiple times with appropriate
+arguments, you can mimic Perl's /g option, and it is in this kind of
+implementation where \G can be useful.
+
+
+Note, however, that PCRE's interpretation of \G, as the start of the current
+match, is subtly different from Perl's, which defines it as the end of the
+previous match. In Perl, these can be different when the previously matched
+string was empty. Because PCRE does just one match at a time, it cannot
+reproduce this behaviour.
+
+
+If all the alternatives of a pattern begin with \G, the expression is anchored
+to the starting match position, and the "anchored" flag is set in the compiled
+regular expression.
+
+
CIRCUMFLEX AND DOLLAR
+
+Outside a character class, in the default matching mode, the circumflex
+character is an assertion that is true only if the current matching point is
+at the start of the subject string. If the startoffset argument of
+pcre_exec() is non-zero, circumflex can never match if the PCRE_MULTILINE
+option is unset. Inside a character class, circumflex has an entirely different
+meaning
+(see below).
+
+
+Circumflex need not be the first character of the pattern if a number of
+alternatives are involved, but it should be the first thing in each alternative
+in which it appears if the pattern is ever to match that branch. If all
+possible alternatives start with a circumflex, that is, if the pattern is
+constrained to match only at the start of the subject, it is said to be an
+"anchored" pattern. (There are also other constructs that can cause a pattern
+to be anchored.)
+
+
+A dollar character is an assertion that is true only if the current matching
+point is at the end of the subject string, or immediately before a newline
+at the end of the string (by default). Dollar need not be the last character of
+the pattern if a number of alternatives are involved, but it should be the last
+item in any branch in which it appears. Dollar has no special meaning in a
+character class.
+
+
+The meaning of dollar can be changed so that it matches only at the very end of
+the string, by setting the PCRE_DOLLAR_ENDONLY option at compile time. This
+does not affect the \Z assertion.
+
+
+The meanings of the circumflex and dollar characters are changed if the
+PCRE_MULTILINE option is set. When this is the case, a circumflex matches
+immediately after internal newlines as well as at the start of the subject
+string. It does not match after a newline that ends the string. A dollar
+matches before any newlines in the string, as well as at the very end, when
+PCRE_MULTILINE is set. When newline is specified as the two-character
+sequence CRLF, isolated CR and LF characters do not indicate newlines.
+
+
+For example, the pattern /^abc$/ matches the subject string "def\nabc" (where
+\n represents a newline) in multiline mode, but not otherwise. Consequently,
+patterns that are anchored in single line mode because all branches start with
+^ are not anchored in multiline mode, and a match for circumflex is possible
+when the startoffset argument of pcre_exec() is non-zero. The
+PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
+
+
+Note that the sequences \A, \Z, and \z can be used to match the start and
+end of the subject in both modes, and if all branches of a pattern start with
+\A it is always anchored, whether or not PCRE_MULTILINE is set.
+
+
FULL STOP (PERIOD, DOT)
+
+Outside a character class, a dot in the pattern matches any one character in
+the subject string except (by default) a character that signifies the end of a
+line. In UTF-8 mode, the matched character may be more than one byte long.
+
+
+When a line ending is defined as a single character, dot never matches that
+character; when the two-character sequence CRLF is used, dot does not match CR
+if it is immediately followed by LF, but otherwise it matches all characters
+(including isolated CRs and LFs). When any Unicode line endings are being
+recognized, dot does not match CR or LF or any of the other line ending
+characters.
+
+
+The behaviour of dot with regard to newlines can be changed. If the PCRE_DOTALL
+option is set, a dot matches any one character, without exception. If the
+two-character sequence CRLF is present in the subject string, it takes two dots
+to match it.
+
+
+The handling of dot is entirely independent of the handling of circumflex and
+dollar, the only relationship being that they both involve newlines. Dot has no
+special meaning in a character class.
+
+
MATCHING A SINGLE BYTE
+
+Outside a character class, the escape sequence \C matches any one byte, both
+in and out of UTF-8 mode. Unlike a dot, it always matches any line-ending
+characters. The feature is provided in Perl in order to match individual bytes
+in UTF-8 mode. Because it breaks up UTF-8 characters into individual bytes,
+what remains in the string may be a malformed UTF-8 string. For this reason,
+the \C escape sequence is best avoided.
+
+
+PCRE does not allow \C to appear in lookbehind assertions
+(described below),
+because in UTF-8 mode this would make it impossible to calculate the length of
+the lookbehind.
+
+
SQUARE BRACKETS AND CHARACTER CLASSES
+
+An opening square bracket introduces a character class, terminated by a closing
+square bracket. A closing square bracket on its own is not special. If a
+closing square bracket is required as a member of the class, it should be the
+first data character in the class (after an initial circumflex, if present) or
+escaped with a backslash.
+
+
+A character class matches a single character in the subject. In UTF-8 mode, the
+character may occupy more than one byte. A matched character must be in the set
+of characters defined by the class, unless the first character in the class
+definition is a circumflex, in which case the subject character must not be in
+the set defined by the class. If a circumflex is actually required as a member
+of the class, ensure it is not the first character, or escape it with a
+backslash.
+
+
+For example, the character class [aeiou] matches any lower case vowel, while
+[^aeiou] matches any character that is not a lower case vowel. Note that a
+circumflex is just a convenient notation for specifying the characters that
+are in the class by enumerating those that are not. A class that starts with a
+circumflex is not an assertion: it still consumes a character from the subject
+string, and therefore it fails if the current pointer is at the end of the
+string.
+
+
+In UTF-8 mode, characters with values greater than 255 can be included in a
+class as a literal string of bytes, or by using the \x{ escaping mechanism.
+
+
+When caseless matching is set, any letters in a class represent both their
+upper case and lower case versions, so for example, a caseless [aeiou] matches
+"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
+caseful version would. In UTF-8 mode, PCRE always understands the concept of
+case for characters whose values are less than 128, so caseless matching is
+always possible. For characters with higher values, the concept of case is
+supported if PCRE is compiled with Unicode property support, but not otherwise.
+If you want to use caseless matching for characters 128 and above, you must
+ensure that PCRE is compiled with Unicode property support as well as with
+UTF-8 support.
+
+
+Characters that might indicate line breaks are never treated in any special way
+when matching character classes, whatever line-ending sequence is in use, and
+whatever setting of the PCRE_DOTALL and PCRE_MULTILINE options is used. A class
+such as [^a] always matches one of these characters.
+
+
+The minus (hyphen) character can be used to specify a range of characters in a
+character class. For example, [d-m] matches any letter between d and m,
+inclusive. If a minus character is required in a class, it must be escaped with
+a backslash or appear in a position where it cannot be interpreted as
+indicating a range, typically as the first or last character in the class.
+
+
+It is not possible to have the literal character "]" as the end character of a
+range. A pattern such as [W-]46] is interpreted as a class of two characters
+("W" and "-") followed by a literal string "46]", so it would match "W46]" or
+"-46]". However, if the "]" is escaped with a backslash it is interpreted as
+the end of range, so [W-\]46] is interpreted as a class containing a range
+followed by two other characters. The octal or hexadecimal representation of
+"]" can also be used to end a range.
+
+
+Ranges operate in the collating sequence of character values. They can also be
+used for characters specified numerically, for example [\000-\037]. In UTF-8
+mode, ranges can include characters whose values are greater than 255, for
+example [\x{100}-\x{2ff}].
+
+
+If a range that includes letters is used when caseless matching is set, it
+matches the letters in either case. For example, [W-c] is equivalent to
+[][\\^_`wxyzabc], matched caselessly, and in non-UTF-8 mode, if character
+tables for a French locale are in use, [\xc8-\xcb] matches accented E
+characters in both cases. In UTF-8 mode, PCRE supports the concept of case for
+characters with values greater than 128 only when it is compiled with Unicode
+property support.
+
+
+The character types \d, \D, \p, \P, \s, \S, \w, and \W may also appear
+in a character class, and add the characters that they match to the class. For
+example, [\dABCDEF] matches any hexadecimal digit. A circumflex can
+conveniently be used with the upper case character types to specify a more
+restricted set of characters than the matching lower case type. For example,
+the class [^\W_] matches any letter or digit, but not underscore.
+
+
+The only metacharacters that are recognized in character classes are backslash,
+hyphen (only where it can be interpreted as specifying a range), circumflex
+(only at the start), opening square bracket (only when it can be interpreted as
+introducing a POSIX class name - see the next section), and the terminating
+closing square bracket. However, escaping other non-alphanumeric characters
+does no harm.
+
+
POSIX CHARACTER CLASSES
+
+Perl supports the POSIX notation for character classes. This uses names
+enclosed by [: and :] within the enclosing square brackets. PCRE also supports
+this notation. For example,
+
+ [01[:alpha:]%]
+
+matches "0", "1", any alphabetic character, or "%". The supported class names
+are
+
+ alnum letters and digits
+ alpha letters
+ ascii character codes 0 - 127
+ blank space or tab only
+ cntrl control characters
+ digit decimal digits (same as \d)
+ graph printing characters, excluding space
+ lower lower case letters
+ print printing characters, including space
+ punct printing characters, excluding letters and digits
+ space white space (not quite the same as \s)
+ upper upper case letters
+ word "word" characters (same as \w)
+ xdigit hexadecimal digits
+
+The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and
+space (32). Notice that this list includes the VT character (code 11). This
+makes "space" different to \s, which does not include VT (for Perl
+compatibility).
+
+
+The name "word" is a Perl extension, and "blank" is a GNU extension from Perl
+5.8. Another Perl extension is negation, which is indicated by a ^ character
+after the colon. For example,
+
+ [12[:^digit:]]
+
+matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
+syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
+supported, and an error is given if they are encountered.
+
+
+In UTF-8 mode, characters with values greater than 128 do not match any of
+the POSIX character classes.
+
+
VERTICAL BAR
+
+Vertical bar characters are used to separate alternative patterns. For example,
+the pattern
+
+ gilbert|sullivan
+
+matches either "gilbert" or "sullivan". Any number of alternatives may appear,
+and an empty alternative is permitted (matching the empty string). The matching
+process tries each alternative in turn, from left to right, and the first one
+that succeeds is used. If the alternatives are within a subpattern
+(defined below),
+"succeeds" means matching the rest of the main pattern as well as the
+alternative in the subpattern.
+
+
INTERNAL OPTION SETTING
+
+The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
+PCRE_EXTENDED options (which are Perl-compatible) can be changed from within
+the pattern by a sequence of Perl option letters enclosed between "(?" and ")".
+The option letters are
+
+ i for PCRE_CASELESS
+ m for PCRE_MULTILINE
+ s for PCRE_DOTALL
+ x for PCRE_EXTENDED
+
+For example, (?im) sets caseless, multiline matching. It is also possible to
+unset these options by preceding the letter with a hyphen, and a combined
+setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
+PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
+permitted. If a letter appears both before and after the hyphen, the option is
+unset.
+
+
+The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA can be
+changed in the same way as the Perl-compatible options by using the characters
+J, U and X respectively.
+
+
+When an option change occurs at top level (that is, not inside subpattern
+parentheses), the change applies to the remainder of the pattern that follows.
+If the change is placed right at the start of a pattern, PCRE extracts it into
+the global options (and it will therefore show up in data extracted by the
+pcre_fullinfo() function).
+
+
+An option change within a subpattern (see below for a description of
+subpatterns) affects only that part of the current pattern that follows it, so
+
+ (a(?i)b)c
+
+matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
+By this means, options can be made to have different settings in different
+parts of the pattern. Any changes made in one alternative do carry on
+into subsequent branches within the same subpattern. For example,
+
+ (a(?i)b|c)
+
+matches "ab", "aB", "c", and "C", even though when matching "C" the first
+branch is abandoned before the option setting. This is because the effects of
+option settings happen at compile time. There would be some very weird
+behaviour otherwise.
+
+
+Note: There are other PCRE-specific options that can be set by the
+application when the compile or match functions are called. In some cases the
+pattern can contain special leading sequences to override what the application
+has set or what has been defaulted. Details are given in the section entitled
+"Newline sequences"
+above.
+
+
SUBPATTERNS
+
+Subpatterns are delimited by parentheses (round brackets), which can be nested.
+Turning part of a pattern into a subpattern does two things:
+
+
+1. It localizes a set of alternatives. For example, the pattern
+
+ cat(aract|erpillar|)
+
+matches one of the words "cat", "cataract", or "caterpillar". Without the
+parentheses, it would match "cataract", "erpillar" or an empty string.
+
+
+2. It sets up the subpattern as a capturing subpattern. This means that, when
+the whole pattern matches, that portion of the subject string that matched the
+subpattern is passed back to the caller via the ovector argument of
+pcre_exec(). Opening parentheses are counted from left to right (starting
+from 1) to obtain numbers for the capturing subpatterns.
+
+
+For example, if the string "the red king" is matched against the pattern
+
+ the ((red|white) (king|queen))
+
+the captured substrings are "red king", "red", and "king", and are numbered 1,
+2, and 3, respectively.
+
+
+The fact that plain parentheses fulfil two functions is not always helpful.
+There are often times when a grouping subpattern is required without a
+capturing requirement. If an opening parenthesis is followed by a question mark
+and a colon, the subpattern does not do any capturing, and is not counted when
+computing the number of any subsequent capturing subpatterns. For example, if
+the string "the white queen" is matched against the pattern
+
+ the ((?:red|white) (king|queen))
+
+the captured substrings are "white queen" and "queen", and are numbered 1 and
+2. The maximum number of capturing subpatterns is 65535.
+
+
+As a convenient shorthand, if any option settings are required at the start of
+a non-capturing subpattern, the option letters may appear between the "?" and
+the ":". Thus the two patterns
+
+ (?i:saturday|sunday)
+ (?:(?i)saturday|sunday)
+
+match exactly the same set of strings. Because alternative branches are tried
+from left to right, and options are not reset until the end of the subpattern
+is reached, an option setting in one branch does affect subsequent branches, so
+the above patterns match "SUNDAY" as well as "Saturday".
+
+
DUPLICATE SUBPATTERN NUMBERS
+
+Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
+the same numbers for its capturing parentheses. Such a subpattern starts with
+(?| and is itself a non-capturing subpattern. For example, consider this
+pattern:
+
+ (?|(Sat)ur|(Sun))day
+
+Because the two alternatives are inside a (?| group, both sets of capturing
+parentheses are numbered one. Thus, when the pattern matches, you can look
+at captured substring number one, whichever alternative matched. This construct
+is useful when you want to capture part, but not all, of one of a number of
+alternatives. Inside a (?| group, parentheses are numbered as usual, but the
+number is reset at the start of each branch. The numbers of any capturing
+buffers that follow the subpattern start after the highest number used in any
+branch. The following example is taken from the Perl documentation.
+The numbers underneath show in which buffer the captured content will be
+stored.
+
+ # before ---------------branch-reset----------- after
+ / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
+ # 1 2 2 3 2 3 4
+
+A backreference or a recursive call to a numbered subpattern always refers to
+the first one in the pattern with the given number.
+
+
+An alternative approach to using this "branch reset" feature is to use
+duplicate named subpatterns, as described in the next section.
+
+
NAMED SUBPATTERNS
+
+Identifying capturing parentheses by number is simple, but it can be very hard
+to keep track of the numbers in complicated regular expressions. Furthermore,
+if an expression is modified, the numbers may change. To help with this
+difficulty, PCRE supports the naming of subpatterns. This feature was not
+added to Perl until release 5.10. Python had the feature earlier, and PCRE
+introduced it at release 4.0, using the Python syntax. PCRE now supports both
+the Perl and the Python syntax.
+
+
+In PCRE, a subpattern can be named in one of three ways: (?<name>...) or
+(?'name'...) as in Perl, or (?P<name>...) as in Python. References to capturing
+parentheses from other parts of the pattern, such as
+backreferences,
+recursion,
+and
+conditions,
+can be made by name as well as by number.
+
+
+Names consist of up to 32 alphanumeric characters and underscores. Named
+capturing parentheses are still allocated numbers as well as names, exactly as
+if the names were not present. The PCRE API provides function calls for
+extracting the name-to-number translation table from a compiled pattern. There
+is also a convenience function for extracting a captured substring by name.
+
+
+By default, a name must be unique within a pattern, but it is possible to relax
+this constraint by setting the PCRE_DUPNAMES option at compile time. This can
+be useful for patterns where only one instance of the named parentheses can
+match. Suppose you want to match the name of a weekday, either as a 3-letter
+abbreviation or as the full name, and in both cases you want to extract the
+abbreviation. This pattern (ignoring the line breaks) does the job:
+
+ (?<DN>Mon|Fri|Sun)(?:day)?|
+ (?<DN>Tue)(?:sday)?|
+ (?<DN>Wed)(?:nesday)?|
+ (?<DN>Thu)(?:rsday)?|
+ (?<DN>Sat)(?:urday)?
+
+There are five capturing substrings, but only one is ever set after a match.
+(An alternative way of solving this problem is to use a "branch reset"
+subpattern, as described in the previous section.)
+
+
+The convenience function for extracting the data by name returns the substring
+for the first (and in this example, the only) subpattern of that name that
+matched. This saves searching to find which numbered subpattern it was. If you
+make a reference to a non-unique named subpattern from elsewhere in the
+pattern, the one that corresponds to the lowest number is used. For further
+details of the interfaces for handling named subpatterns, see the
+pcreapi
+documentation.
+
+
REPETITION
+
+Repetition is specified by quantifiers, which can follow any of the following
+items:
+
+ a literal data character
+ the dot metacharacter
+ the \C escape sequence
+ the \X escape sequence (in UTF-8 mode with Unicode properties)
+ the \R escape sequence
+ an escape such as \d that matches a single character
+ a character class
+ a back reference (see next section)
+ a parenthesized subpattern (unless it is an assertion)
+
+The general repetition quantifier specifies a minimum and maximum number of
+permitted matches, by giving the two numbers in curly brackets (braces),
+separated by a comma. The numbers must be less than 65536, and the first must
+be less than or equal to the second. For example:
+
+ z{2,4}
+
+matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
+character. If the second number is omitted, but the comma is present, there is
+no upper limit; if the second number and the comma are both omitted, the
+quantifier specifies an exact number of required matches. Thus
+
+ [aeiou]{3,}
+
+matches at least 3 successive vowels, but may match many more, while
+
+ \d{8}
+
+matches exactly 8 digits. An opening curly bracket that appears in a position
+where a quantifier is not allowed, or one that does not match the syntax of a
+quantifier, is taken as a literal character. For example, {,6} is not a
+quantifier, but a literal string of four characters.
+
+
+In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to individual
+bytes. Thus, for example, \x{100}{2} matches two UTF-8 characters, each of
+which is represented by a two-byte sequence. Similarly, when Unicode property
+support is available, \X{3} matches three Unicode extended sequences, each of
+which may be several bytes long (and they may be of different lengths).
+
+
+The quantifier {0} is permitted, causing the expression to behave as if the
+previous item and the quantifier were not present. This may be useful for
+subpatterns that are referenced as
+subroutines
+from elsewhere in the pattern. Items other than subpatterns that have a {0}
+quantifier are omitted from the compiled pattern.
+
+
+For convenience, the three most common quantifiers have single-character
+abbreviations:
+
+ * is equivalent to {0,}
+ + is equivalent to {1,}
+ ? is equivalent to {0,1}
+
+It is possible to construct infinite loops by following a subpattern that can
+match no characters with a quantifier that has no upper limit, for example:
+
+ (a?)*
+
+Earlier versions of Perl and PCRE used to give an error at compile time for
+such patterns. However, because there are cases where this can be useful, such
+patterns are now accepted, but if any repetition of the subpattern does in fact
+match no characters, the loop is forcibly broken.
+
+
+By default, the quantifiers are "greedy", that is, they match as much as
+possible (up to the maximum number of permitted times), without causing the
+rest of the pattern to fail. The classic example of where this gives problems
+is in trying to match comments in C programs. These appear between /* and */
+and within the comment, individual * and / characters may appear. An attempt to
+match C comments by applying the pattern
+
+ /\*.*\*/
+
+to the string
+
+ /* first comment */ not comment /* second comment */
+
+fails, because it matches the entire string owing to the greediness of the .*
+item.
+
+
+However, if a quantifier is followed by a question mark, it ceases to be
+greedy, and instead matches the minimum number of times possible, so the
+pattern
+
+ /\*.*?\*/
+
+does the right thing with the C comments. The meaning of the various
+quantifiers is not otherwise changed, just the preferred number of matches.
+Do not confuse this use of question mark with its use as a quantifier in its
+own right. Because it has two uses, it can sometimes appear doubled, as in
+
+ \d??\d
+
+which matches one digit by preference, but can match two if that is the only
+way the rest of the pattern matches.
+
+
+If the PCRE_UNGREEDY option is set (an option that is not available in Perl),
+the quantifiers are not greedy by default, but individual ones can be made
+greedy by following them with a question mark. In other words, it inverts the
+default behaviour.
+
+
+When a parenthesized subpattern is quantified with a minimum repeat count that
+is greater than 1 or with a limited maximum, more memory is required for the
+compiled pattern, in proportion to the size of the minimum or maximum.
+
+
+If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
+to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is
+implicitly anchored, because whatever follows will be tried against every
+character position in the subject string, so there is no point in retrying the
+overall match at any position after the first. PCRE normally treats such a
+pattern as though it were preceded by \A.
+
+
+In cases where it is known that the subject string contains no newlines, it is
+worth setting PCRE_DOTALL in order to obtain this optimization, or
+alternatively using ^ to indicate anchoring explicitly.
+
+
+However, there is one situation where the optimization cannot be used. When .*
+is inside capturing parentheses that are the subject of a backreference
+elsewhere in the pattern, a match at the start may fail where a later one
+succeeds. Consider, for example:
+
+ (.*)abc\1
+
+If the subject is "xyz123abc123" the match point is the fourth character. For
+this reason, such a pattern is not implicitly anchored.
+
+
+When a capturing subpattern is repeated, the value captured is the substring
+that matched the final iteration. For example, after
+
+ (tweedle[dume]{3}\s*)+
+
+has matched "tweedledum tweedledee" the value of the captured substring is
+"tweedledee". However, if there are nested capturing subpatterns, the
+corresponding captured values may have been set in previous iterations. For
+example, after
+
+ /(a|(b))+/
+
+matches "aba" the value of the second captured substring is "b".
+
+
ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
+
+With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
+repetition, failure of what follows normally causes the repeated item to be
+re-evaluated to see if a different number of repeats allows the rest of the
+pattern to match. Sometimes it is useful to prevent this, either to change the
+nature of the match, or to cause it fail earlier than it otherwise might, when
+the author of the pattern knows there is no point in carrying on.
+
+
+Consider, for example, the pattern \d+foo when applied to the subject line
+
+ 123456bar
+
+After matching all 6 digits and then failing to match "foo", the normal
+action of the matcher is to try again with only 5 digits matching the \d+
+item, and then with 4, and so on, before ultimately failing. "Atomic grouping"
+(a term taken from Jeffrey Friedl's book) provides the means for specifying
+that once a subpattern has matched, it is not to be re-evaluated in this way.
+
+
+If we use atomic grouping for the previous example, the matcher gives up
+immediately on failing to match "foo" the first time. The notation is a kind of
+special parenthesis, starting with (?> as in this example:
+
+ (?>\d+)foo
+
+This kind of parenthesis "locks up" the part of the pattern it contains once
+it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
+
+
+An alternative description is that a subpattern of this type matches the string
+of characters that an identical standalone pattern would match, if anchored at
+the current point in the subject string.
+
+
+Atomic grouping subpatterns are not capturing subpatterns. Simple cases such as
+the above example can be thought of as a maximizing repeat that must swallow
+everything it can. So, while both \d+ and \d+? are prepared to adjust the
+number of digits they match in order to make the rest of the pattern match,
+(?>\d+) can only match an entire sequence of digits.
+
+
+Atomic groups in general can of course contain arbitrarily complicated
+subpatterns, and can be nested. However, when the subpattern for an atomic
+group is just a single repeated item, as in the example above, a simpler
+notation, called a "possessive quantifier" can be used. This consists of an
+additional + character following a quantifier. Using this notation, the
+previous example can be rewritten as
+
+ \d++foo
+
+Note that a possessive quantifier can be used with an entire group, for
+example:
+
+ (abc|xyz){2,3}+
+
+Possessive quantifiers are always greedy; the setting of the PCRE_UNGREEDY
+option is ignored. They are a convenient notation for the simpler forms of
+atomic group. However, there is no difference in the meaning of a possessive
+quantifier and the equivalent atomic group, though there may be a performance
+difference; possessive quantifiers should be slightly faster.
+
+
+The possessive quantifier syntax is an extension to the Perl 5.8 syntax.
+Jeffrey Friedl originated the idea (and the name) in the first edition of his
+book. Mike McCloskey liked it, so implemented it when he built Sun's Java
+package, and PCRE copied it from there. It ultimately found its way into Perl
+at release 5.10.
+
+
+PCRE has an optimization that automatically "possessifies" certain simple
+pattern constructs. For example, the sequence A+B is treated as A++B because
+there is no point in backtracking into a sequence of A's when B must follow.
+
+
+When a pattern contains an unlimited repeat inside a subpattern that can itself
+be repeated an unlimited number of times, the use of an atomic group is the
+only way to avoid some failing matches taking a very long time indeed. The
+pattern
+
+ (\D+|<\d+>)*[!?]
+
+matches an unlimited number of substrings that either consist of non-digits, or
+digits enclosed in <>, followed by either ! or ?. When it matches, it runs
+quickly. However, if it is applied to
+
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+it takes a long time before reporting failure. This is because the string can
+be divided between the internal \D+ repeat and the external * repeat in a
+large number of ways, and all have to be tried. (The example uses [!?] rather
+than a single character at the end, because both PCRE and Perl have an
+optimization that allows for fast failure when a single character is used. They
+remember the last single character that is required for a match, and fail early
+if it is not present in the string.) If the pattern is changed so that it uses
+an atomic group, like this:
+
+ ((?>\D+)|<\d+>)*[!?]
+
+sequences of non-digits cannot be broken, and failure happens quickly.
+
+
BACK REFERENCES
+
+Outside a character class, a backslash followed by a digit greater than 0 (and
+possibly further digits) is a back reference to a capturing subpattern earlier
+(that is, to its left) in the pattern, provided there have been that many
+previous capturing left parentheses.
+
+
+However, if the decimal number following the backslash is less than 10, it is
+always taken as a back reference, and causes an error only if there are not
+that many capturing left parentheses in the entire pattern. In other words, the
+parentheses that are referenced need not be to the left of the reference for
+numbers less than 10. A "forward back reference" of this type can make sense
+when a repetition is involved and the subpattern to the right has participated
+in an earlier iteration.
+
+
+It is not possible to have a numerical "forward back reference" to a subpattern
+whose number is 10 or more using this syntax because a sequence such as \50 is
+interpreted as a character defined in octal. See the subsection entitled
+"Non-printing characters"
+above
+for further details of the handling of digits following a backslash. There is
+no such problem when named parentheses are used. A back reference to any
+subpattern is possible using named parentheses (see below).
+
+
+Another way of avoiding the ambiguity inherent in the use of digits following a
+backslash is to use the \g escape sequence, which is a feature introduced in
+Perl 5.10. This escape must be followed by an unsigned number or a negative
+number, optionally enclosed in braces. These examples are all identical:
+
+ (ring), \1
+ (ring), \g1
+ (ring), \g{1}
+
+An unsigned number specifies an absolute reference without the ambiguity that
+is present in the older syntax. It is also useful when literal digits follow
+the reference. A negative number is a relative reference. Consider this
+example:
+
+ (abc(def)ghi)\g{-1}
+
+The sequence \g{-1} is a reference to the most recently started capturing
+subpattern before \g, that is, is it equivalent to \2. Similarly, \g{-2}
+would be equivalent to \1. The use of relative references can be helpful in
+long patterns, and also in patterns that are created by joining together
+fragments that contain references within themselves.
+
+
+A back reference matches whatever actually matched the capturing subpattern in
+the current subject string, rather than anything matching the subpattern
+itself (see
+"Subpatterns as subroutines"
+below for a way of doing that). So the pattern
+
+ (sens|respons)e and \1ibility
+
+matches "sense and sensibility" and "response and responsibility", but not
+"sense and responsibility". If caseful matching is in force at the time of the
+back reference, the case of letters is relevant. For example,
+
+ ((?i)rah)\s+\1
+
+matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
+capturing subpattern is matched caselessly.
+
+
+There are several different ways of writing back references to named
+subpatterns. The .NET syntax \k{name} and the Perl syntax \k<name> or
+\k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's unified
+back reference syntax, in which \g can be used for both numeric and named
+references, is also supported. We could rewrite the above example in any of
+the following ways:
+
+ (?<p1>(?i)rah)\s+\k<p1>
+ (?'p1'(?i)rah)\s+\k{p1}
+ (?P<p1>(?i)rah)\s+(?P=p1)
+ (?<p1>(?i)rah)\s+\g{p1}
+
+A subpattern that is referenced by name may appear in the pattern before or
+after the reference.
+
+
+There may be more than one back reference to the same subpattern. If a
+subpattern has not actually been used in a particular match, any back
+references to it always fail. For example, the pattern
+
+ (a|(bc))\2
+
+always fails if it starts to match "a" rather than "bc". Because there may be
+many capturing parentheses in a pattern, all digits following the backslash are
+taken as part of a potential back reference number. If the pattern continues
+with a digit character, some delimiter must be used to terminate the back
+reference. If the PCRE_EXTENDED option is set, this can be whitespace.
+Otherwise an empty comment (see
+"Comments"
+below) can be used.
+
+
+A back reference that occurs inside the parentheses to which it refers fails
+when the subpattern is first used, so, for example, (a\1) never matches.
+However, such references can be useful inside repeated subpatterns. For
+example, the pattern
+
+ (a|b\1)+
+
+matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
+the subpattern, the back reference matches the character string corresponding
+to the previous iteration. In order for this to work, the pattern must be such
+that the first iteration does not need to match the back reference. This can be
+done using alternation, as in the example above, or by a quantifier with a
+minimum of zero.
+
+
ASSERTIONS
+
+An assertion is a test on the characters following or preceding the current
+matching point that does not actually consume any characters. The simple
+assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described
+above.
+
+
+More complicated assertions are coded as subpatterns. There are two kinds:
+those that look ahead of the current position in the subject string, and those
+that look behind it. An assertion subpattern is matched in the normal way,
+except that it does not cause the current matching position to be changed.
+
+
+Assertion subpatterns are not capturing subpatterns, and may not be repeated,
+because it makes no sense to assert the same thing several times. If any kind
+of assertion contains capturing subpatterns within it, these are counted for
+the purposes of numbering the capturing subpatterns in the whole pattern.
+However, substring capturing is carried out only for positive assertions,
+because it does not make sense for negative assertions.
+
+
+Lookahead assertions
+
+
+Lookahead assertions start with (?= for positive assertions and (?! for
+negative assertions. For example,
+
+ \w+(?=;)
+
+matches a word followed by a semicolon, but does not include the semicolon in
+the match, and
+
+ foo(?!bar)
+
+matches any occurrence of "foo" that is not followed by "bar". Note that the
+apparently similar pattern
+
+ (?!foo)bar
+
+does not find an occurrence of "bar" that is preceded by something other than
+"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
+(?!foo) is always true when the next three characters are "bar". A
+lookbehind assertion is needed to achieve the other effect.
+
+
+If you want to force a matching failure at some point in a pattern, the most
+convenient way to do it is with (?!) because an empty string always matches, so
+an assertion that requires there not to be an empty string must always fail.
+
+
+Lookbehind assertions
+
+
+Lookbehind assertions start with (?<= for positive assertions and (?<! for
+negative assertions. For example,
+
+ (?<!foo)bar
+
+does find an occurrence of "bar" that is not preceded by "foo". The contents of
+a lookbehind assertion are restricted such that all the strings it matches must
+have a fixed length. However, if there are several top-level alternatives, they
+do not all have to have the same fixed length. Thus
+
+ (?<=bullock|donkey)
+
+is permitted, but
+
+ (?<!dogs?|cats?)
+
+causes an error at compile time. Branches that match different length strings
+are permitted only at the top level of a lookbehind assertion. This is an
+extension compared with Perl (at least for 5.8), which requires all branches to
+match the same length of string. An assertion such as
+
+ (?<=ab(c|de))
+
+is not permitted, because its single top-level branch can match two different
+lengths, but it is acceptable if rewritten to use two top-level branches:
+
+ (?<=abc|abde)
+
+In some cases, the Perl 5.10 escape sequence \K
+(see above)
+can be used instead of a lookbehind assertion; this is not restricted to a
+fixed-length.
+
+
+The implementation of lookbehind assertions is, for each alternative, to
+temporarily move the current position back by the fixed length and then try to
+match. If there are insufficient characters before the current position, the
+assertion fails.
+
+
+PCRE does not allow the \C escape (which matches a single byte in UTF-8 mode)
+to appear in lookbehind assertions, because it makes it impossible to calculate
+the length of the lookbehind. The \X and \R escapes, which can match
+different numbers of bytes, are also not permitted.
+
+
+Possessive quantifiers can be used in conjunction with lookbehind assertions to
+specify efficient matching at the end of the subject string. Consider a simple
+pattern such as
+
+ abcd$
+
+when applied to a long string that does not match. Because matching proceeds
+from left to right, PCRE will look for each "a" in the subject and then see if
+what follows matches the rest of the pattern. If the pattern is specified as
+
+ ^.*abcd$
+
+the initial .* matches the entire string at first, but when this fails (because
+there is no following "a"), it backtracks to match all but the last character,
+then all but the last two characters, and so on. Once again the search for "a"
+covers the entire string, from right to left, so we are no better off. However,
+if the pattern is written as
+
+ ^.*+(?<=abcd)
+
+there can be no backtracking for the .*+ item; it can match only the entire
+string. The subsequent lookbehind assertion does a single test on the last four
+characters. If it fails, the match fails immediately. For long strings, this
+approach makes a significant difference to the processing time.
+
+
+Using multiple assertions
+
+
+Several assertions (of any sort) may occur in succession. For example,
+
+ (?<=\d{3})(?<!999)foo
+
+matches "foo" preceded by three digits that are not "999". Notice that each of
+the assertions is applied independently at the same point in the subject
+string. First there is a check that the previous three characters are all
+digits, and then there is a check that the same three characters are not "999".
+This pattern does not match "foo" preceded by six characters, the first
+of which are digits and the last three of which are not "999". For example, it
+doesn't match "123abcfoo". A pattern to do that is
+
+ (?<=\d{3}...)(?<!999)foo
+
+This time the first assertion looks at the preceding six characters, checking
+that the first three are digits, and then the second assertion checks that the
+preceding three characters are not "999".
+
+
+Assertions can be nested in any combination. For example,
+
+ (?<=(?<!foo)bar)baz
+
+matches an occurrence of "baz" that is preceded by "bar" which in turn is not
+preceded by "foo", while
+
+ (?<=\d{3}(?!999)...)foo
+
+is another pattern that matches "foo" preceded by three digits and any three
+characters that are not "999".
+
+
CONDITIONAL SUBPATTERNS
+
+It is possible to cause the matching process to obey a subpattern
+conditionally or to choose between two alternative subpatterns, depending on
+the result of an assertion, or whether a previous capturing subpattern matched
+or not. The two possible forms of conditional subpattern are
+
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+
+If the condition is satisfied, the yes-pattern is used; otherwise the
+no-pattern (if present) is used. If there are more than two alternatives in the
+subpattern, a compile-time error occurs.
+
+
+There are four kinds of condition: references to subpatterns, references to
+recursion, a pseudo-condition called DEFINE, and assertions.
+
+
+Checking for a used subpattern by number
+
+
+If the text between the parentheses consists of a sequence of digits, the
+condition is true if the capturing subpattern of that number has previously
+matched. An alternative notation is to precede the digits with a plus or minus
+sign. In this case, the subpattern number is relative rather than absolute.
+The most recently opened parentheses can be referenced by (?(-1), the next most
+recent by (?(-2), and so on. In looping constructs it can also make sense to
+refer to subsequent groups with constructs such as (?(+2).
+
+
+Consider the following pattern, which contains non-significant white space to
+make it more readable (assume the PCRE_EXTENDED option) and to divide it into
+three parts for ease of discussion:
+
+ ( \( )? [^()]+ (?(1) \) )
+
+The first part matches an optional opening parenthesis, and if that
+character is present, sets it as the first captured substring. The second part
+matches one or more characters that are not parentheses. The third part is a
+conditional subpattern that tests whether the first set of parentheses matched
+or not. If they did, that is, if subject started with an opening parenthesis,
+the condition is true, and so the yes-pattern is executed and a closing
+parenthesis is required. Otherwise, since no-pattern is not present, the
+subpattern matches nothing. In other words, this pattern matches a sequence of
+non-parentheses, optionally enclosed in parentheses.
+
+
+If you were embedding this pattern in a larger one, you could use a relative
+reference:
+
+ ...other stuff... ( \( )? [^()]+ (?(-1) \) ) ...
+
+This makes the fragment independent of the parentheses in the larger pattern.
+
+
+Checking for a used subpattern by name
+
+
+Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used
+subpattern by name. For compatibility with earlier versions of PCRE, which had
+this facility before Perl, the syntax (?(name)...) is also recognized. However,
+there is a possible ambiguity with this syntax, because subpattern names may
+consist entirely of digits. PCRE looks first for a named subpattern; if it
+cannot find one and the name consists entirely of digits, PCRE looks for a
+subpattern of that number, which must be greater than zero. Using subpattern
+names that consist entirely of digits is not recommended.
+
+
+Rewriting the above example to use a named subpattern gives this:
+
+ (?<OPEN> \( )? [^()]+ (?(<OPEN>) \) )
+
+
+
+
+Checking for pattern recursion
+
+
+If the condition is the string (R), and there is no subpattern with the name R,
+the condition is true if a recursive call to the whole pattern or any
+subpattern has been made. If digits or a name preceded by ampersand follow the
+letter R, for example:
+
+ (?(R3)...) or (?(R&name)...)
+
+the condition is true if the most recent recursion is into the subpattern whose
+number or name is given. This condition does not check the entire recursion
+stack.
+
+
+At "top level", all these recursion test conditions are false. Recursive
+patterns are described below.
+
+
+Defining subpatterns for use by reference only
+
+
+If the condition is the string (DEFINE), and there is no subpattern with the
+name DEFINE, the condition is always false. In this case, there may be only one
+alternative in the subpattern. It is always skipped if control reaches this
+point in the pattern; the idea of DEFINE is that it can be used to define
+"subroutines" that can be referenced from elsewhere. (The use of "subroutines"
+is described below.) For example, a pattern to match an IPv4 address could be
+written like this (ignore whitespace and line breaks):
+
+ (?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
+ \b (?&byte) (\.(?&byte)){3} \b
+
+The first part of the pattern is a DEFINE group inside which a another group
+named "byte" is defined. This matches an individual component of an IPv4
+address (a number less than 256). When matching takes place, this part of the
+pattern is skipped because DEFINE acts like a false condition.
+
+
+The rest of the pattern uses references to the named group to match the four
+dot-separated components of an IPv4 address, insisting on a word boundary at
+each end.
+
+
+Assertion conditions
+
+
+If the condition is not in any of the above formats, it must be an assertion.
+This may be a positive or negative lookahead or lookbehind assertion. Consider
+this pattern, again containing non-significant white space, and with the two
+alternatives on the second line:
+
+ (?(?=[^a-z]*[a-z])
+ \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
+
+The condition is a positive lookahead assertion that matches an optional
+sequence of non-letters followed by a letter. In other words, it tests for the
+presence of at least one letter in the subject. If a letter is found, the
+subject is matched against the first alternative; otherwise it is matched
+against the second. This pattern matches strings in one of the two forms
+dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
+
+
COMMENTS
+
+The sequence (?# marks the start of a comment that continues up to the next
+closing parenthesis. Nested parentheses are not permitted. The characters
+that make up a comment play no part in the pattern matching at all.
+
+
+If the PCRE_EXTENDED option is set, an unescaped # character outside a
+character class introduces a comment that continues to immediately after the
+next newline in the pattern.
+
+
RECURSIVE PATTERNS
+
+Consider the problem of matching a string in parentheses, allowing for
+unlimited nested parentheses. Without the use of recursion, the best that can
+be done is to use a pattern that matches up to some fixed depth of nesting. It
+is not possible to handle an arbitrary nesting depth.
+
+
+For some time, Perl has provided a facility that allows regular expressions to
+recurse (amongst other things). It does this by interpolating Perl code in the
+expression at run time, and the code can refer to the expression itself. A Perl
+pattern using code interpolation to solve the parentheses problem can be
+created like this:
+
+ $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
+
+The (?p{...}) item interpolates Perl code at run time, and in this case refers
+recursively to the pattern in which it appears.
+
+
+Obviously, PCRE cannot support the interpolation of Perl code. Instead, it
+supports special syntax for recursion of the entire pattern, and also for
+individual subpattern recursion. After its introduction in PCRE and Python,
+this kind of recursion was introduced into Perl at release 5.10.
+
+
+A special item that consists of (? followed by a number greater than zero and a
+closing parenthesis is a recursive call of the subpattern of the given number,
+provided that it occurs inside that subpattern. (If not, it is a "subroutine"
+call, which is described in the next section.) The special item (?R) or (?0) is
+a recursive call of the entire regular expression.
+
+
+In PCRE (like Python, but unlike Perl), a recursive subpattern call is always
+treated as an atomic group. That is, once it has matched some of the subject
+string, it is never re-entered, even if it contains untried alternatives and
+there is a subsequent matching failure.
+
+
+This PCRE pattern solves the nested parentheses problem (assume the
+PCRE_EXTENDED option is set so that white space is ignored):
+
+ \( ( (?>[^()]+) | (?R) )* \)
+
+First it matches an opening parenthesis. Then it matches any number of
+substrings which can either be a sequence of non-parentheses, or a recursive
+match of the pattern itself (that is, a correctly parenthesized substring).
+Finally there is a closing parenthesis.
+
+
+If this were part of a larger pattern, you would not want to recurse the entire
+pattern, so instead you could use this:
+
+ ( \( ( (?>[^()]+) | (?1) )* \) )
+
+We have put the pattern into parentheses, and caused the recursion to refer to
+them instead of the whole pattern.
+
+
+In a larger pattern, keeping track of parenthesis numbers can be tricky. This
+is made easier by the use of relative references. (A Perl 5.10 feature.)
+Instead of (?1) in the pattern above you can write (?-2) to refer to the second
+most recently opened parentheses preceding the recursion. In other words, a
+negative number counts capturing parentheses leftwards from the point at which
+it is encountered.
+
+
+It is also possible to refer to subsequently opened parentheses, by writing
+references such as (?+2). However, these cannot be recursive because the
+reference is not inside the parentheses that are referenced. They are always
+"subroutine" calls, as described in the next section.
+
+
+An alternative approach is to use named parentheses instead. The Perl syntax
+for this is (?&name); PCRE's earlier syntax (?P>name) is also supported. We
+could rewrite the above example as follows:
+
+ (?<pn> \( ( (?>[^()]+) | (?&pn) )* \) )
+
+If there is more than one subpattern with the same name, the earliest one is
+used.
+
+
+This particular example pattern that we have been looking at contains nested
+unlimited repeats, and so the use of atomic grouping for matching strings of
+non-parentheses is important when applying the pattern to strings that do not
+match. For example, when this pattern is applied to
+
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+
+it yields "no match" quickly. However, if atomic grouping is not used,
+the match runs for a very long time indeed because there are so many different
+ways the + and * repeats can carve up the subject, and all have to be tested
+before failure can be reported.
+
+
+At the end of a match, the values set for any capturing subpatterns are those
+from the outermost level of the recursion at which the subpattern value is set.
+If you want to obtain intermediate values, a callout function can be used (see
+below and the
+pcrecallout
+documentation). If the pattern above is matched against
+
+ (ab(cd)ef)
+
+the value for the capturing parentheses is "ef", which is the last value taken
+on at the top level. If additional parentheses are added, giving
+
+ \( ( ( (?>[^()]+) | (?R) )* ) \)
+ ^ ^
+ ^ ^
+
+the string they capture is "ab(cd)ef", the contents of the top level
+parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
+has to obtain extra memory to store data during a recursion, which it does by
+using pcre_malloc, freeing it via pcre_free afterwards. If no
+memory can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
+
+
+Do not confuse the (?R) item with the condition (R), which tests for recursion.
+Consider this pattern, which matches text in angle brackets, allowing for
+arbitrary nesting. Only digits are allowed in nested brackets (that is, when
+recursing), whereas any characters are permitted at the outer level.
+
+ < (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
+
+In this pattern, (?(R) is the start of a conditional subpattern, with two
+different alternatives for the recursive and non-recursive cases. The (?R) item
+is the actual recursive call.
+
+
SUBPATTERNS AS SUBROUTINES
+
+If the syntax for a recursive subpattern reference (either by number or by
+name) is used outside the parentheses to which it refers, it operates like a
+subroutine in a programming language. The "called" subpattern may be defined
+before or after the reference. A numbered reference can be absolute or
+relative, as in these examples:
+
+ (...(absolute)...)...(?2)...
+ (...(relative)...)...(?-1)...
+ (...(?+1)...(relative)...
+
+An earlier example pointed out that the pattern
+
+ (sens|respons)e and \1ibility
+
+matches "sense and sensibility" and "response and responsibility", but not
+"sense and responsibility". If instead the pattern
+
+ (sens|respons)e and (?1)ibility
+
+is used, it does match "sense and responsibility" as well as the other two
+strings. Another example is given in the discussion of DEFINE above.
+
+
+Like recursive subpatterns, a "subroutine" call is always treated as an atomic
+group. That is, once it has matched some of the subject string, it is never
+re-entered, even if it contains untried alternatives and there is a subsequent
+matching failure.
+
+
+When a subpattern is used as a subroutine, processing options such as
+case-independence are fixed when the subpattern is defined. They cannot be
+changed for different calls. For example, consider this pattern:
+
+ (abc)(?i:(?-1))
+
+It matches "abcabc". It does not match "abcABC" because the change of
+processing option does not affect the called subpattern.
+
+
ONIGURUMA SUBROUTINE SYNTAX
+
+For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or
+a number enclosed either in angle brackets or single quotes, is an alternative
+syntax for referencing a subpattern as a subroutine, possibly recursively. Here
+are two of the examples used above, rewritten using this syntax:
+
+ (?<pn> \( ( (?>[^()]+) | \g<pn> )* \) )
+ (sens|respons)e and \g'1'ibility
+
+PCRE supports an extension to Oniguruma: if a number is preceded by a
+plus or a minus sign it is taken as a relative reference. For example:
+
+ (abc)(?i:\g<-1>)
+
+Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
+synonymous. The former is a back reference; the latter is a subroutine call.
+
+
CALLOUTS
+
+Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
+code to be obeyed in the middle of matching a regular expression. This makes it
+possible, amongst other things, to extract different substrings that match the
+same pair of parentheses when there is a repetition.
+
+
+PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
+code. The feature is called "callout". The caller of PCRE provides an external
+function by putting its entry point in the global variable pcre_callout.
+By default, this variable contains NULL, which disables all calling out.
+
+
+Within a regular expression, (?C) indicates the points at which the external
+function is to be called. If you want to identify different callout points, you
+can put a number less than 256 after the letter C. The default value is zero.
+For example, this pattern has two callout points:
+
+ (?C1)abc(?C2)def
+
+If the PCRE_AUTO_CALLOUT flag is passed to pcre_compile(), callouts are
+automatically installed before each item in the pattern. They are all numbered
+255.
+
+
+During matching, when PCRE reaches a callout point (and pcre_callout is
+set), the external function is called. It is provided with the number of the
+callout, the position in the pattern, and, optionally, one item of data
+originally supplied by the caller of pcre_exec(). The callout function
+may cause matching to proceed, to backtrack, or to fail altogether. A complete
+description of the interface to the callout function is given in the
+pcrecallout
+documentation.
+
+
BACKTRACKING CONTROL
+
+Perl 5.10 introduced a number of "Special Backtracking Control Verbs", which
+are described in the Perl documentation as "experimental and subject to change
+or removal in a future version of Perl". It goes on to say: "Their usage in
+production code should be noted to avoid problems during upgrades." The same
+remarks apply to the PCRE features described in this section.
+
+
+Since these verbs are specifically related to backtracking, most of them can be
+used only when the pattern is to be matched using pcre_exec(), which uses
+a backtracking algorithm. With the exception of (*FAIL), which behaves like a
+failing negative assertion, they cause an error if encountered by
+pcre_dfa_exec().
+
+
+The new verbs make use of what was previously invalid syntax: an opening
+parenthesis followed by an asterisk. In Perl, they are generally of the form
+(*VERB:ARG) but PCRE does not support the use of arguments, so its general
+form is just (*VERB). Any number of these verbs may occur in a pattern. There
+are two kinds:
+
+
+Verbs that act immediately
+
+
+The following verbs act as soon as they are encountered:
+
+ (*ACCEPT)
+
+This verb causes the match to end successfully, skipping the remainder of the
+pattern. When inside a recursion, only the innermost pattern is ended
+immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside
+capturing parentheses. In Perl, the data so far is captured: in PCRE no data is
+captured. For example:
+
+ A(A|B(*ACCEPT)|C)D
+
+This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is
+captured.
+
+ (*FAIL) or (*F)
+
+This verb causes the match to fail, forcing backtracking to occur. It is
+equivalent to (?!) but easier to read. The Perl documentation notes that it is
+probably useful only when combined with (?{}) or (??{}). Those are, of course,
+Perl features that are not present in PCRE. The nearest equivalent is the
+callout feature, as for example in this pattern:
+
+ a+(?C)(*FAIL)
+
+A match with the string "aaaa" always fails, but the callout is taken before
+each backtrack happens (in this example, 10 times).
+
+
+Verbs that act after backtracking
+
+
+The following verbs do nothing when they are encountered. Matching continues
+with what follows, but if there is no subsequent match, a failure is forced.
+The verbs differ in exactly what kind of failure occurs.
+
+ (*COMMIT)
+
+This verb causes the whole match to fail outright if the rest of the pattern
+does not match. Even if the pattern is unanchored, no further attempts to find
+a match by advancing the start point take place. Once (*COMMIT) has been
+passed, pcre_exec() is committed to finding a match at the current
+starting point, or not at all. For example:
+
+ a+(*COMMIT)b
+
+This matches "xxaab" but not "aacaab". It can be thought of as a kind of
+dynamic anchor, or "I've started, so I must finish."
+
+ (*PRUNE)
+
+This verb causes the match to fail at the current position if the rest of the
+pattern does not match. If the pattern is unanchored, the normal "bumpalong"
+advance to the next starting character then happens. Backtracking can occur as
+usual to the left of (*PRUNE), or when matching to the right of (*PRUNE), but
+if there is no match to the right, backtracking cannot cross (*PRUNE).
+In simple cases, the use of (*PRUNE) is just an alternative to an atomic
+group or possessive quantifier, but there are some uses of (*PRUNE) that cannot
+be expressed in any other way.
+
+ (*SKIP)
+
+This verb is like (*PRUNE), except that if the pattern is unanchored, the
+"bumpalong" advance is not to the next character, but to the position in the
+subject where (*SKIP) was encountered. (*SKIP) signifies that whatever text
+was matched leading up to it cannot be part of a successful match. Consider:
+
+ a+(*SKIP)b
+
+If the subject is "aaaac...", after the first match attempt fails (starting at
+the first character in the string), the starting point skips on to start the
+next attempt at "c". Note that a possessive quantifer does not have the same
+effect in this example; although it would suppress backtracking during the
+first match attempt, the second attempt would start at the second character
+instead of skipping on to "c".
+
+ (*THEN)
+
+This verb causes a skip to the next alternation if the rest of the pattern does
+not match. That is, it cancels pending backtracking, but only within the
+current alternation. Its name comes from the observation that it can be used
+for a pattern-based if-then-else block:
+
+ ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
+
+If the COND1 pattern matches, FOO is tried (and possibly further items after
+the end of the group if FOO succeeds); on failure the matcher skips to the
+second alternative and tries COND2, without backtracking into COND1. If (*THEN)
+is used outside of any alternation, it acts exactly like (*PRUNE).
+
+
SEE ALSO
+
+pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 19 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcreperform.html b/src/doc/html/pcreperform.html
new file mode 100644
index 0000000..41d893d
--- /dev/null
+++ b/src/doc/html/pcreperform.html
@@ -0,0 +1,173 @@
+
+
+pcreperform specification
+
+
+pcreperform man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+PCRE PERFORMANCE
+
+
+Two aspects of performance are discussed below: memory usage and processing
+time. The way you express your pattern as a regular expression can affect both
+of them.
+
+
+MEMORY USAGE
+
+
+Patterns are compiled by PCRE into a reasonably efficient byte code, so that
+most simple patterns do not use much memory. However, there is one case where
+memory usage can be unexpectedly large. When a parenthesized subpattern has a
+quantifier with a minimum greater than 1 and/or a limited maximum, the whole
+subpattern is repeated in the compiled code. For example, the pattern
+
+ (abc|def){2,4}
+
+is compiled as if it were
+
+ (abc|def)(abc|def)((abc|def)(abc|def)?)?
+
+(Technical aside: It is done this way so that backtrack points within each of
+the repetitions can be independently maintained.)
+
+
+For regular expressions whose quantifiers use only small numbers, this is not
+usually a problem. However, if the numbers are large, and particularly if such
+repetitions are nested, the memory usage can become an embarrassment. For
+example, the very simple pattern
+
+ ((ab){1,1000}c){1,3}
+
+uses 51K bytes when compiled. When PCRE is compiled with its default internal
+pointer size of two bytes, the size limit on a compiled pattern is 64K, and
+this is reached with the above pattern if the outer repetition is increased
+from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
+handle larger compiled patterns, but it is better to try to rewrite your
+pattern to use less memory if you can.
+
+
+One way of reducing the memory usage for such patterns is to make use of PCRE's
+"subroutine"
+facility. Re-writing the above pattern as
+
+ ((ab)(?2){0,999}c)(?1){0,2}
+
+reduces the memory requirements to 18K, and indeed it remains under 20K even
+with the outer repetition increased to 100. However, this pattern is not
+exactly equivalent, because the "subroutine" calls are treated as
+atomic groups
+into which there can be no backtracking if there is a subsequent matching
+failure. Therefore, PCRE cannot do this kind of rewriting automatically.
+Furthermore, there is a noticeable loss of speed when executing the modified
+pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
+speed is acceptable, this kind of rewriting will allow you to process patterns
+that PCRE cannot otherwise handle.
+
+
+PROCESSING TIME
+
+
+Certain items in regular expression patterns are processed more efficiently
+than others. It is more efficient to use a character class like [aeiou] than a
+set of single-character alternatives such as (a|e|i|o|u). In general, the
+simplest construction that provides the required behaviour is usually the most
+efficient. Jeffrey Friedl's book contains a lot of useful general discussion
+about optimizing regular expressions for efficient performance. This document
+contains a few observations about PCRE.
+
+
+Using Unicode character properties (the \p, \P, and \X escapes) is slow,
+because PCRE has to scan a structure that contains data for over fifteen
+thousand characters whenever it needs a character's property. If you can find
+an alternative pattern that does not use character properties, it will probably
+be faster.
+
+
+When a pattern begins with .* not in parentheses, or in parentheses that are
+not the subject of a backreference, and the PCRE_DOTALL option is set, the
+pattern is implicitly anchored by PCRE, since it can match only at the start of
+a subject string. However, if PCRE_DOTALL is not set, PCRE cannot make this
+optimization, because the . metacharacter does not then match a newline, and if
+the subject string contains newlines, the pattern may match from the character
+immediately following one of them instead of from the very start. For example,
+the pattern
+
+ .*second
+
+matches the subject "first\nand second" (where \n stands for a newline
+character), with the match starting at the seventh character. In order to do
+this, PCRE has to retry the match starting after every newline in the subject.
+
+
+If you are using such a pattern with subject strings that do not contain
+newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
+the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE
+from having to scan along the subject looking for a newline to restart at.
+
+
+Beware of patterns that contain nested indefinite repeats. These can take a
+long time to run when applied to a string that does not match. Consider the
+pattern fragment
+
+ ^(a+)*
+
+This can match "aaaa" in 16 different ways, and this number increases very
+rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
+times, and for each of those cases other than 0 or 4, the + repeats can match
+different numbers of times.) When the remainder of the pattern is such that the
+entire match is going to fail, PCRE has in principle to try every possible
+variation, and this can take an extremely long time, even for relatively short
+strings.
+
+
+An optimization catches some of the more simple cases such as
+
+ (a+)*b
+
+where a literal character follows. Before embarking on the standard matching
+procedure, PCRE checks that there is a "b" later in the subject string, and if
+there is not, it fails the match immediately. However, when there is no
+following literal this optimization cannot be used. You can see the difference
+by comparing the behaviour of
+
+ (a+)*\d
+
+with the pattern above. The former gives a failure almost instantly when
+applied to a whole line of "a" characters, whereas the latter takes an
+appreciable time with strings longer than about 20 characters.
+
+
+In many cases, the solution to this kind of performance issue is to use an
+atomic group or a possessive quantifier.
+
+
+AUTHOR
+
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
+REVISION
+
+
+Last updated: 06 March 2007
+
+Copyright © 1997-2007 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcreposix.html b/src/doc/html/pcreposix.html
new file mode 100644
index 0000000..e52261f
--- /dev/null
+++ b/src/doc/html/pcreposix.html
@@ -0,0 +1,260 @@
+
+
+pcreposix specification
+
+
+pcreposix man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
SYNOPSIS OF POSIX API
+
+#include <pcreposix.h>
+
+
+int regcomp(regex_t *preg, const char *pattern,
+int cflags);
+
+
+int regexec(regex_t *preg, const char *string,
+size_t nmatch, regmatch_t pmatch[], int eflags);
+
+
+size_t regerror(int errcode, const regex_t *preg,
+char *errbuf, size_t errbuf_size);
+
+
+void regfree(regex_t *preg);
+
+
DESCRIPTION
+
+This set of functions provides a POSIX-style API to the PCRE regular expression
+package. See the
+pcreapi
+documentation for a description of PCRE's native API, which contains much
+additional functionality.
+
+
+The functions described here are just wrapper functions that ultimately call
+the PCRE native API. Their prototypes are defined in the pcreposix.h
+header file, and on Unix systems the library itself is called
+pcreposix.a, so can be accessed by adding -lpcreposix to the
+command for linking an application that uses them. Because the POSIX functions
+call the native ones, it is also necessary to add -lpcre.
+
+
+I have implemented only those option bits that can be reasonably mapped to PCRE
+native options. In addition, the option REG_EXTENDED is defined with the value
+zero. This has no effect, but since programs that are written to the POSIX
+interface often use it, this makes it easier to slot in PCRE as a replacement
+library. Other POSIX options are not even defined.
+
+
+When PCRE is called via these functions, it is only the API that is POSIX-like
+in style. The syntax and semantics of the regular expressions themselves are
+still those of Perl, subject to the setting of various PCRE options, as
+described below. "POSIX-like in style" means that the API approximates to the
+POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
+domains it is probably even less compatible.
+
+
+The header for these functions is supplied as pcreposix.h to avoid any
+potential clash with other POSIX libraries. It can, of course, be renamed or
+aliased as regex.h, which is the "correct" name. It provides two
+structure types, regex_t for compiled internal forms, and
+regmatch_t for returning captured substrings. It also defines some
+constants whose names start with "REG_"; these are used for setting options and
+identifying error codes.
+
+
+
+
COMPILING A PATTERN
+
+The function regcomp() is called to compile a pattern into an
+internal form. The pattern is a C string terminated by a binary zero, and
+is passed in the argument pattern. The preg argument is a pointer
+to a regex_t structure that is used as a base for storing information
+about the compiled regular expression.
+
+
+The argument cflags is either zero, or contains one or more of the bits
+defined by the following macros:
+
+ REG_DOTALL
+
+The PCRE_DOTALL option is set when the regular expression is passed for
+compilation to the native function. Note that REG_DOTALL is not part of the
+POSIX standard.
+
+ REG_ICASE
+
+The PCRE_CASELESS option is set when the regular expression is passed for
+compilation to the native function.
+
+ REG_NEWLINE
+
+The PCRE_MULTILINE option is set when the regular expression is passed for
+compilation to the native function. Note that this does not mimic the
+defined POSIX behaviour for REG_NEWLINE (see the following section).
+
+ REG_NOSUB
+
+The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is passed
+for compilation to the native function. In addition, when a pattern that is
+compiled with this flag is passed to regexec() for matching, the
+nmatch and pmatch arguments are ignored, and no captured strings
+are returned.
+
+ REG_UTF8
+
+The PCRE_UTF8 option is set when the regular expression is passed for
+compilation to the native function. This causes the pattern itself and all data
+strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF8
+is not part of the POSIX standard.
+
+
+In the absence of these flags, no options are passed to the native function.
+This means the the regex is compiled with PCRE default semantics. In
+particular, the way it handles newline characters in the subject string is the
+Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
+some of the effects specified for REG_NEWLINE. It does not affect the way
+newlines are matched by . (they aren't) or by a negative class such as [^a]
+(they are).
+
+
+The yield of regcomp() is zero on success, and non-zero otherwise. The
+preg structure is filled in on success, and one member of the structure
+is public: re_nsub contains the number of capturing subpatterns in
+the regular expression. Various error codes are defined in the header file.
+
+
MATCHING NEWLINE CHARACTERS
+
+This area is not simple, because POSIX and Perl take different views of things.
+It is not possible to get PCRE to obey POSIX semantics, but then PCRE was never
+intended to be a POSIX engine. The following table lists the different
+possibilities for matching newline characters in PCRE:
+
+ Default Change with
+
+ . matches newline no PCRE_DOTALL
+ newline matches [^a] yes not changeable
+ $ matches \n at end yes PCRE_DOLLARENDONLY
+ $ matches \n in middle no PCRE_MULTILINE
+ ^ matches \n in middle no PCRE_MULTILINE
+
+This is the equivalent table for POSIX:
+
+ Default Change with
+
+ . matches newline yes REG_NEWLINE
+ newline matches [^a] yes REG_NEWLINE
+ $ matches \n at end no REG_NEWLINE
+ $ matches \n in middle no REG_NEWLINE
+ ^ matches \n in middle no REG_NEWLINE
+
+PCRE's behaviour is the same as Perl's, except that there is no equivalent for
+PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is no way to stop
+newline from matching [^a].
+
+
+The default POSIX newline handling can be obtained by setting PCRE_DOTALL and
+PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE behave exactly as for the
+REG_NEWLINE action.
+
+
MATCHING A PATTERN
+
+The function regexec() is called to match a compiled pattern preg
+against a given string, which is by default terminated by a zero byte
+(but see REG_STARTEND below), subject to the options in eflags. These can
+be:
+
+ REG_NOTBOL
+
+The PCRE_NOTBOL option is set when calling the underlying PCRE matching
+function.
+
+ REG_NOTEOL
+
+The PCRE_NOTEOL option is set when calling the underlying PCRE matching
+function.
+
+ REG_STARTEND
+
+The string is considered to start at string + pmatch[0].rm_so and
+to have a terminating NUL located at string + pmatch[0].rm_eo
+(there need not actually be a NUL at that location), regardless of the value of
+nmatch. This is a BSD extension, compatible with but not specified by
+IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
+intended to be portable to other systems. Note that a non-zero rm_so does
+not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
+how it is matched.
+
+
+If the pattern was compiled with the REG_NOSUB flag, no data about any matched
+strings is returned. The nmatch and pmatch arguments of
+regexec() are ignored.
+
+
+Otherwise,the portion of the string that was matched, and also any captured
+substrings, are returned via the pmatch argument, which points to an
+array of nmatch structures of type regmatch_t, containing the
+members rm_so and rm_eo. These contain the offset to the first
+character of each substring and the offset to the first character after the end
+of each substring, respectively. The 0th element of the vector relates to the
+entire portion of string that was matched; subsequent elements relate to
+the capturing subpatterns of the regular expression. Unused entries in the
+array have both structure members set to -1.
+
+
+A successful match yields a zero return; various error codes are defined in the
+header file, of which REG_NOMATCH is the "expected" failure code.
+
+
ERROR MESSAGES
+
+The regerror() function maps a non-zero errorcode from either
+regcomp() or regexec() to a printable message. If preg is not
+NULL, the error should have arisen from the use of that structure. A message
+terminated by a binary zero is placed in errbuf. The length of the
+message, including the zero, is limited to errbuf_size. The yield of the
+function is the size of buffer needed to hold the whole message.
+
+
MEMORY USAGE
+
+Compiling a regular expression causes memory to be allocated and associated
+with the preg structure. The function regfree() frees all such
+memory, after which preg may no longer be used as a compiled expression.
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 05 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcreprecompile.html b/src/doc/html/pcreprecompile.html
new file mode 100644
index 0000000..83da226
--- /dev/null
+++ b/src/doc/html/pcreprecompile.html
@@ -0,0 +1,148 @@
+
+
+pcreprecompile specification
+
+
+pcreprecompile man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
SAVING AND RE-USING PRECOMPILED PCRE PATTERNS
+
+If you are running an application that uses a large number of regular
+expression patterns, it may be useful to store them in a precompiled form
+instead of having to compile them every time the application is run.
+If you are not using any private character tables (see the
+pcre_maketables()
+documentation), this is relatively straightforward. If you are using private
+tables, it is a little bit more complicated.
+
+
+If you save compiled patterns to a file, you can copy them to a different host
+and run them there. This works even if the new host has the opposite endianness
+to the one on which the patterns were compiled. There may be a small
+performance penalty, but it should be insignificant. However, compiling regular
+expressions with one version of PCRE for use with a different version is not
+guaranteed to work and may cause crashes.
+
+
SAVING A COMPILED PATTERN
+
+The value returned by pcre_compile() points to a single block of memory
+that holds the compiled pattern and associated data. You can find the length of
+this block in bytes by calling pcre_fullinfo() with an argument of
+PCRE_INFO_SIZE. You can then save the data in any appropriate manner. Here is
+sample code that compiles a pattern and writes it to a file. It assumes that
+the variable fd refers to a file that is open for output:
+
+ int erroroffset, rc, size;
+ char *error;
+ pcre *re;
+
+ re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
+ if (re == NULL) { ... handle errors ... }
+ rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
+ if (rc < 0) { ... handle errors ... }
+ rc = fwrite(re, 1, size, fd);
+ if (rc != size) { ... handle errors ... }
+
+In this example, the bytes that comprise the compiled pattern are copied
+exactly. Note that this is binary data that may contain any of the 256 possible
+byte values. On systems that make a distinction between binary and non-binary
+data, be sure that the file is opened for binary output.
+
+
+If you want to write more than one pattern to a file, you will have to devise a
+way of separating them. For binary data, preceding each pattern with its length
+is probably the most straightforward approach. Another possibility is to write
+out the data in hexadecimal instead of binary, one pattern to a line.
+
+
+Saving compiled patterns in a file is only one possible way of storing them for
+later use. They could equally well be saved in a database, or in the memory of
+some daemon process that passes them via sockets to the processes that want
+them.
+
+
+If the pattern has been studied, it is also possible to save the study data in
+a similar way to the compiled pattern itself. When studying generates
+additional information, pcre_study() returns a pointer to a
+pcre_extra data block. Its format is defined in the
+section on matching a pattern
+in the
+pcreapi
+documentation. The study_data field points to the binary study data, and
+this is what you must save (not the pcre_extra block itself). The length
+of the study data can be obtained by calling pcre_fullinfo() with an
+argument of PCRE_INFO_STUDYSIZE. Remember to check that pcre_study() did
+return a non-NULL value before trying to save the study data.
+
+
RE-USING A PRECOMPILED PATTERN
+
+Re-using a precompiled pattern is straightforward. Having reloaded it into main
+memory, you pass its pointer to pcre_exec() or pcre_dfa_exec() in
+the usual way. This should work even on another host, and even if that host has
+the opposite endianness to the one where the pattern was compiled.
+
+
+However, if you passed a pointer to custom character tables when the pattern
+was compiled (the tableptr argument of pcre_compile()), you must
+now pass a similar pointer to pcre_exec() or pcre_dfa_exec(),
+because the value saved with the compiled pattern will obviously be nonsense. A
+field in a pcre_extra() block is used to pass this data, as described in
+the
+section on matching a pattern
+in the
+pcreapi
+documentation.
+
+
+If you did not provide custom character tables when the pattern was compiled,
+the pointer in the compiled pattern is NULL, which causes pcre_exec() to
+use PCRE's internal tables. Thus, you do not need to take any special action at
+run time in this case.
+
+
+If you saved study data with the compiled pattern, you need to create your own
+pcre_extra data block and set the study_data field to point to the
+reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
+flags field to indicate that study data is present. Then pass the
+pcre_extra block to pcre_exec() or pcre_dfa_exec() in the
+usual way.
+
+
COMPATIBILITY WITH DIFFERENT PCRE RELEASES
+
+In general, it is safest to recompile all saved patterns when you update to a
+new PCRE release, though not all updates actually require this. Recompiling is
+definitely needed for release 7.2.
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 13 June 2007
+
+Copyright © 1997-2007 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcresample.html b/src/doc/html/pcresample.html
new file mode 100644
index 0000000..6243be6
--- /dev/null
+++ b/src/doc/html/pcresample.html
@@ -0,0 +1,96 @@
+
+
+pcresample specification
+
+
+pcresample man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+PCRE SAMPLE PROGRAM
+
+
+A simple, complete demonstration program, to get you started with using PCRE,
+is supplied in the file pcredemo.c in the PCRE distribution.
+
+
+The program compiles the regular expression that is its first argument, and
+matches it against the subject string in its second argument. No PCRE options
+are set, and default character tables are used. If matching succeeds, the
+program outputs the portion of the subject that matched, together with the
+contents of any captured substrings.
+
+
+If the -g option is given on the command line, the program then goes on to
+check for further matches of the same regular expression in the same subject
+string. The logic is a little bit tricky because of the possibility of matching
+an empty string. Comments in the code explain what is going on.
+
+
+If PCRE is installed in the standard include and library directories for your
+system, you should be able to compile the demonstration program using this
+command:
+
+ gcc -o pcredemo pcredemo.c -lpcre
+
+If PCRE is installed elsewhere, you may need to add additional options to the
+command line. For example, on a Unix-like system that has PCRE installed in
+/usr/local, you can compile the demonstration program using a command
+like this:
+
+ gcc -o pcredemo -I/usr/local/include pcredemo.c -L/usr/local/lib -lpcre
+
+Once you have compiled the demonstration program, you can run simple tests like
+this:
+
+ ./pcredemo 'cat|dog' 'the cat sat on the mat'
+ ./pcredemo -g 'cat|dog' 'the dog sat on the cat'
+
+Note that there is a much more comprehensive test program, called
+pcretest,
+which supports many more facilities for testing regular expressions and the
+PCRE library. The pcredemo program is provided as a simple coding
+example.
+
+
+On some operating systems (e.g. Solaris), when PCRE is not installed in the
+standard library directory, you may get an error like this when you try to run
+pcredemo:
+
+ ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
+
+This is caused by the way shared library support works on those systems. You
+need to add
+
+ -R/usr/local/lib
+
+(for example) to the compile command to get round this problem.
+
+
+AUTHOR
+
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
+REVISION
+
+
+Last updated: 23 January 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcrestack.html b/src/doc/html/pcrestack.html
new file mode 100644
index 0000000..2cc7d26
--- /dev/null
+++ b/src/doc/html/pcrestack.html
@@ -0,0 +1,154 @@
+
+
+pcrestack specification
+
+
+pcrestack man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+PCRE DISCUSSION OF STACK USAGE
+
+
+When you call pcre_exec(), it makes use of an internal function called
+match(). This calls itself recursively at branch points in the pattern,
+in order to remember the state of the match so that it can back up and try a
+different alternative if the first one fails. As matching proceeds deeper and
+deeper into the tree of possibilities, the recursion depth increases.
+
+
+Not all calls of match() increase the recursion depth; for an item such
+as a* it may be called several times at the same level, after matching
+different numbers of a's. Furthermore, in a number of cases where the result of
+the recursive call would immediately be passed back as the result of the
+current call (a "tail recursion"), the function is just restarted instead.
+
+
+The pcre_dfa_exec() function operates in an entirely different way, and
+hardly uses recursion at all. The limit on its complexity is the amount of
+workspace it is given. The comments that follow do NOT apply to
+pcre_dfa_exec(); they are relevant only for pcre_exec().
+
+
+You can set limits on the number of times that match() is called, both in
+total and recursively. If the limit is exceeded, an error occurs. For details,
+see the
+section on extra data for pcre_exec()
+in the
+pcreapi
+documentation.
+
+
+Each time that match() is actually called recursively, it uses memory
+from the process stack. For certain kinds of pattern and data, very large
+amounts of stack may be needed, despite the recognition of "tail recursion".
+You can often reduce the amount of recursion, and therefore the amount of stack
+used, by modifying the pattern that is being matched. Consider, for example,
+this pattern:
+
+ ([^<]|<(?!inet))+
+
+It matches from wherever it starts until it encounters "<inet" or the end of
+the data, and is the kind of pattern that might be used when processing an XML
+file. Each iteration of the outer parentheses matches either one character that
+is not "<" or a "<" that is not followed by "inet". However, each time a
+parenthesis is processed, a recursion occurs, so this formulation uses a stack
+frame for each matched character. For a long string, a lot of stack is
+required. Consider now this rewritten pattern, which matches exactly the same
+strings:
+
+ ([^<]++|<(?!inet))+
+
+This uses very much less stack, because runs of characters that do not contain
+"<" are "swallowed" in one item inside the parentheses. Recursion happens only
+when a "<" character that is not followed by "inet" is encountered (and we
+assume this is relatively rare). A possessive quantifier is used to stop any
+backtracking into the runs of non-"<" characters, but that is not related to
+stack usage.
+
+
+This example shows that one way of avoiding stack problems when matching long
+subject strings is to write repeated parenthesized subpatterns to match more
+than one character whenever possible.
+
+
+In environments where stack memory is constrained, you might want to compile
+PCRE to use heap memory instead of stack for remembering back-up points. This
+makes it run a lot more slowly, however. Details of how to do this are given in
+the
+pcrebuild
+documentation. When built in this way, instead of using the stack, PCRE obtains
+and frees memory by calling the functions that are pointed to by the
+pcre_stack_malloc and pcre_stack_free variables. By default, these
+point to malloc() and free(), but you can replace the pointers to
+cause PCRE to use your own functions. Since the block sizes are always the
+same, and are always freed in reverse order, it may be possible to implement
+customized memory handlers that are more efficient than the standard functions.
+
+
+In Unix-like environments, there is not often a problem with the stack unless
+very long strings are involved, though the default limit on stack size varies
+from system to system. Values from 8Mb to 64Mb are common. You can find your
+default limit by running the command:
+
+ ulimit -s
+
+Unfortunately, the effect of running out of stack is often SIGSEGV, though
+sometimes a more explicit error message is given. You can normally increase the
+limit on stack size by code such as this:
+
+ struct rlimit rlim;
+ getrlimit(RLIMIT_STACK, &rlim);
+ rlim.rlim_cur = 100*1024*1024;
+ setrlimit(RLIMIT_STACK, &rlim);
+
+This reads the current limits (soft and hard) using getrlimit(), then
+attempts to increase the soft limit to 100Mb using setrlimit(). You must
+do this before calling pcre_exec().
+
+
+PCRE has an internal counter that can be used to limit the depth of recursion,
+and thus cause pcre_exec() to give an error code before it runs out of
+stack. By default, the limit is very large, and unlikely ever to operate. It
+can be changed when PCRE is built, and it can also be set when
+pcre_exec() is called. For details of these interfaces, see the
+pcrebuild
+and
+pcreapi
+documentation.
+
+
+As a very rough rule of thumb, you should reckon on about 500 bytes per
+recursion. Thus, if you want to limit your stack usage to 8Mb, you
+should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
+support around 128000 recursions. The pcretest test program has a command
+line option (-S) that can be used to increase the size of its stack.
+
+
+AUTHOR
+
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
+REVISION
+
+
+Last updated: 05 June 2007
+
+Copyright © 1997-2007 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcresyntax.html b/src/doc/html/pcresyntax.html
new file mode 100644
index 0000000..29173c7
--- /dev/null
+++ b/src/doc/html/pcresyntax.html
@@ -0,0 +1,457 @@
+
+
+pcresyntax specification
+
+
+pcresyntax man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
PCRE REGULAR EXPRESSION SYNTAX SUMMARY
+
+The full syntax and semantics of the regular expressions that are supported by
+PCRE are described in the
+pcrepattern
+documentation. This document contains just a quick-reference summary of the
+syntax.
+
+
QUOTING
+
+
+ \x where x is non-alphanumeric is a literal x
+ \Q...\E treat enclosed characters as literal
+
+
+
CHARACTERS
+
+
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is any character
+ \e escape (hex 1B)
+ \f formfeed (hex 0C)
+ \n newline (hex 0A)
+ \r carriage return (hex 0D)
+ \t tab (hex 09)
+ \ddd character with octal code ddd, or backreference
+ \xhh character with hex code hh
+ \x{hhh..} character with hex code hhh..
+
+
+
CHARACTER TYPES
+
+
+ . any character except newline;
+ in dotall mode, any character whatsoever
+ \C one byte, even in UTF-8 mode (best avoided)
+ \d a decimal digit
+ \D a character that is not a decimal digit
+ \h a horizontal whitespace character
+ \H a character that is not a horizontal whitespace character
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \R a newline sequence
+ \s a whitespace character
+ \S a character that is not a whitespace character
+ \v a vertical whitespace character
+ \V a character that is not a vertical whitespace character
+ \w a "word" character
+ \W a "non-word" character
+ \X an extended Unicode sequence
+
+In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
+
+
GENERAL CATEGORY PROPERTY CODES FOR \p and \P
+
+
+ C Other
+ Cc Control
+ Cf Format
+ Cn Unassigned
+ Co Private use
+ Cs Surrogate
+
+ L Letter
+ Ll Lower case letter
+ Lm Modifier letter
+ Lo Other letter
+ Lt Title case letter
+ Lu Upper case letter
+ L& Ll, Lu, or Lt
+
+ M Mark
+ Mc Spacing mark
+ Me Enclosing mark
+ Mn Non-spacing mark
+
+ N Number
+ Nd Decimal number
+ Nl Letter number
+ No Other number
+
+ P Punctuation
+ Pc Connector punctuation
+ Pd Dash punctuation
+ Pe Close punctuation
+ Pf Final punctuation
+ Pi Initial punctuation
+ Po Other punctuation
+ Ps Open punctuation
+
+ S Symbol
+ Sc Currency symbol
+ Sk Modifier symbol
+ Sm Mathematical symbol
+ So Other symbol
+
+ Z Separator
+ Zl Line separator
+ Zp Paragraph separator
+ Zs Space separator
+
+
+
SCRIPT NAMES FOR \p AND \P
+
+Arabic,
+Armenian,
+Balinese,
+Bengali,
+Bopomofo,
+Braille,
+Buginese,
+Buhid,
+Canadian_Aboriginal,
+Cherokee,
+Common,
+Coptic,
+Cuneiform,
+Cypriot,
+Cyrillic,
+Deseret,
+Devanagari,
+Ethiopic,
+Georgian,
+Glagolitic,
+Gothic,
+Greek,
+Gujarati,
+Gurmukhi,
+Han,
+Hangul,
+Hanunoo,
+Hebrew,
+Hiragana,
+Inherited,
+Kannada,
+Katakana,
+Kharoshthi,
+Khmer,
+Lao,
+Latin,
+Limbu,
+Linear_B,
+Malayalam,
+Mongolian,
+Myanmar,
+New_Tai_Lue,
+Nko,
+Ogham,
+Old_Italic,
+Old_Persian,
+Oriya,
+Osmanya,
+Phags_Pa,
+Phoenician,
+Runic,
+Shavian,
+Sinhala,
+Syloti_Nagri,
+Syriac,
+Tagalog,
+Tagbanwa,
+Tai_Le,
+Tamil,
+Telugu,
+Thaana,
+Thai,
+Tibetan,
+Tifinagh,
+Ugaritic,
+Yi.
+
+
CHARACTER CLASSES
+
+
+ [...] positive character class
+ [^...] negative character class
+ [x-y] range (can be used for hex characters)
+ [[:xxx:]] positive POSIX named set
+ [[:^xxx:]] negative POSIX named set
+
+ alnum alphanumeric
+ alpha alphabetic
+ ascii 0-127
+ blank space or tab
+ cntrl control character
+ digit decimal digit
+ graph printing, excluding space
+ lower lower case letter
+ print printing, including space
+ punct printing, excluding alphanumeric
+ space whitespace
+ upper upper case letter
+ word same as \w
+ xdigit hexadecimal digit
+
+In PCRE, POSIX character set names recognize only ASCII characters. You can use
+\Q...\E inside a character class.
+
+
QUANTIFIERS
+
+
+ ? 0 or 1, greedy
+ ?+ 0 or 1, possessive
+ ?? 0 or 1, lazy
+ * 0 or more, greedy
+ *+ 0 or more, possessive
+ *? 0 or more, lazy
+ + 1 or more, greedy
+ ++ 1 or more, possessive
+ +? 1 or more, lazy
+ {n} exactly n
+ {n,m} at least n, no more than m, greedy
+ {n,m}+ at least n, no more than m, possessive
+ {n,m}? at least n, no more than m, lazy
+ {n,} n or more, greedy
+ {n,}+ n or more, possessive
+ {n,}? n or more, lazy
+
+
+
ANCHORS AND SIMPLE ASSERTIONS
+
+
+ \b word boundary
+ \B not a word boundary
+ ^ start of subject
+ also after internal newline in multiline mode
+ \A start of subject
+ $ end of subject
+ also before newline at end of subject
+ also before internal newline in multiline mode
+ \Z end of subject
+ also before newline at end of subject
+ \z end of subject
+ \G first matching position in subject
+
+
+
MATCH POINT RESET
+
+
+ \K reset start of match
+
+
+
ALTERNATION
+
+
+ expr|expr|expr...
+
+
+
CAPTURING
+
+
+ (...) capturing group
+ (?<name>...) named capturing group (Perl)
+ (?'name'...) named capturing group (Perl)
+ (?P<name>...) named capturing group (Python)
+ (?:...) non-capturing group
+ (?|...) non-capturing group; reset group numbers for
+ capturing groups in each alternative
+
+
+
ATOMIC GROUPS
+
+
+ (?>...) atomic, non-capturing group
+
+
+
COMMENT
+
+
+ (?#....) comment (not nestable)
+
+
+
OPTION SETTING
+
+
+ (?i) caseless
+ (?J) allow duplicate names
+ (?m) multiline
+ (?s) single line (dotall)
+ (?U) default ungreedy (lazy)
+ (?x) extended (ignore white space)
+ (?-...) unset option(s)
+
+
+
LOOKAHEAD AND LOOKBEHIND ASSERTIONS
+
+
+ (?=...) positive look ahead
+ (?!...) negative look ahead
+ (?<=...) positive look behind
+ (?<!...) negative look behind
+
+Each top-level branch of a look behind must be of a fixed length.
+
+
BACKREFERENCES
+
+
+ \n reference by number (can be ambiguous)
+ \gn reference by number
+ \g{n} reference by number
+ \g{-n} relative reference by number
+ \k<name> reference by name (Perl)
+ \k'name' reference by name (Perl)
+ \g{name} reference by name (Perl)
+ \k{name} reference by name (.NET)
+ (?P=name) reference by name (Python)
+
+
+
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
+
+
+ (?R) recurse whole pattern
+ (?n) call subpattern by absolute number
+ (?+n) call subpattern by relative number
+ (?-n) call subpattern by relative number
+ (?&name) call subpattern by name (Perl)
+ (?P>name) call subpattern by name (Python)
+ \g<name> call subpattern by name (Oniguruma)
+ \g'name' call subpattern by name (Oniguruma)
+ \g<n> call subpattern by absolute number (Oniguruma)
+ \g'n' call subpattern by absolute number (Oniguruma)
+ \g<+n> call subpattern by relative number (PCRE extension)
+ \g'+n' call subpattern by relative number (PCRE extension)
+ \g<-n> call subpattern by relative number (PCRE extension)
+ \g'-n' call subpattern by relative number (PCRE extension)
+
+
+
CONDITIONAL PATTERNS
+
+
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+
+ (?(n)... absolute reference condition
+ (?(+n)... relative reference condition
+ (?(-n)... relative reference condition
+ (?(<name>)... named reference condition (Perl)
+ (?('name')... named reference condition (Perl)
+ (?(name)... named reference condition (PCRE)
+ (?(R)... overall recursion condition
+ (?(Rn)... specific group recursion condition
+ (?(R&name)... specific recursion condition
+ (?(DEFINE)... define subpattern for reference
+ (?(assert)... assertion condition
+
+
+
BACKTRACKING CONTROL
+
+The following act immediately they are reached:
+
+ (*ACCEPT) force successful match
+ (*FAIL) force backtrack; synonym (*F)
+
+The following act only when a subsequent match failure causes a backtrack to
+reach them. They all force a match failure, but they differ in what happens
+afterwards. Those that advance the start-of-match point do so only if the
+pattern is not anchored.
+
+ (*COMMIT) overall failure, no advance of starting point
+ (*PRUNE) advance to next starting character
+ (*SKIP) advance start to current matching position
+ (*THEN) local failure, backtrack to next alternation
+
+
+
NEWLINE CONVENTIONS
+
+These are recognized only at the very start of the pattern or after a
+(*BSR_...) option.
+
+ (*CR)
+ (*LF)
+ (*CRLF)
+ (*ANYCRLF)
+ (*ANY)
+
+
+
WHAT \R MATCHES
+
+These are recognized only at the very start of the pattern or after a
+(*...) option that sets the newline convention.
+
+ (*BSR_ANYCRLF)
+ (*BSR_UNICODE)
+
+
+
CALLOUTS
+
+
+ (?C) callout
+ (?Cn) callout with data n
+
+
+
SEE ALSO
+
+pcrepattern(3), pcreapi(3), pcrecallout(3),
+pcrematching(3), pcre(3).
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 09 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/html/pcretest.html b/src/doc/html/pcretest.html
new file mode 100644
index 0000000..5482eae
--- /dev/null
+++ b/src/doc/html/pcretest.html
@@ -0,0 +1,706 @@
+
+
+pcretest specification
+
+
+pcretest man page
+
+Return to the PCRE index page.
+
+
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+
+
+
SYNOPSIS
+
+pcretest [options] [source] [destination]
+
+
+pcretest was written as a test program for the PCRE regular expression
+library itself, but it can also be used for experimenting with regular
+expressions. This document describes the features of the test program; for
+details of the regular expressions themselves, see the
+pcrepattern
+documentation. For details of the PCRE library function calls and their
+options, see the
+pcreapi
+documentation.
+
+
OPTIONS
+
+-b
+Behave as if each regex has the /B (show bytecode) modifier; the internal
+form is output after compilation.
+
+
+-C
+Output the version number of the PCRE library, and all available information
+about the optional features that are included, and then exit.
+
+
+-d
+Behave as if each regex has the /D (debug) modifier; the internal
+form and information about the compiled pattern is output after compilation;
+-d is equivalent to -b -i.
+
+
+-dfa
+Behave as if each data line contains the \D escape sequence; this causes the
+alternative matching function, pcre_dfa_exec(), to be used instead of the
+standard pcre_exec() function (more detail is given below).
+
+
+-help
+Output a brief summary these options and then exit.
+
+
+-i
+Behave as if each regex has the /I modifier; information about the
+compiled pattern is given after compilation.
+
+
+-m
+Output the size of each compiled pattern after it has been compiled. This is
+equivalent to adding /M to each regular expression. For compatibility
+with earlier versions of pcretest, -s is a synonym for -m.
+
+
+-o osize
+Set the number of elements in the output vector that is used when calling
+pcre_exec() or pcre_dfa_exec() to be osize. The default value
+is 45, which is enough for 14 capturing subexpressions for pcre_exec() or
+22 different matches for pcre_dfa_exec(). The vector size can be
+changed for individual matching calls by including \O in the data line (see
+below).
+
+
+-p
+Behave as if each regex has the /P modifier; the POSIX wrapper API is
+used to call PCRE. None of the other options has any effect when -p is
+set.
+
+
+-q
+Do not output the version number of pcretest at the start of execution.
+
+
+-S size
+On Unix-like systems, set the size of the runtime stack to size
+megabytes.
+
+
+-t
+Run each compile, study, and match many times with a timer, and output
+resulting time per compile or match (in milliseconds). Do not set -m with
+-t, because you will then get the size output a zillion times, and the
+timing will be distorted. You can control the number of iterations that are
+used for timing by following -t with a number (as a separate item on the
+command line). For example, "-t 1000" would iterate 1000 times. The default is
+to iterate 500000 times.
+
+
+-tm
+This is like -t except that it times only the matching phase, not the
+compile or study phases.
+
+
DESCRIPTION
+
+If pcretest is given two filename arguments, it reads from the first and
+writes to the second. If it is given only one filename argument, it reads from
+that file and writes to stdout. Otherwise, it reads from stdin and writes to
+stdout, and prompts for each line of input, using "re>" to prompt for regular
+expressions, and "data>" to prompt for data lines.
+
+
+When pcretest is built, a configuration option can specify that it should
+be linked with the libreadline library. When this is done, if the input
+is from a terminal, it is read using the readline() function. This
+provides line-editing and history facilities. The output from the -help
+option states whether or not readline() will be used.
+
+
+The program handles any number of sets of input on a single input file. Each
+set starts with a regular expression, and continues with any number of data
+lines to be matched against the pattern.
+
+
+Each data line is matched separately and independently. If you want to do
+multi-line matches, you have to use the \n escape sequence (or \r or \r\n,
+etc., depending on the newline setting) in a single line of input to encode the
+newline sequences. There is no limit on the length of data lines; the input
+buffer is automatically extended if it is too small.
+
+
+An empty line signals the end of the data lines, at which point a new regular
+expression is read. The regular expressions are given enclosed in any
+non-alphanumeric delimiters other than backslash, for example:
+
+ /(a|bc)x+yz/
+
+White space before the initial delimiter is ignored. A regular expression may
+be continued over several input lines, in which case the newline characters are
+included within it. It is possible to include the delimiter within the pattern
+by escaping it, for example
+
+ /abc\/def/
+
+If you do so, the escape and the delimiter form part of the pattern, but since
+delimiters are always non-alphanumeric, this does not affect its interpretation.
+If the terminating delimiter is immediately followed by a backslash, for
+example,
+
+ /abc/\
+
+then a backslash is added to the end of the pattern. This is done to provide a
+way of testing the error condition that arises if a pattern finishes with a
+backslash, because
+
+ /abc\/
+
+is interpreted as the first line of a pattern that starts with "abc/", causing
+pcretest to read the next line as a continuation of the regular expression.
+
+
PATTERN MODIFIERS
+
+A pattern may be followed by any number of modifiers, which are mostly single
+characters. Following Perl usage, these are referred to below as, for example,
+"the /i modifier", even though the delimiter of the pattern need not
+always be a slash, and no slash is used when writing modifiers. Whitespace may
+appear between the final pattern delimiter and the first modifier, and between
+the modifiers themselves.
+
+
+The /i, /m, /s, and /x modifiers set the PCRE_CASELESS,
+PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when
+pcre_compile() is called. These four modifier letters have the same
+effect as they do in Perl. For example:
+
+ /caseless/i
+
+The following table shows additional modifiers for setting PCRE options that do
+not correspond to anything in Perl:
+
+ /A PCRE_ANCHORED
+ /C PCRE_AUTO_CALLOUT
+ /E PCRE_DOLLAR_ENDONLY
+ /f PCRE_FIRSTLINE
+ /J PCRE_DUPNAMES
+ /N PCRE_NO_AUTO_CAPTURE
+ /U PCRE_UNGREEDY
+ /X PCRE_EXTRA
+ /<JS> PCRE_JAVASCRIPT_COMPAT
+ /<cr> PCRE_NEWLINE_CR
+ /<lf> PCRE_NEWLINE_LF
+ /<crlf> PCRE_NEWLINE_CRLF
+ /<anycrlf> PCRE_NEWLINE_ANYCRLF
+ /<any> PCRE_NEWLINE_ANY
+ /<bsr_anycrlf> PCRE_BSR_ANYCRLF
+ /<bsr_unicode> PCRE_BSR_UNICODE
+
+Those specifying line ending sequences are literal strings as shown, but the
+letters can be in either case. This example sets multiline matching with CRLF
+as the line ending sequence:
+
+ /^abc/m<crlf>
+
+Details of the meanings of these PCRE options are given in the
+pcreapi
+documentation.
+
+
+Finding all matches in a string
+
+
+Searching for all possible matches within each subject string can be requested
+by the /g or /G modifier. After finding a match, PCRE is called
+again to search the remainder of the subject string. The difference between
+/g and /G is that the former uses the startoffset argument to
+pcre_exec() to start searching at a new point within the entire string
+(which is in effect what Perl does), whereas the latter passes over a shortened
+substring. This makes a difference to the matching process if the pattern
+begins with a lookbehind assertion (including \b or \B).
+
+
+If any call to pcre_exec() in a /g or /G sequence matches an
+empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
+flags set in order to search for another, non-empty, match at the same point.
+If this second match fails, the start offset is advanced by one, and the normal
+match is retried. This imitates the way Perl handles such cases when using the
+/g modifier or the split() function.
+
+
+Other modifiers
+
+
+There are yet more modifiers for controlling the way pcretest
+operates.
+
+
+The /+ modifier requests that as well as outputting the substring that
+matched the entire pattern, pcretest should in addition output the remainder of
+the subject string. This is useful for tests where the subject contains
+multiple copies of the same substring.
+
+
+The /B modifier is a debugging feature. It requests that pcretest
+output a representation of the compiled byte code after compilation. Normally
+this information contains length and offset values; however, if /Z is
+also present, this data is replaced by spaces. This is a special feature for
+use in the automatic test scripts; it ensures that the same output is generated
+for different internal link sizes.
+
+
+The /L modifier must be followed directly by the name of a locale, for
+example,
+
+ /pattern/Lfr_FR
+
+For this reason, it must be the last modifier. The given locale is set,
+pcre_maketables() is called to build a set of character tables for the
+locale, and this is then passed to pcre_compile() when compiling the
+regular expression. Without an /L modifier, NULL is passed as the tables
+pointer; that is, /L applies only to the expression on which it appears.
+
+
+The /I modifier requests that pcretest output information about the
+compiled pattern (whether it is anchored, has a fixed first character, and
+so on). It does this by calling pcre_fullinfo() after compiling a
+pattern. If the pattern is studied, the results of that are also output.
+
+
+The /D modifier is a PCRE debugging feature, and is equivalent to
+/BI, that is, both the /B and the /I modifiers.
+
+
+The /F modifier causes pcretest to flip the byte order of the
+fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
+facility is for testing the feature in PCRE that allows it to execute patterns
+that were compiled on a host with a different endianness. This feature is not
+available when the POSIX interface to PCRE is being used, that is, when the
+/P pattern modifier is specified. See also the section about saving and
+reloading compiled patterns below.
+
+
+The /S modifier causes pcre_study() to be called after the
+expression has been compiled, and the results used when the expression is
+matched.
+
+
+The /M modifier causes the size of memory block used to hold the compiled
+pattern to be output.
+
+
+The /P modifier causes pcretest to call PCRE via the POSIX wrapper
+API rather than its native API. When this is done, all other modifiers except
+/i, /m, and /+ are ignored. REG_ICASE is set if /i is
+present, and REG_NEWLINE is set if /m is present. The wrapper functions
+force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
+
+
+The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8
+option set. This turns on support for UTF-8 character handling in PCRE,
+provided that it was compiled with this support enabled. This modifier also
+causes any non-printing characters in output strings to be printed using the
+\x{hh...} notation if they are valid UTF-8 sequences.
+
+
+If the /? modifier is used with /8, it causes pcretest to
+call pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
+checking of the string for UTF-8 validity.
+
+
DATA LINES
+
+Before each data line is passed to pcre_exec(), leading and trailing
+whitespace is removed, and it is then scanned for \ escapes. Some of these are
+pretty esoteric features, intended for checking out some of the more
+complicated features of PCRE. If you are just testing "ordinary" regular
+expressions, you probably don't need any of these. The following escapes are
+recognized:
+
+ \a alarm (BEL, \x07)
+ \b backspace (\x08)
+ \e escape (\x27)
+ \f formfeed (\x0c)
+ \n newline (\x0a)
+ \qdd set the PCRE_MATCH_LIMIT limit to dd (any number of digits)
+ \r carriage return (\x0d)
+ \t tab (\x09)
+ \v vertical tab (\x0b)
+ \nnn octal character (up to 3 octal digits)
+ \xhh hexadecimal character (up to 2 hex digits)
+ \x{hh...} hexadecimal character, any number of digits in UTF-8 mode
+ \A pass the PCRE_ANCHORED option to pcre_exec() or pcre_dfa_exec()
+ \B pass the PCRE_NOTBOL option to pcre_exec() or pcre_dfa_exec()
+ \Cdd call pcre_copy_substring() for substring dd after a successful match (number less than 32)
+ \Cname call pcre_copy_named_substring() for substring "name" after a successful match (name termin-
+ ated by next non alphanumeric character)
+ \C+ show the current captured substrings at callout time
+ \C- do not supply a callout function
+ \C!n return 1 instead of 0 when callout number n is reached
+ \C!n!m return 1 instead of 0 when callout number n is reached for the nth time
+ \C*n pass the number n (may be negative) as callout data; this is used as the callout return value
+ \D use the pcre_dfa_exec() match function
+ \F only shortest match for pcre_dfa_exec()
+ \Gdd call pcre_get_substring() for substring dd after a successful match (number less than 32)
+ \Gname call pcre_get_named_substring() for substring "name" after a successful match (name termin-
+ ated by next non-alphanumeric character)
+ \L call pcre_get_substringlist() after a successful match
+ \M discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings
+ \N pass the PCRE_NOTEMPTY option to pcre_exec() or pcre_dfa_exec()
+ \Odd set the size of the output vector passed to pcre_exec() to dd (any number of digits)
+ \P pass the PCRE_PARTIAL option to pcre_exec() or pcre_dfa_exec()
+ \Qdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd (any number of digits)
+ \R pass the PCRE_DFA_RESTART option to pcre_dfa_exec()
+ \S output details of memory get/free calls during matching
+ \Z pass the PCRE_NOTEOL option to pcre_exec() or pcre_dfa_exec()
+ \? pass the PCRE_NO_UTF8_CHECK option to pcre_exec() or pcre_dfa_exec()
+ \>dd start the match at offset dd (any number of digits);
+ this sets the startoffset argument for pcre_exec() or pcre_dfa_exec()
+ \<cr> pass the PCRE_NEWLINE_CR option to pcre_exec() or pcre_dfa_exec()
+ \<lf> pass the PCRE_NEWLINE_LF option to pcre_exec() or pcre_dfa_exec()
+ \<crlf> pass the PCRE_NEWLINE_CRLF option to pcre_exec() or pcre_dfa_exec()
+ \<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to pcre_exec() or pcre_dfa_exec()
+ \<any> pass the PCRE_NEWLINE_ANY option to pcre_exec() or pcre_dfa_exec()
+
+The escapes that specify line ending sequences are literal strings, exactly as
+shown. No more than one newline setting should be present in any data line.
+
+
+A backslash followed by anything else just escapes the anything else. If
+the very last character is a backslash, it is ignored. This gives a way of
+passing an empty line as data, since a real empty line terminates the data
+input.
+
+
+If \M is present, pcretest calls pcre_exec() several times, with
+different values in the match_limit and match_limit_recursion
+fields of the pcre_extra data structure, until it finds the minimum
+numbers for each parameter that allow pcre_exec() to complete. The
+match_limit number is a measure of the amount of backtracking that takes
+place, and checking it out can be instructive. For most simple matches, the
+number is quite small, but for patterns with very large numbers of matching
+possibilities, it can become large very quickly with increasing length of
+subject string. The match_limit_recursion number is a measure of how much
+stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
+to complete the match attempt.
+
+
+When \O is used, the value specified may be higher or lower than the size set
+by the -O command line option (or defaulted to 45); \O applies only to
+the call of pcre_exec() for the line in which it appears.
+
+
+If the /P modifier was present on the pattern, causing the POSIX wrapper
+API to be used, the only option-setting sequences that have any effect are \B
+and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
+regexec().
+
+
+The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
+of the /8 modifier on the pattern. It is recognized always. There may be
+any number of hexadecimal digits inside the braces. The result is from one to
+six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
+allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
+valid Unicode code points, or indeed valid UTF-8 characters according to the
+later rules in RFC 3629.
+
+
THE ALTERNATIVE MATCHING FUNCTION
+
+By default, pcretest uses the standard PCRE matching function,
+pcre_exec() to match each data line. From release 6.0, PCRE supports an
+alternative matching function, pcre_dfa_test(), which operates in a
+different way, and has some restrictions. The differences between the two
+functions are described in the
+pcrematching
+documentation.
+
+
+If a data line contains the \D escape sequence, or if the command line
+contains the -dfa option, the alternative matching function is called.
+This function finds all possible matches at a given point. If, however, the \F
+escape sequence is present in the data line, it stops after the first match is
+found. This is always the shortest possible match.
+
+
DEFAULT OUTPUT FROM PCRETEST
+
+This section describes the output when the normal matching function,
+pcre_exec(), is being used.
+
+
+When a match succeeds, pcretest outputs the list of captured substrings that
+pcre_exec() returns, starting with number 0 for the string that matched
+the whole pattern. Otherwise, it outputs "No match" or "Partial match"
+when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PARTIAL,
+respectively, and otherwise the PCRE negative error number. Here is an example
+of an interactive pcretest run.
+
+ $ pcretest
+ PCRE version 7.0 30-Nov-2006
+
+ re> /^abc(\d+)/
+ data> abc123
+ 0: abc123
+ 1: 123
+ data> xyz
+ No match
+
+Note that unset capturing substrings that are not followed by one that is set
+are not returned by pcre_exec(), and are not shown by pcretest. In
+the following example, there are two capturing substrings, but when the first
+data line is matched, the second, unset substring is not shown. An "internal"
+unset substring is shown as "<unset>", as for the second data line.
+
+ re> /(a)|(b)/
+ data> a
+ 0: a
+ 1: a
+ data> b
+ 0: b
+ 1: <unset>
+ 2: b
+
+If the strings contain any non-printing characters, they are output as \0x
+escapes, or as \x{...} escapes if the /8 modifier was present on the
+pattern. See below for the definition of non-printing characters. If the
+pattern has the /+ modifier, the output for substring 0 is followed by
+the the rest of the subject string, identified by "0+" like this:
+
+ re> /cat/+
+ data> cataract
+ 0: cat
+ 0+ aract
+
+If the pattern has the /g or /G modifier, the results of successive
+matching attempts are output in sequence, like this:
+
+ re> /\Bi(\w\w)/g
+ data> Mississippi
+ 0: iss
+ 1: ss
+ 0: iss
+ 1: ss
+ 0: ipp
+ 1: pp
+
+"No match" is output only if the first match attempt fails.
+
+
+If any of the sequences \C, \G, or \L are present in a
+data line that is successfully matched, the substrings extracted by the
+convenience functions are output with C, G, or L after the string number
+instead of a colon. This is in addition to the normal full list. The string
+length (that is, the return from the extraction function) is given in
+parentheses after each string for \C and \G.
+
+
+Note that whereas patterns can be continued over several lines (a plain ">"
+prompt is used for continuations), data lines may not. However newlines can be
+included in data by means of the \n escape (or \r, \r\n, etc., depending on
+the newline sequence setting).
+
+
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
+
+When the alternative matching function, pcre_dfa_exec(), is used (by
+means of the \D escape sequence or the -dfa command line option), the
+output consists of a list of all the matches that start at the first point in
+the subject where there is at least one match. For example:
+
+ re> /(tang|tangerine|tan)/
+ data> yellow tangerine\D
+ 0: tangerine
+ 1: tang
+ 2: tan
+
+(Using the normal matching function on this data finds only "tang".) The
+longest matching string is always given first (and numbered zero).
+
+
+If /g is present on the pattern, the search for further matches resumes
+at the end of the longest match. For example:
+
+ re> /(tang|tangerine|tan)/g
+ data> yellow tangerine and tangy sultana\D
+ 0: tangerine
+ 1: tang
+ 2: tan
+ 0: tang
+ 1: tan
+ 0: tan
+
+Since the matching function does not support substring capture, the escape
+sequences that are concerned with captured substrings are not relevant.
+
+
RESTARTING AFTER A PARTIAL MATCH
+
+When the alternative matching function has given the PCRE_ERROR_PARTIAL return,
+indicating that the subject partially matched the pattern, you can restart the
+match with additional subject data by means of the \R escape sequence. For
+example:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 23ja\P\D
+ Partial match: 23ja
+ data> n05\R\D
+ 0: n05
+
+For further information about partial matching, see the
+pcrepartial
+documentation.
+
+
CALLOUTS
+
+If the pattern contains any callout requests, pcretest's callout function
+is called during matching. This works with both matching functions. By default,
+the called function displays the callout number, the start and current
+positions in the text at the callout time, and the next pattern item to be
+tested. For example, the output
+
+ --->pqrabcdef
+ 0 ^ ^ \d
+
+indicates that callout number 0 occurred for a match attempt starting at the
+fourth character of the subject string, when the pointer was at the seventh
+character of the data, and when the next pattern item was \d. Just one
+circumflex is output if the start and current positions are the same.
+
+
+Callouts numbered 255 are assumed to be automatic callouts, inserted as a
+result of the /C pattern modifier. In this case, instead of showing the
+callout number, the offset in the pattern, preceded by a plus, is output. For
+example:
+
+ re> /\d?[A-E]\*/C
+ data> E*
+ --->E*
+ +0 ^ \d?
+ +3 ^ [A-E]
+ +8 ^^ \*
+ +10 ^ ^
+ 0: E*
+
+The callout function in pcretest returns zero (carry on matching) by
+default, but you can use a \C item in a data line (as described above) to
+change this.
+
+
+Inserting callouts can be helpful when using pcretest to check
+complicated regular expressions. For further information about callouts, see
+the
+pcrecallout
+documentation.
+
+
NON-PRINTING CHARACTERS
+
+When pcretest is outputting text in the compiled version of a pattern,
+bytes other than 32-126 are always treated as non-printing characters are are
+therefore shown as hex escapes.
+
+
+When pcretest is outputting text that is a matched part of a subject
+string, it behaves in the same way, unless a different locale has been set for
+the pattern (using the /L modifier). In this case, the isprint()
+function to distinguish printing and non-printing characters.
+
+
SAVING AND RELOADING COMPILED PATTERNS
+
+The facilities described in this section are not available when the POSIX
+inteface to PCRE is being used, that is, when the /P pattern modifier is
+specified.
+
+
+When the POSIX interface is not in use, you can cause pcretest to write a
+compiled pattern to a file, by following the modifiers with > and a file name.
+For example:
+
+ /pattern/im >/some/file
+
+See the
+pcreprecompile
+documentation for a discussion about saving and re-using compiled patterns.
+
+
+The data that is written is binary. The first eight bytes are the length of the
+compiled pattern data followed by the length of the optional study data, each
+written as four bytes in big-endian order (most significant byte first). If
+there is no study data (either the pattern was not studied, or studying did not
+return any data), the second length is zero. The lengths are followed by an
+exact copy of the compiled pattern. If there is additional study data, this
+follows immediately after the compiled pattern. After writing the file,
+pcretest expects to read a new pattern.
+
+
+A saved pattern can be reloaded into pcretest by specifing < and a file
+name instead of a pattern. The name of the file must not contain a < character,
+as otherwise pcretest will interpret the line as a pattern delimited by <
+characters.
+For example:
+
+ re> </some/file
+ Compiled regex loaded from /some/file
+ No study data
+
+When the pattern has been loaded, pcretest proceeds to read data lines in
+the usual way.
+
+
+You can copy a file written by pcretest to a different host and reload it
+there, even if the new host has opposite endianness to the one on which the
+pattern was compiled. For example, you can compile on an i86 machine and run on
+a SPARC machine.
+
+
+File names for saving and reloading can be absolute or relative, but note that
+the shell facility of expanding a file name that starts with a tilde (~) is not
+available.
+
+
+The ability to save and reload files in pcretest is intended for testing
+and experimentation. It is not intended for production use because only a
+single pattern can be written to a file. Furthermore, there is no facility for
+supplying custom character tables for use with a reloaded pattern. If the
+original pattern was compiled with custom tables, an attempt to match a subject
+string using a reloaded pattern is likely to cause pcretest to crash.
+Finally, if you attempt to load a file that is not in the correct format, the
+result is undefined.
+
+
SEE ALSO
+
+pcre(3), pcreapi(3), pcrecallout(3), pcrematching(3),
+pcrepartial(d), pcrepattern(3), pcreprecompile(3).
+
+
AUTHOR
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+
REVISION
+
+Last updated: 12 April 2008
+
+Copyright © 1997-2008 University of Cambridge.
+
+
+Return to the PCRE index page.
+
diff --git a/src/doc/index.html.src b/src/doc/index.html.src
new file mode 100644
index 0000000..888471f
--- /dev/null
+++ b/src/doc/index.html.src
@@ -0,0 +1,140 @@
+
+
+
+PCRE specification
+
+
+Perl-compatible Regular Expressions (PCRE)
+
+The HTML documentation for PCRE comprises the following pages:
+
+
+
+pcre |
+ Introductory page |
+
+pcre-config |
+ Information about the installation configuration |
+
+pcreapi |
+ PCRE's native API |
+
+pcrebuild |
+ Options for building PCRE |
+
+pcrecallout |
+ The callout facility |
+
+pcrecompat |
+ Compability with Perl |
+
+pcrecpp |
+ The C++ wrapper for the PCRE library |
+
+pcregrep |
+ The pcregrep command |
+
+pcrematching |
+ Discussion of the two matching algorithms |
+
+pcrepartial |
+ Using PCRE for partial matching |
+
+pcrepattern |
+ Specification of the regular expressions supported by PCRE |
+
+pcreperform |
+ Some comments on performance |
+
+pcreposix |
+ The POSIX API to the PCRE library |
+
+pcreprecompile |
+ How to save and re-use compiled patterns |
+
+pcresample |
+ Description of the sample program |
+
+pcrestack |
+ Discussion of PCRE's stack usage |
+
+pcresyntax |
+ Syntax quick-reference summary |
+
+pcretest |
+ The pcretest command for testing PCRE |
+
+
+
+There are also individual pages that summarize the interface for each function
+in the library:
+
+
+
+
+
diff --git a/src/doc/pcre-config.1 b/src/doc/pcre-config.1
new file mode 100644
index 0000000..afbd3a0
--- /dev/null
+++ b/src/doc/pcre-config.1
@@ -0,0 +1,73 @@
+.TH PCRE-CONFIG 1
+.SH NAME
+pcre-config - program to return PCRE configuration
+.SH SYNOPSIS
+.rs
+.sp
+.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
+.ti +5n
+.B [--libs-posix] [--cflags] [--cflags-posix]
+.
+.
+.SH DESCRIPTION
+.rs
+.sp
+\fBpcre-config\fP returns the configuration of the installed PCRE
+libraries and the options required to compile a program to use them.
+.
+.
+.SH OPTIONS
+.rs
+.TP 10
+\fB--prefix\fP
+Writes the directory prefix used in the PCRE installation for architecture
+independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some
+systems) to the standard output.
+.TP 10
+\fB--exec-prefix\fP
+Writes the directory prefix used in the PCRE installation for architecture
+dependent files (normally the same as \fB--prefix\fP) to the standard output.
+.TP 10
+\fB--version\fP
+Writes the version number of the installed PCRE libraries to the standard
+output.
+.TP 10
+\fB--libs\fP
+Writes to the standard output the command line options required to link
+with PCRE (\fB-lpcre\fP on many systems).
+.TP 10
+\fB--libs-posix\fP
+Writes to the standard output the command line options required to link with
+the PCRE posix emulation library (\fB-lpcreposix\fP \fB-lpcre\fP on many
+systems).
+.TP 10
+\fB--cflags\fP
+Writes to the standard output the command line options required to compile
+files that use PCRE (this may include some \fB-I\fP options, but is blank on
+many systems).
+.TP 10
+\fB--cflags-posix\fP
+Writes to the standard output the command line options required to compile
+files that use the PCRE posix emulation library (this may include some \fB-I\fP
+options, but is blank on many systems).
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcre(3)\fP
+.
+.
+.SH AUTHOR
+.rs
+.sp
+This manual page was originally written by Mark Baker for the Debian GNU/Linux
+system. It has been slightly revised as a generic PCRE man page.
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 18 April 2007
+.fi
diff --git a/src/doc/pcre-config.txt b/src/doc/pcre-config.txt
new file mode 100644
index 0000000..c979d45
--- /dev/null
+++ b/src/doc/pcre-config.txt
@@ -0,0 +1,67 @@
+PCRE-CONFIG(1) PCRE-CONFIG(1)
+
+
+
+NAME
+ pcre-config - program to return PCRE configuration
+
+SYNOPSIS
+
+ pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
+ [--libs-posix] [--cflags] [--cflags-posix]
+
+
+DESCRIPTION
+
+ pcre-config returns the configuration of the installed PCRE libraries
+ and the options required to compile a program to use them.
+
+
+OPTIONS
+
+ --prefix Writes the directory prefix used in the PCRE installation for
+ architecture independent files (/usr on many systems,
+ /usr/local on some systems) to the standard output.
+
+ --exec-prefix
+ Writes the directory prefix used in the PCRE installation for
+ architecture dependent files (normally the same as --prefix)
+ to the standard output.
+
+ --version Writes the version number of the installed PCRE libraries to
+ the standard output.
+
+ --libs Writes to the standard output the command line options
+ required to link with PCRE (-lpcre on many systems).
+
+ --libs-posix
+ Writes to the standard output the command line options
+ required to link with the PCRE posix emulation library
+ (-lpcreposix -lpcre on many systems).
+
+ --cflags Writes to the standard output the command line options
+ required to compile files that use PCRE (this may include
+ some -I options, but is blank on many systems).
+
+ --cflags-posix
+ Writes to the standard output the command line options
+ required to compile files that use the PCRE posix emulation
+ library (this may include some -I options, but is blank on
+ many systems).
+
+
+SEE ALSO
+
+ pcre(3)
+
+
+AUTHOR
+
+ This manual page was originally written by Mark Baker for the Debian
+ GNU/Linux system. It has been slightly revised as a generic PCRE man
+ page.
+
+
+REVISION
+
+ Last updated: 18 April 2007
diff --git a/src/doc/pcre.3 b/src/doc/pcre.3
new file mode 100644
index 0000000..4f738d9
--- /dev/null
+++ b/src/doc/pcre.3
@@ -0,0 +1,294 @@
+.TH PCRE 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH INTRODUCTION
+.rs
+.sp
+The PCRE library is a set of functions that implement regular expression
+pattern matching using the same syntax and semantics as Perl, with just a few
+differences. Certain features that appeared in Python and PCRE before they
+appeared in Perl are also available using the Python syntax. There is also some
+support for certain .NET and Oniguruma syntax items, and there is an option for
+requesting some minor changes that give better JavaScript compatibility.
+.P
+The current implementation of PCRE (release 7.x) corresponds approximately with
+Perl 5.10, including support for UTF-8 encoded strings and Unicode general
+category properties. However, UTF-8 and Unicode support has to be explicitly
+enabled; it is not the default. The Unicode tables correspond to Unicode
+release 5.0.0.
+.P
+In addition to the Perl-compatible matching function, PCRE contains an
+alternative matching function that matches the same compiled patterns in a
+different way. In certain circumstances, the alternative function has some
+advantages. For a discussion of the two matching algorithms, see the
+.\" HREF
+\fBpcrematching\fP
+.\"
+page.
+.P
+PCRE is written in C and released as a C library. A number of people have
+written wrappers and interfaces of various kinds. In particular, Google Inc.
+have provided a comprehensive C++ wrapper. This is now included as part of the
+PCRE distribution. The
+.\" HREF
+\fBpcrecpp\fP
+.\"
+page has details of this interface. Other people's contributions can be found
+in the \fIContrib\fR directory at the primary FTP site, which is:
+.sp
+.\" HTML
+.\"
+ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
+.P
+Details of exactly which Perl regular expression features are and are not
+supported by PCRE are given in separate documents. See the
+.\" HREF
+\fBpcrepattern\fR
+.\"
+and
+.\" HREF
+\fBpcrecompat\fR
+.\"
+pages. There is a syntax summary in the
+.\" HREF
+\fBpcresyntax\fR
+.\"
+page.
+.P
+Some features of PCRE can be included, excluded, or changed when the library is
+built. The
+.\" HREF
+\fBpcre_config()\fR
+.\"
+function makes it possible for a client to discover which features are
+available. The features themselves are described in the
+.\" HREF
+\fBpcrebuild\fP
+.\"
+page. Documentation about building PCRE for various operating systems can be
+found in the \fBREADME\fP file in the source distribution.
+.P
+The library contains a number of undocumented internal functions and data
+tables that are used by more than one of the exported external functions, but
+which are not intended for use by external callers. Their names all begin with
+"_pcre_", which hopefully will not provoke any name clashes. In some
+environments, it is possible to control which external symbols are exported
+when a shared library is built, and in these cases the undocumented symbols are
+not exported.
+.
+.
+.SH "USER DOCUMENTATION"
+.rs
+.sp
+The user documentation for PCRE comprises a number of different sections. In
+the "man" format, each of these is a separate "man page". In the HTML format,
+each is a separate page, linked from the index page. In the plain text format,
+all the sections are concatenated, for ease of searching. The sections are as
+follows:
+.sp
+ pcre this document
+ pcre-config show PCRE installation configuration information
+ pcreapi details of PCRE's native C API
+ pcrebuild options for building PCRE
+ pcrecallout details of the callout feature
+ pcrecompat discussion of Perl compatibility
+ pcrecpp details of the C++ wrapper
+ pcregrep description of the \fBpcregrep\fP command
+ pcrematching discussion of the two matching algorithms
+ pcrepartial details of the partial matching facility
+.\" JOIN
+ pcrepattern syntax and semantics of supported
+ regular expressions
+ pcresyntax quick syntax reference
+ pcreperform discussion of performance issues
+ pcreposix the POSIX-compatible C API
+ pcreprecompile details of saving and re-using precompiled patterns
+ pcresample discussion of the sample program
+ pcrestack discussion of stack usage
+ pcretest description of the \fBpcretest\fP testing command
+.sp
+In addition, in the "man" and HTML formats, there is a short page for each
+C library function, listing its arguments and results.
+.
+.
+.SH LIMITATIONS
+.rs
+.sp
+There are some size limitations in PCRE but it is hoped that they will never in
+practice be relevant.
+.P
+The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
+compiled with the default internal linkage size of 2. If you want to process
+regular expressions that are truly enormous, you can compile PCRE with an
+internal linkage size of 3 or 4 (see the \fBREADME\fP file in the source
+distribution and the
+.\" HREF
+\fBpcrebuild\fP
+.\"
+documentation for details). In these cases the limit is substantially larger.
+However, the speed of execution is slower.
+.P
+All values in repeating quantifiers must be less than 65536.
+.P
+There is no limit to the number of parenthesized subpatterns, but there can be
+no more than 65535 capturing subpatterns.
+.P
+The maximum length of name for a named subpattern is 32 characters, and the
+maximum number of named subpatterns is 10000.
+.P
+The maximum length of a subject string is the largest positive number that an
+integer variable can hold. However, when using the traditional matching
+function, PCRE uses recursion to handle subpatterns and indefinite repetition.
+This means that the available stack space may limit the size of a subject
+string that can be processed by certain patterns. For a discussion of stack
+issues, see the
+.\" HREF
+\fBpcrestack\fP
+.\"
+documentation.
+.
+.\" HTML
+.
+.
+.SH "UTF-8 AND UNICODE PROPERTY SUPPORT"
+.rs
+.sp
+From release 3.3, PCRE has had some support for character strings encoded in
+the UTF-8 format. For release 4.0 this was greatly extended to cover most
+common requirements, and in release 5.0 additional support for Unicode general
+category properties was added.
+.P
+In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
+the code, and, in addition, you must call
+.\" HREF
+\fBpcre_compile()\fP
+.\"
+with the PCRE_UTF8 option flag. When you do this, both the pattern and any
+subject strings that are matched against it are treated as UTF-8 strings
+instead of just strings of bytes.
+.P
+If you compile PCRE with UTF-8 support, but do not use it at run time, the
+library will be a bit bigger, but the additional run time overhead is limited
+to testing the PCRE_UTF8 flag occasionally, so should not be very big.
+.P
+If PCRE is built with Unicode character property support (which implies UTF-8
+support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
+The available properties that can be tested are limited to the general
+category properties such as Lu for an upper case letter or Nd for a decimal
+number, the Unicode script names such as Arabic or Han, and the derived
+properties Any and L&. A full list is given in the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation. Only the short names for properties are supported. For example,
+\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
+Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
+compatibility with Perl 5.6. PCRE does not support this.
+.
+.\" HTML
+.
+.SS "Validity of UTF-8 strings"
+.rs
+.sp
+When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
+are (by default) checked for validity on entry to the relevant functions. From
+release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
+themselves derived from the Unicode specification. Earlier releases of PCRE
+followed the rules of RFC 2279, which allows the full range of 31-bit values (0
+to 0x7FFFFFFF). The current check allows only values in the range U+0 to
+U+10FFFF, excluding U+D800 to U+DFFF.
+.P
+The excluded code points are the "Low Surrogate Area" of Unicode, of which the
+Unicode Standard says this: "The Low Surrogate Area does not contain any
+character assignments, consequently no character code charts or namelists are
+provided for this area. Surrogates are reserved for use with UTF-16 and then
+must be used in pairs." The code points that are encoded by UTF-16 pairs are
+available as independent code points in the UTF-8 encoding. (In other words,
+the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
+UTF-8.)
+.P
+If an invalid UTF-8 string is passed to PCRE, an error return
+(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
+your strings are valid, and therefore want to skip these checks in order to
+improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
+at run time, PCRE assumes that the pattern or subject it is given
+(respectively) contains only valid UTF-8 codes. In this case, it does not
+diagnose an invalid UTF-8 string.
+.P
+If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
+happens depends on why the string is invalid. If the string conforms to the
+"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
+in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
+test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
+rules of RFC 2279. However, if the string does not even conform to RFC 2279,
+the result is undefined. Your program may crash.
+.P
+If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
+encoded in a UTF-8-like manner as per the old RFC, you can set
+PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
+situation, you will have to apply your own validity check.
+.
+.SS "General comments about UTF-8 mode"
+.rs
+.sp
+1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
+UTF-8 character if the value is greater than 127.
+.P
+2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
+characters for values greater than \e177.
+.P
+3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
+bytes, for example: \ex{100}{3}.
+.P
+4. The dot metacharacter matches one UTF-8 character instead of a single byte.
+.P
+5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
+but its use can lead to some strange effects. This facility is not available in
+the alternative matching function, \fBpcre_dfa_exec()\fP.
+.P
+6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
+test characters of any code value, but the characters that PCRE recognizes as
+digits, spaces, or word characters remain the same set as before, all with
+values less than 256. This remains true even when PCRE includes Unicode
+property support, because to do otherwise would slow down PCRE in many common
+cases. If you really want to test for a wider sense of, say, "digit", you
+must use Unicode property tests such as \ep{Nd}.
+.P
+7. Similarly, characters that match the POSIX named character classes are all
+low-valued characters.
+.P
+8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
+(\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters.
+.P
+9. Case-insensitive matching applies only to characters whose values are less
+than 128, unless PCRE is built with Unicode property support. Even when Unicode
+property support is available, PCRE still uses its own character tables when
+checking the case of low-valued characters, so as not to degrade performance.
+The Unicode property information is used only for characters with higher
+values. Even when Unicode property support is available, PCRE supports
+case-insensitive matching only when there is a one-to-one mapping between a
+letter's cases. There are a small number of many-to-one mappings in Unicode;
+these are not supported by PCRE.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.P
+Putting an actual email address here seems to have been a spam magnet, so I've
+taken it away. If you want to email me, use my two initials, followed by the
+two digits 10, at the domain cam.ac.uk.
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 12 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcre.txt b/src/doc/pcre.txt
new file mode 100644
index 0000000..1a328ce
--- /dev/null
+++ b/src/doc/pcre.txt
@@ -0,0 +1,6599 @@
+-----------------------------------------------------------------------------
+This file contains a concatenation of the PCRE man pages, converted to plain
+text format for ease of searching with a text editor, or for use on systems
+that do not have a man page processor. The small individual files that give
+synopses of each function in the library have not been included. There are
+separate text files for the pcregrep and pcretest commands.
+-----------------------------------------------------------------------------
+
+
+PCRE(3) PCRE(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+INTRODUCTION
+
+ The PCRE library is a set of functions that implement regular expres-
+ sion pattern matching using the same syntax and semantics as Perl, with
+ just a few differences. Certain features that appeared in Python and
+ PCRE before they appeared in Perl are also available using the Python
+ syntax. There is also some support for certain .NET and Oniguruma syn-
+ tax items, and there is an option for requesting some minor changes
+ that give better JavaScript compatibility.
+
+ The current implementation of PCRE (release 7.x) corresponds approxi-
+ mately with Perl 5.10, including support for UTF-8 encoded strings and
+ Unicode general category properties. However, UTF-8 and Unicode support
+ has to be explicitly enabled; it is not the default. The Unicode tables
+ correspond to Unicode release 5.0.0.
+
+ In addition to the Perl-compatible matching function, PCRE contains an
+ alternative matching function that matches the same compiled patterns
+ in a different way. In certain circumstances, the alternative function
+ has some advantages. For a discussion of the two matching algorithms,
+ see the pcrematching page.
+
+ PCRE is written in C and released as a C library. A number of people
+ have written wrappers and interfaces of various kinds. In particular,
+ Google Inc. have provided a comprehensive C++ wrapper. This is now
+ included as part of the PCRE distribution. The pcrecpp page has details
+ of this interface. Other people's contributions can be found in the
+ Contrib directory at the primary FTP site, which is:
+
+ ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
+
+ Details of exactly which Perl regular expression features are and are
+ not supported by PCRE are given in separate documents. See the pcrepat-
+ tern and pcrecompat pages. There is a syntax summary in the pcresyntax
+ page.
+
+ Some features of PCRE can be included, excluded, or changed when the
+ library is built. The pcre_config() function makes it possible for a
+ client to discover which features are available. The features them-
+ selves are described in the pcrebuild page. Documentation about build-
+ ing PCRE for various operating systems can be found in the README file
+ in the source distribution.
+
+ The library contains a number of undocumented internal functions and
+ data tables that are used by more than one of the exported external
+ functions, but which are not intended for use by external callers.
+ Their names all begin with "_pcre_", which hopefully will not provoke
+ any name clashes. In some environments, it is possible to control which
+ external symbols are exported when a shared library is built, and in
+ these cases the undocumented symbols are not exported.
+
+
+USER DOCUMENTATION
+
+ The user documentation for PCRE comprises a number of different sec-
+ tions. In the "man" format, each of these is a separate "man page". In
+ the HTML format, each is a separate page, linked from the index page.
+ In the plain text format, all the sections are concatenated, for ease
+ of searching. The sections are as follows:
+
+ pcre this document
+ pcre-config show PCRE installation configuration information
+ pcreapi details of PCRE's native C API
+ pcrebuild options for building PCRE
+ pcrecallout details of the callout feature
+ pcrecompat discussion of Perl compatibility
+ pcrecpp details of the C++ wrapper
+ pcregrep description of the pcregrep command
+ pcrematching discussion of the two matching algorithms
+ pcrepartial details of the partial matching facility
+ pcrepattern syntax and semantics of supported
+ regular expressions
+ pcresyntax quick syntax reference
+ pcreperform discussion of performance issues
+ pcreposix the POSIX-compatible C API
+ pcreprecompile details of saving and re-using precompiled patterns
+ pcresample discussion of the sample program
+ pcrestack discussion of stack usage
+ pcretest description of the pcretest testing command
+
+ In addition, in the "man" and HTML formats, there is a short page for
+ each C library function, listing its arguments and results.
+
+
+LIMITATIONS
+
+ There are some size limitations in PCRE but it is hoped that they will
+ never in practice be relevant.
+
+ The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE
+ is compiled with the default internal linkage size of 2. If you want to
+ process regular expressions that are truly enormous, you can compile
+ PCRE with an internal linkage size of 3 or 4 (see the README file in
+ the source distribution and the pcrebuild documentation for details).
+ In these cases the limit is substantially larger. However, the speed
+ of execution is slower.
+
+ All values in repeating quantifiers must be less than 65536.
+
+ There is no limit to the number of parenthesized subpatterns, but there
+ can be no more than 65535 capturing subpatterns.
+
+ The maximum length of name for a named subpattern is 32 characters, and
+ the maximum number of named subpatterns is 10000.
+
+ The maximum length of a subject string is the largest positive number
+ that an integer variable can hold. However, when using the traditional
+ matching function, PCRE uses recursion to handle subpatterns and indef-
+ inite repetition. This means that the available stack space may limit
+ the size of a subject string that can be processed by certain patterns.
+ For a discussion of stack issues, see the pcrestack documentation.
+
+
+UTF-8 AND UNICODE PROPERTY SUPPORT
+
+ From release 3.3, PCRE has had some support for character strings
+ encoded in the UTF-8 format. For release 4.0 this was greatly extended
+ to cover most common requirements, and in release 5.0 additional sup-
+ port for Unicode general category properties was added.
+
+ In order process UTF-8 strings, you must build PCRE to include UTF-8
+ support in the code, and, in addition, you must call pcre_compile()
+ with the PCRE_UTF8 option flag. When you do this, both the pattern and
+ any subject strings that are matched against it are treated as UTF-8
+ strings instead of just strings of bytes.
+
+ If you compile PCRE with UTF-8 support, but do not use it at run time,
+ the library will be a bit bigger, but the additional run time overhead
+ is limited to testing the PCRE_UTF8 flag occasionally, so should not be
+ very big.
+
+ If PCRE is built with Unicode character property support (which implies
+ UTF-8 support), the escape sequences \p{..}, \P{..}, and \X are sup-
+ ported. The available properties that can be tested are limited to the
+ general category properties such as Lu for an upper case letter or Nd
+ for a decimal number, the Unicode script names such as Arabic or Han,
+ and the derived properties Any and L&. A full list is given in the
+ pcrepattern documentation. Only the short names for properties are sup-
+ ported. For example, \p{L} matches a letter. Its Perl synonym, \p{Let-
+ ter}, is not supported. Furthermore, in Perl, many properties may
+ optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE
+ does not support this.
+
+ Validity of UTF-8 strings
+
+ When you set the PCRE_UTF8 flag, the strings passed as patterns and
+ subjects are (by default) checked for validity on entry to the relevant
+ functions. From release 7.3 of PCRE, the check is according the rules
+ of RFC 3629, which are themselves derived from the Unicode specifica-
+ tion. Earlier releases of PCRE followed the rules of RFC 2279, which
+ allows the full range of 31-bit values (0 to 0x7FFFFFFF). The current
+ check allows only values in the range U+0 to U+10FFFF, excluding U+D800
+ to U+DFFF.
+
+ The excluded code points are the "Low Surrogate Area" of Unicode, of
+ which the Unicode Standard says this: "The Low Surrogate Area does not
+ contain any character assignments, consequently no character code
+ charts or namelists are provided for this area. Surrogates are reserved
+ for use with UTF-16 and then must be used in pairs." The code points
+ that are encoded by UTF-16 pairs are available as independent code
+ points in the UTF-8 encoding. (In other words, the whole surrogate
+ thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
+
+ If an invalid UTF-8 string is passed to PCRE, an error return
+ (PCRE_ERROR_BADUTF8) is given. In some situations, you may already know
+ that your strings are valid, and therefore want to skip these checks in
+ order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag at
+ compile time or at run time, PCRE assumes that the pattern or subject
+ it is given (respectively) contains only valid UTF-8 codes. In this
+ case, it does not diagnose an invalid UTF-8 string.
+
+ If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set,
+ what happens depends on why the string is invalid. If the string con-
+ forms to the "old" definition of UTF-8 (RFC 2279), it is processed as a
+ string of characters in the range 0 to 0x7FFFFFFF. In other words,
+ apart from the initial validity test, PCRE (when in UTF-8 mode) handles
+ strings according to the more liberal rules of RFC 2279. However, if
+ the string does not even conform to RFC 2279, the result is undefined.
+ Your program may crash.
+
+ If you want to process strings of values in the full range 0 to
+ 0x7FFFFFFF, encoded in a UTF-8-like manner as per the old RFC, you can
+ set PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in
+ this situation, you will have to apply your own validity check.
+
+ General comments about UTF-8 mode
+
+ 1. An unbraced hexadecimal escape sequence (such as \xb3) matches a
+ two-byte UTF-8 character if the value is greater than 127.
+
+ 2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
+ characters for values greater than \177.
+
+ 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi-
+ vidual bytes, for example: \x{100}{3}.
+
+ 4. The dot metacharacter matches one UTF-8 character instead of a sin-
+ gle byte.
+
+ 5. The escape sequence \C can be used to match a single byte in UTF-8
+ mode, but its use can lead to some strange effects. This facility is
+ not available in the alternative matching function, pcre_dfa_exec().
+
+ 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
+ test characters of any code value, but the characters that PCRE recog-
+ nizes as digits, spaces, or word characters remain the same set as
+ before, all with values less than 256. This remains true even when PCRE
+ includes Unicode property support, because to do otherwise would slow
+ down PCRE in many common cases. If you really want to test for a wider
+ sense of, say, "digit", you must use Unicode property tests such as
+ \p{Nd}.
+
+ 7. Similarly, characters that match the POSIX named character classes
+ are all low-valued characters.
+
+ 8. However, the Perl 5.10 horizontal and vertical whitespace matching
+ escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
+ acters.
+
+ 9. Case-insensitive matching applies only to characters whose values
+ are less than 128, unless PCRE is built with Unicode property support.
+ Even when Unicode property support is available, PCRE still uses its
+ own character tables when checking the case of low-valued characters,
+ so as not to degrade performance. The Unicode property information is
+ used only for characters with higher values. Even when Unicode property
+ support is available, PCRE supports case-insensitive matching only when
+ there is a one-to-one mapping between a letter's cases. There are a
+ small number of many-to-one mappings in Unicode; these are not sup-
+ ported by PCRE.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+ Putting an actual email address here seems to have been a spam magnet,
+ so I've taken it away. If you want to email me, use my two initials,
+ followed by the two digits 10, at the domain cam.ac.uk.
+
+
+REVISION
+
+ Last updated: 12 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREBUILD(3) PCREBUILD(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE BUILD-TIME OPTIONS
+
+ This document describes the optional features of PCRE that can be
+ selected when the library is compiled. It assumes use of the configure
+ script, where the optional features are selected or deselected by pro-
+ viding options to configure before running the make command. However,
+ the same options can be selected in both Unix-like and non-Unix-like
+ environments using the GUI facility of CMakeSetup if you are using
+ CMake instead of configure to build PCRE.
+
+ The complete list of options for configure (which includes the standard
+ ones such as the selection of the installation directory) can be
+ obtained by running
+
+ ./configure --help
+
+ The following sections include descriptions of options whose names
+ begin with --enable or --disable. These settings specify changes to the
+ defaults for the configure command. Because of the way that configure
+ works, --enable and --disable always come in pairs, so the complemen-
+ tary option always exists as well, but as it specifies the default, it
+ is not described.
+
+
+C++ SUPPORT
+
+ By default, the configure script will search for a C++ compiler and C++
+ header files. If it finds them, it automatically builds the C++ wrapper
+ library for PCRE. You can disable this by adding
+
+ --disable-cpp
+
+ to the configure command.
+
+
+UTF-8 SUPPORT
+
+ To build PCRE with support for UTF-8 character strings, add
+
+ --enable-utf8
+
+ to the configure command. Of itself, this does not make PCRE treat
+ strings as UTF-8. As well as compiling PCRE with this option, you also
+ have have to set the PCRE_UTF8 option when you call the pcre_compile()
+ function.
+
+
+UNICODE CHARACTER PROPERTY SUPPORT
+
+ UTF-8 support allows PCRE to process character values greater than 255
+ in the strings that it handles. On its own, however, it does not pro-
+ vide any facilities for accessing the properties of such characters. If
+ you want to be able to use the pattern escapes \P, \p, and \X, which
+ refer to Unicode character properties, you must add
+
+ --enable-unicode-properties
+
+ to the configure command. This implies UTF-8 support, even if you have
+ not explicitly requested it.
+
+ Including Unicode property support adds around 30K of tables to the
+ PCRE library. Only the general category properties such as Lu and Nd
+ are supported. Details are given in the pcrepattern documentation.
+
+
+CODE VALUE OF NEWLINE
+
+ By default, PCRE interprets character 10 (linefeed, LF) as indicating
+ the end of a line. This is the normal newline character on Unix-like
+ systems. You can compile PCRE to use character 13 (carriage return, CR)
+ instead, by adding
+
+ --enable-newline-is-cr
+
+ to the configure command. There is also a --enable-newline-is-lf
+ option, which explicitly specifies linefeed as the newline character.
+
+ Alternatively, you can specify that line endings are to be indicated by
+ the two character sequence CRLF. If you want this, add
+
+ --enable-newline-is-crlf
+
+ to the configure command. There is a fourth option, specified by
+
+ --enable-newline-is-anycrlf
+
+ which causes PCRE to recognize any of the three sequences CR, LF, or
+ CRLF as indicating a line ending. Finally, a fifth option, specified by
+
+ --enable-newline-is-any
+
+ causes PCRE to recognize any Unicode newline sequence.
+
+ Whatever line ending convention is selected when PCRE is built can be
+ overridden when the library functions are called. At build time it is
+ conventional to use the standard for your operating system.
+
+
+WHAT \R MATCHES
+
+ By default, the sequence \R in a pattern matches any Unicode newline
+ sequence, whatever has been selected as the line ending sequence. If
+ you specify
+
+ --enable-bsr-anycrlf
+
+ the default is changed so that \R matches only CR, LF, or CRLF. What-
+ ever is selected when PCRE is built can be overridden when the library
+ functions are called.
+
+
+BUILDING SHARED AND STATIC LIBRARIES
+
+ The PCRE building process uses libtool to build both shared and static
+ Unix libraries by default. You can suppress one of these by adding one
+ of
+
+ --disable-shared
+ --disable-static
+
+ to the configure command, as required.
+
+
+POSIX MALLOC USAGE
+
+ When PCRE is called through the POSIX interface (see the pcreposix doc-
+ umentation), additional working storage is required for holding the
+ pointers to capturing substrings, because PCRE requires three integers
+ per substring, whereas the POSIX interface provides only two. If the
+ number of expected substrings is small, the wrapper function uses space
+ on the stack, because this is faster than using malloc() for each call.
+ The default threshold above which the stack is no longer used is 10; it
+ can be changed by adding a setting such as
+
+ --with-posix-malloc-threshold=20
+
+ to the configure command.
+
+
+HANDLING VERY LARGE PATTERNS
+
+ Within a compiled pattern, offset values are used to point from one
+ part to another (for example, from an opening parenthesis to an alter-
+ nation metacharacter). By default, two-byte values are used for these
+ offsets, leading to a maximum size for a compiled pattern of around
+ 64K. This is sufficient to handle all but the most gigantic patterns.
+ Nevertheless, some people do want to process enormous patterns, so it
+ is possible to compile PCRE to use three-byte or four-byte offsets by
+ adding a setting such as
+
+ --with-link-size=3
+
+ to the configure command. The value given must be 2, 3, or 4. Using
+ longer offsets slows down the operation of PCRE because it has to load
+ additional bytes when handling them.
+
+
+AVOIDING EXCESSIVE STACK USAGE
+
+ When matching with the pcre_exec() function, PCRE implements backtrack-
+ ing by making recursive calls to an internal function called match().
+ In environments where the size of the stack is limited, this can se-
+ verely limit PCRE's operation. (The Unix environment does not usually
+ suffer from this problem, but it may sometimes be necessary to increase
+ the maximum stack size. There is a discussion in the pcrestack docu-
+ mentation.) An alternative approach to recursion that uses memory from
+ the heap to remember data, instead of using recursive function calls,
+ has been implemented to work round the problem of limited stack size.
+ If you want to build a version of PCRE that works this way, add
+
+ --disable-stack-for-recursion
+
+ to the configure command. With this configuration, PCRE will use the
+ pcre_stack_malloc and pcre_stack_free variables to call memory manage-
+ ment functions. By default these point to malloc() and free(), but you
+ can replace the pointers so that your own functions are used.
+
+ Separate functions are provided rather than using pcre_malloc and
+ pcre_free because the usage is very predictable: the block sizes
+ requested are always the same, and the blocks are always freed in
+ reverse order. A calling program might be able to implement optimized
+ functions that perform better than malloc() and free(). PCRE runs
+ noticeably more slowly when built in this way. This option affects only
+ the pcre_exec() function; it is not relevant for the the
+ pcre_dfa_exec() function.
+
+
+LIMITING PCRE RESOURCE USAGE
+
+ Internally, PCRE has a function called match(), which it calls repeat-
+ edly (sometimes recursively) when matching a pattern with the
+ pcre_exec() function. By controlling the maximum number of times this
+ function may be called during a single matching operation, a limit can
+ be placed on the resources used by a single call to pcre_exec(). The
+ limit can be changed at run time, as described in the pcreapi documen-
+ tation. The default is 10 million, but this can be changed by adding a
+ setting such as
+
+ --with-match-limit=500000
+
+ to the configure command. This setting has no effect on the
+ pcre_dfa_exec() matching function.
+
+ In some environments it is desirable to limit the depth of recursive
+ calls of match() more strictly than the total number of calls, in order
+ to restrict the maximum amount of stack (or heap, if --disable-stack-
+ for-recursion is specified) that is used. A second limit controls this;
+ it defaults to the value that is set for --with-match-limit, which
+ imposes no additional constraints. However, you can set a lower limit
+ by adding, for example,
+
+ --with-match-limit-recursion=10000
+
+ to the configure command. This value can also be overridden at run
+ time.
+
+
+CREATING CHARACTER TABLES AT BUILD TIME
+
+ PCRE uses fixed tables for processing characters whose code values are
+ less than 256. By default, PCRE is built with a set of tables that are
+ distributed in the file pcre_chartables.c.dist. These tables are for
+ ASCII codes only. If you add
+
+ --enable-rebuild-chartables
+
+ to the configure command, the distributed tables are no longer used.
+ Instead, a program called dftables is compiled and run. This outputs
+ the source for new set of tables, created in the default locale of your
+ C runtime system. (This method of replacing the tables does not work if
+ you are cross compiling, because dftables is run on the local host. If
+ you need to create alternative tables when cross compiling, you will
+ have to do so "by hand".)
+
+
+USING EBCDIC CODE
+
+ PCRE assumes by default that it will run in an environment where the
+ character code is ASCII (or Unicode, which is a superset of ASCII).
+ This is the case for most computer operating systems. PCRE can, how-
+ ever, be compiled to run in an EBCDIC environment by adding
+
+ --enable-ebcdic
+
+ to the configure command. This setting implies --enable-rebuild-charta-
+ bles. You should only use it if you know that you are in an EBCDIC
+ environment (for example, an IBM mainframe operating system).
+
+
+PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT
+
+ By default, pcregrep reads all files as plain text. You can build it so
+ that it recognizes files whose names end in .gz or .bz2, and reads them
+ with libz or libbz2, respectively, by adding one or both of
+
+ --enable-pcregrep-libz
+ --enable-pcregrep-libbz2
+
+ to the configure command. These options naturally require that the rel-
+ evant libraries are installed on your system. Configuration will fail
+ if they are not.
+
+
+PCRETEST OPTION FOR LIBREADLINE SUPPORT
+
+ If you add
+
+ --enable-pcretest-libreadline
+
+ to the configure command, pcretest is linked with the libreadline
+ library, and when its input is from a terminal, it reads it using the
+ readline() function. This provides line-editing and history facilities.
+ Note that libreadline is GPL-licenced, so if you distribute a binary of
+ pcretest linked in this way, there may be licensing issues.
+
+ Setting this option causes the -lreadline option to be added to the
+ pcretest build. In many operating environments with a sytem-installed
+ libreadline this is sufficient. However, in some environments (e.g. if
+ an unmodified distribution version of readline is in use), some extra
+ configuration may be necessary. The INSTALL file for libreadline says
+ this:
+
+ "Readline uses the termcap functions, but does not link with the
+ termcap or curses library itself, allowing applications which link
+ with readline the to choose an appropriate library."
+
+ If your environment has not been set up so that an appropriate library
+ is automatically included, you may need to add something like
+
+ LIBS="-ncurses"
+
+ immediately before the configure command.
+
+
+SEE ALSO
+
+ pcreapi(3), pcre_config(3).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 13 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREMATCHING(3) PCREMATCHING(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE MATCHING ALGORITHMS
+
+ This document describes the two different algorithms that are available
+ in PCRE for matching a compiled regular expression against a given sub-
+ ject string. The "standard" algorithm is the one provided by the
+ pcre_exec() function. This works in the same was as Perl's matching
+ function, and provides a Perl-compatible matching operation.
+
+ An alternative algorithm is provided by the pcre_dfa_exec() function;
+ this operates in a different way, and is not Perl-compatible. It has
+ advantages and disadvantages compared with the standard algorithm, and
+ these are described below.
+
+ When there is only one possible way in which a given subject string can
+ match a pattern, the two algorithms give the same answer. A difference
+ arises, however, when there are multiple possibilities. For example, if
+ the pattern
+
+ ^<.*>
+
+ is matched against the string
+
+
+
+ there are three possible answers. The standard algorithm finds only one
+ of them, whereas the alternative algorithm finds all three.
+
+
+REGULAR EXPRESSIONS AS TREES
+
+ The set of strings that are matched by a regular expression can be rep-
+ resented as a tree structure. An unlimited repetition in the pattern
+ makes the tree of infinite size, but it is still a tree. Matching the
+ pattern to a given subject string (from a given starting point) can be
+ thought of as a search of the tree. There are two ways to search a
+ tree: depth-first and breadth-first, and these correspond to the two
+ matching algorithms provided by PCRE.
+
+
+THE STANDARD MATCHING ALGORITHM
+
+ In the terminology of Jeffrey Friedl's book "Mastering Regular Expres-
+ sions", the standard algorithm is an "NFA algorithm". It conducts a
+ depth-first search of the pattern tree. That is, it proceeds along a
+ single path through the tree, checking that the subject matches what is
+ required. When there is a mismatch, the algorithm tries any alterna-
+ tives at the current point, and if they all fail, it backs up to the
+ previous branch point in the tree, and tries the next alternative
+ branch at that level. This often involves backing up (moving to the
+ left) in the subject string as well. The order in which repetition
+ branches are tried is controlled by the greedy or ungreedy nature of
+ the quantifier.
+
+ If a leaf node is reached, a matching string has been found, and at
+ that point the algorithm stops. Thus, if there is more than one possi-
+ ble match, this algorithm returns the first one that it finds. Whether
+ this is the shortest, the longest, or some intermediate length depends
+ on the way the greedy and ungreedy repetition quantifiers are specified
+ in the pattern.
+
+ Because it ends up with a single path through the tree, it is rela-
+ tively straightforward for this algorithm to keep track of the sub-
+ strings that are matched by portions of the pattern in parentheses.
+ This provides support for capturing parentheses and back references.
+
+
+THE ALTERNATIVE MATCHING ALGORITHM
+
+ This algorithm conducts a breadth-first search of the tree. Starting
+ from the first matching point in the subject, it scans the subject
+ string from left to right, once, character by character, and as it does
+ this, it remembers all the paths through the tree that represent valid
+ matches. In Friedl's terminology, this is a kind of "DFA algorithm",
+ though it is not implemented as a traditional finite state machine (it
+ keeps multiple states active simultaneously).
+
+ The scan continues until either the end of the subject is reached, or
+ there are no more unterminated paths. At this point, terminated paths
+ represent the different matching possibilities (if there are none, the
+ match has failed). Thus, if there is more than one possible match,
+ this algorithm finds all of them, and in particular, it finds the long-
+ est. In PCRE, there is an option to stop the algorithm after the first
+ match (which is necessarily the shortest) has been found.
+
+ Note that all the matches that are found start at the same point in the
+ subject. If the pattern
+
+ cat(er(pillar)?)
+
+ is matched against the string "the caterpillar catchment", the result
+ will be the three strings "cat", "cater", and "caterpillar" that start
+ at the fourth character of the subject. The algorithm does not automat-
+ ically move on to find matches that start at later positions.
+
+ There are a number of features of PCRE regular expressions that are not
+ supported by the alternative matching algorithm. They are as follows:
+
+ 1. Because the algorithm finds all possible matches, the greedy or
+ ungreedy nature of repetition quantifiers is not relevant. Greedy and
+ ungreedy quantifiers are treated in exactly the same way. However, pos-
+ sessive quantifiers can make a difference when what follows could also
+ match what is quantified, for example in a pattern like this:
+
+ ^a++\w!
+
+ This pattern matches "aaab!" but not "aaa!", which would be matched by
+ a non-possessive quantifier. Similarly, if an atomic group is present,
+ it is matched as if it were a standalone pattern at the current point,
+ and the longest match is then "locked in" for the rest of the overall
+ pattern.
+
+ 2. When dealing with multiple paths through the tree simultaneously, it
+ is not straightforward to keep track of captured substrings for the
+ different matching possibilities, and PCRE's implementation of this
+ algorithm does not attempt to do this. This means that no captured sub-
+ strings are available.
+
+ 3. Because no substrings are captured, back references within the pat-
+ tern are not supported, and cause errors if encountered.
+
+ 4. For the same reason, conditional expressions that use a backrefer-
+ ence as the condition or test for a specific group recursion are not
+ supported.
+
+ 5. Because many paths through the tree may be active, the \K escape
+ sequence, which resets the start of the match when encountered (but may
+ be on some paths and not on others), is not supported. It causes an
+ error if encountered.
+
+ 6. Callouts are supported, but the value of the capture_top field is
+ always 1, and the value of the capture_last field is always -1.
+
+ 7. The \C escape sequence, which (in the standard algorithm) matches a
+ single byte, even in UTF-8 mode, is not supported because the alterna-
+ tive algorithm moves through the subject string one character at a
+ time, for all active paths through the tree.
+
+ 8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE)
+ are not supported. (*FAIL) is supported, and behaves like a failing
+ negative assertion.
+
+
+ADVANTAGES OF THE ALTERNATIVE ALGORITHM
+
+ Using the alternative matching algorithm provides the following advan-
+ tages:
+
+ 1. All possible matches (at a single point in the subject) are automat-
+ ically found, and in particular, the longest match is found. To find
+ more than one match using the standard algorithm, you have to do kludgy
+ things with callouts.
+
+ 2. There is much better support for partial matching. The restrictions
+ on the content of the pattern that apply when using the standard algo-
+ rithm for partial matching do not apply to the alternative algorithm.
+ For non-anchored patterns, the starting position of a partial match is
+ available.
+
+ 3. Because the alternative algorithm scans the subject string just
+ once, and never needs to backtrack, it is possible to pass very long
+ subject strings to the matching function in several pieces, checking
+ for partial matching each time.
+
+
+DISADVANTAGES OF THE ALTERNATIVE ALGORITHM
+
+ The alternative algorithm suffers from a number of disadvantages:
+
+ 1. It is substantially slower than the standard algorithm. This is
+ partly because it has to search for all possible matches, but is also
+ because it is less susceptible to optimization.
+
+ 2. Capturing parentheses and back references are not supported.
+
+ 3. Although atomic groups are supported, their use does not provide the
+ performance advantage that it does for the standard algorithm.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 19 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREAPI(3) PCREAPI(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE NATIVE API
+
+ #include
+
+ pcre *pcre_compile(const char *pattern, int options,
+ const char **errptr, int *erroffset,
+ const unsigned char *tableptr);
+
+ pcre *pcre_compile2(const char *pattern, int options,
+ int *errorcodeptr,
+ const char **errptr, int *erroffset,
+ const unsigned char *tableptr);
+
+ pcre_extra *pcre_study(const pcre *code, int options,
+ const char **errptr);
+
+ int pcre_exec(const pcre *code, const pcre_extra *extra,
+ const char *subject, int length, int startoffset,
+ int options, int *ovector, int ovecsize);
+
+ int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
+ const char *subject, int length, int startoffset,
+ int options, int *ovector, int ovecsize,
+ int *workspace, int wscount);
+
+ int pcre_copy_named_substring(const pcre *code,
+ const char *subject, int *ovector,
+ int stringcount, const char *stringname,
+ char *buffer, int buffersize);
+
+ int pcre_copy_substring(const char *subject, int *ovector,
+ int stringcount, int stringnumber, char *buffer,
+ int buffersize);
+
+ int pcre_get_named_substring(const pcre *code,
+ const char *subject, int *ovector,
+ int stringcount, const char *stringname,
+ const char **stringptr);
+
+ int pcre_get_stringnumber(const pcre *code,
+ const char *name);
+
+ int pcre_get_stringtable_entries(const pcre *code,
+ const char *name, char **first, char **last);
+
+ int pcre_get_substring(const char *subject, int *ovector,
+ int stringcount, int stringnumber,
+ const char **stringptr);
+
+ int pcre_get_substring_list(const char *subject,
+ int *ovector, int stringcount, const char ***listptr);
+
+ void pcre_free_substring(const char *stringptr);
+
+ void pcre_free_substring_list(const char **stringptr);
+
+ const unsigned char *pcre_maketables(void);
+
+ int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
+ int what, void *where);
+
+ int pcre_info(const pcre *code, int *optptr, int *firstcharptr);
+
+ int pcre_refcount(pcre *code, int adjust);
+
+ int pcre_config(int what, void *where);
+
+ char *pcre_version(void);
+
+ void *(*pcre_malloc)(size_t);
+
+ void (*pcre_free)(void *);
+
+ void *(*pcre_stack_malloc)(size_t);
+
+ void (*pcre_stack_free)(void *);
+
+ int (*pcre_callout)(pcre_callout_block *);
+
+
+PCRE API OVERVIEW
+
+ PCRE has its own native API, which is described in this document. There
+ are also some wrapper functions that correspond to the POSIX regular
+ expression API. These are described in the pcreposix documentation.
+ Both of these APIs define a set of C function calls. A C++ wrapper is
+ distributed with PCRE. It is documented in the pcrecpp page.
+
+ The native API C function prototypes are defined in the header file
+ pcre.h, and on Unix systems the library itself is called libpcre. It
+ can normally be accessed by adding -lpcre to the command for linking an
+ application that uses PCRE. The header file defines the macros
+ PCRE_MAJOR and PCRE_MINOR to contain the major and minor release num-
+ bers for the library. Applications can use these to include support
+ for different releases of PCRE.
+
+ The functions pcre_compile(), pcre_compile2(), pcre_study(), and
+ pcre_exec() are used for compiling and matching regular expressions in
+ a Perl-compatible manner. A sample program that demonstrates the sim-
+ plest way of using them is provided in the file called pcredemo.c in
+ the source distribution. The pcresample documentation describes how to
+ compile and run it.
+
+ A second matching function, pcre_dfa_exec(), which is not Perl-compati-
+ ble, is also provided. This uses a different algorithm for the match-
+ ing. The alternative algorithm finds all possible matches (at a given
+ point in the subject), and scans the subject just once. However, this
+ algorithm does not return captured substrings. A description of the two
+ matching algorithms and their advantages and disadvantages is given in
+ the pcrematching documentation.
+
+ In addition to the main compiling and matching functions, there are
+ convenience functions for extracting captured substrings from a subject
+ string that is matched by pcre_exec(). They are:
+
+ pcre_copy_substring()
+ pcre_copy_named_substring()
+ pcre_get_substring()
+ pcre_get_named_substring()
+ pcre_get_substring_list()
+ pcre_get_stringnumber()
+ pcre_get_stringtable_entries()
+
+ pcre_free_substring() and pcre_free_substring_list() are also provided,
+ to free the memory used for extracted strings.
+
+ The function pcre_maketables() is used to build a set of character
+ tables in the current locale for passing to pcre_compile(),
+ pcre_exec(), or pcre_dfa_exec(). This is an optional facility that is
+ provided for specialist use. Most commonly, no special tables are
+ passed, in which case internal tables that are generated when PCRE is
+ built are used.
+
+ The function pcre_fullinfo() is used to find out information about a
+ compiled pattern; pcre_info() is an obsolete version that returns only
+ some of the available information, but is retained for backwards com-
+ patibility. The function pcre_version() returns a pointer to a string
+ containing the version of PCRE and its date of release.
+
+ The function pcre_refcount() maintains a reference count in a data
+ block containing a compiled pattern. This is provided for the benefit
+ of object-oriented applications.
+
+ The global variables pcre_malloc and pcre_free initially contain the
+ entry points of the standard malloc() and free() functions, respec-
+ tively. PCRE calls the memory management functions via these variables,
+ so a calling program can replace them if it wishes to intercept the
+ calls. This should be done before calling any PCRE functions.
+
+ The global variables pcre_stack_malloc and pcre_stack_free are also
+ indirections to memory management functions. These special functions
+ are used only when PCRE is compiled to use the heap for remembering
+ data, instead of recursive function calls, when running the pcre_exec()
+ function. See the pcrebuild documentation for details of how to do
+ this. It is a non-standard way of building PCRE, for use in environ-
+ ments that have limited stacks. Because of the greater use of memory
+ management, it runs more slowly. Separate functions are provided so
+ that special-purpose external code can be used for this case. When
+ used, these functions are always called in a stack-like manner (last
+ obtained, first freed), and always for memory blocks of the same size.
+ There is a discussion about PCRE's stack usage in the pcrestack docu-
+ mentation.
+
+ The global variable pcre_callout initially contains NULL. It can be set
+ by the caller to a "callout" function, which PCRE will then call at
+ specified points during a matching operation. Details are given in the
+ pcrecallout documentation.
+
+
+NEWLINES
+
+ PCRE supports five different conventions for indicating line breaks in
+ strings: a single CR (carriage return) character, a single LF (line-
+ feed) character, the two-character sequence CRLF, any of the three pre-
+ ceding, or any Unicode newline sequence. The Unicode newline sequences
+ are the three just mentioned, plus the single characters VT (vertical
+ tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
+ separator, U+2028), and PS (paragraph separator, U+2029).
+
+ Each of the first three conventions is used by at least one operating
+ system as its standard newline sequence. When PCRE is built, a default
+ can be specified. The default default is LF, which is the Unix stan-
+ dard. When PCRE is run, the default can be overridden, either when a
+ pattern is compiled, or when it is matched.
+
+ At compile time, the newline convention can be specified by the options
+ argument of pcre_compile(), or it can be specified by special text at
+ the start of the pattern itself; this overrides any other settings. See
+ the pcrepattern page for details of the special character sequences.
+
+ In the PCRE documentation the word "newline" is used to mean "the char-
+ acter or pair of characters that indicate a line break". The choice of
+ newline convention affects the handling of the dot, circumflex, and
+ dollar metacharacters, the handling of #-comments in /x mode, and, when
+ CRLF is a recognized line ending sequence, the match position advance-
+ ment for a non-anchored pattern. There is more detail about this in the
+ section on pcre_exec() options below.
+
+ The choice of newline convention does not affect the interpretation of
+ the \n or \r escape sequences, nor does it affect what \R matches,
+ which is controlled in a similar way, but by separate options.
+
+
+MULTITHREADING
+
+ The PCRE functions can be used in multi-threading applications, with
+ the proviso that the memory management functions pointed to by
+ pcre_malloc, pcre_free, pcre_stack_malloc, and pcre_stack_free, and the
+ callout function pointed to by pcre_callout, are shared by all threads.
+
+ The compiled form of a regular expression is not altered during match-
+ ing, so the same compiled pattern can safely be used by several threads
+ at once.
+
+
+SAVING PRECOMPILED PATTERNS FOR LATER USE
+
+ The compiled form of a regular expression can be saved and re-used at a
+ later time, possibly by a different program, and even on a host other
+ than the one on which it was compiled. Details are given in the
+ pcreprecompile documentation. However, compiling a regular expression
+ with one version of PCRE for use with a different version is not guar-
+ anteed to work and may cause crashes.
+
+
+CHECKING BUILD-TIME OPTIONS
+
+ int pcre_config(int what, void *where);
+
+ The function pcre_config() makes it possible for a PCRE client to dis-
+ cover which optional features have been compiled into the PCRE library.
+ The pcrebuild documentation has more details about these optional fea-
+ tures.
+
+ The first argument for pcre_config() is an integer, specifying which
+ information is required; the second argument is a pointer to a variable
+ into which the information is placed. The following information is
+ available:
+
+ PCRE_CONFIG_UTF8
+
+ The output is an integer that is set to one if UTF-8 support is avail-
+ able; otherwise it is set to zero.
+
+ PCRE_CONFIG_UNICODE_PROPERTIES
+
+ The output is an integer that is set to one if support for Unicode
+ character properties is available; otherwise it is set to zero.
+
+ PCRE_CONFIG_NEWLINE
+
+ The output is an integer whose value specifies the default character
+ sequence that is recognized as meaning "newline". The four values that
+ are supported are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF,
+ and -1 for ANY. The default should normally be the standard sequence
+ for your operating system.
+
+ PCRE_CONFIG_BSR
+
+ The output is an integer whose value indicates what character sequences
+ the \R escape sequence matches by default. A value of 0 means that \R
+ matches any Unicode line ending sequence; a value of 1 means that \R
+ matches only CR, LF, or CRLF. The default can be overridden when a pat-
+ tern is compiled or matched.
+
+ PCRE_CONFIG_LINK_SIZE
+
+ The output is an integer that contains the number of bytes used for
+ internal linkage in compiled regular expressions. The value is 2, 3, or
+ 4. Larger values allow larger regular expressions to be compiled, at
+ the expense of slower matching. The default value of 2 is sufficient
+ for all but the most massive patterns, since it allows the compiled
+ pattern to be up to 64K in size.
+
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
+
+ The output is an integer that contains the threshold above which the
+ POSIX interface uses malloc() for output vectors. Further details are
+ given in the pcreposix documentation.
+
+ PCRE_CONFIG_MATCH_LIMIT
+
+ The output is an integer that gives the default limit for the number of
+ internal matching function calls in a pcre_exec() execution. Further
+ details are given with pcre_exec() below.
+
+ PCRE_CONFIG_MATCH_LIMIT_RECURSION
+
+ The output is an integer that gives the default limit for the depth of
+ recursion when calling the internal matching function in a pcre_exec()
+ execution. Further details are given with pcre_exec() below.
+
+ PCRE_CONFIG_STACKRECURSE
+
+ The output is an integer that is set to one if internal recursion when
+ running pcre_exec() is implemented by recursive function calls that use
+ the stack to remember their state. This is the usual way that PCRE is
+ compiled. The output is zero if PCRE was compiled to use blocks of data
+ on the heap instead of recursive function calls. In this case,
+ pcre_stack_malloc and pcre_stack_free are called to manage memory
+ blocks on the heap, thus avoiding the use of the stack.
+
+
+COMPILING A PATTERN
+
+ pcre *pcre_compile(const char *pattern, int options,
+ const char **errptr, int *erroffset,
+ const unsigned char *tableptr);
+
+ pcre *pcre_compile2(const char *pattern, int options,
+ int *errorcodeptr,
+ const char **errptr, int *erroffset,
+ const unsigned char *tableptr);
+
+ Either of the functions pcre_compile() or pcre_compile2() can be called
+ to compile a pattern into an internal form. The only difference between
+ the two interfaces is that pcre_compile2() has an additional argument,
+ errorcodeptr, via which a numerical error code can be returned.
+
+ The pattern is a C string terminated by a binary zero, and is passed in
+ the pattern argument. A pointer to a single block of memory that is
+ obtained via pcre_malloc is returned. This contains the compiled code
+ and related data. The pcre type is defined for the returned block; this
+ is a typedef for a structure whose contents are not externally defined.
+ It is up to the caller to free the memory (via pcre_free) when it is no
+ longer required.
+
+ Although the compiled code of a PCRE regex is relocatable, that is, it
+ does not depend on memory location, the complete pcre data block is not
+ fully relocatable, because it may contain a copy of the tableptr argu-
+ ment, which is an address (see below).
+
+ The options argument contains various bit settings that affect the com-
+ pilation. It should be zero if no options are required. The available
+ options are described below. Some of them, in particular, those that
+ are compatible with Perl, can also be set and unset from within the
+ pattern (see the detailed description in the pcrepattern documenta-
+ tion). For these options, the contents of the options argument speci-
+ fies their initial settings at the start of compilation and execution.
+ The PCRE_ANCHORED and PCRE_NEWLINE_xxx options can be set at the time
+ of matching as well as at compile time.
+
+ If errptr is NULL, pcre_compile() returns NULL immediately. Otherwise,
+ if compilation of a pattern fails, pcre_compile() returns NULL, and
+ sets the variable pointed to by errptr to point to a textual error mes-
+ sage. This is a static string that is part of the library. You must not
+ try to free it. The offset from the start of the pattern to the charac-
+ ter where the error was discovered is placed in the variable pointed to
+ by erroffset, which must not be NULL. If it is, an immediate error is
+ given.
+
+ If pcre_compile2() is used instead of pcre_compile(), and the error-
+ codeptr argument is not NULL, a non-zero error code number is returned
+ via this argument in the event of an error. This is in addition to the
+ textual error message. Error codes and messages are listed below.
+
+ If the final argument, tableptr, is NULL, PCRE uses a default set of
+ character tables that are built when PCRE is compiled, using the
+ default C locale. Otherwise, tableptr must be an address that is the
+ result of a call to pcre_maketables(). This value is stored with the
+ compiled pattern, and used again by pcre_exec(), unless another table
+ pointer is passed to it. For more discussion, see the section on locale
+ support below.
+
+ This code fragment shows a typical straightforward call to pcre_com-
+ pile():
+
+ pcre *re;
+ const char *error;
+ int erroffset;
+ re = pcre_compile(
+ "^A.*Z", /* the pattern */
+ 0, /* default options */
+ &error, /* for error message */
+ &erroffset, /* for error offset */
+ NULL); /* use default character tables */
+
+ The following names for option bits are defined in the pcre.h header
+ file:
+
+ PCRE_ANCHORED
+
+ If this bit is set, the pattern is forced to be "anchored", that is, it
+ is constrained to match only at the first matching point in the string
+ that is being searched (the "subject string"). This effect can also be
+ achieved by appropriate constructs in the pattern itself, which is the
+ only way to do it in Perl.
+
+ PCRE_AUTO_CALLOUT
+
+ If this bit is set, pcre_compile() automatically inserts callout items,
+ all with number 255, before each pattern item. For discussion of the
+ callout facility, see the pcrecallout documentation.
+
+ PCRE_BSR_ANYCRLF
+ PCRE_BSR_UNICODE
+
+ These options (which are mutually exclusive) control what the \R escape
+ sequence matches. The choice is either to match only CR, LF, or CRLF,
+ or to match any Unicode newline sequence. The default is specified when
+ PCRE is built. It can be overridden from within the pattern, or by set-
+ ting an option when a compiled pattern is matched.
+
+ PCRE_CASELESS
+
+ If this bit is set, letters in the pattern match both upper and lower
+ case letters. It is equivalent to Perl's /i option, and it can be
+ changed within a pattern by a (?i) option setting. In UTF-8 mode, PCRE
+ always understands the concept of case for characters whose values are
+ less than 128, so caseless matching is always possible. For characters
+ with higher values, the concept of case is supported if PCRE is com-
+ piled with Unicode property support, but not otherwise. If you want to
+ use caseless matching for characters 128 and above, you must ensure
+ that PCRE is compiled with Unicode property support as well as with
+ UTF-8 support.
+
+ PCRE_DOLLAR_ENDONLY
+
+ If this bit is set, a dollar metacharacter in the pattern matches only
+ at the end of the subject string. Without this option, a dollar also
+ matches immediately before a newline at the end of the string (but not
+ before any other newlines). The PCRE_DOLLAR_ENDONLY option is ignored
+ if PCRE_MULTILINE is set. There is no equivalent to this option in
+ Perl, and no way to set it within a pattern.
+
+ PCRE_DOTALL
+
+ If this bit is set, a dot metacharater in the pattern matches all char-
+ acters, including those that indicate newline. Without it, a dot does
+ not match when the current position is at a newline. This option is
+ equivalent to Perl's /s option, and it can be changed within a pattern
+ by a (?s) option setting. A negative class such as [^a] always matches
+ newline characters, independent of the setting of this option.
+
+ PCRE_DUPNAMES
+
+ If this bit is set, names used to identify capturing subpatterns need
+ not be unique. This can be helpful for certain types of pattern when it
+ is known that only one instance of the named subpattern can ever be
+ matched. There are more details of named subpatterns below; see also
+ the pcrepattern documentation.
+
+ PCRE_EXTENDED
+
+ If this bit is set, whitespace data characters in the pattern are
+ totally ignored except when escaped or inside a character class. White-
+ space does not include the VT character (code 11). In addition, charac-
+ ters between an unescaped # outside a character class and the next new-
+ line, inclusive, are also ignored. This is equivalent to Perl's /x
+ option, and it can be changed within a pattern by a (?x) option set-
+ ting.
+
+ This option makes it possible to include comments inside complicated
+ patterns. Note, however, that this applies only to data characters.
+ Whitespace characters may never appear within special character
+ sequences in a pattern, for example within the sequence (?( which
+ introduces a conditional subpattern.
+
+ PCRE_EXTRA
+
+ This option was invented in order to turn on additional functionality
+ of PCRE that is incompatible with Perl, but it is currently of very
+ little use. When set, any backslash in a pattern that is followed by a
+ letter that has no special meaning causes an error, thus reserving
+ these combinations for future expansion. By default, as in Perl, a
+ backslash followed by a letter with no special meaning is treated as a
+ literal. (Perl can, however, be persuaded to give a warning for this.)
+ There are at present no other features controlled by this option. It
+ can also be set by a (?X) option setting within a pattern.
+
+ PCRE_FIRSTLINE
+
+ If this option is set, an unanchored pattern is required to match
+ before or at the first newline in the subject string, though the
+ matched text may continue over the newline.
+
+ PCRE_JAVASCRIPT_COMPAT
+
+ If this option is set, PCRE's behaviour is changed in some ways so that
+ it is compatible with JavaScript rather than Perl. The changes are as
+ follows:
+
+ (1) A lone closing square bracket in a pattern causes a compile-time
+ error, because this is illegal in JavaScript (by default it is treated
+ as a data character). Thus, the pattern AB]CD becomes illegal when this
+ option is set.
+
+ (2) At run time, a back reference to an unset subpattern group matches
+ an empty string (by default this causes the current matching alterna-
+ tive to fail). A pattern such as (\1)(a) succeeds when this option is
+ set (assuming it can find an "a" in the subject), whereas it fails by
+ default, for Perl compatibility.
+
+ PCRE_MULTILINE
+
+ By default, PCRE treats the subject string as consisting of a single
+ line of characters (even if it actually contains newlines). The "start
+ of line" metacharacter (^) matches only at the start of the string,
+ while the "end of line" metacharacter ($) matches only at the end of
+ the string, or before a terminating newline (unless PCRE_DOLLAR_ENDONLY
+ is set). This is the same as Perl.
+
+ When PCRE_MULTILINE it is set, the "start of line" and "end of line"
+ constructs match immediately following or immediately before internal
+ newlines in the subject string, respectively, as well as at the very
+ start and end. This is equivalent to Perl's /m option, and it can be
+ changed within a pattern by a (?m) option setting. If there are no new-
+ lines in a subject string, or no occurrences of ^ or $ in a pattern,
+ setting PCRE_MULTILINE has no effect.
+
+ PCRE_NEWLINE_CR
+ PCRE_NEWLINE_LF
+ PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
+ PCRE_NEWLINE_ANY
+
+ These options override the default newline definition that was chosen
+ when PCRE was built. Setting the first or the second specifies that a
+ newline is indicated by a single character (CR or LF, respectively).
+ Setting PCRE_NEWLINE_CRLF specifies that a newline is indicated by the
+ two-character CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies
+ that any of the three preceding sequences should be recognized. Setting
+ PCRE_NEWLINE_ANY specifies that any Unicode newline sequence should be
+ recognized. The Unicode newline sequences are the three just mentioned,
+ plus the single characters VT (vertical tab, U+000B), FF (formfeed,
+ U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+ (paragraph separator, U+2029). The last two are recognized only in
+ UTF-8 mode.
+
+ The newline setting in the options word uses three bits that are
+ treated as a number, giving eight possibilities. Currently only six are
+ used (default plus the five values above). This means that if you set
+ more than one newline option, the combination may or may not be sensi-
+ ble. For example, PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to
+ PCRE_NEWLINE_CRLF, but other combinations may yield unused numbers and
+ cause an error.
+
+ The only time that a line break is specially recognized when compiling
+ a pattern is if PCRE_EXTENDED is set, and an unescaped # outside a
+ character class is encountered. This indicates a comment that lasts
+ until after the next line break sequence. In other circumstances, line
+ break sequences are treated as literal data, except that in
+ PCRE_EXTENDED mode, both CR and LF are treated as whitespace characters
+ and are therefore ignored.
+
+ The newline option that is set at compile time becomes the default that
+ is used for pcre_exec() and pcre_dfa_exec(), but it can be overridden.
+
+ PCRE_NO_AUTO_CAPTURE
+
+ If this option is set, it disables the use of numbered capturing paren-
+ theses in the pattern. Any opening parenthesis that is not followed by
+ ? behaves as if it were followed by ?: but named parentheses can still
+ be used for capturing (and they acquire numbers in the usual way).
+ There is no equivalent of this option in Perl.
+
+ PCRE_UNGREEDY
+
+ This option inverts the "greediness" of the quantifiers so that they
+ are not greedy by default, but become greedy if followed by "?". It is
+ not compatible with Perl. It can also be set by a (?U) option setting
+ within the pattern.
+
+ PCRE_UTF8
+
+ This option causes PCRE to regard both the pattern and the subject as
+ strings of UTF-8 characters instead of single-byte character strings.
+ However, it is available only when PCRE is built to include UTF-8 sup-
+ port. If not, the use of this option provokes an error. Details of how
+ this option changes the behaviour of PCRE are given in the section on
+ UTF-8 support in the main pcre page.
+
+ PCRE_NO_UTF8_CHECK
+
+ When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
+ automatically checked. There is a discussion about the validity of
+ UTF-8 strings in the main pcre page. If an invalid UTF-8 sequence of
+ bytes is found, pcre_compile() returns an error. If you already know
+ that your pattern is valid, and you want to skip this check for perfor-
+ mance reasons, you can set the PCRE_NO_UTF8_CHECK option. When it is
+ set, the effect of passing an invalid UTF-8 string as a pattern is
+ undefined. It may cause your program to crash. Note that this option
+ can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress the
+ UTF-8 validity checking of subject strings.
+
+
+COMPILATION ERROR CODES
+
+ The following table lists the error codes than may be returned by
+ pcre_compile2(), along with the error messages that may be returned by
+ both compiling functions. As PCRE has developed, some error codes have
+ fallen out of use. To avoid confusion, they have not been re-used.
+
+ 0 no error
+ 1 \ at end of pattern
+ 2 \c at end of pattern
+ 3 unrecognized character follows \
+ 4 numbers out of order in {} quantifier
+ 5 number too big in {} quantifier
+ 6 missing terminating ] for character class
+ 7 invalid escape sequence in character class
+ 8 range out of order in character class
+ 9 nothing to repeat
+ 10 [this code is not in use]
+ 11 internal error: unexpected repeat
+ 12 unrecognized character after (? or (?-
+ 13 POSIX named classes are supported only within a class
+ 14 missing )
+ 15 reference to non-existent subpattern
+ 16 erroffset passed as NULL
+ 17 unknown option bit(s) set
+ 18 missing ) after comment
+ 19 [this code is not in use]
+ 20 regular expression is too large
+ 21 failed to get memory
+ 22 unmatched parentheses
+ 23 internal error: code overflow
+ 24 unrecognized character after (?<
+ 25 lookbehind assertion is not fixed length
+ 26 malformed number or name after (?(
+ 27 conditional group contains more than two branches
+ 28 assertion expected after (?(
+ 29 (?R or (?[+-]digits must be followed by )
+ 30 unknown POSIX class name
+ 31 POSIX collating elements are not supported
+ 32 this version of PCRE is not compiled with PCRE_UTF8 support
+ 33 [this code is not in use]
+ 34 character value in \x{...} sequence is too large
+ 35 invalid condition (?(0)
+ 36 \C not allowed in lookbehind assertion
+ 37 PCRE does not support \L, \l, \N, \U, or \u
+ 38 number after (?C is > 255
+ 39 closing ) for (?C expected
+ 40 recursive call could loop indefinitely
+ 41 unrecognized character after (?P
+ 42 syntax error in subpattern name (missing terminator)
+ 43 two named subpatterns have the same name
+ 44 invalid UTF-8 string
+ 45 support for \P, \p, and \X has not been compiled
+ 46 malformed \P or \p sequence
+ 47 unknown property name after \P or \p
+ 48 subpattern name is too long (maximum 32 characters)
+ 49 too many named subpatterns (maximum 10000)
+ 50 [this code is not in use]
+ 51 octal value is greater than \377 (not in UTF-8 mode)
+ 52 internal error: overran compiling workspace
+ 53 internal error: previously-checked referenced subpattern not
+ found
+ 54 DEFINE group contains more than one branch
+ 55 repeating a DEFINE group is not allowed
+ 56 inconsistent NEWLINE options
+ 57 \g is not followed by a braced, angle-bracketed, or quoted
+ name/number or by a plain number
+ 58 a numbered reference must not be zero
+ 59 (*VERB) with an argument is not supported
+ 60 (*VERB) not recognized
+ 61 number is too big
+ 62 subpattern name expected
+ 63 digit expected after (?+
+ 64 ] is an invalid data character in JavaScript compatibility mode
+
+ The numbers 32 and 10000 in errors 48 and 49 are defaults; different
+ values may be used if the limits were changed when PCRE was built.
+
+
+STUDYING A PATTERN
+
+ pcre_extra *pcre_study(const pcre *code, int options
+ const char **errptr);
+
+ If a compiled pattern is going to be used several times, it is worth
+ spending more time analyzing it in order to speed up the time taken for
+ matching. The function pcre_study() takes a pointer to a compiled pat-
+ tern as its first argument. If studying the pattern produces additional
+ information that will help speed up matching, pcre_study() returns a
+ pointer to a pcre_extra block, in which the study_data field points to
+ the results of the study.
+
+ The returned value from pcre_study() can be passed directly to
+ pcre_exec(). However, a pcre_extra block also contains other fields
+ that can be set by the caller before the block is passed; these are
+ described below in the section on matching a pattern.
+
+ If studying the pattern does not produce any additional information
+ pcre_study() returns NULL. In that circumstance, if the calling program
+ wants to pass any of the other fields to pcre_exec(), it must set up
+ its own pcre_extra block.
+
+ The second argument of pcre_study() contains option bits. At present,
+ no options are defined, and this argument should always be zero.
+
+ The third argument for pcre_study() is a pointer for an error message.
+ If studying succeeds (even if no data is returned), the variable it
+ points to is set to NULL. Otherwise it is set to point to a textual
+ error message. This is a static string that is part of the library. You
+ must not try to free it. You should test the error pointer for NULL
+ after calling pcre_study(), to be sure that it has run successfully.
+
+ This is a typical call to pcre_study():
+
+ pcre_extra *pe;
+ pe = pcre_study(
+ re, /* result of pcre_compile() */
+ 0, /* no options exist */
+ &error); /* set to NULL or points to a message */
+
+ At present, studying a pattern is useful only for non-anchored patterns
+ that do not have a single fixed starting character. A bitmap of possi-
+ ble starting bytes is created.
+
+
+LOCALE SUPPORT
+
+ PCRE handles caseless matching, and determines whether characters are
+ letters, digits, or whatever, by reference to a set of tables, indexed
+ by character value. When running in UTF-8 mode, this applies only to
+ characters with codes less than 128. Higher-valued codes never match
+ escapes such as \w or \d, but can be tested with \p if PCRE is built
+ with Unicode character property support. The use of locales with Uni-
+ code is discouraged. If you are handling characters with codes greater
+ than 128, you should either use UTF-8 and Unicode, or use locales, but
+ not try to mix the two.
+
+ PCRE contains an internal set of tables that are used when the final
+ argument of pcre_compile() is NULL. These are sufficient for many
+ applications. Normally, the internal tables recognize only ASCII char-
+ acters. However, when PCRE is built, it is possible to cause the inter-
+ nal tables to be rebuilt in the default "C" locale of the local system,
+ which may cause them to be different.
+
+ The internal tables can always be overridden by tables supplied by the
+ application that calls PCRE. These may be created in a different locale
+ from the default. As more and more applications change to using Uni-
+ code, the need for this locale support is expected to die away.
+
+ External tables are built by calling the pcre_maketables() function,
+ which has no arguments, in the relevant locale. The result can then be
+ passed to pcre_compile() or pcre_exec() as often as necessary. For
+ example, to build and use tables that are appropriate for the French
+ locale (where accented characters with values greater than 128 are
+ treated as letters), the following code could be used:
+
+ setlocale(LC_CTYPE, "fr_FR");
+ tables = pcre_maketables();
+ re = pcre_compile(..., tables);
+
+ The locale name "fr_FR" is used on Linux and other Unix-like systems;
+ if you are using Windows, the name for the French locale is "french".
+
+ When pcre_maketables() runs, the tables are built in memory that is
+ obtained via pcre_malloc. It is the caller's responsibility to ensure
+ that the memory containing the tables remains available for as long as
+ it is needed.
+
+ The pointer that is passed to pcre_compile() is saved with the compiled
+ pattern, and the same tables are used via this pointer by pcre_study()
+ and normally also by pcre_exec(). Thus, by default, for any single pat-
+ tern, compilation, studying and matching all happen in the same locale,
+ but different patterns can be compiled in different locales.
+
+ It is possible to pass a table pointer or NULL (indicating the use of
+ the internal tables) to pcre_exec(). Although not intended for this
+ purpose, this facility could be used to match a pattern in a different
+ locale from the one in which it was compiled. Passing table pointers at
+ run time is discussed below in the section on matching a pattern.
+
+
+INFORMATION ABOUT A PATTERN
+
+ int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
+ int what, void *where);
+
+ The pcre_fullinfo() function returns information about a compiled pat-
+ tern. It replaces the obsolete pcre_info() function, which is neverthe-
+ less retained for backwards compability (and is documented below).
+
+ The first argument for pcre_fullinfo() is a pointer to the compiled
+ pattern. The second argument is the result of pcre_study(), or NULL if
+ the pattern was not studied. The third argument specifies which piece
+ of information is required, and the fourth argument is a pointer to a
+ variable to receive the data. The yield of the function is zero for
+ success, or one of the following negative numbers:
+
+ PCRE_ERROR_NULL the argument code was NULL
+ the argument where was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of what was invalid
+
+ The "magic number" is placed at the start of each compiled pattern as
+ an simple check against passing an arbitrary memory pointer. Here is a
+ typical call of pcre_fullinfo(), to obtain the length of the compiled
+ pattern:
+
+ int rc;
+ size_t length;
+ rc = pcre_fullinfo(
+ re, /* result of pcre_compile() */
+ pe, /* result of pcre_study(), or NULL */
+ PCRE_INFO_SIZE, /* what is required */
+ &length); /* where to put the data */
+
+ The possible values for the third argument are defined in pcre.h, and
+ are as follows:
+
+ PCRE_INFO_BACKREFMAX
+
+ Return the number of the highest back reference in the pattern. The
+ fourth argument should point to an int variable. Zero is returned if
+ there are no back references.
+
+ PCRE_INFO_CAPTURECOUNT
+
+ Return the number of capturing subpatterns in the pattern. The fourth
+ argument should point to an int variable.
+
+ PCRE_INFO_DEFAULT_TABLES
+
+ Return a pointer to the internal default character tables within PCRE.
+ The fourth argument should point to an unsigned char * variable. This
+ information call is provided for internal use by the pcre_study() func-
+ tion. External callers can cause PCRE to use its internal tables by
+ passing a NULL table pointer.
+
+ PCRE_INFO_FIRSTBYTE
+
+ Return information about the first byte of any matched string, for a
+ non-anchored pattern. The fourth argument should point to an int vari-
+ able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name
+ is still recognized for backwards compatibility.)
+
+ If there is a fixed first byte, for example, from a pattern such as
+ (cat|cow|coyote), its value is returned. Otherwise, if either
+
+ (a) the pattern was compiled with the PCRE_MULTILINE option, and every
+ branch starts with "^", or
+
+ (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not
+ set (if it were set, the pattern would be anchored),
+
+ -1 is returned, indicating that the pattern matches only at the start
+ of a subject string or after any newline within the string. Otherwise
+ -2 is returned. For anchored patterns, -2 is returned.
+
+ PCRE_INFO_FIRSTTABLE
+
+ If the pattern was studied, and this resulted in the construction of a
+ 256-bit table indicating a fixed set of bytes for the first byte in any
+ matching string, a pointer to the table is returned. Otherwise NULL is
+ returned. The fourth argument should point to an unsigned char * vari-
+ able.
+
+ PCRE_INFO_HASCRORLF
+
+ Return 1 if the pattern contains any explicit matches for CR or LF
+ characters, otherwise 0. The fourth argument should point to an int
+ variable. An explicit match is either a literal CR or LF character, or
+ \r or \n.
+
+ PCRE_INFO_JCHANGED
+
+ Return 1 if the (?J) or (?-J) option setting is used in the pattern,
+ otherwise 0. The fourth argument should point to an int variable. (?J)
+ and (?-J) set and unset the local PCRE_DUPNAMES option, respectively.
+
+ PCRE_INFO_LASTLITERAL
+
+ Return the value of the rightmost literal byte that must exist in any
+ matched string, other than at its start, if such a byte has been
+ recorded. The fourth argument should point to an int variable. If there
+ is no such byte, -1 is returned. For anchored patterns, a last literal
+ byte is recorded only if it follows something of variable length. For
+ example, for the pattern /^a\d+z\d+/ the returned value is "z", but for
+ /^a\dz\d/ the returned value is -1.
+
+ PCRE_INFO_NAMECOUNT
+ PCRE_INFO_NAMEENTRYSIZE
+ PCRE_INFO_NAMETABLE
+
+ PCRE supports the use of named as well as numbered capturing parenthe-
+ ses. The names are just an additional way of identifying the parenthe-
+ ses, which still acquire numbers. Several convenience functions such as
+ pcre_get_named_substring() are provided for extracting captured sub-
+ strings by name. It is also possible to extract the data directly, by
+ first converting the name to a number in order to access the correct
+ pointers in the output vector (described with pcre_exec() below). To do
+ the conversion, you need to use the name-to-number map, which is
+ described by these three values.
+
+ The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT
+ gives the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size
+ of each entry; both of these return an int value. The entry size
+ depends on the length of the longest name. PCRE_INFO_NAMETABLE returns
+ a pointer to the first entry of the table (a pointer to char). The
+ first two bytes of each entry are the number of the capturing parenthe-
+ sis, most significant byte first. The rest of the entry is the corre-
+ sponding name, zero terminated. The names are in alphabetical order.
+ When PCRE_DUPNAMES is set, duplicate names are in order of their paren-
+ theses numbers. For example, consider the following pattern (assume
+ PCRE_EXTENDED is set, so white space - including newlines - is
+ ignored):
+
+ (? (?(\d\d)?\d\d) -
+ (?\d\d) - (?\d\d) )
+
+ There are four named subpatterns, so the table has four entries, and
+ each entry in the table is eight bytes long. The table is as follows,
+ with non-printing bytes shows in hexadecimal, and undefined bytes shown
+ as ??:
+
+ 00 01 d a t e 00 ??
+ 00 05 d a y 00 ?? ??
+ 00 04 m o n t h 00
+ 00 02 y e a r 00 ??
+
+ When writing code to extract data from named subpatterns using the
+ name-to-number map, remember that the length of the entries is likely
+ to be different for each compiled pattern.
+
+ PCRE_INFO_OKPARTIAL
+
+ Return 1 if the pattern can be used for partial matching, otherwise 0.
+ The fourth argument should point to an int variable. The pcrepartial
+ documentation lists the restrictions that apply to patterns when par-
+ tial matching is used.
+
+ PCRE_INFO_OPTIONS
+
+ Return a copy of the options with which the pattern was compiled. The
+ fourth argument should point to an unsigned long int variable. These
+ option bits are those specified in the call to pcre_compile(), modified
+ by any top-level option settings at the start of the pattern itself. In
+ other words, they are the options that will be in force when matching
+ starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with
+ the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE,
+ and PCRE_EXTENDED.
+
+ A pattern is automatically anchored by PCRE if all of its top-level
+ alternatives begin with one of the following:
+
+ ^ unless PCRE_MULTILINE is set
+ \A always
+ \G always
+ .* if PCRE_DOTALL is set and there are no back
+ references to the subpattern in which .* appears
+
+ For such patterns, the PCRE_ANCHORED bit is set in the options returned
+ by pcre_fullinfo().
+
+ PCRE_INFO_SIZE
+
+ Return the size of the compiled pattern, that is, the value that was
+ passed as the argument to pcre_malloc() when PCRE was getting memory in
+ which to place the compiled data. The fourth argument should point to a
+ size_t variable.
+
+ PCRE_INFO_STUDYSIZE
+
+ Return the size of the data block pointed to by the study_data field in
+ a pcre_extra block. That is, it is the value that was passed to
+ pcre_malloc() when PCRE was getting memory into which to place the data
+ created by pcre_study(). The fourth argument should point to a size_t
+ variable.
+
+
+OBSOLETE INFO FUNCTION
+
+ int pcre_info(const pcre *code, int *optptr, int *firstcharptr);
+
+ The pcre_info() function is now obsolete because its interface is too
+ restrictive to return all the available data about a compiled pattern.
+ New programs should use pcre_fullinfo() instead. The yield of
+ pcre_info() is the number of capturing subpatterns, or one of the fol-
+ lowing negative numbers:
+
+ PCRE_ERROR_NULL the argument code was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+
+ If the optptr argument is not NULL, a copy of the options with which
+ the pattern was compiled is placed in the integer it points to (see
+ PCRE_INFO_OPTIONS above).
+
+ If the pattern is not anchored and the firstcharptr argument is not
+ NULL, it is used to pass back information about the first character of
+ any matched string (see PCRE_INFO_FIRSTBYTE above).
+
+
+REFERENCE COUNTS
+
+ int pcre_refcount(pcre *code, int adjust);
+
+ The pcre_refcount() function is used to maintain a reference count in
+ the data block that contains a compiled pattern. It is provided for the
+ benefit of applications that operate in an object-oriented manner,
+ where different parts of the application may be using the same compiled
+ pattern, but you want to free the block when they are all done.
+
+ When a pattern is compiled, the reference count field is initialized to
+ zero. It is changed only by calling this function, whose action is to
+ add the adjust value (which may be positive or negative) to it. The
+ yield of the function is the new value. However, the value of the count
+ is constrained to lie between 0 and 65535, inclusive. If the new value
+ is outside these limits, it is forced to the appropriate limit value.
+
+ Except when it is zero, the reference count is not correctly preserved
+ if a pattern is compiled on one host and then transferred to a host
+ whose byte-order is different. (This seems a highly unlikely scenario.)
+
+
+MATCHING A PATTERN: THE TRADITIONAL FUNCTION
+
+ int pcre_exec(const pcre *code, const pcre_extra *extra,
+ const char *subject, int length, int startoffset,
+ int options, int *ovector, int ovecsize);
+
+ The function pcre_exec() is called to match a subject string against a
+ compiled pattern, which is passed in the code argument. If the pattern
+ has been studied, the result of the study should be passed in the extra
+ argument. This function is the main matching facility of the library,
+ and it operates in a Perl-like manner. For specialist use there is also
+ an alternative matching function, which is described below in the sec-
+ tion about the pcre_dfa_exec() function.
+
+ In most applications, the pattern will have been compiled (and option-
+ ally studied) in the same process that calls pcre_exec(). However, it
+ is possible to save compiled patterns and study data, and then use them
+ later in different processes, possibly even on different hosts. For a
+ discussion about this, see the pcreprecompile documentation.
+
+ Here is an example of a simple call to pcre_exec():
+
+ int rc;
+ int ovector[30];
+ rc = pcre_exec(
+ re, /* result of pcre_compile() */
+ NULL, /* we didn't study the pattern */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ ovector, /* vector of integers for substring information */
+ 30); /* number of elements (NOT size in bytes) */
+
+ Extra data for pcre_exec()
+
+ If the extra argument is not NULL, it must point to a pcre_extra data
+ block. The pcre_study() function returns such a block (when it doesn't
+ return NULL), but you can also create one for yourself, and pass addi-
+ tional information in it. The pcre_extra block contains the following
+ fields (not necessarily in this order):
+
+ unsigned long int flags;
+ void *study_data;
+ unsigned long int match_limit;
+ unsigned long int match_limit_recursion;
+ void *callout_data;
+ const unsigned char *tables;
+
+ The flags field is a bitmap that specifies which of the other fields
+ are set. The flag bits are:
+
+ PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_MATCH_LIMIT
+ PCRE_EXTRA_MATCH_LIMIT_RECURSION
+ PCRE_EXTRA_CALLOUT_DATA
+ PCRE_EXTRA_TABLES
+
+ Other flag bits should be set to zero. The study_data field is set in
+ the pcre_extra block that is returned by pcre_study(), together with
+ the appropriate flag bit. You should not set this yourself, but you may
+ add to the block by setting the other fields and their corresponding
+ flag bits.
+
+ The match_limit field provides a means of preventing PCRE from using up
+ a vast amount of resources when running patterns that are not going to
+ match, but which have a very large number of possibilities in their
+ search trees. The classic example is the use of nested unlimited
+ repeats.
+
+ Internally, PCRE uses a function called match() which it calls repeat-
+ edly (sometimes recursively). The limit set by match_limit is imposed
+ on the number of times this function is called during a match, which
+ has the effect of limiting the amount of backtracking that can take
+ place. For patterns that are not anchored, the count restarts from zero
+ for each position in the subject string.
+
+ The default value for the limit can be set when PCRE is built; the
+ default default is 10 million, which handles all but the most extreme
+ cases. You can override the default by suppling pcre_exec() with a
+ pcre_extra block in which match_limit is set, and
+ PCRE_EXTRA_MATCH_LIMIT is set in the flags field. If the limit is
+ exceeded, pcre_exec() returns PCRE_ERROR_MATCHLIMIT.
+
+ The match_limit_recursion field is similar to match_limit, but instead
+ of limiting the total number of times that match() is called, it limits
+ the depth of recursion. The recursion depth is a smaller number than
+ the total number of calls, because not all calls to match() are recur-
+ sive. This limit is of use only if it is set smaller than match_limit.
+
+ Limiting the recursion depth limits the amount of stack that can be
+ used, or, when PCRE has been compiled to use memory on the heap instead
+ of the stack, the amount of heap memory that can be used.
+
+ The default value for match_limit_recursion can be set when PCRE is
+ built; the default default is the same value as the default for
+ match_limit. You can override the default by suppling pcre_exec() with
+ a pcre_extra block in which match_limit_recursion is set, and
+ PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the
+ limit is exceeded, pcre_exec() returns PCRE_ERROR_RECURSIONLIMIT.
+
+ The pcre_callout field is used in conjunction with the "callout" fea-
+ ture, which is described in the pcrecallout documentation.
+
+ The tables field is used to pass a character tables pointer to
+ pcre_exec(); this overrides the value that is stored with the compiled
+ pattern. A non-NULL value is stored with the compiled pattern only if
+ custom tables were supplied to pcre_compile() via its tableptr argu-
+ ment. If NULL is passed to pcre_exec() using this mechanism, it forces
+ PCRE's internal tables to be used. This facility is helpful when re-
+ using patterns that have been saved after compiling with an external
+ set of tables, because the external tables might be at a different
+ address when pcre_exec() is called. See the pcreprecompile documenta-
+ tion for a discussion of saving compiled patterns for later use.
+
+ Option bits for pcre_exec()
+
+ The unused bits of the options argument for pcre_exec() must be zero.
+ The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
+ PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and
+ PCRE_PARTIAL.
+
+ PCRE_ANCHORED
+
+ The PCRE_ANCHORED option limits pcre_exec() to matching at the first
+ matching position. If a pattern was compiled with PCRE_ANCHORED, or
+ turned out to be anchored by virtue of its contents, it cannot be made
+ unachored at matching time.
+
+ PCRE_BSR_ANYCRLF
+ PCRE_BSR_UNICODE
+
+ These options (which are mutually exclusive) control what the \R escape
+ sequence matches. The choice is either to match only CR, LF, or CRLF,
+ or to match any Unicode newline sequence. These options override the
+ choice that was made or defaulted when the pattern was compiled.
+
+ PCRE_NEWLINE_CR
+ PCRE_NEWLINE_LF
+ PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
+ PCRE_NEWLINE_ANY
+
+ These options override the newline definition that was chosen or
+ defaulted when the pattern was compiled. For details, see the descrip-
+ tion of pcre_compile() above. During matching, the newline choice
+ affects the behaviour of the dot, circumflex, and dollar metacharac-
+ ters. It may also alter the way the match position is advanced after a
+ match failure for an unanchored pattern.
+
+ When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
+ set, and a match attempt for an unanchored pattern fails when the cur-
+ rent position is at a CRLF sequence, and the pattern contains no
+ explicit matches for CR or LF characters, the match position is
+ advanced by two characters instead of one, in other words, to after the
+ CRLF.
+
+ The above rule is a compromise that makes the most common cases work as
+ expected. For example, if the pattern is .+A (and the PCRE_DOTALL
+ option is not set), it does not match the string "\r\nA" because, after
+ failing at the start, it skips both the CR and the LF before retrying.
+ However, the pattern [\r\n]A does match that string, because it con-
+ tains an explicit CR or LF reference, and so advances only by one char-
+ acter after the first failure.
+
+ An explicit match for CR of LF is either a literal appearance of one of
+ those characters, or one of the \r or \n escape sequences. Implicit
+ matches such as [^X] do not count, nor does \s (which includes CR and
+ LF in the characters that it matches).
+
+ Notwithstanding the above, anomalous effects may still occur when CRLF
+ is a valid newline sequence and explicit \r or \n escapes appear in the
+ pattern.
+
+ PCRE_NOTBOL
+
+ This option specifies that first character of the subject string is not
+ the beginning of a line, so the circumflex metacharacter should not
+ match before it. Setting this without PCRE_MULTILINE (at compile time)
+ causes circumflex never to match. This option affects only the behav-
+ iour of the circumflex metacharacter. It does not affect \A.
+
+ PCRE_NOTEOL
+
+ This option specifies that the end of the subject string is not the end
+ of a line, so the dollar metacharacter should not match it nor (except
+ in multiline mode) a newline immediately before it. Setting this with-
+ out PCRE_MULTILINE (at compile time) causes dollar never to match. This
+ option affects only the behaviour of the dollar metacharacter. It does
+ not affect \Z or \z.
+
+ PCRE_NOTEMPTY
+
+ An empty string is not considered to be a valid match if this option is
+ set. If there are alternatives in the pattern, they are tried. If all
+ the alternatives match the empty string, the entire match fails. For
+ example, if the pattern
+
+ a?b?
+
+ is applied to a string not beginning with "a" or "b", it matches the
+ empty string at the start of the subject. With PCRE_NOTEMPTY set, this
+ match is not valid, so PCRE searches further into the string for occur-
+ rences of "a" or "b".
+
+ Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a spe-
+ cial case of a pattern match of the empty string within its split()
+ function, and when using the /g modifier. It is possible to emulate
+ Perl's behaviour after matching a null string by first trying the match
+ again at the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and then
+ if that fails by advancing the starting offset (see below) and trying
+ an ordinary match again. There is some code that demonstrates how to do
+ this in the pcredemo.c sample program.
+
+ PCRE_NO_UTF8_CHECK
+
+ When PCRE_UTF8 is set at compile time, the validity of the subject as a
+ UTF-8 string is automatically checked when pcre_exec() is subsequently
+ called. The value of startoffset is also checked to ensure that it
+ points to the start of a UTF-8 character. There is a discussion about
+ the validity of UTF-8 strings in the section on UTF-8 support in the
+ main pcre page. If an invalid UTF-8 sequence of bytes is found,
+ pcre_exec() returns the error PCRE_ERROR_BADUTF8. If startoffset con-
+ tains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
+
+ If you already know that your subject is valid, and you want to skip
+ these checks for performance reasons, you can set the
+ PCRE_NO_UTF8_CHECK option when calling pcre_exec(). You might want to
+ do this for the second and subsequent calls to pcre_exec() if you are
+ making repeated calls to find all the matches in a single subject
+ string. However, you should be sure that the value of startoffset
+ points to the start of a UTF-8 character. When PCRE_NO_UTF8_CHECK is
+ set, the effect of passing an invalid UTF-8 string as a subject, or a
+ value of startoffset that does not point to the start of a UTF-8 char-
+ acter, is undefined. Your program may crash.
+
+ PCRE_PARTIAL
+
+ This option turns on the partial matching feature. If the subject
+ string fails to match the pattern, but at some point during the match-
+ ing process the end of the subject was reached (that is, the subject
+ partially matches the pattern and the failure to match occurred only
+ because there were not enough subject characters), pcre_exec() returns
+ PCRE_ERROR_PARTIAL instead of PCRE_ERROR_NOMATCH. When PCRE_PARTIAL is
+ used, there are restrictions on what may appear in the pattern. These
+ are discussed in the pcrepartial documentation.
+
+ The string to be matched by pcre_exec()
+
+ The subject string is passed to pcre_exec() as a pointer in subject, a
+ length in length, and a starting byte offset in startoffset. In UTF-8
+ mode, the byte offset must point to the start of a UTF-8 character.
+ Unlike the pattern string, the subject may contain binary zero bytes.
+ When the starting offset is zero, the search for a match starts at the
+ beginning of the subject, and this is by far the most common case.
+
+ A non-zero starting offset is useful when searching for another match
+ in the same subject by calling pcre_exec() again after a previous suc-
+ cess. Setting startoffset differs from just passing over a shortened
+ string and setting PCRE_NOTBOL in the case of a pattern that begins
+ with any kind of lookbehind. For example, consider the pattern
+
+ \Biss\B
+
+ which finds occurrences of "iss" in the middle of words. (\B matches
+ only if the current position in the subject is not a word boundary.)
+ When applied to the string "Mississipi" the first call to pcre_exec()
+ finds the first occurrence. If pcre_exec() is called again with just
+ the remainder of the subject, namely "issipi", it does not match,
+ because \B is always false at the start of the subject, which is deemed
+ to be a word boundary. However, if pcre_exec() is passed the entire
+ string again, but with startoffset set to 4, it finds the second occur-
+ rence of "iss" because it is able to look behind the starting point to
+ discover that it is preceded by a letter.
+
+ If a non-zero starting offset is passed when the pattern is anchored,
+ one attempt to match at the given offset is made. This can only succeed
+ if the pattern does not require the match to be at the start of the
+ subject.
+
+ How pcre_exec() returns captured substrings
+
+ In general, a pattern matches a certain portion of the subject, and in
+ addition, further substrings from the subject may be picked out by
+ parts of the pattern. Following the usage in Jeffrey Friedl's book,
+ this is called "capturing" in what follows, and the phrase "capturing
+ subpattern" is used for a fragment of a pattern that picks out a sub-
+ string. PCRE supports several other kinds of parenthesized subpattern
+ that do not cause substrings to be captured.
+
+ Captured substrings are returned to the caller via a vector of integer
+ offsets whose address is passed in ovector. The number of elements in
+ the vector is passed in ovecsize, which must be a non-negative number.
+ Note: this argument is NOT the size of ovector in bytes.
+
+ The first two-thirds of the vector is used to pass back captured sub-
+ strings, each substring using a pair of integers. The remaining third
+ of the vector is used as workspace by pcre_exec() while matching cap-
+ turing subpatterns, and is not available for passing back information.
+ The length passed in ovecsize should always be a multiple of three. If
+ it is not, it is rounded down.
+
+ When a match is successful, information about captured substrings is
+ returned in pairs of integers, starting at the beginning of ovector,
+ and continuing up to two-thirds of its length at the most. The first
+ element of a pair is set to the offset of the first character in a sub-
+ string, and the second is set to the offset of the first character
+ after the end of a substring. The first pair, ovector[0] and ovec-
+ tor[1], identify the portion of the subject string matched by the
+ entire pattern. The next pair is used for the first capturing subpat-
+ tern, and so on. The value returned by pcre_exec() is one more than the
+ highest numbered pair that has been set. For example, if two substrings
+ have been captured, the returned value is 3. If there are no capturing
+ subpatterns, the return value from a successful match is 1, indicating
+ that just the first pair of offsets has been set.
+
+ If a capturing subpattern is matched repeatedly, it is the last portion
+ of the string that it matched that is returned.
+
+ If the vector is too small to hold all the captured substring offsets,
+ it is used as far as possible (up to two-thirds of its length), and the
+ function returns a value of zero. In particular, if the substring off-
+ sets are not of interest, pcre_exec() may be called with ovector passed
+ as NULL and ovecsize as zero. However, if the pattern contains back
+ references and the ovector is not big enough to remember the related
+ substrings, PCRE has to get additional memory for use during matching.
+ Thus it is usually advisable to supply an ovector.
+
+ The pcre_info() function can be used to find out how many capturing
+ subpatterns there are in a compiled pattern. The smallest size for
+ ovector that will allow for n captured substrings, in addition to the
+ offsets of the substring matched by the whole pattern, is (n+1)*3.
+
+ It is possible for capturing subpattern number n+1 to match some part
+ of the subject when subpattern n has not been used at all. For example,
+ if the string "abc" is matched against the pattern (a|(z))(bc) the
+ return from the function is 4, and subpatterns 1 and 3 are matched, but
+ 2 is not. When this happens, both values in the offset pairs corre-
+ sponding to unused subpatterns are set to -1.
+
+ Offset values that correspond to unused subpatterns at the end of the
+ expression are also set to -1. For example, if the string "abc" is
+ matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not
+ matched. The return from the function is 2, because the highest used
+ capturing subpattern number is 1. However, you can refer to the offsets
+ for the second and third capturing subpatterns if you wish (assuming
+ the vector is large enough, of course).
+
+ Some convenience functions are provided for extracting the captured
+ substrings as separate strings. These are described below.
+
+ Error return values from pcre_exec()
+
+ If pcre_exec() fails, it returns a negative number. The following are
+ defined in the header file:
+
+ PCRE_ERROR_NOMATCH (-1)
+
+ The subject string did not match the pattern.
+
+ PCRE_ERROR_NULL (-2)
+
+ Either code or subject was passed as NULL, or ovector was NULL and
+ ovecsize was not zero.
+
+ PCRE_ERROR_BADOPTION (-3)
+
+ An unrecognized bit was set in the options argument.
+
+ PCRE_ERROR_BADMAGIC (-4)
+
+ PCRE stores a 4-byte "magic number" at the start of the compiled code,
+ to catch the case when it is passed a junk pointer and to detect when a
+ pattern that was compiled in an environment of one endianness is run in
+ an environment with the other endianness. This is the error that PCRE
+ gives when the magic number is not present.
+
+ PCRE_ERROR_UNKNOWN_OPCODE (-5)
+
+ While running the pattern match, an unknown item was encountered in the
+ compiled pattern. This error could be caused by a bug in PCRE or by
+ overwriting of the compiled pattern.
+
+ PCRE_ERROR_NOMEMORY (-6)
+
+ If a pattern contains back references, but the ovector that is passed
+ to pcre_exec() is not big enough to remember the referenced substrings,
+ PCRE gets a block of memory at the start of matching to use for this
+ purpose. If the call via pcre_malloc() fails, this error is given. The
+ memory is automatically freed at the end of matching.
+
+ PCRE_ERROR_NOSUBSTRING (-7)
+
+ This error is used by the pcre_copy_substring(), pcre_get_substring(),
+ and pcre_get_substring_list() functions (see below). It is never
+ returned by pcre_exec().
+
+ PCRE_ERROR_MATCHLIMIT (-8)
+
+ The backtracking limit, as specified by the match_limit field in a
+ pcre_extra structure (or defaulted) was reached. See the description
+ above.
+
+ PCRE_ERROR_CALLOUT (-9)
+
+ This error is never generated by pcre_exec() itself. It is provided for
+ use by callout functions that want to yield a distinctive error code.
+ See the pcrecallout documentation for details.
+
+ PCRE_ERROR_BADUTF8 (-10)
+
+ A string that contains an invalid UTF-8 byte sequence was passed as a
+ subject.
+
+ PCRE_ERROR_BADUTF8_OFFSET (-11)
+
+ The UTF-8 byte sequence that was passed as a subject was valid, but the
+ value of startoffset did not point to the beginning of a UTF-8 charac-
+ ter.
+
+ PCRE_ERROR_PARTIAL (-12)
+
+ The subject string did not match, but it did match partially. See the
+ pcrepartial documentation for details of partial matching.
+
+ PCRE_ERROR_BADPARTIAL (-13)
+
+ The PCRE_PARTIAL option was used with a compiled pattern containing
+ items that are not supported for partial matching. See the pcrepartial
+ documentation for details of partial matching.
+
+ PCRE_ERROR_INTERNAL (-14)
+
+ An unexpected internal error has occurred. This error could be caused
+ by a bug in PCRE or by overwriting of the compiled pattern.
+
+ PCRE_ERROR_BADCOUNT (-15)
+
+ This error is given if the value of the ovecsize argument is negative.
+
+ PCRE_ERROR_RECURSIONLIMIT (-21)
+
+ The internal recursion limit, as specified by the match_limit_recursion
+ field in a pcre_extra structure (or defaulted) was reached. See the
+ description above.
+
+ PCRE_ERROR_BADNEWLINE (-23)
+
+ An invalid combination of PCRE_NEWLINE_xxx options was given.
+
+ Error numbers -16 to -20 and -22 are not used by pcre_exec().
+
+
+EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
+
+ int pcre_copy_substring(const char *subject, int *ovector,
+ int stringcount, int stringnumber, char *buffer,
+ int buffersize);
+
+ int pcre_get_substring(const char *subject, int *ovector,
+ int stringcount, int stringnumber,
+ const char **stringptr);
+
+ int pcre_get_substring_list(const char *subject,
+ int *ovector, int stringcount, const char ***listptr);
+
+ Captured substrings can be accessed directly by using the offsets
+ returned by pcre_exec() in ovector. For convenience, the functions
+ pcre_copy_substring(), pcre_get_substring(), and pcre_get_sub-
+ string_list() are provided for extracting captured substrings as new,
+ separate, zero-terminated strings. These functions identify substrings
+ by number. The next section describes functions for extracting named
+ substrings.
+
+ A substring that contains a binary zero is correctly extracted and has
+ a further zero added on the end, but the result is not, of course, a C
+ string. However, you can process such a string by referring to the
+ length that is returned by pcre_copy_substring() and pcre_get_sub-
+ string(). Unfortunately, the interface to pcre_get_substring_list() is
+ not adequate for handling strings containing binary zeros, because the
+ end of the final string is not independently indicated.
+
+ The first three arguments are the same for all three of these func-
+ tions: subject is the subject string that has just been successfully
+ matched, ovector is a pointer to the vector of integer offsets that was
+ passed to pcre_exec(), and stringcount is the number of substrings that
+ were captured by the match, including the substring that matched the
+ entire regular expression. This is the value returned by pcre_exec() if
+ it is greater than zero. If pcre_exec() returned zero, indicating that
+ it ran out of space in ovector, the value passed as stringcount should
+ be the number of elements in the vector divided by three.
+
+ The functions pcre_copy_substring() and pcre_get_substring() extract a
+ single substring, whose number is given as stringnumber. A value of
+ zero extracts the substring that matched the entire pattern, whereas
+ higher values extract the captured substrings. For pcre_copy_sub-
+ string(), the string is placed in buffer, whose length is given by
+ buffersize, while for pcre_get_substring() a new block of memory is
+ obtained via pcre_malloc, and its address is returned via stringptr.
+ The yield of the function is the length of the string, not including
+ the terminating zero, or one of these error codes:
+
+ PCRE_ERROR_NOMEMORY (-6)
+
+ The buffer was too small for pcre_copy_substring(), or the attempt to
+ get memory failed for pcre_get_substring().
+
+ PCRE_ERROR_NOSUBSTRING (-7)
+
+ There is no substring whose number is stringnumber.
+
+ The pcre_get_substring_list() function extracts all available sub-
+ strings and builds a list of pointers to them. All this is done in a
+ single block of memory that is obtained via pcre_malloc. The address of
+ the memory block is returned via listptr, which is also the start of
+ the list of string pointers. The end of the list is marked by a NULL
+ pointer. The yield of the function is zero if all went well, or the
+ error code
+
+ PCRE_ERROR_NOMEMORY (-6)
+
+ if the attempt to get the memory block failed.
+
+ When any of these functions encounter a substring that is unset, which
+ can happen when capturing subpattern number n+1 matches some part of
+ the subject, but subpattern n has not been used at all, they return an
+ empty string. This can be distinguished from a genuine zero-length sub-
+ string by inspecting the appropriate offset in ovector, which is nega-
+ tive for unset substrings.
+
+ The two convenience functions pcre_free_substring() and pcre_free_sub-
+ string_list() can be used to free the memory returned by a previous
+ call of pcre_get_substring() or pcre_get_substring_list(), respec-
+ tively. They do nothing more than call the function pointed to by
+ pcre_free, which of course could be called directly from a C program.
+ However, PCRE is used in some situations where it is linked via a spe-
+ cial interface to another programming language that cannot use
+ pcre_free directly; it is for these cases that the functions are pro-
+ vided.
+
+
+EXTRACTING CAPTURED SUBSTRINGS BY NAME
+
+ int pcre_get_stringnumber(const pcre *code,
+ const char *name);
+
+ int pcre_copy_named_substring(const pcre *code,
+ const char *subject, int *ovector,
+ int stringcount, const char *stringname,
+ char *buffer, int buffersize);
+
+ int pcre_get_named_substring(const pcre *code,
+ const char *subject, int *ovector,
+ int stringcount, const char *stringname,
+ const char **stringptr);
+
+ To extract a substring by name, you first have to find associated num-
+ ber. For example, for this pattern
+
+ (a+)b(?\d+)...
+
+ the number of the subpattern called "xxx" is 2. If the name is known to
+ be unique (PCRE_DUPNAMES was not set), you can find the number from the
+ name by calling pcre_get_stringnumber(). The first argument is the com-
+ piled pattern, and the second is the name. The yield of the function is
+ the subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no
+ subpattern of that name.
+
+ Given the number, you can extract the substring directly, or use one of
+ the functions described in the previous section. For convenience, there
+ are also two functions that do the whole job.
+
+ Most of the arguments of pcre_copy_named_substring() and
+ pcre_get_named_substring() are the same as those for the similarly
+ named functions that extract by number. As these are described in the
+ previous section, they are not re-described here. There are just two
+ differences:
+
+ First, instead of a substring number, a substring name is given. Sec-
+ ond, there is an extra argument, given at the start, which is a pointer
+ to the compiled pattern. This is needed in order to gain access to the
+ name-to-number translation table.
+
+ These functions call pcre_get_stringnumber(), and if it succeeds, they
+ then call pcre_copy_substring() or pcre_get_substring(), as appropri-
+ ate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names, the
+ behaviour may not be what you want (see the next section).
+
+
+DUPLICATE SUBPATTERN NAMES
+
+ int pcre_get_stringtable_entries(const pcre *code,
+ const char *name, char **first, char **last);
+
+ When a pattern is compiled with the PCRE_DUPNAMES option, names for
+ subpatterns are not required to be unique. Normally, patterns with
+ duplicate names are such that in any one match, only one of the named
+ subpatterns participates. An example is shown in the pcrepattern docu-
+ mentation.
+
+ When duplicates are present, pcre_copy_named_substring() and
+ pcre_get_named_substring() return the first substring corresponding to
+ the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING
+ (-7) is returned; no data is returned. The pcre_get_stringnumber()
+ function returns one of the numbers that are associated with the name,
+ but it is not defined which it is.
+
+ If you want to get full details of all captured substrings for a given
+ name, you must use the pcre_get_stringtable_entries() function. The
+ first argument is the compiled pattern, and the second is the name. The
+ third and fourth are pointers to variables which are updated by the
+ function. After it has run, they point to the first and last entries in
+ the name-to-number table for the given name. The function itself
+ returns the length of each entry, or PCRE_ERROR_NOSUBSTRING (-7) if
+ there are none. The format of the table is described above in the sec-
+ tion entitled Information about a pattern. Given all the relevant
+ entries for the name, you can extract each of their numbers, and hence
+ the captured data, if any.
+
+
+FINDING ALL POSSIBLE MATCHES
+
+ The traditional matching function uses a similar algorithm to Perl,
+ which stops when it finds the first match, starting at a given point in
+ the subject. If you want to find all possible matches, or the longest
+ possible match, consider using the alternative matching function (see
+ below) instead. If you cannot use the alternative function, but still
+ need to find all possible matches, you can kludge it up by making use
+ of the callout facility, which is described in the pcrecallout documen-
+ tation.
+
+ What you have to do is to insert a callout right at the end of the pat-
+ tern. When your callout function is called, extract and save the cur-
+ rent matched substring. Then return 1, which forces pcre_exec() to
+ backtrack and try other alternatives. Ultimately, when it runs out of
+ matches, pcre_exec() will yield PCRE_ERROR_NOMATCH.
+
+
+MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
+
+ int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
+ const char *subject, int length, int startoffset,
+ int options, int *ovector, int ovecsize,
+ int *workspace, int wscount);
+
+ The function pcre_dfa_exec() is called to match a subject string
+ against a compiled pattern, using a matching algorithm that scans the
+ subject string just once, and does not backtrack. This has different
+ characteristics to the normal algorithm, and is not compatible with
+ Perl. Some of the features of PCRE patterns are not supported. Never-
+ theless, there are times when this kind of matching can be useful. For
+ a discussion of the two matching algorithms, see the pcrematching docu-
+ mentation.
+
+ The arguments for the pcre_dfa_exec() function are the same as for
+ pcre_exec(), plus two extras. The ovector argument is used in a differ-
+ ent way, and this is described below. The other common arguments are
+ used in the same way as for pcre_exec(), so their description is not
+ repeated here.
+
+ The two additional arguments provide workspace for the function. The
+ workspace vector should contain at least 20 elements. It is used for
+ keeping track of multiple paths through the pattern tree. More
+ workspace will be needed for patterns and subjects where there are a
+ lot of potential matches.
+
+ Here is an example of a simple call to pcre_dfa_exec():
+
+ int rc;
+ int ovector[10];
+ int wspace[20];
+ rc = pcre_dfa_exec(
+ re, /* result of pcre_compile() */
+ NULL, /* we didn't study the pattern */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ ovector, /* vector of integers for substring information */
+ 10, /* number of elements (NOT size in bytes) */
+ wspace, /* working space vector */
+ 20); /* number of elements (NOT size in bytes) */
+
+ Option bits for pcre_dfa_exec()
+
+ The unused bits of the options argument for pcre_dfa_exec() must be
+ zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEW-
+ LINE_xxx, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK,
+ PCRE_PARTIAL, PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last
+ three of these are the same as for pcre_exec(), so their description is
+ not repeated here.
+
+ PCRE_PARTIAL
+
+ This has the same general effect as it does for pcre_exec(), but the
+ details are slightly different. When PCRE_PARTIAL is set for
+ pcre_dfa_exec(), the return code PCRE_ERROR_NOMATCH is converted into
+ PCRE_ERROR_PARTIAL if the end of the subject is reached, there have
+ been no complete matches, but there is still at least one matching pos-
+ sibility. The portion of the string that provided the partial match is
+ set as the first matching string.
+
+ PCRE_DFA_SHORTEST
+
+ Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to
+ stop as soon as it has found one match. Because of the way the alterna-
+ tive algorithm works, this is necessarily the shortest possible match
+ at the first possible matching point in the subject string.
+
+ PCRE_DFA_RESTART
+
+ When pcre_dfa_exec() is called with the PCRE_PARTIAL option, and
+ returns a partial match, it is possible to call it again, with addi-
+ tional subject characters, and have it continue with the same match.
+ The PCRE_DFA_RESTART option requests this action; when it is set, the
+ workspace and wscount options must reference the same vector as before
+ because data about the match so far is left in them after a partial
+ match. There is more discussion of this facility in the pcrepartial
+ documentation.
+
+ Successful returns from pcre_dfa_exec()
+
+ When pcre_dfa_exec() succeeds, it may have matched more than one sub-
+ string in the subject. Note, however, that all the matches from one run
+ of the function start at the same point in the subject. The shorter
+ matches are all initial substrings of the longer matches. For example,
+ if the pattern
+
+ <.*>
+
+ is matched against the string
+
+ This is no more
+
+ the three matched strings are
+
+
+
+
+
+ On success, the yield of the function is a number greater than zero,
+ which is the number of matched substrings. The substrings themselves
+ are returned in ovector. Each string uses two elements; the first is
+ the offset to the start, and the second is the offset to the end. In
+ fact, all the strings have the same start offset. (Space could have
+ been saved by giving this only once, but it was decided to retain some
+ compatibility with the way pcre_exec() returns data, even though the
+ meaning of the strings is different.)
+
+ The strings are returned in reverse order of length; that is, the long-
+ est matching string is given first. If there were too many matches to
+ fit into ovector, the yield of the function is zero, and the vector is
+ filled with the longest matches.
+
+ Error returns from pcre_dfa_exec()
+
+ The pcre_dfa_exec() function returns a negative number when it fails.
+ Many of the errors are the same as for pcre_exec(), and these are
+ described above. There are in addition the following errors that are
+ specific to pcre_dfa_exec():
+
+ PCRE_ERROR_DFA_UITEM (-16)
+
+ This return is given if pcre_dfa_exec() encounters an item in the pat-
+ tern that it does not support, for instance, the use of \C or a back
+ reference.
+
+ PCRE_ERROR_DFA_UCOND (-17)
+
+ This return is given if pcre_dfa_exec() encounters a condition item
+ that uses a back reference for the condition, or a test for recursion
+ in a specific group. These are not supported.
+
+ PCRE_ERROR_DFA_UMLIMIT (-18)
+
+ This return is given if pcre_dfa_exec() is called with an extra block
+ that contains a setting of the match_limit field. This is not supported
+ (it is meaningless).
+
+ PCRE_ERROR_DFA_WSSIZE (-19)
+
+ This return is given if pcre_dfa_exec() runs out of space in the
+ workspace vector.
+
+ PCRE_ERROR_DFA_RECURSE (-20)
+
+ When a recursive subpattern is processed, the matching function calls
+ itself recursively, using private vectors for ovector and workspace.
+ This error is given if the output vector is not large enough. This
+ should be extremely rare, as a vector of size 1000 is used.
+
+
+SEE ALSO
+
+ pcrebuild(3), pcrecallout(3), pcrecpp(3)(3), pcrematching(3), pcrepar-
+ tial(3), pcreposix(3), pcreprecompile(3), pcresample(3), pcrestack(3).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 12 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCRECALLOUT(3) PCRECALLOUT(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE CALLOUTS
+
+ int (*pcre_callout)(pcre_callout_block *);
+
+ PCRE provides a feature called "callout", which is a means of temporar-
+ ily passing control to the caller of PCRE in the middle of pattern
+ matching. The caller of PCRE provides an external function by putting
+ its entry point in the global variable pcre_callout. By default, this
+ variable contains NULL, which disables all calling out.
+
+ Within a regular expression, (?C) indicates the points at which the
+ external function is to be called. Different callout points can be
+ identified by putting a number less than 256 after the letter C. The
+ default value is zero. For example, this pattern has two callout
+ points:
+
+ (?C1)abc(?C2)def
+
+ If the PCRE_AUTO_CALLOUT option bit is set when pcre_compile() is
+ called, PCRE automatically inserts callouts, all with number 255,
+ before each item in the pattern. For example, if PCRE_AUTO_CALLOUT is
+ used with the pattern
+
+ A(\d{2}|--)
+
+ it is processed as if it were
+
+ (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
+
+ Notice that there is a callout before and after each parenthesis and
+ alternation bar. Automatic callouts can be used for tracking the
+ progress of pattern matching. The pcretest command has an option that
+ sets automatic callouts; when it is used, the output indicates how the
+ pattern is matched. This is useful information when you are trying to
+ optimize the performance of a particular pattern.
+
+
+MISSING CALLOUTS
+
+ You should be aware that, because of optimizations in the way PCRE
+ matches patterns, callouts sometimes do not happen. For example, if the
+ pattern is
+
+ ab(?C4)cd
+
+ PCRE knows that any matching string must contain the letter "d". If the
+ subject string is "abyz", the lack of "d" means that matching doesn't
+ ever start, and the callout is never reached. However, with "abyd",
+ though the result is still no match, the callout is obeyed.
+
+
+THE CALLOUT INTERFACE
+
+ During matching, when PCRE reaches a callout point, the external func-
+ tion defined by pcre_callout is called (if it is set). This applies to
+ both the pcre_exec() and the pcre_dfa_exec() matching functions. The
+ only argument to the callout function is a pointer to a pcre_callout
+ block. This structure contains the following fields:
+
+ int version;
+ int callout_number;
+ int *offset_vector;
+ const char *subject;
+ int subject_length;
+ int start_match;
+ int current_position;
+ int capture_top;
+ int capture_last;
+ void *callout_data;
+ int pattern_position;
+ int next_item_length;
+
+ The version field is an integer containing the version number of the
+ block format. The initial version was 0; the current version is 1. The
+ version number will change again in future if additional fields are
+ added, but the intention is never to remove any of the existing fields.
+
+ The callout_number field contains the number of the callout, as com-
+ piled into the pattern (that is, the number after ?C for manual call-
+ outs, and 255 for automatically generated callouts).
+
+ The offset_vector field is a pointer to the vector of offsets that was
+ passed by the caller to pcre_exec() or pcre_dfa_exec(). When
+ pcre_exec() is used, the contents can be inspected in order to extract
+ substrings that have been matched so far, in the same way as for
+ extracting substrings after a match has completed. For pcre_dfa_exec()
+ this field is not useful.
+
+ The subject and subject_length fields contain copies of the values that
+ were passed to pcre_exec().
+
+ The start_match field normally contains the offset within the subject
+ at which the current match attempt started. However, if the escape
+ sequence \K has been encountered, this value is changed to reflect the
+ modified starting point. If the pattern is not anchored, the callout
+ function may be called several times from the same point in the pattern
+ for different starting points in the subject.
+
+ The current_position field contains the offset within the subject of
+ the current match pointer.
+
+ When the pcre_exec() function is used, the capture_top field contains
+ one more than the number of the highest numbered captured substring so
+ far. If no substrings have been captured, the value of capture_top is
+ one. This is always the case when pcre_dfa_exec() is used, because it
+ does not support captured substrings.
+
+ The capture_last field contains the number of the most recently cap-
+ tured substring. If no substrings have been captured, its value is -1.
+ This is always the case when pcre_dfa_exec() is used.
+
+ The callout_data field contains a value that is passed to pcre_exec()
+ or pcre_dfa_exec() specifically so that it can be passed back in call-
+ outs. It is passed in the pcre_callout field of the pcre_extra data
+ structure. If no such data was passed, the value of callout_data in a
+ pcre_callout block is NULL. There is a description of the pcre_extra
+ structure in the pcreapi documentation.
+
+ The pattern_position field is present from version 1 of the pcre_call-
+ out structure. It contains the offset to the next item to be matched in
+ the pattern string.
+
+ The next_item_length field is present from version 1 of the pcre_call-
+ out structure. It contains the length of the next item to be matched in
+ the pattern string. When the callout immediately precedes an alterna-
+ tion bar, a closing parenthesis, or the end of the pattern, the length
+ is zero. When the callout precedes an opening parenthesis, the length
+ is that of the entire subpattern.
+
+ The pattern_position and next_item_length fields are intended to help
+ in distinguishing between different automatic callouts, which all have
+ the same callout number. However, they are set for all callouts.
+
+
+RETURN VALUES
+
+ The external callout function returns an integer to PCRE. If the value
+ is zero, matching proceeds as normal. If the value is greater than
+ zero, matching fails at the current point, but the testing of other
+ matching possibilities goes ahead, just as if a lookahead assertion had
+ failed. If the value is less than zero, the match is abandoned, and
+ pcre_exec() (or pcre_dfa_exec()) returns the negative value.
+
+ Negative values should normally be chosen from the set of
+ PCRE_ERROR_xxx values. In particular, PCRE_ERROR_NOMATCH forces a stan-
+ dard "no match" failure. The error number PCRE_ERROR_CALLOUT is
+ reserved for use by callout functions; it will never be used by PCRE
+ itself.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 29 May 2007
+ Copyright (c) 1997-2007 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCRECOMPAT(3) PCRECOMPAT(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+DIFFERENCES BETWEEN PCRE AND PERL
+
+ This document describes the differences in the ways that PCRE and Perl
+ handle regular expressions. The differences described here are mainly
+ with respect to Perl 5.8, though PCRE versions 7.0 and later contain
+ some features that are expected to be in the forthcoming Perl 5.10.
+
+ 1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details
+ of what it does have are given in the section on UTF-8 support in the
+ main pcre page.
+
+ 2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl
+ permits them, but they do not mean what you might think. For example,
+ (?!a){3} does not assert that the next three characters are not "a". It
+ just asserts that the next character is not "a" three times.
+
+ 3. Capturing subpatterns that occur inside negative lookahead asser-
+ tions are counted, but their entries in the offsets vector are never
+ set. Perl sets its numerical variables from any such patterns that are
+ matched before the assertion fails to match something (thereby succeed-
+ ing), but only if the negative lookahead assertion contains just one
+ branch.
+
+ 4. Though binary zero characters are supported in the subject string,
+ they are not allowed in a pattern string because it is passed as a nor-
+ mal C string, terminated by zero. The escape sequence \0 can be used in
+ the pattern to represent a binary zero.
+
+ 5. The following Perl escape sequences are not supported: \l, \u, \L,
+ \U, and \N. In fact these are implemented by Perl's general string-han-
+ dling and are not part of its pattern matching engine. If any of these
+ are encountered by PCRE, an error is generated.
+
+ 6. The Perl escape sequences \p, \P, and \X are supported only if PCRE
+ is built with Unicode character property support. The properties that
+ can be tested with \p and \P are limited to the general category prop-
+ erties such as Lu and Nd, script names such as Greek or Han, and the
+ derived properties Any and L&.
+
+ 7. PCRE does support the \Q...\E escape for quoting substrings. Charac-
+ ters in between are treated as literals. This is slightly different
+ from Perl in that $ and @ are also handled as literals inside the
+ quotes. In Perl, they cause variable interpolation (but of course PCRE
+ does not have variables). Note the following examples:
+
+ Pattern PCRE matches Perl matches
+
+ \Qabc$xyz\E abc$xyz abc followed by the
+ contents of $xyz
+ \Qabc\$xyz\E abc\$xyz abc\$xyz
+ \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
+
+ The \Q...\E sequence is recognized both inside and outside character
+ classes.
+
+ 8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
+ constructions. However, there is support for recursive patterns. This
+ is not available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE
+ "callout" feature allows an external function to be called during pat-
+ tern matching. See the pcrecallout documentation for details.
+
+ 9. Subpatterns that are called recursively or as "subroutines" are
+ always treated as atomic groups in PCRE. This is like Python, but
+ unlike Perl.
+
+ 10. There are some differences that are concerned with the settings of
+ captured strings when part of a pattern is repeated. For example,
+ matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
+ unset, but in PCRE it is set to "b".
+
+ 11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT),
+ (*FAIL), (*F), (*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in
+ the forms without an argument. PCRE does not support (*MARK). If
+ (*ACCEPT) is within capturing parentheses, PCRE does not set that cap-
+ ture group; this is different to Perl.
+
+ 12. PCRE provides some extensions to the Perl regular expression facil-
+ ities. Perl 5.10 will include new features that are not in earlier
+ versions, some of which (such as named parentheses) have been in PCRE
+ for some time. This list is with respect to Perl 5.10:
+
+ (a) Although lookbehind assertions must match fixed length strings,
+ each alternative branch of a lookbehind assertion can match a different
+ length of string. Perl requires them all to have the same length.
+
+ (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
+ meta-character matches only at the very end of the string.
+
+ (c) If PCRE_EXTRA is set, a backslash followed by a letter with no spe-
+ cial meaning is faulted. Otherwise, like Perl, the backslash is quietly
+ ignored. (Perl can be made to issue a warning.)
+
+ (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
+ fiers is inverted, that is, by default they are not greedy, but if fol-
+ lowed by a question mark they are.
+
+ (e) PCRE_ANCHORED can be used at matching time to force a pattern to be
+ tried only at the first matching position in the subject string.
+
+ (f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAP-
+ TURE options for pcre_exec() have no Perl equivalents.
+
+ (g) The \R escape sequence can be restricted to match only CR, LF, or
+ CRLF by the PCRE_BSR_ANYCRLF option.
+
+ (h) The callout facility is PCRE-specific.
+
+ (i) The partial matching facility is PCRE-specific.
+
+ (j) Patterns compiled by PCRE can be saved and re-used at a later time,
+ even on different hosts that have the other endianness.
+
+ (k) The alternative matching function (pcre_dfa_exec()) matches in a
+ different way and is not Perl-compatible.
+
+ (l) PCRE recognizes some special sequences such as (*CR) at the start
+ of a pattern that set overall options that cannot be changed within the
+ pattern.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 11 September 2007
+ Copyright (c) 1997-2007 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREPATTERN(3) PCREPATTERN(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE REGULAR EXPRESSION DETAILS
+
+ The syntax and semantics of the regular expressions that are supported
+ by PCRE are described in detail below. There is a quick-reference syn-
+ tax summary in the pcresyntax page. PCRE tries to match Perl syntax and
+ semantics as closely as it can. PCRE also supports some alternative
+ regular expression syntax (which does not conflict with the Perl syn-
+ tax) in order to provide some compatibility with regular expressions in
+ Python, .NET, and Oniguruma.
+
+ Perl's regular expressions are described in its own documentation, and
+ regular expressions in general are covered in a number of books, some
+ of which have copious examples. Jeffrey Friedl's "Mastering Regular
+ Expressions", published by O'Reilly, covers regular expressions in
+ great detail. This description of PCRE's regular expressions is
+ intended as reference material.
+
+ The original operation of PCRE was on strings of one-byte characters.
+ However, there is now also support for UTF-8 character strings. To use
+ this, you must build PCRE to include UTF-8 support, and then call
+ pcre_compile() with the PCRE_UTF8 option. How this affects pattern
+ matching is mentioned in several places below. There is also a summary
+ of UTF-8 features in the section on UTF-8 support in the main pcre
+ page.
+
+ The remainder of this document discusses the patterns that are sup-
+ ported by PCRE when its main matching function, pcre_exec(), is used.
+ From release 6.0, PCRE offers a second matching function,
+ pcre_dfa_exec(), which matches using a different algorithm that is not
+ Perl-compatible. Some of the features discussed below are not available
+ when pcre_dfa_exec() is used. The advantages and disadvantages of the
+ alternative function, and how it differs from the normal function, are
+ discussed in the pcrematching page.
+
+
+NEWLINE CONVENTIONS
+
+ PCRE supports five different conventions for indicating line breaks in
+ strings: a single CR (carriage return) character, a single LF (line-
+ feed) character, the two-character sequence CRLF, any of the three pre-
+ ceding, or any Unicode newline sequence. The pcreapi page has further
+ discussion about newlines, and shows how to set the newline convention
+ in the options arguments for the compiling and matching functions.
+
+ It is also possible to specify a newline convention by starting a pat-
+ tern string with one of the following five sequences:
+
+ (*CR) carriage return
+ (*LF) linefeed
+ (*CRLF) carriage return, followed by linefeed
+ (*ANYCRLF) any of the three above
+ (*ANY) all Unicode newline sequences
+
+ These override the default and the options given to pcre_compile(). For
+ example, on a Unix system where LF is the default newline sequence, the
+ pattern
+
+ (*CR)a.b
+
+ changes the convention to CR. That pattern matches "a\nb" because LF is
+ no longer a newline. Note that these special settings, which are not
+ Perl-compatible, are recognized only at the very start of a pattern,
+ and that they must be in upper case. If more than one of them is
+ present, the last one is used.
+
+ The newline convention does not affect what the \R escape sequence
+ matches. By default, this is any Unicode newline sequence, for Perl
+ compatibility. However, this can be changed; see the description of \R
+ in the section entitled "Newline sequences" below. A change of \R set-
+ ting can be combined with a change of newline convention.
+
+
+CHARACTERS AND METACHARACTERS
+
+ A regular expression is a pattern that is matched against a subject
+ string from left to right. Most characters stand for themselves in a
+ pattern, and match the corresponding characters in the subject. As a
+ trivial example, the pattern
+
+ The quick brown fox
+
+ matches a portion of a subject string that is identical to itself. When
+ caseless matching is specified (the PCRE_CASELESS option), letters are
+ matched independently of case. In UTF-8 mode, PCRE always understands
+ the concept of case for characters whose values are less than 128, so
+ caseless matching is always possible. For characters with higher val-
+ ues, the concept of case is supported if PCRE is compiled with Unicode
+ property support, but not otherwise. If you want to use caseless
+ matching for characters 128 and above, you must ensure that PCRE is
+ compiled with Unicode property support as well as with UTF-8 support.
+
+ The power of regular expressions comes from the ability to include
+ alternatives and repetitions in the pattern. These are encoded in the
+ pattern by the use of metacharacters, which do not stand for themselves
+ but instead are interpreted in some special way.
+
+ There are two different sets of metacharacters: those that are recog-
+ nized anywhere in the pattern except within square brackets, and those
+ that are recognized within square brackets. Outside square brackets,
+ the metacharacters are as follows:
+
+ \ general escape character with several uses
+ ^ assert start of string (or line, in multiline mode)
+ $ assert end of string (or line, in multiline mode)
+ . match any character except newline (by default)
+ [ start character class definition
+ | start of alternative branch
+ ( start subpattern
+ ) end subpattern
+ ? extends the meaning of (
+ also 0 or 1 quantifier
+ also quantifier minimizer
+ * 0 or more quantifier
+ + 1 or more quantifier
+ also "possessive quantifier"
+ { start min/max quantifier
+
+ Part of a pattern that is in square brackets is called a "character
+ class". In a character class the only metacharacters are:
+
+ \ general escape character
+ ^ negate the class, but only if the first character
+ - indicates character range
+ [ POSIX character class (only if followed by POSIX
+ syntax)
+ ] terminates the character class
+
+ The following sections describe the use of each of the metacharacters.
+
+
+BACKSLASH
+
+ The backslash character has several uses. Firstly, if it is followed by
+ a non-alphanumeric character, it takes away any special meaning that
+ character may have. This use of backslash as an escape character
+ applies both inside and outside character classes.
+
+ For example, if you want to match a * character, you write \* in the
+ pattern. This escaping action applies whether or not the following
+ character would otherwise be interpreted as a metacharacter, so it is
+ always safe to precede a non-alphanumeric with backslash to specify
+ that it stands for itself. In particular, if you want to match a back-
+ slash, you write \\.
+
+ If a pattern is compiled with the PCRE_EXTENDED option, whitespace in
+ the pattern (other than in a character class) and characters between a
+ # outside a character class and the next newline are ignored. An escap-
+ ing backslash can be used to include a whitespace or # character as
+ part of the pattern.
+
+ If you want to remove the special meaning from a sequence of charac-
+ ters, you can do so by putting them between \Q and \E. This is differ-
+ ent from Perl in that $ and @ are handled as literals in \Q...\E
+ sequences in PCRE, whereas in Perl, $ and @ cause variable interpola-
+ tion. Note the following examples:
+
+ Pattern PCRE matches Perl matches
+
+ \Qabc$xyz\E abc$xyz abc followed by the
+ contents of $xyz
+ \Qabc\$xyz\E abc\$xyz abc\$xyz
+ \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
+
+ The \Q...\E sequence is recognized both inside and outside character
+ classes.
+
+ Non-printing characters
+
+ A second use of backslash provides a way of encoding non-printing char-
+ acters in patterns in a visible manner. There is no restriction on the
+ appearance of non-printing characters, apart from the binary zero that
+ terminates a pattern, but when a pattern is being prepared by text
+ editing, it is usually easier to use one of the following escape
+ sequences than the binary character it represents:
+
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is any character
+ \e escape (hex 1B)
+ \f formfeed (hex 0C)
+ \n linefeed (hex 0A)
+ \r carriage return (hex 0D)
+ \t tab (hex 09)
+ \ddd character with octal code ddd, or backreference
+ \xhh character with hex code hh
+ \x{hhh..} character with hex code hhh..
+
+ The precise effect of \cx is as follows: if x is a lower case letter,
+ it is converted to upper case. Then bit 6 of the character (hex 40) is
+ inverted. Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c;
+ becomes hex 7B.
+
+ After \x, from zero to two hexadecimal digits are read (letters can be
+ in upper or lower case). Any number of hexadecimal digits may appear
+ between \x{ and }, but the value of the character code must be less
+ than 256 in non-UTF-8 mode, and less than 2**31 in UTF-8 mode. That is,
+ the maximum value in hexadecimal is 7FFFFFFF. Note that this is bigger
+ than the largest Unicode code point, which is 10FFFF.
+
+ If characters other than hexadecimal digits appear between \x{ and },
+ or if there is no terminating }, this form of escape is not recognized.
+ Instead, the initial \x will be interpreted as a basic hexadecimal
+ escape, with no following digits, giving a character whose value is
+ zero.
+
+ Characters whose value is less than 256 can be defined by either of the
+ two syntaxes for \x. There is no difference in the way they are han-
+ dled. For example, \xdc is exactly the same as \x{dc}.
+
+ After \0 up to two further octal digits are read. If there are fewer
+ than two digits, just those that are present are used. Thus the
+ sequence \0\x\07 specifies two binary zeros followed by a BEL character
+ (code value 7). Make sure you supply two digits after the initial zero
+ if the pattern character that follows is itself an octal digit.
+
+ The handling of a backslash followed by a digit other than 0 is compli-
+ cated. Outside a character class, PCRE reads it and any following dig-
+ its as a decimal number. If the number is less than 10, or if there
+ have been at least that many previous capturing left parentheses in the
+ expression, the entire sequence is taken as a back reference. A
+ description of how this works is given later, following the discussion
+ of parenthesized subpatterns.
+
+ Inside a character class, or if the decimal number is greater than 9
+ and there have not been that many capturing subpatterns, PCRE re-reads
+ up to three octal digits following the backslash, and uses them to gen-
+ erate a data character. Any subsequent digits stand for themselves. In
+ non-UTF-8 mode, the value of a character specified in octal must be
+ less than \400. In UTF-8 mode, values up to \777 are permitted. For
+ example:
+
+ \040 is another way of writing a space
+ \40 is the same, provided there are fewer than 40
+ previous capturing subpatterns
+ \7 is always a back reference
+ \11 might be a back reference, or another way of
+ writing a tab
+ \011 is always a tab
+ \0113 is a tab followed by the character "3"
+ \113 might be a back reference, otherwise the
+ character with octal code 113
+ \377 might be a back reference, otherwise
+ the byte consisting entirely of 1 bits
+ \81 is either a back reference, or a binary zero
+ followed by the two characters "8" and "1"
+
+ Note that octal values of 100 or greater must not be introduced by a
+ leading zero, because no more than three octal digits are ever read.
+
+ All the sequences that define a single character value can be used both
+ inside and outside character classes. In addition, inside a character
+ class, the sequence \b is interpreted as the backspace character (hex
+ 08), and the sequences \R and \X are interpreted as the characters "R"
+ and "X", respectively. Outside a character class, these sequences have
+ different meanings (see below).
+
+ Absolute and relative back references
+
+ The sequence \g followed by an unsigned or a negative number, option-
+ ally enclosed in braces, is an absolute or relative back reference. A
+ named back reference can be coded as \g{name}. Back references are dis-
+ cussed later, following the discussion of parenthesized subpatterns.
+
+ Absolute and relative subroutine calls
+
+ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
+ name or a number enclosed either in angle brackets or single quotes, is
+ an alternative syntax for referencing a subpattern as a "subroutine".
+ Details are discussed later. Note that \g{...} (Perl syntax) and
+ \g<...> (Oniguruma syntax) are not synonymous. The former is a back
+ reference; the latter is a subroutine call.
+
+ Generic character types
+
+ Another use of backslash is for specifying generic character types. The
+ following are always recognized:
+
+ \d any decimal digit
+ \D any character that is not a decimal digit
+ \h any horizontal whitespace character
+ \H any character that is not a horizontal whitespace character
+ \s any whitespace character
+ \S any character that is not a whitespace character
+ \v any vertical whitespace character
+ \V any character that is not a vertical whitespace character
+ \w any "word" character
+ \W any "non-word" character
+
+ Each pair of escape sequences partitions the complete set of characters
+ into two disjoint sets. Any given character matches one, and only one,
+ of each pair.
+
+ These character type sequences can appear both inside and outside char-
+ acter classes. They each match one character of the appropriate type.
+ If the current matching point is at the end of the subject string, all
+ of them fail, since there is no character to match.
+
+ For compatibility with Perl, \s does not match the VT character (code
+ 11). This makes it different from the the POSIX "space" class. The \s
+ characters are HT (9), LF (10), FF (12), CR (13), and space (32). If
+ "use locale;" is included in a Perl script, \s may match the VT charac-
+ ter. In PCRE, it never does.
+
+ In UTF-8 mode, characters with values greater than 128 never match \d,
+ \s, or \w, and always match \D, \S, and \W. This is true even when Uni-
+ code character property support is available. These sequences retain
+ their original meanings from before UTF-8 support was available, mainly
+ for efficiency reasons.
+
+ The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to
+ the other sequences, these do match certain high-valued codepoints in
+ UTF-8 mode. The horizontal space characters are:
+
+ U+0009 Horizontal tab
+ U+0020 Space
+ U+00A0 Non-break space
+ U+1680 Ogham space mark
+ U+180E Mongolian vowel separator
+ U+2000 En quad
+ U+2001 Em quad
+ U+2002 En space
+ U+2003 Em space
+ U+2004 Three-per-em space
+ U+2005 Four-per-em space
+ U+2006 Six-per-em space
+ U+2007 Figure space
+ U+2008 Punctuation space
+ U+2009 Thin space
+ U+200A Hair space
+ U+202F Narrow no-break space
+ U+205F Medium mathematical space
+ U+3000 Ideographic space
+
+ The vertical space characters are:
+
+ U+000A Linefeed
+ U+000B Vertical tab
+ U+000C Formfeed
+ U+000D Carriage return
+ U+0085 Next line
+ U+2028 Line separator
+ U+2029 Paragraph separator
+
+ A "word" character is an underscore or any character less than 256 that
+ is a letter or digit. The definition of letters and digits is con-
+ trolled by PCRE's low-valued character tables, and may vary if locale-
+ specific matching is taking place (see "Locale support" in the pcreapi
+ page). For example, in a French locale such as "fr_FR" in Unix-like
+ systems, or "french" in Windows, some character codes greater than 128
+ are used for accented letters, and these are matched by \w. The use of
+ locales with Unicode is discouraged.
+
+ Newline sequences
+
+ Outside a character class, by default, the escape sequence \R matches
+ any Unicode newline sequence. This is a Perl 5.10 feature. In non-UTF-8
+ mode \R is equivalent to the following:
+
+ (?>\r\n|\n|\x0b|\f|\r|\x85)
+
+ This is an example of an "atomic group", details of which are given
+ below. This particular group matches either the two-character sequence
+ CR followed by LF, or one of the single characters LF (linefeed,
+ U+000A), VT (vertical tab, U+000B), FF (formfeed, U+000C), CR (carriage
+ return, U+000D), or NEL (next line, U+0085). The two-character sequence
+ is treated as a single unit that cannot be split.
+
+ In UTF-8 mode, two additional characters whose codepoints are greater
+ than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa-
+ rator, U+2029). Unicode character property support is not needed for
+ these characters to be recognized.
+
+ It is possible to restrict \R to match only CR, LF, or CRLF (instead of
+ the complete set of Unicode line endings) by setting the option
+ PCRE_BSR_ANYCRLF either at compile time or when the pattern is matched.
+ (BSR is an abbrevation for "backslash R".) This can be made the default
+ when PCRE is built; if this is the case, the other behaviour can be
+ requested via the PCRE_BSR_UNICODE option. It is also possible to
+ specify these settings by starting a pattern string with one of the
+ following sequences:
+
+ (*BSR_ANYCRLF) CR, LF, or CRLF only
+ (*BSR_UNICODE) any Unicode newline sequence
+
+ These override the default and the options given to pcre_compile(), but
+ they can be overridden by options given to pcre_exec(). Note that these
+ special settings, which are not Perl-compatible, are recognized only at
+ the very start of a pattern, and that they must be in upper case. If
+ more than one of them is present, the last one is used. They can be
+ combined with a change of newline convention, for example, a pattern
+ can start with:
+
+ (*ANY)(*BSR_ANYCRLF)
+
+ Inside a character class, \R matches the letter "R".
+
+ Unicode character properties
+
+ When PCRE is built with Unicode character property support, three addi-
+ tional escape sequences that match characters with specific properties
+ are available. When not in UTF-8 mode, these sequences are of course
+ limited to testing characters whose codepoints are less than 256, but
+ they do work in this mode. The extra escape sequences are:
+
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \X an extended Unicode sequence
+
+ The property names represented by xx above are limited to the Unicode
+ script names, the general category properties, and "Any", which matches
+ any character (including newline). Other properties such as "InMusical-
+ Symbols" are not currently supported by PCRE. Note that \P{Any} does
+ not match any characters, so always causes a match failure.
+
+ Sets of Unicode characters are defined as belonging to certain scripts.
+ A character from one of these sets can be matched using a script name.
+ For example:
+
+ \p{Greek}
+ \P{Han}
+
+ Those that are not part of an identified script are lumped together as
+ "Common". The current list of scripts is:
+
+ Arabic, Armenian, Balinese, Bengali, Bopomofo, Braille, Buginese,
+ Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic, Cuneiform,
+ Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, Glagolitic,
+ Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hira-
+ gana, Inherited, Kannada, Katakana, Kharoshthi, Khmer, Lao, Latin,
+ Limbu, Linear_B, Malayalam, Mongolian, Myanmar, New_Tai_Lue, Nko,
+ Ogham, Old_Italic, Old_Persian, Oriya, Osmanya, Phags_Pa, Phoenician,
+ Runic, Shavian, Sinhala, Syloti_Nagri, Syriac, Tagalog, Tagbanwa,
+ Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, Tifinagh, Ugaritic, Yi.
+
+ Each character has exactly one general category property, specified by
+ a two-letter abbreviation. For compatibility with Perl, negation can be
+ specified by including a circumflex between the opening brace and the
+ property name. For example, \p{^Lu} is the same as \P{Lu}.
+
+ If only one letter is specified with \p or \P, it includes all the gen-
+ eral category properties that start with that letter. In this case, in
+ the absence of negation, the curly brackets in the escape sequence are
+ optional; these two examples have the same effect:
+
+ \p{L}
+ \pL
+
+ The following general category property codes are supported:
+
+ C Other
+ Cc Control
+ Cf Format
+ Cn Unassigned
+ Co Private use
+ Cs Surrogate
+
+ L Letter
+ Ll Lower case letter
+ Lm Modifier letter
+ Lo Other letter
+ Lt Title case letter
+ Lu Upper case letter
+
+ M Mark
+ Mc Spacing mark
+ Me Enclosing mark
+ Mn Non-spacing mark
+
+ N Number
+ Nd Decimal number
+ Nl Letter number
+ No Other number
+
+ P Punctuation
+ Pc Connector punctuation
+ Pd Dash punctuation
+ Pe Close punctuation
+ Pf Final punctuation
+ Pi Initial punctuation
+ Po Other punctuation
+ Ps Open punctuation
+
+ S Symbol
+ Sc Currency symbol
+ Sk Modifier symbol
+ Sm Mathematical symbol
+ So Other symbol
+
+ Z Separator
+ Zl Line separator
+ Zp Paragraph separator
+ Zs Space separator
+
+ The special property L& is also supported: it matches a character that
+ has the Lu, Ll, or Lt property, in other words, a letter that is not
+ classified as a modifier or "other".
+
+ The Cs (Surrogate) property applies only to characters in the range
+ U+D800 to U+DFFF. Such characters are not valid in UTF-8 strings (see
+ RFC 3629) and so cannot be tested by PCRE, unless UTF-8 validity check-
+ ing has been turned off (see the discussion of PCRE_NO_UTF8_CHECK in
+ the pcreapi page).
+
+ The long synonyms for these properties that Perl supports (such as
+ \p{Letter}) are not supported by PCRE, nor is it permitted to prefix
+ any of these properties with "Is".
+
+ No character that is in the Unicode table has the Cn (unassigned) prop-
+ erty. Instead, this property is assumed for any code point that is not
+ in the Unicode table.
+
+ Specifying caseless matching does not affect these escape sequences.
+ For example, \p{Lu} always matches only upper case letters.
+
+ The \X escape matches any number of Unicode characters that form an
+ extended Unicode sequence. \X is equivalent to
+
+ (?>\PM\pM*)
+
+ That is, it matches a character without the "mark" property, followed
+ by zero or more characters with the "mark" property, and treats the
+ sequence as an atomic group (see below). Characters with the "mark"
+ property are typically accents that affect the preceding character.
+ None of them have codepoints less than 256, so in non-UTF-8 mode \X
+ matches any one character.
+
+ Matching characters by Unicode property is not fast, because PCRE has
+ to search a structure that contains data for over fifteen thousand
+ characters. That is why the traditional escape sequences such as \d and
+ \w do not use Unicode properties in PCRE.
+
+ Resetting the match start
+
+ The escape sequence \K, which is a Perl 5.10 feature, causes any previ-
+ ously matched characters not to be included in the final matched
+ sequence. For example, the pattern:
+
+ foo\Kbar
+
+ matches "foobar", but reports that it has matched "bar". This feature
+ is similar to a lookbehind assertion (described below). However, in
+ this case, the part of the subject before the real match does not have
+ to be of fixed length, as lookbehind assertions do. The use of \K does
+ not interfere with the setting of captured substrings. For example,
+ when the pattern
+
+ (foo)\Kbar
+
+ matches "foobar", the first substring is still set to "foo".
+
+ Simple assertions
+
+ The final use of backslash is for certain simple assertions. An asser-
+ tion specifies a condition that has to be met at a particular point in
+ a match, without consuming any characters from the subject string. The
+ use of subpatterns for more complicated assertions is described below.
+ The backslashed assertions are:
+
+ \b matches at a word boundary
+ \B matches when not at a word boundary
+ \A matches at the start of the subject
+ \Z matches at the end of the subject
+ also matches before a newline at the end of the subject
+ \z matches only at the end of the subject
+ \G matches at the first matching position in the subject
+
+ These assertions may not appear in character classes (but note that \b
+ has a different meaning, namely the backspace character, inside a char-
+ acter class).
+
+ A word boundary is a position in the subject string where the current
+ character and the previous character do not both match \w or \W (i.e.
+ one matches \w and the other matches \W), or the start or end of the
+ string if the first or last character matches \w, respectively.
+
+ The \A, \Z, and \z assertions differ from the traditional circumflex
+ and dollar (described in the next section) in that they only ever match
+ at the very start and end of the subject string, whatever options are
+ set. Thus, they are independent of multiline mode. These three asser-
+ tions are not affected by the PCRE_NOTBOL or PCRE_NOTEOL options, which
+ affect only the behaviour of the circumflex and dollar metacharacters.
+ However, if the startoffset argument of pcre_exec() is non-zero, indi-
+ cating that matching is to start at a point other than the beginning of
+ the subject, \A can never match. The difference between \Z and \z is
+ that \Z matches before a newline at the end of the string as well as at
+ the very end, whereas \z matches only at the end.
+
+ The \G assertion is true only when the current matching position is at
+ the start point of the match, as specified by the startoffset argument
+ of pcre_exec(). It differs from \A when the value of startoffset is
+ non-zero. By calling pcre_exec() multiple times with appropriate argu-
+ ments, you can mimic Perl's /g option, and it is in this kind of imple-
+ mentation where \G can be useful.
+
+ Note, however, that PCRE's interpretation of \G, as the start of the
+ current match, is subtly different from Perl's, which defines it as the
+ end of the previous match. In Perl, these can be different when the
+ previously matched string was empty. Because PCRE does just one match
+ at a time, it cannot reproduce this behaviour.
+
+ If all the alternatives of a pattern begin with \G, the expression is
+ anchored to the starting match position, and the "anchored" flag is set
+ in the compiled regular expression.
+
+
+CIRCUMFLEX AND DOLLAR
+
+ Outside a character class, in the default matching mode, the circumflex
+ character is an assertion that is true only if the current matching
+ point is at the start of the subject string. If the startoffset argu-
+ ment of pcre_exec() is non-zero, circumflex can never match if the
+ PCRE_MULTILINE option is unset. Inside a character class, circumflex
+ has an entirely different meaning (see below).
+
+ Circumflex need not be the first character of the pattern if a number
+ of alternatives are involved, but it should be the first thing in each
+ alternative in which it appears if the pattern is ever to match that
+ branch. If all possible alternatives start with a circumflex, that is,
+ if the pattern is constrained to match only at the start of the sub-
+ ject, it is said to be an "anchored" pattern. (There are also other
+ constructs that can cause a pattern to be anchored.)
+
+ A dollar character is an assertion that is true only if the current
+ matching point is at the end of the subject string, or immediately
+ before a newline at the end of the string (by default). Dollar need not
+ be the last character of the pattern if a number of alternatives are
+ involved, but it should be the last item in any branch in which it
+ appears. Dollar has no special meaning in a character class.
+
+ The meaning of dollar can be changed so that it matches only at the
+ very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at
+ compile time. This does not affect the \Z assertion.
+
+ The meanings of the circumflex and dollar characters are changed if the
+ PCRE_MULTILINE option is set. When this is the case, a circumflex
+ matches immediately after internal newlines as well as at the start of
+ the subject string. It does not match after a newline that ends the
+ string. A dollar matches before any newlines in the string, as well as
+ at the very end, when PCRE_MULTILINE is set. When newline is specified
+ as the two-character sequence CRLF, isolated CR and LF characters do
+ not indicate newlines.
+
+ For example, the pattern /^abc$/ matches the subject string "def\nabc"
+ (where \n represents a newline) in multiline mode, but not otherwise.
+ Consequently, patterns that are anchored in single line mode because
+ all branches start with ^ are not anchored in multiline mode, and a
+ match for circumflex is possible when the startoffset argument of
+ pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
+ PCRE_MULTILINE is set.
+
+ Note that the sequences \A, \Z, and \z can be used to match the start
+ and end of the subject in both modes, and if all branches of a pattern
+ start with \A it is always anchored, whether or not PCRE_MULTILINE is
+ set.
+
+
+FULL STOP (PERIOD, DOT)
+
+ Outside a character class, a dot in the pattern matches any one charac-
+ ter in the subject string except (by default) a character that signi-
+ fies the end of a line. In UTF-8 mode, the matched character may be
+ more than one byte long.
+
+ When a line ending is defined as a single character, dot never matches
+ that character; when the two-character sequence CRLF is used, dot does
+ not match CR if it is immediately followed by LF, but otherwise it
+ matches all characters (including isolated CRs and LFs). When any Uni-
+ code line endings are being recognized, dot does not match CR or LF or
+ any of the other line ending characters.
+
+ The behaviour of dot with regard to newlines can be changed. If the
+ PCRE_DOTALL option is set, a dot matches any one character, without
+ exception. If the two-character sequence CRLF is present in the subject
+ string, it takes two dots to match it.
+
+ The handling of dot is entirely independent of the handling of circum-
+ flex and dollar, the only relationship being that they both involve
+ newlines. Dot has no special meaning in a character class.
+
+
+MATCHING A SINGLE BYTE
+
+ Outside a character class, the escape sequence \C matches any one byte,
+ both in and out of UTF-8 mode. Unlike a dot, it always matches any
+ line-ending characters. The feature is provided in Perl in order to
+ match individual bytes in UTF-8 mode. Because it breaks up UTF-8 char-
+ acters into individual bytes, what remains in the string may be a mal-
+ formed UTF-8 string. For this reason, the \C escape sequence is best
+ avoided.
+
+ PCRE does not allow \C to appear in lookbehind assertions (described
+ below), because in UTF-8 mode this would make it impossible to calcu-
+ late the length of the lookbehind.
+
+
+SQUARE BRACKETS AND CHARACTER CLASSES
+
+ An opening square bracket introduces a character class, terminated by a
+ closing square bracket. A closing square bracket on its own is not spe-
+ cial. If a closing square bracket is required as a member of the class,
+ it should be the first data character in the class (after an initial
+ circumflex, if present) or escaped with a backslash.
+
+ A character class matches a single character in the subject. In UTF-8
+ mode, the character may occupy more than one byte. A matched character
+ must be in the set of characters defined by the class, unless the first
+ character in the class definition is a circumflex, in which case the
+ subject character must not be in the set defined by the class. If a
+ circumflex is actually required as a member of the class, ensure it is
+ not the first character, or escape it with a backslash.
+
+ For example, the character class [aeiou] matches any lower case vowel,
+ while [^aeiou] matches any character that is not a lower case vowel.
+ Note that a circumflex is just a convenient notation for specifying the
+ characters that are in the class by enumerating those that are not. A
+ class that starts with a circumflex is not an assertion: it still con-
+ sumes a character from the subject string, and therefore it fails if
+ the current pointer is at the end of the string.
+
+ In UTF-8 mode, characters with values greater than 255 can be included
+ in a class as a literal string of bytes, or by using the \x{ escaping
+ mechanism.
+
+ When caseless matching is set, any letters in a class represent both
+ their upper case and lower case versions, so for example, a caseless
+ [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
+ match "A", whereas a caseful version would. In UTF-8 mode, PCRE always
+ understands the concept of case for characters whose values are less
+ than 128, so caseless matching is always possible. For characters with
+ higher values, the concept of case is supported if PCRE is compiled
+ with Unicode property support, but not otherwise. If you want to use
+ caseless matching for characters 128 and above, you must ensure that
+ PCRE is compiled with Unicode property support as well as with UTF-8
+ support.
+
+ Characters that might indicate line breaks are never treated in any
+ special way when matching character classes, whatever line-ending
+ sequence is in use, and whatever setting of the PCRE_DOTALL and
+ PCRE_MULTILINE options is used. A class such as [^a] always matches one
+ of these characters.
+
+ The minus (hyphen) character can be used to specify a range of charac-
+ ters in a character class. For example, [d-m] matches any letter
+ between d and m, inclusive. If a minus character is required in a
+ class, it must be escaped with a backslash or appear in a position
+ where it cannot be interpreted as indicating a range, typically as the
+ first or last character in the class.
+
+ It is not possible to have the literal character "]" as the end charac-
+ ter of a range. A pattern such as [W-]46] is interpreted as a class of
+ two characters ("W" and "-") followed by a literal string "46]", so it
+ would match "W46]" or "-46]". However, if the "]" is escaped with a
+ backslash it is interpreted as the end of range, so [W-\]46] is inter-
+ preted as a class containing a range followed by two other characters.
+ The octal or hexadecimal representation of "]" can also be used to end
+ a range.
+
+ Ranges operate in the collating sequence of character values. They can
+ also be used for characters specified numerically, for example
+ [\000-\037]. In UTF-8 mode, ranges can include characters whose values
+ are greater than 255, for example [\x{100}-\x{2ff}].
+
+ If a range that includes letters is used when caseless matching is set,
+ it matches the letters in either case. For example, [W-c] is equivalent
+ to [][\\^_`wxyzabc], matched caselessly, and in non-UTF-8 mode, if
+ character tables for a French locale are in use, [\xc8-\xcb] matches
+ accented E characters in both cases. In UTF-8 mode, PCRE supports the
+ concept of case for characters with values greater than 128 only when
+ it is compiled with Unicode property support.
+
+ The character types \d, \D, \p, \P, \s, \S, \w, and \W may also appear
+ in a character class, and add the characters that they match to the
+ class. For example, [\dABCDEF] matches any hexadecimal digit. A circum-
+ flex can conveniently be used with the upper case character types to
+ specify a more restricted set of characters than the matching lower
+ case type. For example, the class [^\W_] matches any letter or digit,
+ but not underscore.
+
+ The only metacharacters that are recognized in character classes are
+ backslash, hyphen (only where it can be interpreted as specifying a
+ range), circumflex (only at the start), opening square bracket (only
+ when it can be interpreted as introducing a POSIX class name - see the
+ next section), and the terminating closing square bracket. However,
+ escaping other non-alphanumeric characters does no harm.
+
+
+POSIX CHARACTER CLASSES
+
+ Perl supports the POSIX notation for character classes. This uses names
+ enclosed by [: and :] within the enclosing square brackets. PCRE also
+ supports this notation. For example,
+
+ [01[:alpha:]%]
+
+ matches "0", "1", any alphabetic character, or "%". The supported class
+ names are
+
+ alnum letters and digits
+ alpha letters
+ ascii character codes 0 - 127
+ blank space or tab only
+ cntrl control characters
+ digit decimal digits (same as \d)
+ graph printing characters, excluding space
+ lower lower case letters
+ print printing characters, including space
+ punct printing characters, excluding letters and digits
+ space white space (not quite the same as \s)
+ upper upper case letters
+ word "word" characters (same as \w)
+ xdigit hexadecimal digits
+
+ The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13),
+ and space (32). Notice that this list includes the VT character (code
+ 11). This makes "space" different to \s, which does not include VT (for
+ Perl compatibility).
+
+ The name "word" is a Perl extension, and "blank" is a GNU extension
+ from Perl 5.8. Another Perl extension is negation, which is indicated
+ by a ^ character after the colon. For example,
+
+ [12[:^digit:]]
+
+ matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the
+ POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but
+ these are not supported, and an error is given if they are encountered.
+
+ In UTF-8 mode, characters with values greater than 128 do not match any
+ of the POSIX character classes.
+
+
+VERTICAL BAR
+
+ Vertical bar characters are used to separate alternative patterns. For
+ example, the pattern
+
+ gilbert|sullivan
+
+ matches either "gilbert" or "sullivan". Any number of alternatives may
+ appear, and an empty alternative is permitted (matching the empty
+ string). The matching process tries each alternative in turn, from left
+ to right, and the first one that succeeds is used. If the alternatives
+ are within a subpattern (defined below), "succeeds" means matching the
+ rest of the main pattern as well as the alternative in the subpattern.
+
+
+INTERNAL OPTION SETTING
+
+ The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
+ PCRE_EXTENDED options (which are Perl-compatible) can be changed from
+ within the pattern by a sequence of Perl option letters enclosed
+ between "(?" and ")". The option letters are
+
+ i for PCRE_CASELESS
+ m for PCRE_MULTILINE
+ s for PCRE_DOTALL
+ x for PCRE_EXTENDED
+
+ For example, (?im) sets caseless, multiline matching. It is also possi-
+ ble to unset these options by preceding the letter with a hyphen, and a
+ combined setting and unsetting such as (?im-sx), which sets PCRE_CASE-
+ LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED,
+ is also permitted. If a letter appears both before and after the
+ hyphen, the option is unset.
+
+ The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA
+ can be changed in the same way as the Perl-compatible options by using
+ the characters J, U and X respectively.
+
+ When an option change occurs at top level (that is, not inside subpat-
+ tern parentheses), the change applies to the remainder of the pattern
+ that follows. If the change is placed right at the start of a pattern,
+ PCRE extracts it into the global options (and it will therefore show up
+ in data extracted by the pcre_fullinfo() function).
+
+ An option change within a subpattern (see below for a description of
+ subpatterns) affects only that part of the current pattern that follows
+ it, so
+
+ (a(?i)b)c
+
+ matches abc and aBc and no other strings (assuming PCRE_CASELESS is not
+ used). By this means, options can be made to have different settings
+ in different parts of the pattern. Any changes made in one alternative
+ do carry on into subsequent branches within the same subpattern. For
+ example,
+
+ (a(?i)b|c)
+
+ matches "ab", "aB", "c", and "C", even though when matching "C" the
+ first branch is abandoned before the option setting. This is because
+ the effects of option settings happen at compile time. There would be
+ some very weird behaviour otherwise.
+
+ Note: There are other PCRE-specific options that can be set by the
+ application when the compile or match functions are called. In some
+ cases the pattern can contain special leading sequences to override
+ what the application has set or what has been defaulted. Details are
+ given in the section entitled "Newline sequences" above.
+
+
+SUBPATTERNS
+
+ Subpatterns are delimited by parentheses (round brackets), which can be
+ nested. Turning part of a pattern into a subpattern does two things:
+
+ 1. It localizes a set of alternatives. For example, the pattern
+
+ cat(aract|erpillar|)
+
+ matches one of the words "cat", "cataract", or "caterpillar". Without
+ the parentheses, it would match "cataract", "erpillar" or an empty
+ string.
+
+ 2. It sets up the subpattern as a capturing subpattern. This means
+ that, when the whole pattern matches, that portion of the subject
+ string that matched the subpattern is passed back to the caller via the
+ ovector argument of pcre_exec(). Opening parentheses are counted from
+ left to right (starting from 1) to obtain numbers for the capturing
+ subpatterns.
+
+ For example, if the string "the red king" is matched against the pat-
+ tern
+
+ the ((red|white) (king|queen))
+
+ the captured substrings are "red king", "red", and "king", and are num-
+ bered 1, 2, and 3, respectively.
+
+ The fact that plain parentheses fulfil two functions is not always
+ helpful. There are often times when a grouping subpattern is required
+ without a capturing requirement. If an opening parenthesis is followed
+ by a question mark and a colon, the subpattern does not do any captur-
+ ing, and is not counted when computing the number of any subsequent
+ capturing subpatterns. For example, if the string "the white queen" is
+ matched against the pattern
+
+ the ((?:red|white) (king|queen))
+
+ the captured substrings are "white queen" and "queen", and are numbered
+ 1 and 2. The maximum number of capturing subpatterns is 65535.
+
+ As a convenient shorthand, if any option settings are required at the
+ start of a non-capturing subpattern, the option letters may appear
+ between the "?" and the ":". Thus the two patterns
+
+ (?i:saturday|sunday)
+ (?:(?i)saturday|sunday)
+
+ match exactly the same set of strings. Because alternative branches are
+ tried from left to right, and options are not reset until the end of
+ the subpattern is reached, an option setting in one branch does affect
+ subsequent branches, so the above patterns match "SUNDAY" as well as
+ "Saturday".
+
+
+DUPLICATE SUBPATTERN NUMBERS
+
+ Perl 5.10 introduced a feature whereby each alternative in a subpattern
+ uses the same numbers for its capturing parentheses. Such a subpattern
+ starts with (?| and is itself a non-capturing subpattern. For example,
+ consider this pattern:
+
+ (?|(Sat)ur|(Sun))day
+
+ Because the two alternatives are inside a (?| group, both sets of cap-
+ turing parentheses are numbered one. Thus, when the pattern matches,
+ you can look at captured substring number one, whichever alternative
+ matched. This construct is useful when you want to capture part, but
+ not all, of one of a number of alternatives. Inside a (?| group, paren-
+ theses are numbered as usual, but the number is reset at the start of
+ each branch. The numbers of any capturing buffers that follow the sub-
+ pattern start after the highest number used in any branch. The follow-
+ ing example is taken from the Perl documentation. The numbers under-
+ neath show in which buffer the captured content will be stored.
+
+ # before ---------------branch-reset----------- after
+ / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
+ # 1 2 2 3 2 3 4
+
+ A backreference or a recursive call to a numbered subpattern always
+ refers to the first one in the pattern with the given number.
+
+ An alternative approach to using this "branch reset" feature is to use
+ duplicate named subpatterns, as described in the next section.
+
+
+NAMED SUBPATTERNS
+
+ Identifying capturing parentheses by number is simple, but it can be
+ very hard to keep track of the numbers in complicated regular expres-
+ sions. Furthermore, if an expression is modified, the numbers may
+ change. To help with this difficulty, PCRE supports the naming of sub-
+ patterns. This feature was not added to Perl until release 5.10. Python
+ had the feature earlier, and PCRE introduced it at release 4.0, using
+ the Python syntax. PCRE now supports both the Perl and the Python syn-
+ tax.
+
+ In PCRE, a subpattern can be named in one of three ways: (?...)
+ or (?'name'...) as in Perl, or (?P...) as in Python. References
+ to capturing parentheses from other parts of the pattern, such as back-
+ references, recursion, and conditions, can be made by name as well as
+ by number.
+
+ Names consist of up to 32 alphanumeric characters and underscores.
+ Named capturing parentheses are still allocated numbers as well as
+ names, exactly as if the names were not present. The PCRE API provides
+ function calls for extracting the name-to-number translation table from
+ a compiled pattern. There is also a convenience function for extracting
+ a captured substring by name.
+
+ By default, a name must be unique within a pattern, but it is possible
+ to relax this constraint by setting the PCRE_DUPNAMES option at compile
+ time. This can be useful for patterns where only one instance of the
+ named parentheses can match. Suppose you want to match the name of a
+ weekday, either as a 3-letter abbreviation or as the full name, and in
+ both cases you want to extract the abbreviation. This pattern (ignoring
+ the line breaks) does the job:
+
+ (?Mon|Fri|Sun)(?:day)?|
+ (?Tue)(?:sday)?|
+ (?Wed)(?:nesday)?|
+ (?Thu)(?:rsday)?|
+ (?Sat)(?:urday)?
+
+ There are five capturing substrings, but only one is ever set after a
+ match. (An alternative way of solving this problem is to use a "branch
+ reset" subpattern, as described in the previous section.)
+
+ The convenience function for extracting the data by name returns the
+ substring for the first (and in this example, the only) subpattern of
+ that name that matched. This saves searching to find which numbered
+ subpattern it was. If you make a reference to a non-unique named sub-
+ pattern from elsewhere in the pattern, the one that corresponds to the
+ lowest number is used. For further details of the interfaces for han-
+ dling named subpatterns, see the pcreapi documentation.
+
+
+REPETITION
+
+ Repetition is specified by quantifiers, which can follow any of the
+ following items:
+
+ a literal data character
+ the dot metacharacter
+ the \C escape sequence
+ the \X escape sequence (in UTF-8 mode with Unicode properties)
+ the \R escape sequence
+ an escape such as \d that matches a single character
+ a character class
+ a back reference (see next section)
+ a parenthesized subpattern (unless it is an assertion)
+
+ The general repetition quantifier specifies a minimum and maximum num-
+ ber of permitted matches, by giving the two numbers in curly brackets
+ (braces), separated by a comma. The numbers must be less than 65536,
+ and the first must be less than or equal to the second. For example:
+
+ z{2,4}
+
+ matches "zz", "zzz", or "zzzz". A closing brace on its own is not a
+ special character. If the second number is omitted, but the comma is
+ present, there is no upper limit; if the second number and the comma
+ are both omitted, the quantifier specifies an exact number of required
+ matches. Thus
+
+ [aeiou]{3,}
+
+ matches at least 3 successive vowels, but may match many more, while
+
+ \d{8}
+
+ matches exactly 8 digits. An opening curly bracket that appears in a
+ position where a quantifier is not allowed, or one that does not match
+ the syntax of a quantifier, is taken as a literal character. For exam-
+ ple, {,6} is not a quantifier, but a literal string of four characters.
+
+ In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to
+ individual bytes. Thus, for example, \x{100}{2} matches two UTF-8 char-
+ acters, each of which is represented by a two-byte sequence. Similarly,
+ when Unicode property support is available, \X{3} matches three Unicode
+ extended sequences, each of which may be several bytes long (and they
+ may be of different lengths).
+
+ The quantifier {0} is permitted, causing the expression to behave as if
+ the previous item and the quantifier were not present. This may be use-
+ ful for subpatterns that are referenced as subroutines from elsewhere
+ in the pattern. Items other than subpatterns that have a {0} quantifier
+ are omitted from the compiled pattern.
+
+ For convenience, the three most common quantifiers have single-charac-
+ ter abbreviations:
+
+ * is equivalent to {0,}
+ + is equivalent to {1,}
+ ? is equivalent to {0,1}
+
+ It is possible to construct infinite loops by following a subpattern
+ that can match no characters with a quantifier that has no upper limit,
+ for example:
+
+ (a?)*
+
+ Earlier versions of Perl and PCRE used to give an error at compile time
+ for such patterns. However, because there are cases where this can be
+ useful, such patterns are now accepted, but if any repetition of the
+ subpattern does in fact match no characters, the loop is forcibly bro-
+ ken.
+
+ By default, the quantifiers are "greedy", that is, they match as much
+ as possible (up to the maximum number of permitted times), without
+ causing the rest of the pattern to fail. The classic example of where
+ this gives problems is in trying to match comments in C programs. These
+ appear between /* and */ and within the comment, individual * and /
+ characters may appear. An attempt to match C comments by applying the
+ pattern
+
+ /\*.*\*/
+
+ to the string
+
+ /* first comment */ not comment /* second comment */
+
+ fails, because it matches the entire string owing to the greediness of
+ the .* item.
+
+ However, if a quantifier is followed by a question mark, it ceases to
+ be greedy, and instead matches the minimum number of times possible, so
+ the pattern
+
+ /\*.*?\*/
+
+ does the right thing with the C comments. The meaning of the various
+ quantifiers is not otherwise changed, just the preferred number of
+ matches. Do not confuse this use of question mark with its use as a
+ quantifier in its own right. Because it has two uses, it can sometimes
+ appear doubled, as in
+
+ \d??\d
+
+ which matches one digit by preference, but can match two if that is the
+ only way the rest of the pattern matches.
+
+ If the PCRE_UNGREEDY option is set (an option that is not available in
+ Perl), the quantifiers are not greedy by default, but individual ones
+ can be made greedy by following them with a question mark. In other
+ words, it inverts the default behaviour.
+
+ When a parenthesized subpattern is quantified with a minimum repeat
+ count that is greater than 1 or with a limited maximum, more memory is
+ required for the compiled pattern, in proportion to the size of the
+ minimum or maximum.
+
+ If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equiv-
+ alent to Perl's /s) is set, thus allowing the dot to match newlines,
+ the pattern is implicitly anchored, because whatever follows will be
+ tried against every character position in the subject string, so there
+ is no point in retrying the overall match at any position after the
+ first. PCRE normally treats such a pattern as though it were preceded
+ by \A.
+
+ In cases where it is known that the subject string contains no new-
+ lines, it is worth setting PCRE_DOTALL in order to obtain this opti-
+ mization, or alternatively using ^ to indicate anchoring explicitly.
+
+ However, there is one situation where the optimization cannot be used.
+ When .* is inside capturing parentheses that are the subject of a
+ backreference elsewhere in the pattern, a match at the start may fail
+ where a later one succeeds. Consider, for example:
+
+ (.*)abc\1
+
+ If the subject is "xyz123abc123" the match point is the fourth charac-
+ ter. For this reason, such a pattern is not implicitly anchored.
+
+ When a capturing subpattern is repeated, the value captured is the sub-
+ string that matched the final iteration. For example, after
+
+ (tweedle[dume]{3}\s*)+
+
+ has matched "tweedledum tweedledee" the value of the captured substring
+ is "tweedledee". However, if there are nested capturing subpatterns,
+ the corresponding captured values may have been set in previous itera-
+ tions. For example, after
+
+ /(a|(b))+/
+
+ matches "aba" the value of the second captured substring is "b".
+
+
+ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
+
+ With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
+ repetition, failure of what follows normally causes the repeated item
+ to be re-evaluated to see if a different number of repeats allows the
+ rest of the pattern to match. Sometimes it is useful to prevent this,
+ either to change the nature of the match, or to cause it fail earlier
+ than it otherwise might, when the author of the pattern knows there is
+ no point in carrying on.
+
+ Consider, for example, the pattern \d+foo when applied to the subject
+ line
+
+ 123456bar
+
+ After matching all 6 digits and then failing to match "foo", the normal
+ action of the matcher is to try again with only 5 digits matching the
+ \d+ item, and then with 4, and so on, before ultimately failing.
+ "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides
+ the means for specifying that once a subpattern has matched, it is not
+ to be re-evaluated in this way.
+
+ If we use atomic grouping for the previous example, the matcher gives
+ up immediately on failing to match "foo" the first time. The notation
+ is a kind of special parenthesis, starting with (?> as in this example:
+
+ (?>\d+)foo
+
+ This kind of parenthesis "locks up" the part of the pattern it con-
+ tains once it has matched, and a failure further into the pattern is
+ prevented from backtracking into it. Backtracking past it to previous
+ items, however, works as normal.
+
+ An alternative description is that a subpattern of this type matches
+ the string of characters that an identical standalone pattern would
+ match, if anchored at the current point in the subject string.
+
+ Atomic grouping subpatterns are not capturing subpatterns. Simple cases
+ such as the above example can be thought of as a maximizing repeat that
+ must swallow everything it can. So, while both \d+ and \d+? are pre-
+ pared to adjust the number of digits they match in order to make the
+ rest of the pattern match, (?>\d+) can only match an entire sequence of
+ digits.
+
+ Atomic groups in general can of course contain arbitrarily complicated
+ subpatterns, and can be nested. However, when the subpattern for an
+ atomic group is just a single repeated item, as in the example above, a
+ simpler notation, called a "possessive quantifier" can be used. This
+ consists of an additional + character following a quantifier. Using
+ this notation, the previous example can be rewritten as
+
+ \d++foo
+
+ Note that a possessive quantifier can be used with an entire group, for
+ example:
+
+ (abc|xyz){2,3}+
+
+ Possessive quantifiers are always greedy; the setting of the
+ PCRE_UNGREEDY option is ignored. They are a convenient notation for the
+ simpler forms of atomic group. However, there is no difference in the
+ meaning of a possessive quantifier and the equivalent atomic group,
+ though there may be a performance difference; possessive quantifiers
+ should be slightly faster.
+
+ The possessive quantifier syntax is an extension to the Perl 5.8 syn-
+ tax. Jeffrey Friedl originated the idea (and the name) in the first
+ edition of his book. Mike McCloskey liked it, so implemented it when he
+ built Sun's Java package, and PCRE copied it from there. It ultimately
+ found its way into Perl at release 5.10.
+
+ PCRE has an optimization that automatically "possessifies" certain sim-
+ ple pattern constructs. For example, the sequence A+B is treated as
+ A++B because there is no point in backtracking into a sequence of A's
+ when B must follow.
+
+ When a pattern contains an unlimited repeat inside a subpattern that
+ can itself be repeated an unlimited number of times, the use of an
+ atomic group is the only way to avoid some failing matches taking a
+ very long time indeed. The pattern
+
+ (\D+|<\d+>)*[!?]
+
+ matches an unlimited number of substrings that either consist of non-
+ digits, or digits enclosed in <>, followed by either ! or ?. When it
+ matches, it runs quickly. However, if it is applied to
+
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+ it takes a long time before reporting failure. This is because the
+ string can be divided between the internal \D+ repeat and the external
+ * repeat in a large number of ways, and all have to be tried. (The
+ example uses [!?] rather than a single character at the end, because
+ both PCRE and Perl have an optimization that allows for fast failure
+ when a single character is used. They remember the last single charac-
+ ter that is required for a match, and fail early if it is not present
+ in the string.) If the pattern is changed so that it uses an atomic
+ group, like this:
+
+ ((?>\D+)|<\d+>)*[!?]
+
+ sequences of non-digits cannot be broken, and failure happens quickly.
+
+
+BACK REFERENCES
+
+ Outside a character class, a backslash followed by a digit greater than
+ 0 (and possibly further digits) is a back reference to a capturing sub-
+ pattern earlier (that is, to its left) in the pattern, provided there
+ have been that many previous capturing left parentheses.
+
+ However, if the decimal number following the backslash is less than 10,
+ it is always taken as a back reference, and causes an error only if
+ there are not that many capturing left parentheses in the entire pat-
+ tern. In other words, the parentheses that are referenced need not be
+ to the left of the reference for numbers less than 10. A "forward back
+ reference" of this type can make sense when a repetition is involved
+ and the subpattern to the right has participated in an earlier itera-
+ tion.
+
+ It is not possible to have a numerical "forward back reference" to a
+ subpattern whose number is 10 or more using this syntax because a
+ sequence such as \50 is interpreted as a character defined in octal.
+ See the subsection entitled "Non-printing characters" above for further
+ details of the handling of digits following a backslash. There is no
+ such problem when named parentheses are used. A back reference to any
+ subpattern is possible using named parentheses (see below).
+
+ Another way of avoiding the ambiguity inherent in the use of digits
+ following a backslash is to use the \g escape sequence, which is a fea-
+ ture introduced in Perl 5.10. This escape must be followed by an
+ unsigned number or a negative number, optionally enclosed in braces.
+ These examples are all identical:
+
+ (ring), \1
+ (ring), \g1
+ (ring), \g{1}
+
+ An unsigned number specifies an absolute reference without the ambigu-
+ ity that is present in the older syntax. It is also useful when literal
+ digits follow the reference. A negative number is a relative reference.
+ Consider this example:
+
+ (abc(def)ghi)\g{-1}
+
+ The sequence \g{-1} is a reference to the most recently started captur-
+ ing subpattern before \g, that is, is it equivalent to \2. Similarly,
+ \g{-2} would be equivalent to \1. The use of relative references can be
+ helpful in long patterns, and also in patterns that are created by
+ joining together fragments that contain references within themselves.
+
+ A back reference matches whatever actually matched the capturing sub-
+ pattern in the current subject string, rather than anything matching
+ the subpattern itself (see "Subpatterns as subroutines" below for a way
+ of doing that). So the pattern
+
+ (sens|respons)e and \1ibility
+
+ matches "sense and sensibility" and "response and responsibility", but
+ not "sense and responsibility". If caseful matching is in force at the
+ time of the back reference, the case of letters is relevant. For exam-
+ ple,
+
+ ((?i)rah)\s+\1
+
+ matches "rah rah" and "RAH RAH", but not "RAH rah", even though the
+ original capturing subpattern is matched caselessly.
+
+ There are several different ways of writing back references to named
+ subpatterns. The .NET syntax \k{name} and the Perl syntax \k or
+ \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's
+ unified back reference syntax, in which \g can be used for both numeric
+ and named references, is also supported. We could rewrite the above
+ example in any of the following ways:
+
+ (?(?i)rah)\s+\k
+ (?'p1'(?i)rah)\s+\k{p1}
+ (?P(?i)rah)\s+(?P=p1)
+ (?(?i)rah)\s+\g{p1}
+
+ A subpattern that is referenced by name may appear in the pattern
+ before or after the reference.
+
+ There may be more than one back reference to the same subpattern. If a
+ subpattern has not actually been used in a particular match, any back
+ references to it always fail. For example, the pattern
+
+ (a|(bc))\2
+
+ always fails if it starts to match "a" rather than "bc". Because there
+ may be many capturing parentheses in a pattern, all digits following
+ the backslash are taken as part of a potential back reference number.
+ If the pattern continues with a digit character, some delimiter must be
+ used to terminate the back reference. If the PCRE_EXTENDED option is
+ set, this can be whitespace. Otherwise an empty comment (see "Com-
+ ments" below) can be used.
+
+ A back reference that occurs inside the parentheses to which it refers
+ fails when the subpattern is first used, so, for example, (a\1) never
+ matches. However, such references can be useful inside repeated sub-
+ patterns. For example, the pattern
+
+ (a|b\1)+
+
+ matches any number of "a"s and also "aba", "ababbaa" etc. At each iter-
+ ation of the subpattern, the back reference matches the character
+ string corresponding to the previous iteration. In order for this to
+ work, the pattern must be such that the first iteration does not need
+ to match the back reference. This can be done using alternation, as in
+ the example above, or by a quantifier with a minimum of zero.
+
+
+ASSERTIONS
+
+ An assertion is a test on the characters following or preceding the
+ current matching point that does not actually consume any characters.
+ The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are
+ described above.
+
+ More complicated assertions are coded as subpatterns. There are two
+ kinds: those that look ahead of the current position in the subject
+ string, and those that look behind it. An assertion subpattern is
+ matched in the normal way, except that it does not cause the current
+ matching position to be changed.
+
+ Assertion subpatterns are not capturing subpatterns, and may not be
+ repeated, because it makes no sense to assert the same thing several
+ times. If any kind of assertion contains capturing subpatterns within
+ it, these are counted for the purposes of numbering the capturing sub-
+ patterns in the whole pattern. However, substring capturing is carried
+ out only for positive assertions, because it does not make sense for
+ negative assertions.
+
+ Lookahead assertions
+
+ Lookahead assertions start with (?= for positive assertions and (?! for
+ negative assertions. For example,
+
+ \w+(?=;)
+
+ matches a word followed by a semicolon, but does not include the semi-
+ colon in the match, and
+
+ foo(?!bar)
+
+ matches any occurrence of "foo" that is not followed by "bar". Note
+ that the apparently similar pattern
+
+ (?!foo)bar
+
+ does not find an occurrence of "bar" that is preceded by something
+ other than "foo"; it finds any occurrence of "bar" whatsoever, because
+ the assertion (?!foo) is always true when the next three characters are
+ "bar". A lookbehind assertion is needed to achieve the other effect.
+
+ If you want to force a matching failure at some point in a pattern, the
+ most convenient way to do it is with (?!) because an empty string
+ always matches, so an assertion that requires there not to be an empty
+ string must always fail.
+
+ Lookbehind assertions
+
+ Lookbehind assertions start with (?<= for positive assertions and (?)...) or (?('name')...) to test for a
+ used subpattern by name. For compatibility with earlier versions of
+ PCRE, which had this facility before Perl, the syntax (?(name)...) is
+ also recognized. However, there is a possible ambiguity with this syn-
+ tax, because subpattern names may consist entirely of digits. PCRE
+ looks first for a named subpattern; if it cannot find one and the name
+ consists entirely of digits, PCRE looks for a subpattern of that num-
+ ber, which must be greater than zero. Using subpattern names that con-
+ sist entirely of digits is not recommended.
+
+ Rewriting the above example to use a named subpattern gives this:
+
+ (? \( )? [^()]+ (?() \) )
+
+
+ Checking for pattern recursion
+
+ If the condition is the string (R), and there is no subpattern with the
+ name R, the condition is true if a recursive call to the whole pattern
+ or any subpattern has been made. If digits or a name preceded by amper-
+ sand follow the letter R, for example:
+
+ (?(R3)...) or (?(R&name)...)
+
+ the condition is true if the most recent recursion is into the subpat-
+ tern whose number or name is given. This condition does not check the
+ entire recursion stack.
+
+ At "top level", all these recursion test conditions are false. Recur-
+ sive patterns are described below.
+
+ Defining subpatterns for use by reference only
+
+ If the condition is the string (DEFINE), and there is no subpattern
+ with the name DEFINE, the condition is always false. In this case,
+ there may be only one alternative in the subpattern. It is always
+ skipped if control reaches this point in the pattern; the idea of
+ DEFINE is that it can be used to define "subroutines" that can be ref-
+ erenced from elsewhere. (The use of "subroutines" is described below.)
+ For example, a pattern to match an IPv4 address could be written like
+ this (ignore whitespace and line breaks):
+
+ (?(DEFINE) (? 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
+ \b (?&byte) (\.(?&byte)){3} \b
+
+ The first part of the pattern is a DEFINE group inside which a another
+ group named "byte" is defined. This matches an individual component of
+ an IPv4 address (a number less than 256). When matching takes place,
+ this part of the pattern is skipped because DEFINE acts like a false
+ condition.
+
+ The rest of the pattern uses references to the named group to match the
+ four dot-separated components of an IPv4 address, insisting on a word
+ boundary at each end.
+
+ Assertion conditions
+
+ If the condition is not in any of the above formats, it must be an
+ assertion. This may be a positive or negative lookahead or lookbehind
+ assertion. Consider this pattern, again containing non-significant
+ white space, and with the two alternatives on the second line:
+
+ (?(?=[^a-z]*[a-z])
+ \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
+
+ The condition is a positive lookahead assertion that matches an
+ optional sequence of non-letters followed by a letter. In other words,
+ it tests for the presence of at least one letter in the subject. If a
+ letter is found, the subject is matched against the first alternative;
+ otherwise it is matched against the second. This pattern matches
+ strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
+ letters and dd are digits.
+
+
+COMMENTS
+
+ The sequence (?# marks the start of a comment that continues up to the
+ next closing parenthesis. Nested parentheses are not permitted. The
+ characters that make up a comment play no part in the pattern matching
+ at all.
+
+ If the PCRE_EXTENDED option is set, an unescaped # character outside a
+ character class introduces a comment that continues to immediately
+ after the next newline in the pattern.
+
+
+RECURSIVE PATTERNS
+
+ Consider the problem of matching a string in parentheses, allowing for
+ unlimited nested parentheses. Without the use of recursion, the best
+ that can be done is to use a pattern that matches up to some fixed
+ depth of nesting. It is not possible to handle an arbitrary nesting
+ depth.
+
+ For some time, Perl has provided a facility that allows regular expres-
+ sions to recurse (amongst other things). It does this by interpolating
+ Perl code in the expression at run time, and the code can refer to the
+ expression itself. A Perl pattern using code interpolation to solve the
+ parentheses problem can be created like this:
+
+ $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
+
+ The (?p{...}) item interpolates Perl code at run time, and in this case
+ refers recursively to the pattern in which it appears.
+
+ Obviously, PCRE cannot support the interpolation of Perl code. Instead,
+ it supports special syntax for recursion of the entire pattern, and
+ also for individual subpattern recursion. After its introduction in
+ PCRE and Python, this kind of recursion was introduced into Perl at
+ release 5.10.
+
+ A special item that consists of (? followed by a number greater than
+ zero and a closing parenthesis is a recursive call of the subpattern of
+ the given number, provided that it occurs inside that subpattern. (If
+ not, it is a "subroutine" call, which is described in the next sec-
+ tion.) The special item (?R) or (?0) is a recursive call of the entire
+ regular expression.
+
+ In PCRE (like Python, but unlike Perl), a recursive subpattern call is
+ always treated as an atomic group. That is, once it has matched some of
+ the subject string, it is never re-entered, even if it contains untried
+ alternatives and there is a subsequent matching failure.
+
+ This PCRE pattern solves the nested parentheses problem (assume the
+ PCRE_EXTENDED option is set so that white space is ignored):
+
+ \( ( (?>[^()]+) | (?R) )* \)
+
+ First it matches an opening parenthesis. Then it matches any number of
+ substrings which can either be a sequence of non-parentheses, or a
+ recursive match of the pattern itself (that is, a correctly parenthe-
+ sized substring). Finally there is a closing parenthesis.
+
+ If this were part of a larger pattern, you would not want to recurse
+ the entire pattern, so instead you could use this:
+
+ ( \( ( (?>[^()]+) | (?1) )* \) )
+
+ We have put the pattern into parentheses, and caused the recursion to
+ refer to them instead of the whole pattern.
+
+ In a larger pattern, keeping track of parenthesis numbers can be
+ tricky. This is made easier by the use of relative references. (A Perl
+ 5.10 feature.) Instead of (?1) in the pattern above you can write
+ (?-2) to refer to the second most recently opened parentheses preceding
+ the recursion. In other words, a negative number counts capturing
+ parentheses leftwards from the point at which it is encountered.
+
+ It is also possible to refer to subsequently opened parentheses, by
+ writing references such as (?+2). However, these cannot be recursive
+ because the reference is not inside the parentheses that are refer-
+ enced. They are always "subroutine" calls, as described in the next
+ section.
+
+ An alternative approach is to use named parentheses instead. The Perl
+ syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
+ supported. We could rewrite the above example as follows:
+
+ (? \( ( (?>[^()]+) | (?&pn) )* \) )
+
+ If there is more than one subpattern with the same name, the earliest
+ one is used.
+
+ This particular example pattern that we have been looking at contains
+ nested unlimited repeats, and so the use of atomic grouping for match-
+ ing strings of non-parentheses is important when applying the pattern
+ to strings that do not match. For example, when this pattern is applied
+ to
+
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+
+ it yields "no match" quickly. However, if atomic grouping is not used,
+ the match runs for a very long time indeed because there are so many
+ different ways the + and * repeats can carve up the subject, and all
+ have to be tested before failure can be reported.
+
+ At the end of a match, the values set for any capturing subpatterns are
+ those from the outermost level of the recursion at which the subpattern
+ value is set. If you want to obtain intermediate values, a callout
+ function can be used (see below and the pcrecallout documentation). If
+ the pattern above is matched against
+
+ (ab(cd)ef)
+
+ the value for the capturing parentheses is "ef", which is the last
+ value taken on at the top level. If additional parentheses are added,
+ giving
+
+ \( ( ( (?>[^()]+) | (?R) )* ) \)
+ ^ ^
+ ^ ^
+
+ the string they capture is "ab(cd)ef", the contents of the top level
+ parentheses. If there are more than 15 capturing parentheses in a pat-
+ tern, PCRE has to obtain extra memory to store data during a recursion,
+ which it does by using pcre_malloc, freeing it via pcre_free after-
+ wards. If no memory can be obtained, the match fails with the
+ PCRE_ERROR_NOMEMORY error.
+
+ Do not confuse the (?R) item with the condition (R), which tests for
+ recursion. Consider this pattern, which matches text in angle brack-
+ ets, allowing for arbitrary nesting. Only digits are allowed in nested
+ brackets (that is, when recursing), whereas any characters are permit-
+ ted at the outer level.
+
+ < (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
+
+ In this pattern, (?(R) is the start of a conditional subpattern, with
+ two different alternatives for the recursive and non-recursive cases.
+ The (?R) item is the actual recursive call.
+
+
+SUBPATTERNS AS SUBROUTINES
+
+ If the syntax for a recursive subpattern reference (either by number or
+ by name) is used outside the parentheses to which it refers, it oper-
+ ates like a subroutine in a programming language. The "called" subpat-
+ tern may be defined before or after the reference. A numbered reference
+ can be absolute or relative, as in these examples:
+
+ (...(absolute)...)...(?2)...
+ (...(relative)...)...(?-1)...
+ (...(?+1)...(relative)...
+
+ An earlier example pointed out that the pattern
+
+ (sens|respons)e and \1ibility
+
+ matches "sense and sensibility" and "response and responsibility", but
+ not "sense and responsibility". If instead the pattern
+
+ (sens|respons)e and (?1)ibility
+
+ is used, it does match "sense and responsibility" as well as the other
+ two strings. Another example is given in the discussion of DEFINE
+ above.
+
+ Like recursive subpatterns, a "subroutine" call is always treated as an
+ atomic group. That is, once it has matched some of the subject string,
+ it is never re-entered, even if it contains untried alternatives and
+ there is a subsequent matching failure.
+
+ When a subpattern is used as a subroutine, processing options such as
+ case-independence are fixed when the subpattern is defined. They cannot
+ be changed for different calls. For example, consider this pattern:
+
+ (abc)(?i:(?-1))
+
+ It matches "abcabc". It does not match "abcABC" because the change of
+ processing option does not affect the called subpattern.
+
+
+ONIGURUMA SUBROUTINE SYNTAX
+
+ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
+ name or a number enclosed either in angle brackets or single quotes, is
+ an alternative syntax for referencing a subpattern as a subroutine,
+ possibly recursively. Here are two of the examples used above, rewrit-
+ ten using this syntax:
+
+ (? \( ( (?>[^()]+) | \g )* \) )
+ (sens|respons)e and \g'1'ibility
+
+ PCRE supports an extension to Oniguruma: if a number is preceded by a
+ plus or a minus sign it is taken as a relative reference. For example:
+
+ (abc)(?i:\g<-1>)
+
+ Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
+ synonymous. The former is a back reference; the latter is a subroutine
+ call.
+
+
+CALLOUTS
+
+ Perl has a feature whereby using the sequence (?{...}) causes arbitrary
+ Perl code to be obeyed in the middle of matching a regular expression.
+ This makes it possible, amongst other things, to extract different sub-
+ strings that match the same pair of parentheses when there is a repeti-
+ tion.
+
+ PCRE provides a similar feature, but of course it cannot obey arbitrary
+ Perl code. The feature is called "callout". The caller of PCRE provides
+ an external function by putting its entry point in the global variable
+ pcre_callout. By default, this variable contains NULL, which disables
+ all calling out.
+
+ Within a regular expression, (?C) indicates the points at which the
+ external function is to be called. If you want to identify different
+ callout points, you can put a number less than 256 after the letter C.
+ The default value is zero. For example, this pattern has two callout
+ points:
+
+ (?C1)abc(?C2)def
+
+ If the PCRE_AUTO_CALLOUT flag is passed to pcre_compile(), callouts are
+ automatically installed before each item in the pattern. They are all
+ numbered 255.
+
+ During matching, when PCRE reaches a callout point (and pcre_callout is
+ set), the external function is called. It is provided with the number
+ of the callout, the position in the pattern, and, optionally, one item
+ of data originally supplied by the caller of pcre_exec(). The callout
+ function may cause matching to proceed, to backtrack, or to fail alto-
+ gether. A complete description of the interface to the callout function
+ is given in the pcrecallout documentation.
+
+
+BACKTRACKING CONTROL
+
+ Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
+ which are described in the Perl documentation as "experimental and sub-
+ ject to change or removal in a future version of Perl". It goes on to
+ say: "Their usage in production code should be noted to avoid problems
+ during upgrades." The same remarks apply to the PCRE features described
+ in this section.
+
+ Since these verbs are specifically related to backtracking, most of
+ them can be used only when the pattern is to be matched using
+ pcre_exec(), which uses a backtracking algorithm. With the exception of
+ (*FAIL), which behaves like a failing negative assertion, they cause an
+ error if encountered by pcre_dfa_exec().
+
+ The new verbs make use of what was previously invalid syntax: an open-
+ ing parenthesis followed by an asterisk. In Perl, they are generally of
+ the form (*VERB:ARG) but PCRE does not support the use of arguments, so
+ its general form is just (*VERB). Any number of these verbs may occur
+ in a pattern. There are two kinds:
+
+ Verbs that act immediately
+
+ The following verbs act as soon as they are encountered:
+
+ (*ACCEPT)
+
+ This verb causes the match to end successfully, skipping the remainder
+ of the pattern. When inside a recursion, only the innermost pattern is
+ ended immediately. PCRE differs from Perl in what happens if the
+ (*ACCEPT) is inside capturing parentheses. In Perl, the data so far is
+ captured: in PCRE no data is captured. For example:
+
+ A(A|B(*ACCEPT)|C)D
+
+ This matches "AB", "AAD", or "ACD", but when it matches "AB", no data
+ is captured.
+
+ (*FAIL) or (*F)
+
+ This verb causes the match to fail, forcing backtracking to occur. It
+ is equivalent to (?!) but easier to read. The Perl documentation notes
+ that it is probably useful only when combined with (?{}) or (??{}).
+ Those are, of course, Perl features that are not present in PCRE. The
+ nearest equivalent is the callout feature, as for example in this pat-
+ tern:
+
+ a+(?C)(*FAIL)
+
+ A match with the string "aaaa" always fails, but the callout is taken
+ before each backtrack happens (in this example, 10 times).
+
+ Verbs that act after backtracking
+
+ The following verbs do nothing when they are encountered. Matching con-
+ tinues with what follows, but if there is no subsequent match, a fail-
+ ure is forced. The verbs differ in exactly what kind of failure
+ occurs.
+
+ (*COMMIT)
+
+ This verb causes the whole match to fail outright if the rest of the
+ pattern does not match. Even if the pattern is unanchored, no further
+ attempts to find a match by advancing the start point take place. Once
+ (*COMMIT) has been passed, pcre_exec() is committed to finding a match
+ at the current starting point, or not at all. For example:
+
+ a+(*COMMIT)b
+
+ This matches "xxaab" but not "aacaab". It can be thought of as a kind
+ of dynamic anchor, or "I've started, so I must finish."
+
+ (*PRUNE)
+
+ This verb causes the match to fail at the current position if the rest
+ of the pattern does not match. If the pattern is unanchored, the normal
+ "bumpalong" advance to the next starting character then happens. Back-
+ tracking can occur as usual to the left of (*PRUNE), or when matching
+ to the right of (*PRUNE), but if there is no match to the right, back-
+ tracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE)
+ is just an alternative to an atomic group or possessive quantifier, but
+ there are some uses of (*PRUNE) that cannot be expressed in any other
+ way.
+
+ (*SKIP)
+
+ This verb is like (*PRUNE), except that if the pattern is unanchored,
+ the "bumpalong" advance is not to the next character, but to the posi-
+ tion in the subject where (*SKIP) was encountered. (*SKIP) signifies
+ that whatever text was matched leading up to it cannot be part of a
+ successful match. Consider:
+
+ a+(*SKIP)b
+
+ If the subject is "aaaac...", after the first match attempt fails
+ (starting at the first character in the string), the starting point
+ skips on to start the next attempt at "c". Note that a possessive quan-
+ tifer does not have the same effect in this example; although it would
+ suppress backtracking during the first match attempt, the second
+ attempt would start at the second character instead of skipping on to
+ "c".
+
+ (*THEN)
+
+ This verb causes a skip to the next alternation if the rest of the pat-
+ tern does not match. That is, it cancels pending backtracking, but only
+ within the current alternation. Its name comes from the observation
+ that it can be used for a pattern-based if-then-else block:
+
+ ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
+
+ If the COND1 pattern matches, FOO is tried (and possibly further items
+ after the end of the group if FOO succeeds); on failure the matcher
+ skips to the second alternative and tries COND2, without backtracking
+ into COND1. If (*THEN) is used outside of any alternation, it acts
+ exactly like (*PRUNE).
+
+
+SEE ALSO
+
+ pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 19 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCRESYNTAX(3) PCRESYNTAX(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE REGULAR EXPRESSION SYNTAX SUMMARY
+
+ The full syntax and semantics of the regular expressions that are sup-
+ ported by PCRE are described in the pcrepattern documentation. This
+ document contains just a quick-reference summary of the syntax.
+
+
+QUOTING
+
+ \x where x is non-alphanumeric is a literal x
+ \Q...\E treat enclosed characters as literal
+
+
+CHARACTERS
+
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is any character
+ \e escape (hex 1B)
+ \f formfeed (hex 0C)
+ \n newline (hex 0A)
+ \r carriage return (hex 0D)
+ \t tab (hex 09)
+ \ddd character with octal code ddd, or backreference
+ \xhh character with hex code hh
+ \x{hhh..} character with hex code hhh..
+
+
+CHARACTER TYPES
+
+ . any character except newline;
+ in dotall mode, any character whatsoever
+ \C one byte, even in UTF-8 mode (best avoided)
+ \d a decimal digit
+ \D a character that is not a decimal digit
+ \h a horizontal whitespace character
+ \H a character that is not a horizontal whitespace character
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \R a newline sequence
+ \s a whitespace character
+ \S a character that is not a whitespace character
+ \v a vertical whitespace character
+ \V a character that is not a vertical whitespace character
+ \w a "word" character
+ \W a "non-word" character
+ \X an extended Unicode sequence
+
+ In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
+
+
+GENERAL CATEGORY PROPERTY CODES FOR \p and \P
+
+ C Other
+ Cc Control
+ Cf Format
+ Cn Unassigned
+ Co Private use
+ Cs Surrogate
+
+ L Letter
+ Ll Lower case letter
+ Lm Modifier letter
+ Lo Other letter
+ Lt Title case letter
+ Lu Upper case letter
+ L& Ll, Lu, or Lt
+
+ M Mark
+ Mc Spacing mark
+ Me Enclosing mark
+ Mn Non-spacing mark
+
+ N Number
+ Nd Decimal number
+ Nl Letter number
+ No Other number
+
+ P Punctuation
+ Pc Connector punctuation
+ Pd Dash punctuation
+ Pe Close punctuation
+ Pf Final punctuation
+ Pi Initial punctuation
+ Po Other punctuation
+ Ps Open punctuation
+
+ S Symbol
+ Sc Currency symbol
+ Sk Modifier symbol
+ Sm Mathematical symbol
+ So Other symbol
+
+ Z Separator
+ Zl Line separator
+ Zp Paragraph separator
+ Zs Space separator
+
+
+SCRIPT NAMES FOR \p AND \P
+
+ Arabic, Armenian, Balinese, Bengali, Bopomofo, Braille, Buginese,
+ Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic, Cuneiform,
+ Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, Glagolitic,
+ Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hira-
+ gana, Inherited, Kannada, Katakana, Kharoshthi, Khmer, Lao, Latin,
+ Limbu, Linear_B, Malayalam, Mongolian, Myanmar, New_Tai_Lue, Nko,
+ Ogham, Old_Italic, Old_Persian, Oriya, Osmanya, Phags_Pa, Phoenician,
+ Runic, Shavian, Sinhala, Syloti_Nagri, Syriac, Tagalog, Tagbanwa,
+ Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, Tifinagh, Ugaritic, Yi.
+
+
+CHARACTER CLASSES
+
+ [...] positive character class
+ [^...] negative character class
+ [x-y] range (can be used for hex characters)
+ [[:xxx:]] positive POSIX named set
+ [[:^xxx:]] negative POSIX named set
+
+ alnum alphanumeric
+ alpha alphabetic
+ ascii 0-127
+ blank space or tab
+ cntrl control character
+ digit decimal digit
+ graph printing, excluding space
+ lower lower case letter
+ print printing, including space
+ punct printing, excluding alphanumeric
+ space whitespace
+ upper upper case letter
+ word same as \w
+ xdigit hexadecimal digit
+
+ In PCRE, POSIX character set names recognize only ASCII characters. You
+ can use \Q...\E inside a character class.
+
+
+QUANTIFIERS
+
+ ? 0 or 1, greedy
+ ?+ 0 or 1, possessive
+ ?? 0 or 1, lazy
+ * 0 or more, greedy
+ *+ 0 or more, possessive
+ *? 0 or more, lazy
+ + 1 or more, greedy
+ ++ 1 or more, possessive
+ +? 1 or more, lazy
+ {n} exactly n
+ {n,m} at least n, no more than m, greedy
+ {n,m}+ at least n, no more than m, possessive
+ {n,m}? at least n, no more than m, lazy
+ {n,} n or more, greedy
+ {n,}+ n or more, possessive
+ {n,}? n or more, lazy
+
+
+ANCHORS AND SIMPLE ASSERTIONS
+
+ \b word boundary
+ \B not a word boundary
+ ^ start of subject
+ also after internal newline in multiline mode
+ \A start of subject
+ $ end of subject
+ also before newline at end of subject
+ also before internal newline in multiline mode
+ \Z end of subject
+ also before newline at end of subject
+ \z end of subject
+ \G first matching position in subject
+
+
+MATCH POINT RESET
+
+ \K reset start of match
+
+
+ALTERNATION
+
+ expr|expr|expr...
+
+
+CAPTURING
+
+ (...) capturing group
+ (?...) named capturing group (Perl)
+ (?'name'...) named capturing group (Perl)
+ (?P...) named capturing group (Python)
+ (?:...) non-capturing group
+ (?|...) non-capturing group; reset group numbers for
+ capturing groups in each alternative
+
+
+ATOMIC GROUPS
+
+ (?>...) atomic, non-capturing group
+
+
+COMMENT
+
+ (?#....) comment (not nestable)
+
+
+OPTION SETTING
+
+ (?i) caseless
+ (?J) allow duplicate names
+ (?m) multiline
+ (?s) single line (dotall)
+ (?U) default ungreedy (lazy)
+ (?x) extended (ignore white space)
+ (?-...) unset option(s)
+
+
+LOOKAHEAD AND LOOKBEHIND ASSERTIONS
+
+ (?=...) positive look ahead
+ (?!...) negative look ahead
+ (?<=...) positive look behind
+ (? reference by name (Perl)
+ \k'name' reference by name (Perl)
+ \g{name} reference by name (Perl)
+ \k{name} reference by name (.NET)
+ (?P=name) reference by name (Python)
+
+
+SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
+
+ (?R) recurse whole pattern
+ (?n) call subpattern by absolute number
+ (?+n) call subpattern by relative number
+ (?-n) call subpattern by relative number
+ (?&name) call subpattern by name (Perl)
+ (?P>name) call subpattern by name (Python)
+ \g call subpattern by name (Oniguruma)
+ \g'name' call subpattern by name (Oniguruma)
+ \g call subpattern by absolute number (Oniguruma)
+ \g'n' call subpattern by absolute number (Oniguruma)
+ \g<+n> call subpattern by relative number (PCRE extension)
+ \g'+n' call subpattern by relative number (PCRE extension)
+ \g<-n> call subpattern by relative number (PCRE extension)
+ \g'-n' call subpattern by relative number (PCRE extension)
+
+
+CONDITIONAL PATTERNS
+
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+
+ (?(n)... absolute reference condition
+ (?(+n)... relative reference condition
+ (?(-n)... relative reference condition
+ (?()... named reference condition (Perl)
+ (?('name')... named reference condition (Perl)
+ (?(name)... named reference condition (PCRE)
+ (?(R)... overall recursion condition
+ (?(Rn)... specific group recursion condition
+ (?(R&name)... specific recursion condition
+ (?(DEFINE)... define subpattern for reference
+ (?(assert)... assertion condition
+
+
+BACKTRACKING CONTROL
+
+ The following act immediately they are reached:
+
+ (*ACCEPT) force successful match
+ (*FAIL) force backtrack; synonym (*F)
+
+ The following act only when a subsequent match failure causes a back-
+ track to reach them. They all force a match failure, but they differ in
+ what happens afterwards. Those that advance the start-of-match point do
+ so only if the pattern is not anchored.
+
+ (*COMMIT) overall failure, no advance of starting point
+ (*PRUNE) advance to next starting character
+ (*SKIP) advance start to current matching position
+ (*THEN) local failure, backtrack to next alternation
+
+
+NEWLINE CONVENTIONS
+
+ These are recognized only at the very start of the pattern or after a
+ (*BSR_...) option.
+
+ (*CR)
+ (*LF)
+ (*CRLF)
+ (*ANYCRLF)
+ (*ANY)
+
+
+WHAT \R MATCHES
+
+ These are recognized only at the very start of the pattern or after a
+ (*...) option that sets the newline convention.
+
+ (*BSR_ANYCRLF)
+ (*BSR_UNICODE)
+
+
+CALLOUTS
+
+ (?C) callout
+ (?Cn) callout with data n
+
+
+SEE ALSO
+
+ pcrepattern(3), pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 09 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREPARTIAL(3) PCREPARTIAL(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PARTIAL MATCHING IN PCRE
+
+ In normal use of PCRE, if the subject string that is passed to
+ pcre_exec() or pcre_dfa_exec() matches as far as it goes, but is too
+ short to match the entire pattern, PCRE_ERROR_NOMATCH is returned.
+ There are circumstances where it might be helpful to distinguish this
+ case from other cases in which there is no match.
+
+ Consider, for example, an application where a human is required to type
+ in data for a field with specific formatting requirements. An example
+ might be a date in the form ddmmmyy, defined by this pattern:
+
+ ^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
+
+ If the application sees the user's keystrokes one by one, and can check
+ that what has been typed so far is potentially valid, it is able to
+ raise an error as soon as a mistake is made, possibly beeping and not
+ reflecting the character that has been typed. This immediate feedback
+ is likely to be a better user interface than a check that is delayed
+ until the entire string has been entered.
+
+ PCRE supports the concept of partial matching by means of the PCRE_PAR-
+ TIAL option, which can be set when calling pcre_exec() or
+ pcre_dfa_exec(). When this flag is set for pcre_exec(), the return code
+ PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
+ during the matching process the last part of the subject string matched
+ part of the pattern. Unfortunately, for non-anchored matching, it is
+ not possible to obtain the position of the start of the partial match.
+ No captured data is set when PCRE_ERROR_PARTIAL is returned.
+
+ When PCRE_PARTIAL is set for pcre_dfa_exec(), the return code
+ PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of
+ the subject is reached, there have been no complete matches, but there
+ is still at least one matching possibility. The portion of the string
+ that provided the partial match is set as the first matching string.
+
+ Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers
+ the last literal byte in a pattern, and abandons matching immediately
+ if such a byte is not present in the subject string. This optimization
+ cannot be used for a subject string that might match only partially.
+
+
+RESTRICTED PATTERNS FOR PCRE_PARTIAL
+
+ Because of the way certain internal optimizations are implemented in
+ the pcre_exec() function, the PCRE_PARTIAL option cannot be used with
+ all patterns. These restrictions do not apply when pcre_dfa_exec() is
+ used. For pcre_exec(), repeated single characters such as
+
+ a{2,4}
+
+ and repeated single metasequences such as
+
+ \d+
+
+ are not permitted if the maximum number of occurrences is greater than
+ one. Optional items such as \d? (where the maximum is one) are permit-
+ ted. Quantifiers with any values are permitted after parentheses, so
+ the invalid examples above can be coded thus:
+
+ (a){2,4}
+ (\d)+
+
+ These constructions run more slowly, but for the kinds of application
+ that are envisaged for this facility, this is not felt to be a major
+ restriction.
+
+ If PCRE_PARTIAL is set for a pattern that does not conform to the
+ restrictions, pcre_exec() returns the error code PCRE_ERROR_BADPARTIAL
+ (-13). You can use the PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to
+ find out if a compiled pattern can be used for partial matching.
+
+
+EXAMPLE OF PARTIAL MATCHING USING PCRETEST
+
+ If the escape sequence \P is present in a pcretest data line, the
+ PCRE_PARTIAL flag is used for the match. Here is a run of pcretest that
+ uses the date example quoted above:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 25jun04\P
+ 0: 25jun04
+ 1: jun
+ data> 25dec3\P
+ Partial match
+ data> 3ju\P
+ Partial match
+ data> 3juj\P
+ No match
+ data> j\P
+ No match
+
+ The first data string is matched completely, so pcretest shows the
+ matched substrings. The remaining four strings do not match the com-
+ plete pattern, but the first two are partial matches. The same test,
+ using pcre_dfa_exec() matching (by means of the \D escape sequence),
+ produces the following output:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 25jun04\P\D
+ 0: 25jun04
+ data> 23dec3\P\D
+ Partial match: 23dec3
+ data> 3ju\P\D
+ Partial match: 3ju
+ data> 3juj\P\D
+ No match
+ data> j\P\D
+ No match
+
+ Notice that in this case the portion of the string that was matched is
+ made available.
+
+
+MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()
+
+ When a partial match has been found using pcre_dfa_exec(), it is possi-
+ ble to continue the match by providing additional subject data and
+ calling pcre_dfa_exec() again with the same compiled regular expres-
+ sion, this time setting the PCRE_DFA_RESTART option. You must also pass
+ the same working space as before, because this is where details of the
+ previous partial match are stored. Here is an example using pcretest,
+ using the \R escape sequence to set the PCRE_DFA_RESTART option (\P and
+ \D are as above):
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 23ja\P\D
+ Partial match: 23ja
+ data> n05\R\D
+ 0: n05
+
+ The first call has "23ja" as the subject, and requests partial match-
+ ing; the second call has "n05" as the subject for the continued
+ (restarted) match. Notice that when the match is complete, only the
+ last part is shown; PCRE does not retain the previously partially-
+ matched string. It is up to the calling program to do that if it needs
+ to.
+
+ You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial
+ matching over multiple segments. This facility can be used to pass very
+ long subject strings to pcre_dfa_exec(). However, some care is needed
+ for certain types of pattern.
+
+ 1. If the pattern contains tests for the beginning or end of a line,
+ you need to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropri-
+ ate, when the subject string for any call does not contain the begin-
+ ning or end of a line.
+
+ 2. If the pattern contains backward assertions (including \b or \B),
+ you need to arrange for some overlap in the subject strings to allow
+ for this. For example, you could pass the subject in chunks that are
+ 500 bytes long, but in a buffer of 700 bytes, with the starting offset
+ set to 200 and the previous 200 bytes at the start of the buffer.
+
+ 3. Matching a subject string that is split into multiple segments does
+ not always produce exactly the same result as matching over one single
+ long string. The difference arises when there are multiple matching
+ possibilities, because a partial match result is given only when there
+ are no completed matches in a call to pcre_dfa_exec(). This means that
+ as soon as the shortest match has been found, continuation to a new
+ subject segment is no longer possible. Consider this pcretest example:
+
+ re> /dog(sbody)?/
+ data> do\P\D
+ Partial match: do
+ data> gsb\R\P\D
+ 0: g
+ data> dogsbody\D
+ 0: dogsbody
+ 1: dog
+
+ The pattern matches the words "dog" or "dogsbody". When the subject is
+ presented in several parts ("do" and "gsb" being the first two) the
+ match stops when "dog" has been found, and it is not possible to con-
+ tinue. On the other hand, if "dogsbody" is presented as a single
+ string, both matches are found.
+
+ Because of this phenomenon, it does not usually make sense to end a
+ pattern that is going to be matched in this way with a variable repeat.
+
+ 4. Patterns that contain alternatives at the top level which do not all
+ start with the same pattern item may not work as expected. For example,
+ consider this pattern:
+
+ 1234|3789
+
+ If the first part of the subject is "ABC123", a partial match of the
+ first alternative is found at offset 3. There is no partial match for
+ the second alternative, because such a match does not start at the same
+ point in the subject string. Attempting to continue with the string
+ "789" does not yield a match because only those alternatives that match
+ at one point in the subject are remembered. The problem arises because
+ the start of the second alternative matches within the first alterna-
+ tive. There is no problem with anchored patterns or patterns such as:
+
+ 1234|ABCD
+
+ where no string can be a partial match for both alternatives.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 04 June 2007
+ Copyright (c) 1997-2007 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREPRECOMPILE(3) PCREPRECOMPILE(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+SAVING AND RE-USING PRECOMPILED PCRE PATTERNS
+
+ If you are running an application that uses a large number of regular
+ expression patterns, it may be useful to store them in a precompiled
+ form instead of having to compile them every time the application is
+ run. If you are not using any private character tables (see the
+ pcre_maketables() documentation), this is relatively straightforward.
+ If you are using private tables, it is a little bit more complicated.
+
+ If you save compiled patterns to a file, you can copy them to a differ-
+ ent host and run them there. This works even if the new host has the
+ opposite endianness to the one on which the patterns were compiled.
+ There may be a small performance penalty, but it should be insignifi-
+ cant. However, compiling regular expressions with one version of PCRE
+ for use with a different version is not guaranteed to work and may
+ cause crashes.
+
+
+SAVING A COMPILED PATTERN
+ The value returned by pcre_compile() points to a single block of memory
+ that holds the compiled pattern and associated data. You can find the
+ length of this block in bytes by calling pcre_fullinfo() with an argu-
+ ment of PCRE_INFO_SIZE. You can then save the data in any appropriate
+ manner. Here is sample code that compiles a pattern and writes it to a
+ file. It assumes that the variable fd refers to a file that is open for
+ output:
+
+ int erroroffset, rc, size;
+ char *error;
+ pcre *re;
+
+ re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
+ if (re == NULL) { ... handle errors ... }
+ rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
+ if (rc < 0) { ... handle errors ... }
+ rc = fwrite(re, 1, size, fd);
+ if (rc != size) { ... handle errors ... }
+
+ In this example, the bytes that comprise the compiled pattern are
+ copied exactly. Note that this is binary data that may contain any of
+ the 256 possible byte values. On systems that make a distinction
+ between binary and non-binary data, be sure that the file is opened for
+ binary output.
+
+ If you want to write more than one pattern to a file, you will have to
+ devise a way of separating them. For binary data, preceding each pat-
+ tern with its length is probably the most straightforward approach.
+ Another possibility is to write out the data in hexadecimal instead of
+ binary, one pattern to a line.
+
+ Saving compiled patterns in a file is only one possible way of storing
+ them for later use. They could equally well be saved in a database, or
+ in the memory of some daemon process that passes them via sockets to
+ the processes that want them.
+
+ If the pattern has been studied, it is also possible to save the study
+ data in a similar way to the compiled pattern itself. When studying
+ generates additional information, pcre_study() returns a pointer to a
+ pcre_extra data block. Its format is defined in the section on matching
+ a pattern in the pcreapi documentation. The study_data field points to
+ the binary study data, and this is what you must save (not the
+ pcre_extra block itself). The length of the study data can be obtained
+ by calling pcre_fullinfo() with an argument of PCRE_INFO_STUDYSIZE.
+ Remember to check that pcre_study() did return a non-NULL value before
+ trying to save the study data.
+
+
+RE-USING A PRECOMPILED PATTERN
+
+ Re-using a precompiled pattern is straightforward. Having reloaded it
+ into main memory, you pass its pointer to pcre_exec() or
+ pcre_dfa_exec() in the usual way. This should work even on another
+ host, and even if that host has the opposite endianness to the one
+ where the pattern was compiled.
+
+ However, if you passed a pointer to custom character tables when the
+ pattern was compiled (the tableptr argument of pcre_compile()), you
+ must now pass a similar pointer to pcre_exec() or pcre_dfa_exec(),
+ because the value saved with the compiled pattern will obviously be
+ nonsense. A field in a pcre_extra() block is used to pass this data, as
+ described in the section on matching a pattern in the pcreapi documen-
+ tation.
+
+ If you did not provide custom character tables when the pattern was
+ compiled, the pointer in the compiled pattern is NULL, which causes
+ pcre_exec() to use PCRE's internal tables. Thus, you do not need to
+ take any special action at run time in this case.
+
+ If you saved study data with the compiled pattern, you need to create
+ your own pcre_extra data block and set the study_data field to point to
+ the reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA
+ bit in the flags field to indicate that study data is present. Then
+ pass the pcre_extra block to pcre_exec() or pcre_dfa_exec() in the
+ usual way.
+
+
+COMPATIBILITY WITH DIFFERENT PCRE RELEASES
+
+ In general, it is safest to recompile all saved patterns when you
+ update to a new PCRE release, though not all updates actually require
+ this. Recompiling is definitely needed for release 7.2.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 13 June 2007
+ Copyright (c) 1997-2007 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREPERFORM(3) PCREPERFORM(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE PERFORMANCE
+
+ Two aspects of performance are discussed below: memory usage and pro-
+ cessing time. The way you express your pattern as a regular expression
+ can affect both of them.
+
+
+MEMORY USAGE
+
+ Patterns are compiled by PCRE into a reasonably efficient byte code, so
+ that most simple patterns do not use much memory. However, there is one
+ case where memory usage can be unexpectedly large. When a parenthesized
+ subpattern has a quantifier with a minimum greater than 1 and/or a lim-
+ ited maximum, the whole subpattern is repeated in the compiled code.
+ For example, the pattern
+
+ (abc|def){2,4}
+
+ is compiled as if it were
+
+ (abc|def)(abc|def)((abc|def)(abc|def)?)?
+
+ (Technical aside: It is done this way so that backtrack points within
+ each of the repetitions can be independently maintained.)
+
+ For regular expressions whose quantifiers use only small numbers, this
+ is not usually a problem. However, if the numbers are large, and par-
+ ticularly if such repetitions are nested, the memory usage can become
+ an embarrassment. For example, the very simple pattern
+
+ ((ab){1,1000}c){1,3}
+
+ uses 51K bytes when compiled. When PCRE is compiled with its default
+ internal pointer size of two bytes, the size limit on a compiled pat-
+ tern is 64K, and this is reached with the above pattern if the outer
+ repetition is increased from 3 to 4. PCRE can be compiled to use larger
+ internal pointers and thus handle larger compiled patterns, but it is
+ better to try to rewrite your pattern to use less memory if you can.
+
+ One way of reducing the memory usage for such patterns is to make use
+ of PCRE's "subroutine" facility. Re-writing the above pattern as
+
+ ((ab)(?2){0,999}c)(?1){0,2}
+
+ reduces the memory requirements to 18K, and indeed it remains under 20K
+ even with the outer repetition increased to 100. However, this pattern
+ is not exactly equivalent, because the "subroutine" calls are treated
+ as atomic groups into which there can be no backtracking if there is a
+ subsequent matching failure. Therefore, PCRE cannot do this kind of
+ rewriting automatically. Furthermore, there is a noticeable loss of
+ speed when executing the modified pattern. Nevertheless, if the atomic
+ grouping is not a problem and the loss of speed is acceptable, this
+ kind of rewriting will allow you to process patterns that PCRE cannot
+ otherwise handle.
+
+
+PROCESSING TIME
+
+ Certain items in regular expression patterns are processed more effi-
+ ciently than others. It is more efficient to use a character class like
+ [aeiou] than a set of single-character alternatives such as
+ (a|e|i|o|u). In general, the simplest construction that provides the
+ required behaviour is usually the most efficient. Jeffrey Friedl's book
+ contains a lot of useful general discussion about optimizing regular
+ expressions for efficient performance. This document contains a few
+ observations about PCRE.
+
+ Using Unicode character properties (the \p, \P, and \X escapes) is
+ slow, because PCRE has to scan a structure that contains data for over
+ fifteen thousand characters whenever it needs a character's property.
+ If you can find an alternative pattern that does not use character
+ properties, it will probably be faster.
+
+ When a pattern begins with .* not in parentheses, or in parentheses
+ that are not the subject of a backreference, and the PCRE_DOTALL option
+ is set, the pattern is implicitly anchored by PCRE, since it can match
+ only at the start of a subject string. However, if PCRE_DOTALL is not
+ set, PCRE cannot make this optimization, because the . metacharacter
+ does not then match a newline, and if the subject string contains new-
+ lines, the pattern may match from the character immediately following
+ one of them instead of from the very start. For example, the pattern
+
+ .*second
+
+ matches the subject "first\nand second" (where \n stands for a newline
+ character), with the match starting at the seventh character. In order
+ to do this, PCRE has to retry the match starting after every newline in
+ the subject.
+
+ If you are using such a pattern with subject strings that do not con-
+ tain newlines, the best performance is obtained by setting PCRE_DOTALL,
+ or starting the pattern with ^.* or ^.*? to indicate explicit anchor-
+ ing. That saves PCRE from having to scan along the subject looking for
+ a newline to restart at.
+
+ Beware of patterns that contain nested indefinite repeats. These can
+ take a long time to run when applied to a string that does not match.
+ Consider the pattern fragment
+
+ ^(a+)*
+
+ This can match "aaaa" in 16 different ways, and this number increases
+ very rapidly as the string gets longer. (The * repeat can match 0, 1,
+ 2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
+ repeats can match different numbers of times.) When the remainder of
+ the pattern is such that the entire match is going to fail, PCRE has in
+ principle to try every possible variation, and this can take an
+ extremely long time, even for relatively short strings.
+
+ An optimization catches some of the more simple cases such as
+
+ (a+)*b
+
+ where a literal character follows. Before embarking on the standard
+ matching procedure, PCRE checks that there is a "b" later in the sub-
+ ject string, and if there is not, it fails the match immediately. How-
+ ever, when there is no following literal this optimization cannot be
+ used. You can see the difference by comparing the behaviour of
+
+ (a+)*\d
+
+ with the pattern above. The former gives a failure almost instantly
+ when applied to a whole line of "a" characters, whereas the latter
+ takes an appreciable time with strings longer than about 20 characters.
+
+ In many cases, the solution to this kind of performance issue is to use
+ an atomic group or a possessive quantifier.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 06 March 2007
+ Copyright (c) 1997-2007 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCREPOSIX(3) PCREPOSIX(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions.
+
+
+SYNOPSIS OF POSIX API
+
+ #include
+
+ int regcomp(regex_t *preg, const char *pattern,
+ int cflags);
+
+ int regexec(regex_t *preg, const char *string,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+ size_t regerror(int errcode, const regex_t *preg,
+ char *errbuf, size_t errbuf_size);
+
+ void regfree(regex_t *preg);
+
+
+DESCRIPTION
+
+ This set of functions provides a POSIX-style API to the PCRE regular
+ expression package. See the pcreapi documentation for a description of
+ PCRE's native API, which contains much additional functionality.
+
+ The functions described here are just wrapper functions that ultimately
+ call the PCRE native API. Their prototypes are defined in the
+ pcreposix.h header file, and on Unix systems the library itself is
+ called pcreposix.a, so can be accessed by adding -lpcreposix to the
+ command for linking an application that uses them. Because the POSIX
+ functions call the native ones, it is also necessary to add -lpcre.
+
+ I have implemented only those option bits that can be reasonably mapped
+ to PCRE native options. In addition, the option REG_EXTENDED is defined
+ with the value zero. This has no effect, but since programs that are
+ written to the POSIX interface often use it, this makes it easier to
+ slot in PCRE as a replacement library. Other POSIX options are not even
+ defined.
+
+ When PCRE is called via these functions, it is only the API that is
+ POSIX-like in style. The syntax and semantics of the regular expres-
+ sions themselves are still those of Perl, subject to the setting of
+ various PCRE options, as described below. "POSIX-like in style" means
+ that the API approximates to the POSIX definition; it is not fully
+ POSIX-compatible, and in multi-byte encoding domains it is probably
+ even less compatible.
+
+ The header for these functions is supplied as pcreposix.h to avoid any
+ potential clash with other POSIX libraries. It can, of course, be
+ renamed or aliased as regex.h, which is the "correct" name. It provides
+ two structure types, regex_t for compiled internal forms, and reg-
+ match_t for returning captured substrings. It also defines some con-
+ stants whose names start with "REG_"; these are used for setting
+ options and identifying error codes.
+
+
+COMPILING A PATTERN
+
+ The function regcomp() is called to compile a pattern into an internal
+ form. The pattern is a C string terminated by a binary zero, and is
+ passed in the argument pattern. The preg argument is a pointer to a
+ regex_t structure that is used as a base for storing information about
+ the compiled regular expression.
+
+ The argument cflags is either zero, or contains one or more of the bits
+ defined by the following macros:
+
+ REG_DOTALL
+
+ The PCRE_DOTALL option is set when the regular expression is passed for
+ compilation to the native function. Note that REG_DOTALL is not part of
+ the POSIX standard.
+
+ REG_ICASE
+
+ The PCRE_CASELESS option is set when the regular expression is passed
+ for compilation to the native function.
+
+ REG_NEWLINE
+
+ The PCRE_MULTILINE option is set when the regular expression is passed
+ for compilation to the native function. Note that this does not mimic
+ the defined POSIX behaviour for REG_NEWLINE (see the following sec-
+ tion).
+
+ REG_NOSUB
+
+ The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is
+ passed for compilation to the native function. In addition, when a pat-
+ tern that is compiled with this flag is passed to regexec() for match-
+ ing, the nmatch and pmatch arguments are ignored, and no captured
+ strings are returned.
+
+ REG_UTF8
+
+ The PCRE_UTF8 option is set when the regular expression is passed for
+ compilation to the native function. This causes the pattern itself and
+ all data strings used for matching it to be treated as UTF-8 strings.
+ Note that REG_UTF8 is not part of the POSIX standard.
+
+ In the absence of these flags, no options are passed to the native
+ function. This means the the regex is compiled with PCRE default
+ semantics. In particular, the way it handles newline characters in the
+ subject string is the Perl way, not the POSIX way. Note that setting
+ PCRE_MULTILINE has only some of the effects specified for REG_NEWLINE.
+ It does not affect the way newlines are matched by . (they aren't) or
+ by a negative class such as [^a] (they are).
+
+ The yield of regcomp() is zero on success, and non-zero otherwise. The
+ preg structure is filled in on success, and one member of the structure
+ is public: re_nsub contains the number of capturing subpatterns in the
+ regular expression. Various error codes are defined in the header file.
+
+
+MATCHING NEWLINE CHARACTERS
+
+ This area is not simple, because POSIX and Perl take different views of
+ things. It is not possible to get PCRE to obey POSIX semantics, but
+ then PCRE was never intended to be a POSIX engine. The following table
+ lists the different possibilities for matching newline characters in
+ PCRE:
+
+ Default Change with
+
+ . matches newline no PCRE_DOTALL
+ newline matches [^a] yes not changeable
+ $ matches \n at end yes PCRE_DOLLARENDONLY
+ $ matches \n in middle no PCRE_MULTILINE
+ ^ matches \n in middle no PCRE_MULTILINE
+
+ This is the equivalent table for POSIX:
+
+ Default Change with
+
+ . matches newline yes REG_NEWLINE
+ newline matches [^a] yes REG_NEWLINE
+ $ matches \n at end no REG_NEWLINE
+ $ matches \n in middle no REG_NEWLINE
+ ^ matches \n in middle no REG_NEWLINE
+
+ PCRE's behaviour is the same as Perl's, except that there is no equiva-
+ lent for PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is
+ no way to stop newline from matching [^a].
+
+ The default POSIX newline handling can be obtained by setting
+ PCRE_DOTALL and PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE
+ behave exactly as for the REG_NEWLINE action.
+
+
+MATCHING A PATTERN
+
+ The function regexec() is called to match a compiled pattern preg
+ against a given string, which is by default terminated by a zero byte
+ (but see REG_STARTEND below), subject to the options in eflags. These
+ can be:
+
+ REG_NOTBOL
+
+ The PCRE_NOTBOL option is set when calling the underlying PCRE matching
+ function.
+
+ REG_NOTEOL
+
+ The PCRE_NOTEOL option is set when calling the underlying PCRE matching
+ function.
+
+ REG_STARTEND
+
+ The string is considered to start at string + pmatch[0].rm_so and to
+ have a terminating NUL located at string + pmatch[0].rm_eo (there need
+ not actually be a NUL at that location), regardless of the value of
+ nmatch. This is a BSD extension, compatible with but not specified by
+ IEEE Standard 1003.2 (POSIX.2), and should be used with caution in
+ software intended to be portable to other systems. Note that a non-zero
+ rm_so does not imply REG_NOTBOL; REG_STARTEND affects only the location
+ of the string, not how it is matched.
+
+ If the pattern was compiled with the REG_NOSUB flag, no data about any
+ matched strings is returned. The nmatch and pmatch arguments of
+ regexec() are ignored.
+
+ Otherwise,the portion of the string that was matched, and also any cap-
+ tured substrings, are returned via the pmatch argument, which points to
+ an array of nmatch structures of type regmatch_t, containing the mem-
+ bers rm_so and rm_eo. These contain the offset to the first character
+ of each substring and the offset to the first character after the end
+ of each substring, respectively. The 0th element of the vector relates
+ to the entire portion of string that was matched; subsequent elements
+ relate to the capturing subpatterns of the regular expression. Unused
+ entries in the array have both structure members set to -1.
+
+ A successful match yields a zero return; various error codes are
+ defined in the header file, of which REG_NOMATCH is the "expected"
+ failure code.
+
+
+ERROR MESSAGES
+
+ The regerror() function maps a non-zero errorcode from either regcomp()
+ or regexec() to a printable message. If preg is not NULL, the error
+ should have arisen from the use of that structure. A message terminated
+ by a binary zero is placed in errbuf. The length of the message,
+ including the zero, is limited to errbuf_size. The yield of the func-
+ tion is the size of buffer needed to hold the whole message.
+
+
+MEMORY USAGE
+
+ Compiling a regular expression causes memory to be allocated and asso-
+ ciated with the preg structure. The function regfree() frees all such
+ memory, after which preg may no longer be used as a compiled expres-
+ sion.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 05 April 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCRECPP(3) PCRECPP(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions.
+
+
+SYNOPSIS OF C++ WRAPPER
+
+ #include
+
+
+DESCRIPTION
+
+ The C++ wrapper for PCRE was provided by Google Inc. Some additional
+ functionality was added by Giuseppe Maxia. This brief man page was con-
+ structed from the notes in the pcrecpp.h file, which should be con-
+ sulted for further details.
+
+
+MATCHING INTERFACE
+
+ The "FullMatch" operation checks that supplied text matches a supplied
+ pattern exactly. If pointer arguments are supplied, it copies matched
+ sub-strings that match sub-patterns into them.
+
+ Example: successful match
+ pcrecpp::RE re("h.*o");
+ re.FullMatch("hello");
+
+ Example: unsuccessful match (requires full match):
+ pcrecpp::RE re("e");
+ !re.FullMatch("hello");
+
+ Example: creating a temporary RE object:
+ pcrecpp::RE("h.*o").FullMatch("hello");
+
+ You can pass in a "const char*" or a "string" for "text". The examples
+ below tend to use a const char*. You can, as in the different examples
+ above, store the RE object explicitly in a variable or use a temporary
+ RE object. The examples below use one mode or the other arbitrarily.
+ Either could correctly be used for any of these examples.
+
+ You must supply extra pointer arguments to extract matched subpieces.
+
+ Example: extracts "ruby" into "s" and 1234 into "i"
+ int i;
+ string s;
+ pcrecpp::RE re("(\\w+):(\\d+)");
+ re.FullMatch("ruby:1234", &s, &i);
+
+ Example: does not try to extract any extra sub-patterns
+ re.FullMatch("ruby:1234", &s);
+
+ Example: does not try to extract into NULL
+ re.FullMatch("ruby:1234", NULL, &i);
+
+ Example: integer overflow causes failure
+ !re.FullMatch("ruby:1234567891234", NULL, &i);
+
+ Example: fails because there aren't enough sub-patterns:
+ !pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s);
+
+ Example: fails because string cannot be stored in integer
+ !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
+
+ The provided pointer arguments can be pointers to any scalar numeric
+ type, or one of:
+
+ string (matched piece is copied to string)
+ StringPiece (StringPiece is mutated to point to matched piece)
+ T (where "bool T::ParseFrom(const char*, int)" exists)
+ NULL (the corresponding matched sub-pattern is not copied)
+
+ The function returns true iff all of the following conditions are sat-
+ isfied:
+
+ a. "text" matches "pattern" exactly;
+
+ b. The number of matched sub-patterns is >= number of supplied
+ pointers;
+
+ c. The "i"th argument has a suitable type for holding the
+ string captured as the "i"th sub-pattern. If you pass in
+ void * NULL for the "i"th argument, or a non-void * NULL
+ of the correct type, or pass fewer arguments than the
+ number of sub-patterns, "i"th captured sub-pattern is
+ ignored.
+
+ CAVEAT: An optional sub-pattern that does not exist in the matched
+ string is assigned the empty string. Therefore, the following will
+ return false (because the empty string is not a valid number):
+
+ int number;
+ pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
+
+ The matching interface supports at most 16 arguments per call. If you
+ need more, consider using the more general interface
+ pcrecpp::RE::DoMatch. See pcrecpp.h for the signature for DoMatch.
+
+
+QUOTING METACHARACTERS
+
+ You can use the "QuoteMeta" operation to insert backslashes before all
+ potentially meaningful characters in a string. The returned string,
+ used as a regular expression, will exactly match the original string.
+
+ Example:
+ string quoted = RE::QuoteMeta(unquoted);
+
+ Note that it's legal to escape a character even if it has no special
+ meaning in a regular expression -- so this function does that. (This
+ also makes it identical to the perl function of the same name; see
+ "perldoc -f quotemeta".) For example, "1.5-2.0?" becomes
+ "1\.5\-2\.0\?".
+
+
+PARTIAL MATCHES
+
+ You can use the "PartialMatch" operation when you want the pattern to
+ match any substring of the text.
+
+ Example: simple search for a string:
+ pcrecpp::RE("ell").PartialMatch("hello");
+
+ Example: find first number in a string:
+ int number;
+ pcrecpp::RE re("(\\d+)");
+ re.PartialMatch("x*100 + 20", &number);
+ assert(number == 100);
+
+
+UTF-8 AND THE MATCHING INTERFACE
+
+ By default, pattern and text are plain text, one byte per character.
+ The UTF8 flag, passed to the constructor, causes both pattern and
+ string to be treated as UTF-8 text, still a byte stream but potentially
+ multiple bytes per character. In practice, the text is likelier to be
+ UTF-8 than the pattern, but the match returned may depend on the UTF8
+ flag, so always use it when matching UTF8 text. For example, "." will
+ match one byte normally but with UTF8 set may match up to three bytes
+ of a multi-byte character.
+
+ Example:
+ pcrecpp::RE_Options options;
+ options.set_utf8();
+ pcrecpp::RE re(utf8_pattern, options);
+ re.FullMatch(utf8_string);
+
+ Example: using the convenience function UTF8():
+ pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
+ re.FullMatch(utf8_string);
+
+ NOTE: The UTF8 flag is ignored if pcre was not configured with the
+ --enable-utf8 flag.
+
+
+PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
+
+ PCRE defines some modifiers to change the behavior of the regular
+ expression engine. The C++ wrapper defines an auxiliary class,
+ RE_Options, as a vehicle to pass such modifiers to a RE class. Cur-
+ rently, the following modifiers are supported:
+
+ modifier description Perl corresponding
+
+ PCRE_CASELESS case insensitive match /i
+ PCRE_MULTILINE multiple lines match /m
+ PCRE_DOTALL dot matches newlines /s
+ PCRE_DOLLAR_ENDONLY $ matches only at end N/A
+ PCRE_EXTRA strict escape parsing N/A
+ PCRE_EXTENDED ignore whitespaces /x
+ PCRE_UTF8 handles UTF8 chars built-in
+ PCRE_UNGREEDY reverses * and *? N/A
+ PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
+
+ (*) Both Perl and PCRE allow non capturing parentheses by means of the
+ "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not cap-
+ ture, while (ab|cd) does.
+
+ For a full account on how each modifier works, please check the PCRE
+ API reference page.
+
+ For each modifier, there are two member functions whose name is made
+ out of the modifier in lowercase, without the "PCRE_" prefix. For
+ instance, PCRE_CASELESS is handled by
+
+ bool caseless()
+
+ which returns true if the modifier is set, and
+
+ RE_Options & set_caseless(bool)
+
+ which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can
+ be accessed through the set_match_limit() and match_limit() member
+ functions. Setting match_limit to a non-zero value will limit the exe-
+ cution of pcre to keep it from doing bad things like blowing the stack
+ or taking an eternity to return a result. A value of 5000 is good
+ enough to stop stack blowup in a 2MB thread stack. Setting match_limit
+ to zero disables match limiting. Alternatively, you can call
+ match_limit_recursion() which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to
+ limit how much PCRE recurses. match_limit() limits the number of
+ matches PCRE does; match_limit_recursion() limits the depth of internal
+ recursion, and therefore the amount of stack that is used.
+
+ Normally, to pass one or more modifiers to a RE class, you declare a
+ RE_Options object, set the appropriate options, and pass this object to
+ a RE constructor. Example:
+
+ RE_options opt;
+ opt.set_caseless(true);
+ if (RE("HELLO", opt).PartialMatch("hello world")) ...
+
+ RE_options has two constructors. The default constructor takes no argu-
+ ments and creates a set of flags that are off by default. The optional
+ parameter option_flags is to facilitate transfer of legacy code from C
+ programs. This lets you do
+
+ RE(pattern,
+ RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
+
+ However, new code is better off doing
+
+ RE(pattern,
+ RE_Options().set_caseless(true).set_multiline(true))
+ .PartialMatch(str);
+
+ If you are going to pass one of the most used modifiers, there are some
+ convenience functions that return a RE_Options class with the appropri-
+ ate modifier already set: CASELESS(), UTF8(), MULTILINE(), DOTALL(),
+ and EXTENDED().
+
+ If you need to set several options at once, and you don't want to go
+ through the pains of declaring a RE_Options object and setting several
+ options, there is a parallel method that give you such ability on the
+ fly. You can concatenate several set_xxxxx() member functions, since
+ each of them returns a reference to its class object. For example, to
+ pass PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one
+ statement, you may write:
+
+ RE(" ^ xyz \\s+ .* blah$",
+ RE_Options()
+ .set_caseless(true)
+ .set_extended(true)
+ .set_multiline(true)).PartialMatch(sometext);
+
+
+SCANNING TEXT INCREMENTALLY
+
+ The "Consume" operation may be useful if you want to repeatedly match
+ regular expressions at the front of a string and skip over them as they
+ match. This requires use of the "StringPiece" type, which represents a
+ sub-range of a real string. Like RE, StringPiece is defined in the
+ pcrecpp namespace.
+
+ Example: read lines of the form "var = value" from a string.
+ string contents = ...; // Fill string somehow
+ pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
+
+ string var;
+ int value;
+ pcrecpp::RE re("(\\w+) = (\\d+)\n");
+ while (re.Consume(&input, &var, &value)) {
+ ...;
+ }
+
+ Each successful call to "Consume" will set "var/value", and also
+ advance "input" so it points past the matched text.
+
+ The "FindAndConsume" operation is similar to "Consume" but does not
+ anchor your match at the beginning of the string. For example, you
+ could extract all words from a string by repeatedly calling
+
+ pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
+
+
+PARSING HEX/OCTAL/C-RADIX NUMBERS
+
+ By default, if you pass a pointer to a numeric value, the corresponding
+ text is interpreted as a base-10 number. You can instead wrap the
+ pointer with a call to one of the operators Hex(), Octal(), or CRadix()
+ to interpret the text in another base. The CRadix operator interprets
+ C-style "0" (base-8) and "0x" (base-16) prefixes, but defaults to
+ base-10.
+
+ Example:
+ int a, b, c, d;
+ pcrecpp::RE re("(.*) (.*) (.*) (.*)");
+ re.FullMatch("100 40 0100 0x40",
+ pcrecpp::Octal(&a), pcrecpp::Hex(&b),
+ pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
+
+ will leave 64 in a, b, c, and d.
+
+
+REPLACING PARTS OF STRINGS
+
+ You can replace the first match of "pattern" in "str" with "rewrite".
+ Within "rewrite", backslash-escaped digits (\1 to \9) can be used to
+ insert text matching corresponding parenthesized group from the pat-
+ tern. \0 in "rewrite" refers to the entire matching text. For example:
+
+ string s = "yabba dabba doo";
+ pcrecpp::RE("b+").Replace("d", &s);
+
+ will leave "s" containing "yada dabba doo". The result is true if the
+ pattern matches and a replacement occurs, false otherwise.
+
+ GlobalReplace is like Replace except that it replaces all occurrences
+ of the pattern in the string with the rewrite. Replacements are not
+ subject to re-matching. For example:
+
+ string s = "yabba dabba doo";
+ pcrecpp::RE("b+").GlobalReplace("d", &s);
+
+ will leave "s" containing "yada dada doo". It returns the number of
+ replacements made.
+
+ Extract is like Replace, except that if the pattern matches, "rewrite"
+ is copied into "out" (an additional argument) with substitutions. The
+ non-matching portions of "text" are ignored. Returns true iff a match
+ occurred and the extraction happened successfully; if no match occurs,
+ the string is left unaffected.
+
+
+AUTHOR
+
+ The C++ wrapper was contributed by Google Inc.
+ Copyright (c) 2007 Google Inc.
+
+
+REVISION
+
+ Last updated: 12 November 2007
+------------------------------------------------------------------------------
+
+
+PCRESAMPLE(3) PCRESAMPLE(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE SAMPLE PROGRAM
+
+ A simple, complete demonstration program, to get you started with using
+ PCRE, is supplied in the file pcredemo.c in the PCRE distribution.
+
+ The program compiles the regular expression that is its first argument,
+ and matches it against the subject string in its second argument. No
+ PCRE options are set, and default character tables are used. If match-
+ ing succeeds, the program outputs the portion of the subject that
+ matched, together with the contents of any captured substrings.
+
+ If the -g option is given on the command line, the program then goes on
+ to check for further matches of the same regular expression in the same
+ subject string. The logic is a little bit tricky because of the possi-
+ bility of matching an empty string. Comments in the code explain what
+ is going on.
+
+ If PCRE is installed in the standard include and library directories
+ for your system, you should be able to compile the demonstration pro-
+ gram using this command:
+
+ gcc -o pcredemo pcredemo.c -lpcre
+
+ If PCRE is installed elsewhere, you may need to add additional options
+ to the command line. For example, on a Unix-like system that has PCRE
+ installed in /usr/local, you can compile the demonstration program
+ using a command like this:
+
+ gcc -o pcredemo -I/usr/local/include pcredemo.c \
+ -L/usr/local/lib -lpcre
+
+ Once you have compiled the demonstration program, you can run simple
+ tests like this:
+
+ ./pcredemo 'cat|dog' 'the cat sat on the mat'
+ ./pcredemo -g 'cat|dog' 'the dog sat on the cat'
+
+ Note that there is a much more comprehensive test program, called
+ pcretest, which supports many more facilities for testing regular
+ expressions and the PCRE library. The pcredemo program is provided as a
+ simple coding example.
+
+ On some operating systems (e.g. Solaris), when PCRE is not installed in
+ the standard library directory, you may get an error like this when you
+ try to run pcredemo:
+
+ ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or
+ directory
+
+ This is caused by the way shared library support works on those sys-
+ tems. You need to add
+
+ -R/usr/local/lib
+
+ (for example) to the compile command to get round this problem.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 23 January 2008
+ Copyright (c) 1997-2008 University of Cambridge.
+------------------------------------------------------------------------------
+PCRESTACK(3) PCRESTACK(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE DISCUSSION OF STACK USAGE
+
+ When you call pcre_exec(), it makes use of an internal function called
+ match(). This calls itself recursively at branch points in the pattern,
+ in order to remember the state of the match so that it can back up and
+ try a different alternative if the first one fails. As matching pro-
+ ceeds deeper and deeper into the tree of possibilities, the recursion
+ depth increases.
+
+ Not all calls of match() increase the recursion depth; for an item such
+ as a* it may be called several times at the same level, after matching
+ different numbers of a's. Furthermore, in a number of cases where the
+ result of the recursive call would immediately be passed back as the
+ result of the current call (a "tail recursion"), the function is just
+ restarted instead.
+
+ The pcre_dfa_exec() function operates in an entirely different way, and
+ hardly uses recursion at all. The limit on its complexity is the amount
+ of workspace it is given. The comments that follow do NOT apply to
+ pcre_dfa_exec(); they are relevant only for pcre_exec().
+
+ You can set limits on the number of times that match() is called, both
+ in total and recursively. If the limit is exceeded, an error occurs.
+ For details, see the section on extra data for pcre_exec() in the
+ pcreapi documentation.
+
+ Each time that match() is actually called recursively, it uses memory
+ from the process stack. For certain kinds of pattern and data, very
+ large amounts of stack may be needed, despite the recognition of "tail
+ recursion". You can often reduce the amount of recursion, and there-
+ fore the amount of stack used, by modifying the pattern that is being
+ matched. Consider, for example, this pattern:
+
+ ([^<]|<(?!inet))+
+
+ It matches from wherever it starts until it encounters "
+.PP
+.SM
+.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
+.ti +5n
+.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
+.ti +5n
+.B const unsigned char *\fItableptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function compiles a regular expression into an internal form. It is the
+same as \fBpcre_compile2()\fP, except for the absence of the \fIerrorcodeptr\fP
+argument. Its arguments are:
+.sp
+ \fIpattern\fR A zero-terminated string containing the
+ regular expression to be compiled
+ \fIoptions\fR Zero or more option bits
+ \fIerrptr\fR Where to put an error message
+ \fIerroffset\fR Offset in pattern where error was found
+ \fItableptr\fR Pointer to character tables, or NULL to
+ use the built-in default
+.sp
+The option bits are:
+.sp
+ PCRE_ANCHORED Force pattern anchoring
+ PCRE_AUTO_CALLOUT Compile automatic callouts
+ PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \eR matches all Unicode line endings
+ PCRE_CASELESS Do caseless matching
+ PCRE_DOLLAR_ENDONLY $ not to match newline at end
+ PCRE_DOTALL . matches anything including NL
+ PCRE_DUPNAMES Allow duplicate names for subpatterns
+ PCRE_EXTENDED Ignore whitespace and # comments
+ PCRE_EXTRA PCRE extra features
+ (not much use currently)
+ PCRE_FIRSTLINE Force matching to be before newline
+ PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
+ PCRE_MULTILINE ^ and $ match newlines within data
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
+ sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
+ theses (named ones available)
+ PCRE_UNGREEDY Invert greediness of quantifiers
+ PCRE_UTF8 Run in UTF-8 mode
+ PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
+ validity (only relevant if
+ PCRE_UTF8 is set)
+.sp
+PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
+PCRE_NO_UTF8_CHECK.
+.P
+The yield of the function is a pointer to a private data structure that
+contains the compiled pattern, or NULL if an error was detected. Note that
+compiling regular expressions with one version of PCRE for use with a different
+version is not guaranteed to work and may cause crashes.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fR
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fR
+.\"
+page.
diff --git a/src/doc/pcre_compile2.3 b/src/doc/pcre_compile2.3
new file mode 100644
index 0000000..1e71aff
--- /dev/null
+++ b/src/doc/pcre_compile2.3
@@ -0,0 +1,77 @@
+.TH PCRE_COMPILE2 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
+.ti +5n
+.B int *\fIerrorcodeptr\fP,
+.ti +5n
+.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
+.ti +5n
+.B const unsigned char *\fItableptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function compiles a regular expression into an internal form. It is the
+same as \fBpcre_compile()\fP, except for the addition of the \fIerrorcodeptr\fP
+argument. The arguments are:
+
+.sp
+ \fIpattern\fR A zero-terminated string containing the
+ regular expression to be compiled
+ \fIoptions\fR Zero or more option bits
+ \fIerrorcodeptr\fP Where to put an error code
+ \fIerrptr\fR Where to put an error message
+ \fIerroffset\fR Offset in pattern where error was found
+ \fItableptr\fR Pointer to character tables, or NULL to
+ use the built-in default
+.sp
+The option bits are:
+.sp
+ PCRE_ANCHORED Force pattern anchoring
+ PCRE_AUTO_CALLOUT Compile automatic callouts
+ PCRE_CASELESS Do caseless matching
+ PCRE_DOLLAR_ENDONLY $ not to match newline at end
+ PCRE_DOTALL . matches anything including NL
+ PCRE_DUPNAMES Allow duplicate names for subpatterns
+ PCRE_EXTENDED Ignore whitespace and # comments
+ PCRE_EXTRA PCRE extra features
+ (not much use currently)
+ PCRE_FIRSTLINE Force matching to be before newline
+ PCRE_MULTILINE ^ and $ match newlines within data
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
+ theses (named ones available)
+ PCRE_UNGREEDY Invert greediness of quantifiers
+ PCRE_UTF8 Run in UTF-8 mode
+ PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
+ validity (only relevant if
+ PCRE_UTF8 is set)
+.sp
+PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
+PCRE_NO_UTF8_CHECK.
+.P
+The yield of the function is a pointer to a private data structure that
+contains the compiled pattern, or NULL if an error was detected. Note that
+compiling regular expressions with one version of PCRE for use with a different
+version is not guaranteed to work and may cause crashes.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fR
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fR
+.\"
+page.
diff --git a/src/doc/pcre_config.3 b/src/doc/pcre_config.3
new file mode 100644
index 0000000..b111a70
--- /dev/null
+++ b/src/doc/pcre_config.3
@@ -0,0 +1,57 @@
+.TH PCRE_CONFIG 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function makes it possible for a client program to find out which optional
+features are available in the version of the PCRE library it is using. Its
+arguments are as follows:
+.sp
+ \fIwhat\fR A code specifying what information is required
+ \fIwhere\fR Points to where to put the data
+.sp
+The available codes are:
+.sp
+ PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
+ PCRE_CONFIG_MATCH_LIMIT Internal resource limit
+ PCRE_CONFIG_MATCH_LIMIT_RECURSION
+ Internal recursion depth limit
+ PCRE_CONFIG_NEWLINE Value of the default newline sequence:
+ 13 (0x000d) for CR
+ 10 (0x000a) for LF
+ 3338 (0x0d0a) for CRLF
+ -2 for ANYCRLF
+ -1 for ANY
+ PCRE_CONFIG_BSR Indicates what \eR matches by default:
+ 0 all Unicode line endings
+ 1 CR, LF, or CRLF only
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
+ Threshold of return slots, above
+ which \fBmalloc()\fR is used by
+ the POSIX API
+ PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
+ PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
+ PCRE_CONFIG_UNICODE_PROPERTIES
+ Availability of Unicode property support
+ (1=yes 0=no)
+.sp
+The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fR
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fR
+.\"
+page.
diff --git a/src/doc/pcre_copy_named_substring.3 b/src/doc/pcre_copy_named_substring.3
new file mode 100644
index 0000000..9ad6826
--- /dev/null
+++ b/src/doc/pcre_copy_named_substring.3
@@ -0,0 +1,43 @@
+.TH PCRE_COPY_NAMED_SUBSTRING 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, const char *\fIstringname\fP,
+.ti +5n
+.B char *\fIbuffer\fP, int \fIbuffersize\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for extracting a captured substring, identified
+by name, into a given buffer. The arguments are:
+.sp
+ \fIcode\fP Pattern that was successfully matched
+ \fIsubject\fP Subject that has been successfully matched
+ \fIovector\fP Offset vector that \fBpcre_exec()\fP used
+ \fIstringcount\fP Value returned by \fBpcre_exec()\fP
+ \fIstringname\fP Name of the required substring
+ \fIbuffer\fP Buffer to receive the string
+ \fIbuffersize\fP Size of buffer
+.sp
+The yield is the length of the substring, PCRE_ERROR_NOMEMORY if the buffer was
+too small, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_copy_substring.3 b/src/doc/pcre_copy_substring.3
new file mode 100644
index 0000000..1910d18
--- /dev/null
+++ b/src/doc/pcre_copy_substring.3
@@ -0,0 +1,40 @@
+.TH PCRE_COPY_SUBSTRING 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
+.ti +5n
+.B int \fIbuffersize\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for extracting a captured substring into a given
+buffer. The arguments are:
+.sp
+ \fIsubject\fP Subject that has been successfully matched
+ \fIovector\fP Offset vector that \fBpcre_exec()\fP used
+ \fIstringcount\fP Value returned by \fBpcre_exec()\fP
+ \fIstringnumber\fP Number of the required substring
+ \fIbuffer\fP Buffer to receive the string
+ \fIbuffersize\fP Size of buffer
+.sp
+The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
+too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_dfa_exec.3 b/src/doc/pcre_dfa_exec.3
new file mode 100644
index 0000000..274b97c
--- /dev/null
+++ b/src/doc/pcre_dfa_exec.3
@@ -0,0 +1,88 @@
+.TH PCRE_DFA_EXEC 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
+.ti +5n
+.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
+.ti +5n
+.B int *\fIworkspace\fP, int \fIwscount\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function matches a compiled regular expression against a given subject
+string, using an alternative matching algorithm that scans the subject string
+just once (\fInot\fP Perl-compatible). Note that the main, Perl-compatible,
+matching function is \fBpcre_exec()\fP. The arguments for this function are:
+.sp
+ \fIcode\fP Points to the compiled pattern
+ \fIextra\fP Points to an associated \fBpcre_extra\fP structure,
+ or is NULL
+ \fIsubject\fP Points to the subject string
+ \fIlength\fP Length of the subject string, in bytes
+ \fIstartoffset\fP Offset in bytes in the subject at which to
+ start matching
+ \fIoptions\fP Option bits
+ \fIovector\fP Points to a vector of ints for result offsets
+ \fIovecsize\fP Number of elements in the vector
+ \fIworkspace\fP Points to a vector of ints used as working space
+ \fIwscount\fP Number of elements in the vector
+.sp
+The options are:
+.sp
+ PCRE_ANCHORED Match only at the first position
+ PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \eR matches all Unicode line endings
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NOTBOL Subject is not the beginning of a line
+ PCRE_NOTEOL Subject is not the end of a line
+ PCRE_NOTEMPTY An empty string is not a valid match
+ PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
+ validity (only relevant if PCRE_UTF8
+ was set at compile time)
+ PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
+ PCRE_DFA_SHORTEST Return only the shortest match
+ PCRE_DFA_RESTART This is a restart after a partial match
+.sp
+There are restrictions on what may appear in a pattern when using this matching
+function. Details are given in the
+.\" HREF
+\fBpcrematching\fP
+.\"
+documentation.
+.P
+A \fBpcre_extra\fP structure contains the following fields:
+.sp
+ \fIflags\fP Bits indicating which fields are set
+ \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
+ \fImatch_limit\fP Limit on internal resource use
+ \fImatch_limit_recursion\fP Limit on internal recursion depth
+ \fIcallout_data\fP Opaque data passed back to callouts
+ \fItables\fP Points to character tables or is NULL
+.sp
+The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
+PCRE_EXTRA_TABLES. For this matching function, the \fImatch_limit\fP and
+\fImatch_limit_recursion\fP fields are not used, and must not be set.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_exec.3 b/src/doc/pcre_exec.3
new file mode 100644
index 0000000..834a1f2
--- /dev/null
+++ b/src/doc/pcre_exec.3
@@ -0,0 +1,80 @@
+.TH PCRE_EXEC 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
+.ti +5n
+.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function matches a compiled regular expression against a given subject
+string, using a matching algorithm that is similar to Perl's. It returns
+offsets to captured substrings. Its arguments are:
+.sp
+ \fIcode\fP Points to the compiled pattern
+ \fIextra\fP Points to an associated \fBpcre_extra\fP structure,
+ or is NULL
+ \fIsubject\fP Points to the subject string
+ \fIlength\fP Length of the subject string, in bytes
+ \fIstartoffset\fP Offset in bytes in the subject at which to
+ start matching
+ \fIoptions\fP Option bits
+ \fIovector\fP Points to a vector of ints for result offsets
+ \fIovecsize\fP Number of elements in the vector (a multiple of 3)
+.sp
+The options are:
+.sp
+ PCRE_ANCHORED Match only at the first position
+ PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \eR matches all Unicode line endings
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NOTBOL Subject is not the beginning of a line
+ PCRE_NOTEOL Subject is not the end of a line
+ PCRE_NOTEMPTY An empty string is not a valid match
+ PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
+ validity (only relevant if PCRE_UTF8
+ was set at compile time)
+ PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
+.sp
+There are restrictions on what may appear in a pattern when partial matching is
+requested. For details, see the
+.\" HREF
+\fBpcrepartial\fP
+.\"
+page.
+.P
+A \fBpcre_extra\fP structure contains the following fields:
+.sp
+ \fIflags\fP Bits indicating which fields are set
+ \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
+ \fImatch_limit\fP Limit on internal resource use
+ \fImatch_limit_recursion\fP Limit on internal recursion depth
+ \fIcallout_data\fP Opaque data passed back to callouts
+ \fItables\fP Points to character tables or is NULL
+.sp
+The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
+PCRE_EXTRA_TABLES.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_free_substring.3 b/src/doc/pcre_free_substring.3
new file mode 100644
index 0000000..ed3999a
--- /dev/null
+++ b/src/doc/pcre_free_substring.3
@@ -0,0 +1,27 @@
+.TH PCRE_FREE_SUBSTRING 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B void pcre_free_substring(const char *\fIstringptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for freeing the store obtained by a previous
+call to \fBpcre_get_substring()\fP or \fBpcre_get_named_substring()\fP. Its
+only argument is a pointer to the string.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_free_substring_list.3 b/src/doc/pcre_free_substring_list.3
new file mode 100644
index 0000000..89b7078
--- /dev/null
+++ b/src/doc/pcre_free_substring_list.3
@@ -0,0 +1,27 @@
+.TH PCRE_FREE_SUBSTRING_LIST 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B void pcre_free_substring_list(const char **\fIstringptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for freeing the store obtained by a previous
+call to \fBpcre_get_substring_list()\fP. Its only argument is a pointer to the
+list of string pointers.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_fullinfo.3 b/src/doc/pcre_fullinfo.3
new file mode 100644
index 0000000..3cf8cbd
--- /dev/null
+++ b/src/doc/pcre_fullinfo.3
@@ -0,0 +1,59 @@
+.TH PCRE_FULLINFO 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B int \fIwhat\fP, void *\fIwhere\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function returns information about a compiled pattern. Its arguments are:
+.sp
+ \fIcode\fP Compiled regular expression
+ \fIextra\fP Result of \fBpcre_study()\fP or NULL
+ \fIwhat\fP What information is required
+ \fIwhere\fP Where to put the information
+.sp
+The following information is available:
+.sp
+ PCRE_INFO_BACKREFMAX Number of highest back reference
+ PCRE_INFO_CAPTURECOUNT Number of capturing subpatterns
+ PCRE_INFO_DEFAULT_TABLES Pointer to default tables
+ PCRE_INFO_FIRSTBYTE Fixed first byte for a match, or
+ -1 for start of string
+ or after newline, or
+ -2 otherwise
+ PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
+ PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
+ PCRE_INFO_LASTLITERAL Literal last byte required
+ PCRE_INFO_NAMECOUNT Number of named subpatterns
+ PCRE_INFO_NAMEENTRYSIZE Size of name table entry
+ PCRE_INFO_NAMETABLE Pointer to name table
+ PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
+ PCRE_INFO_OPTIONS Option bits used for compilation
+ PCRE_INFO_SIZE Size of compiled pattern
+ PCRE_INFO_STUDYSIZE Size of study data
+.sp
+The yield of the function is zero on success or:
+.sp
+ PCRE_ERROR_NULL the argument \fIcode\fP was NULL
+ the argument \fIwhere\fP was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_get_named_substring.3 b/src/doc/pcre_get_named_substring.3
new file mode 100644
index 0000000..22d0c1b
--- /dev/null
+++ b/src/doc/pcre_get_named_substring.3
@@ -0,0 +1,45 @@
+.TH PCRE_GET_NAMED_SUBSTRING 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_get_named_substring(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, const char *\fIstringname\fP,
+.ti +5n
+.B const char **\fIstringptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for extracting a captured substring by name. The
+arguments are:
+.sp
+ \fIcode\fP Compiled pattern
+ \fIsubject\fP Subject that has been successfully matched
+ \fIovector\fP Offset vector that \fBpcre_exec()\fP used
+ \fIstringcount\fP Value returned by \fBpcre_exec()\fP
+ \fIstringname\fP Name of the required substring
+ \fIstringptr\fP Where to put the string pointer
+.sp
+The memory in which the substring is placed is obtained by calling
+\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
+be used to free it when it is no longer needed. The yield of the function is
+the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
+could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_get_stringnumber.3 b/src/doc/pcre_get_stringnumber.3
new file mode 100644
index 0000000..f6017ff
--- /dev/null
+++ b/src/doc/pcre_get_stringnumber.3
@@ -0,0 +1,37 @@
+.TH PCRE_GET_STRINGNUMBER 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIname\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This convenience function finds the number of a named substring capturing
+parenthesis in a compiled pattern. Its arguments are:
+.sp
+ \fIcode\fP Compiled regular expression
+ \fIname\fP Name whose number is required
+.sp
+The yield of the function is the number of the parenthesis if the name is
+found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
+(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
+\fBpcre_get_stringnumber()\fP. You can obtain the complete list by calling
+\fBpcre_get_stringtable_entries()\fP.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_get_stringtable_entries.3 b/src/doc/pcre_get_stringtable_entries.3
new file mode 100644
index 0000000..979c4be
--- /dev/null
+++ b/src/doc/pcre_get_stringtable_entries.3
@@ -0,0 +1,40 @@
+.TH PCRE_GET_STRINGTABLE_ENTRIES 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This convenience function finds, for a compiled pattern, the first and last
+entries for a given name in the table that translates capturing parenthesis
+names into numbers. When names are required to be unique (PCRE_DUPNAMES is
+\fInot\fP set), it is usually easier to use \fBpcre_get_stringnumber()\fP
+instead.
+.sp
+ \fIcode\fP Compiled regular expression
+ \fIname\fP Name whose entries required
+ \fIfirst\fP Where to return a pointer to the first entry
+ \fIlast\fP Where to return a pointer to the last entry
+.sp
+The yield of the function is the length of each entry, or
+PCRE_ERROR_NOSUBSTRING if none are found.
+.P
+There is a complete description of the PCRE native API, including the format of
+the table entries, in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page, and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_get_substring.3 b/src/doc/pcre_get_substring.3
new file mode 100644
index 0000000..8fb11ec
--- /dev/null
+++ b/src/doc/pcre_get_substring.3
@@ -0,0 +1,42 @@
+.TH PCRE_GET_SUBSTRING 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, int \fIstringnumber\fP,
+.ti +5n
+.B const char **\fIstringptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for extracting a captured substring. The
+arguments are:
+.sp
+ \fIsubject\fP Subject that has been successfully matched
+ \fIovector\fP Offset vector that \fBpcre_exec()\fP used
+ \fIstringcount\fP Value returned by \fBpcre_exec()\fP
+ \fIstringnumber\fP Number of the required substring
+ \fIstringptr\fP Where to put the string pointer
+.sp
+The memory in which the substring is placed is obtained by calling
+\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
+be used to free it when it is no longer needed. The yield of the function is
+the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
+be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_get_substring_list.3 b/src/doc/pcre_get_substring_list.3
new file mode 100644
index 0000000..647ae39
--- /dev/null
+++ b/src/doc/pcre_get_substring_list.3
@@ -0,0 +1,41 @@
+.TH PCRE_GET_SUBSTRING_LIST 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_get_substring_list(const char *\fIsubject\fP,
+.ti +5n
+.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
+.
+.SH DESCRIPTION
+.rs
+.sp
+This is a convenience function for extracting a list of all the captured
+substrings. The arguments are:
+.sp
+ \fIsubject\fP Subject that has been successfully matched
+ \fIovector\fP Offset vector that \fBpcre_exec\fP used
+ \fIstringcount\fP Value returned by \fBpcre_exec\fP
+ \fIlistptr\fP Where to put a pointer to the list
+.sp
+The memory in which the substrings and the list are placed is obtained by
+calling \fBpcre_malloc()\fP. The convenience function
+\fBpcre_free_substring_list()\fP can be used to free it when it is no longer
+needed. A pointer to a list of pointers is put in the variable whose address is
+in \fIlistptr\fP. The list is terminated by a NULL pointer. The yield of the
+function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
+not be obtained.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_info.3 b/src/doc/pcre_info.3
new file mode 100644
index 0000000..8c78121
--- /dev/null
+++ b/src/doc/pcre_info.3
@@ -0,0 +1,26 @@
+.TH PCRE_INFO 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
+.B *\fIfirstcharptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function is obsolete. You should be using \fBpcre_fullinfo()\fP instead.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_maketables.3 b/src/doc/pcre_maketables.3
new file mode 100644
index 0000000..8d3978c
--- /dev/null
+++ b/src/doc/pcre_maketables.3
@@ -0,0 +1,29 @@
+.TH PCRE_MAKETABLES 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B const unsigned char *pcre_maketables(void);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function builds a set of character tables for character values less than
+256. These can be passed to \fBpcre_compile()\fP to override PCRE's internal,
+built-in tables (which were made by \fBpcre_maketables()\fP when PCRE was
+compiled). You might want to do this if you are using a non-standard locale.
+The function yields a pointer to the tables.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_refcount.3 b/src/doc/pcre_refcount.3
new file mode 100644
index 0000000..6ab9f4f
--- /dev/null
+++ b/src/doc/pcre_refcount.3
@@ -0,0 +1,32 @@
+.TH PCRE_REFCOUNT 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function is used to maintain a reference count inside a data block that
+contains a compiled pattern. Its arguments are:
+.sp
+ \fIcode\fP Compiled regular expression
+ \fIadjust\fP Adjustment to reference value
+.sp
+The yield of the function is the adjusted reference value, which is constrained
+to lie between 0 and 65535.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_study.3 b/src/doc/pcre_study.3
new file mode 100644
index 0000000..53f5bc1
--- /dev/null
+++ b/src/doc/pcre_study.3
@@ -0,0 +1,42 @@
+.TH PCRE_STUDY 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
+.ti +5n
+.B const char **\fIerrptr\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function studies a compiled pattern, to see if additional information can
+be extracted that might speed up matching. Its arguments are:
+.sp
+ \fIcode\fP A compiled regular expression
+ \fIoptions\fP Options for \fBpcre_study()\fP
+ \fIerrptr\fP Where to put an error message
+.sp
+If the function succeeds, it returns a value that can be passed to
+\fBpcre_exec()\fP via its \fIextra\fP argument.
+.P
+If the function returns NULL, either it could not find any additional
+information, or there was an error. You can tell the difference by looking at
+the error value. It is NULL in first case.
+.P
+There are currently no options defined; the value of the second argument should
+always be zero.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcre_version.3 b/src/doc/pcre_version.3
new file mode 100644
index 0000000..f1563fa
--- /dev/null
+++ b/src/doc/pcre_version.3
@@ -0,0 +1,26 @@
+.TH PCRE_VERSION 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B char *pcre_version(void);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function returns a character string that gives the version number of the
+PCRE library and the date of its release.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/src/doc/pcreapi.3 b/src/doc/pcreapi.3
new file mode 100644
index 0000000..f68d0ed
--- /dev/null
+++ b/src/doc/pcreapi.3
@@ -0,0 +1,1980 @@
+.TH PCREAPI 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE NATIVE API"
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
+.ti +5n
+.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
+.ti +5n
+.B const unsigned char *\fItableptr\fP);
+.PP
+.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
+.ti +5n
+.B int *\fIerrorcodeptr\fP,
+.ti +5n
+.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
+.ti +5n
+.B const unsigned char *\fItableptr\fP);
+.PP
+.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
+.ti +5n
+.B const char **\fIerrptr\fP);
+.PP
+.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
+.ti +5n
+.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
+.PP
+.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
+.ti +5n
+.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
+.ti +5n
+.B int *\fIworkspace\fP, int \fIwscount\fP);
+.PP
+.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, const char *\fIstringname\fP,
+.ti +5n
+.B char *\fIbuffer\fP, int \fIbuffersize\fP);
+.PP
+.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
+.ti +5n
+.B int \fIbuffersize\fP);
+.PP
+.B int pcre_get_named_substring(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, const char *\fIstringname\fP,
+.ti +5n
+.B const char **\fIstringptr\fP);
+.PP
+.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIname\fP);
+.PP
+.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
+.PP
+.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, int \fIstringnumber\fP,
+.ti +5n
+.B const char **\fIstringptr\fP);
+.PP
+.B int pcre_get_substring_list(const char *\fIsubject\fP,
+.ti +5n
+.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
+.PP
+.B void pcre_free_substring(const char *\fIstringptr\fP);
+.PP
+.B void pcre_free_substring_list(const char **\fIstringptr\fP);
+.PP
+.B const unsigned char *pcre_maketables(void);
+.PP
+.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B int \fIwhat\fP, void *\fIwhere\fP);
+.PP
+.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
+.B *\fIfirstcharptr\fP);
+.PP
+.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
+.PP
+.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
+.PP
+.B char *pcre_version(void);
+.PP
+.B void *(*pcre_malloc)(size_t);
+.PP
+.B void (*pcre_free)(void *);
+.PP
+.B void *(*pcre_stack_malloc)(size_t);
+.PP
+.B void (*pcre_stack_free)(void *);
+.PP
+.B int (*pcre_callout)(pcre_callout_block *);
+.
+.
+.SH "PCRE API OVERVIEW"
+.rs
+.sp
+PCRE has its own native API, which is described in this document. There are
+also some wrapper functions that correspond to the POSIX regular expression
+API. These are described in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+documentation. Both of these APIs define a set of C function calls. A C++
+wrapper is distributed with PCRE. It is documented in the
+.\" HREF
+\fBpcrecpp\fP
+.\"
+page.
+.P
+The native API C function prototypes are defined in the header file
+\fBpcre.h\fP, and on Unix systems the library itself is called \fBlibpcre\fP.
+It can normally be accessed by adding \fB-lpcre\fP to the command for linking
+an application that uses PCRE. The header file defines the macros PCRE_MAJOR
+and PCRE_MINOR to contain the major and minor release numbers for the library.
+Applications can use these to include support for different releases of PCRE.
+.P
+The functions \fBpcre_compile()\fP, \fBpcre_compile2()\fP, \fBpcre_study()\fP,
+and \fBpcre_exec()\fP are used for compiling and matching regular expressions
+in a Perl-compatible manner. A sample program that demonstrates the simplest
+way of using them is provided in the file called \fIpcredemo.c\fP in the source
+distribution. The
+.\" HREF
+\fBpcresample\fP
+.\"
+documentation describes how to compile and run it.
+.P
+A second matching function, \fBpcre_dfa_exec()\fP, which is not
+Perl-compatible, is also provided. This uses a different algorithm for the
+matching. The alternative algorithm finds all possible matches (at a given
+point in the subject), and scans the subject just once. However, this algorithm
+does not return captured substrings. A description of the two matching
+algorithms and their advantages and disadvantages is given in the
+.\" HREF
+\fBpcrematching\fP
+.\"
+documentation.
+.P
+In addition to the main compiling and matching functions, there are convenience
+functions for extracting captured substrings from a subject string that is
+matched by \fBpcre_exec()\fP. They are:
+.sp
+ \fBpcre_copy_substring()\fP
+ \fBpcre_copy_named_substring()\fP
+ \fBpcre_get_substring()\fP
+ \fBpcre_get_named_substring()\fP
+ \fBpcre_get_substring_list()\fP
+ \fBpcre_get_stringnumber()\fP
+ \fBpcre_get_stringtable_entries()\fP
+.sp
+\fBpcre_free_substring()\fP and \fBpcre_free_substring_list()\fP are also
+provided, to free the memory used for extracted strings.
+.P
+The function \fBpcre_maketables()\fP is used to build a set of character tables
+in the current locale for passing to \fBpcre_compile()\fP, \fBpcre_exec()\fP,
+or \fBpcre_dfa_exec()\fP. This is an optional facility that is provided for
+specialist use. Most commonly, no special tables are passed, in which case
+internal tables that are generated when PCRE is built are used.
+.P
+The function \fBpcre_fullinfo()\fP is used to find out information about a
+compiled pattern; \fBpcre_info()\fP is an obsolete version that returns only
+some of the available information, but is retained for backwards compatibility.
+The function \fBpcre_version()\fP returns a pointer to a string containing the
+version of PCRE and its date of release.
+.P
+The function \fBpcre_refcount()\fP maintains a reference count in a data block
+containing a compiled pattern. This is provided for the benefit of
+object-oriented applications.
+.P
+The global variables \fBpcre_malloc\fP and \fBpcre_free\fP initially contain
+the entry points of the standard \fBmalloc()\fP and \fBfree()\fP functions,
+respectively. PCRE calls the memory management functions via these variables,
+so a calling program can replace them if it wishes to intercept the calls. This
+should be done before calling any PCRE functions.
+.P
+The global variables \fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP are also
+indirections to memory management functions. These special functions are used
+only when PCRE is compiled to use the heap for remembering data, instead of
+recursive function calls, when running the \fBpcre_exec()\fP function. See the
+.\" HREF
+\fBpcrebuild\fP
+.\"
+documentation for details of how to do this. It is a non-standard way of
+building PCRE, for use in environments that have limited stacks. Because of the
+greater use of memory management, it runs more slowly. Separate functions are
+provided so that special-purpose external code can be used for this case. When
+used, these functions are always called in a stack-like manner (last obtained,
+first freed), and always for memory blocks of the same size. There is a
+discussion about PCRE's stack usage in the
+.\" HREF
+\fBpcrestack\fP
+.\"
+documentation.
+.P
+The global variable \fBpcre_callout\fP initially contains NULL. It can be set
+by the caller to a "callout" function, which PCRE will then call at specified
+points during a matching operation. Details are given in the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation.
+.
+.
+.\" HTML
+.SH NEWLINES
+.rs
+.sp
+PCRE supports five different conventions for indicating line breaks in
+strings: a single CR (carriage return) character, a single LF (linefeed)
+character, the two-character sequence CRLF, any of the three preceding, or any
+Unicode newline sequence. The Unicode newline sequences are the three just
+mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
+U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+(paragraph separator, U+2029).
+.P
+Each of the first three conventions is used by at least one operating system as
+its standard newline sequence. When PCRE is built, a default can be specified.
+The default default is LF, which is the Unix standard. When PCRE is run, the
+default can be overridden, either when a pattern is compiled, or when it is
+matched.
+.P
+At compile time, the newline convention can be specified by the \fIoptions\fP
+argument of \fBpcre_compile()\fP, or it can be specified by special text at the
+start of the pattern itself; this overrides any other settings. See the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+page for details of the special character sequences.
+.P
+In the PCRE documentation the word "newline" is used to mean "the character or
+pair of characters that indicate a line break". The choice of newline
+convention affects the handling of the dot, circumflex, and dollar
+metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
+recognized line ending sequence, the match position advancement for a
+non-anchored pattern. There is more detail about this in the
+.\" HTML
+.\"
+section on \fBpcre_exec()\fP options
+.\"
+below.
+.P
+The choice of newline convention does not affect the interpretation of
+the \en or \er escape sequences, nor does it affect what \eR matches, which is
+controlled in a similar way, but by separate options.
+.
+.
+.SH MULTITHREADING
+.rs
+.sp
+The PCRE functions can be used in multi-threading applications, with the
+proviso that the memory management functions pointed to by \fBpcre_malloc\fP,
+\fBpcre_free\fP, \fBpcre_stack_malloc\fP, and \fBpcre_stack_free\fP, and the
+callout function pointed to by \fBpcre_callout\fP, are shared by all threads.
+.P
+The compiled form of a regular expression is not altered during matching, so
+the same compiled pattern can safely be used by several threads at once.
+.
+.
+.SH "SAVING PRECOMPILED PATTERNS FOR LATER USE"
+.rs
+.sp
+The compiled form of a regular expression can be saved and re-used at a later
+time, possibly by a different program, and even on a host other than the one on
+which it was compiled. Details are given in the
+.\" HREF
+\fBpcreprecompile\fP
+.\"
+documentation. However, compiling a regular expression with one version of PCRE
+for use with a different version is not guaranteed to work and may cause
+crashes.
+.
+.
+.SH "CHECKING BUILD-TIME OPTIONS"
+.rs
+.sp
+.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
+.PP
+The function \fBpcre_config()\fP makes it possible for a PCRE client to
+discover which optional features have been compiled into the PCRE library. The
+.\" HREF
+\fBpcrebuild\fP
+.\"
+documentation has more details about these optional features.
+.P
+The first argument for \fBpcre_config()\fP is an integer, specifying which
+information is required; the second argument is a pointer to a variable into
+which the information is placed. The following information is available:
+.sp
+ PCRE_CONFIG_UTF8
+.sp
+The output is an integer that is set to one if UTF-8 support is available;
+otherwise it is set to zero.
+.sp
+ PCRE_CONFIG_UNICODE_PROPERTIES
+.sp
+The output is an integer that is set to one if support for Unicode character
+properties is available; otherwise it is set to zero.
+.sp
+ PCRE_CONFIG_NEWLINE
+.sp
+The output is an integer whose value specifies the default character sequence
+that is recognized as meaning "newline". The four values that are supported
+are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY. The
+default should normally be the standard sequence for your operating system.
+.sp
+ PCRE_CONFIG_BSR
+.sp
+The output is an integer whose value indicates what character sequences the \eR
+escape sequence matches by default. A value of 0 means that \eR matches any
+Unicode line ending sequence; a value of 1 means that \eR matches only CR, LF,
+or CRLF. The default can be overridden when a pattern is compiled or matched.
+.sp
+ PCRE_CONFIG_LINK_SIZE
+.sp
+The output is an integer that contains the number of bytes used for internal
+linkage in compiled regular expressions. The value is 2, 3, or 4. Larger values
+allow larger regular expressions to be compiled, at the expense of slower
+matching. The default value of 2 is sufficient for all but the most massive
+patterns, since it allows the compiled pattern to be up to 64K in size.
+.sp
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
+.sp
+The output is an integer that contains the threshold above which the POSIX
+interface uses \fBmalloc()\fP for output vectors. Further details are given in
+the
+.\" HREF
+\fBpcreposix\fP
+.\"
+documentation.
+.sp
+ PCRE_CONFIG_MATCH_LIMIT
+.sp
+The output is an integer that gives the default limit for the number of
+internal matching function calls in a \fBpcre_exec()\fP execution. Further
+details are given with \fBpcre_exec()\fP below.
+.sp
+ PCRE_CONFIG_MATCH_LIMIT_RECURSION
+.sp
+The output is an integer that gives the default limit for the depth of
+recursion when calling the internal matching function in a \fBpcre_exec()\fP
+execution. Further details are given with \fBpcre_exec()\fP below.
+.sp
+ PCRE_CONFIG_STACKRECURSE
+.sp
+The output is an integer that is set to one if internal recursion when running
+\fBpcre_exec()\fP is implemented by recursive function calls that use the stack
+to remember their state. This is the usual way that PCRE is compiled. The
+output is zero if PCRE was compiled to use blocks of data on the heap instead
+of recursive function calls. In this case, \fBpcre_stack_malloc\fP and
+\fBpcre_stack_free\fP are called to manage memory blocks on the heap, thus
+avoiding the use of the stack.
+.
+.
+.SH "COMPILING A PATTERN"
+.rs
+.sp
+.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
+.ti +5n
+.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
+.ti +5n
+.B const unsigned char *\fItableptr\fP);
+.sp
+.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
+.ti +5n
+.B int *\fIerrorcodeptr\fP,
+.ti +5n
+.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
+.ti +5n
+.B const unsigned char *\fItableptr\fP);
+.P
+Either of the functions \fBpcre_compile()\fP or \fBpcre_compile2()\fP can be
+called to compile a pattern into an internal form. The only difference between
+the two interfaces is that \fBpcre_compile2()\fP has an additional argument,
+\fIerrorcodeptr\fP, via which a numerical error code can be returned.
+.P
+The pattern is a C string terminated by a binary zero, and is passed in the
+\fIpattern\fP argument. A pointer to a single block of memory that is obtained
+via \fBpcre_malloc\fP is returned. This contains the compiled code and related
+data. The \fBpcre\fP type is defined for the returned block; this is a typedef
+for a structure whose contents are not externally defined. It is up to the
+caller to free the memory (via \fBpcre_free\fP) when it is no longer required.
+.P
+Although the compiled code of a PCRE regex is relocatable, that is, it does not
+depend on memory location, the complete \fBpcre\fP data block is not
+fully relocatable, because it may contain a copy of the \fItableptr\fP
+argument, which is an address (see below).
+.P
+The \fIoptions\fP argument contains various bit settings that affect the
+compilation. It should be zero if no options are required. The available
+options are described below. Some of them, in particular, those that are
+compatible with Perl, can also be set and unset from within the pattern (see
+the detailed description in the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation). For these options, the contents of the \fIoptions\fP argument
+specifies their initial settings at the start of compilation and execution. The
+PCRE_ANCHORED and PCRE_NEWLINE_\fIxxx\fP options can be set at the time of
+matching as well as at compile time.
+.P
+If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
+Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
+NULL, and sets the variable pointed to by \fIerrptr\fP to point to a textual
+error message. This is a static string that is part of the library. You must
+not try to free it. The offset from the start of the pattern to the character
+where the error was discovered is placed in the variable pointed to by
+\fIerroffset\fP, which must not be NULL. If it is, an immediate error is given.
+.P
+If \fBpcre_compile2()\fP is used instead of \fBpcre_compile()\fP, and the
+\fIerrorcodeptr\fP argument is not NULL, a non-zero error code number is
+returned via this argument in the event of an error. This is in addition to the
+textual error message. Error codes and messages are listed below.
+.P
+If the final argument, \fItableptr\fP, is NULL, PCRE uses a default set of
+character tables that are built when PCRE is compiled, using the default C
+locale. Otherwise, \fItableptr\fP must be an address that is the result of a
+call to \fBpcre_maketables()\fP. This value is stored with the compiled
+pattern, and used again by \fBpcre_exec()\fP, unless another table pointer is
+passed to it. For more discussion, see the section on locale support below.
+.P
+This code fragment shows a typical straightforward call to \fBpcre_compile()\fP:
+.sp
+ pcre *re;
+ const char *error;
+ int erroffset;
+ re = pcre_compile(
+ "^A.*Z", /* the pattern */
+ 0, /* default options */
+ &error, /* for error message */
+ &erroffset, /* for error offset */
+ NULL); /* use default character tables */
+.sp
+The following names for option bits are defined in the \fBpcre.h\fP header
+file:
+.sp
+ PCRE_ANCHORED
+.sp
+If this bit is set, the pattern is forced to be "anchored", that is, it is
+constrained to match only at the first matching point in the string that is
+being searched (the "subject string"). This effect can also be achieved by
+appropriate constructs in the pattern itself, which is the only way to do it in
+Perl.
+.sp
+ PCRE_AUTO_CALLOUT
+.sp
+If this bit is set, \fBpcre_compile()\fP automatically inserts callout items,
+all with number 255, before each pattern item. For discussion of the callout
+facility, see the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation.
+.sp
+ PCRE_BSR_ANYCRLF
+ PCRE_BSR_UNICODE
+.sp
+These options (which are mutually exclusive) control what the \eR escape
+sequence matches. The choice is either to match only CR, LF, or CRLF, or to
+match any Unicode newline sequence. The default is specified when PCRE is
+built. It can be overridden from within the pattern, or by setting an option
+when a compiled pattern is matched.
+.sp
+ PCRE_CASELESS
+.sp
+If this bit is set, letters in the pattern match both upper and lower case
+letters. It is equivalent to Perl's /i option, and it can be changed within a
+pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
+concept of case for characters whose values are less than 128, so caseless
+matching is always possible. For characters with higher values, the concept of
+case is supported if PCRE is compiled with Unicode property support, but not
+otherwise. If you want to use caseless matching for characters 128 and above,
+you must ensure that PCRE is compiled with Unicode property support as well as
+with UTF-8 support.
+.sp
+ PCRE_DOLLAR_ENDONLY
+.sp
+If this bit is set, a dollar metacharacter in the pattern matches only at the
+end of the subject string. Without this option, a dollar also matches
+immediately before a newline at the end of the string (but not before any other
+newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
+There is no equivalent to this option in Perl, and no way to set it within a
+pattern.
+.sp
+ PCRE_DOTALL
+.sp
+If this bit is set, a dot metacharater in the pattern matches all characters,
+including those that indicate newline. Without it, a dot does not match when
+the current position is at a newline. This option is equivalent to Perl's /s
+option, and it can be changed within a pattern by a (?s) option setting. A
+negative class such as [^a] always matches newline characters, independent of
+the setting of this option.
+.sp
+ PCRE_DUPNAMES
+.sp
+If this bit is set, names used to identify capturing subpatterns need not be
+unique. This can be helpful for certain types of pattern when it is known that
+only one instance of the named subpattern can ever be matched. There are more
+details of named subpatterns below; see also the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation.
+.sp
+ PCRE_EXTENDED
+.sp
+If this bit is set, whitespace data characters in the pattern are totally
+ignored except when escaped or inside a character class. Whitespace does not
+include the VT character (code 11). In addition, characters between an
+unescaped # outside a character class and the next newline, inclusive, are also
+ignored. This is equivalent to Perl's /x option, and it can be changed within a
+pattern by a (?x) option setting.
+.P
+This option makes it possible to include comments inside complicated patterns.
+Note, however, that this applies only to data characters. Whitespace characters
+may never appear within special character sequences in a pattern, for example
+within the sequence (?( which introduces a conditional subpattern.
+.sp
+ PCRE_EXTRA
+.sp
+This option was invented in order to turn on additional functionality of PCRE
+that is incompatible with Perl, but it is currently of very little use. When
+set, any backslash in a pattern that is followed by a letter that has no
+special meaning causes an error, thus reserving these combinations for future
+expansion. By default, as in Perl, a backslash followed by a letter with no
+special meaning is treated as a literal. (Perl can, however, be persuaded to
+give a warning for this.) There are at present no other features controlled by
+this option. It can also be set by a (?X) option setting within a pattern.
+.sp
+ PCRE_FIRSTLINE
+.sp
+If this option is set, an unanchored pattern is required to match before or at
+the first newline in the subject string, though the matched text may continue
+over the newline.
+.sp
+ PCRE_JAVASCRIPT_COMPAT
+.sp
+If this option is set, PCRE's behaviour is changed in some ways so that it is
+compatible with JavaScript rather than Perl. The changes are as follows:
+.P
+(1) A lone closing square bracket in a pattern causes a compile-time error,
+because this is illegal in JavaScript (by default it is treated as a data
+character). Thus, the pattern AB]CD becomes illegal when this option is set.
+.P
+(2) At run time, a back reference to an unset subpattern group matches an empty
+string (by default this causes the current matching alternative to fail). A
+pattern such as (\e1)(a) succeeds when this option is set (assuming it can find
+an "a" in the subject), whereas it fails by default, for Perl compatibility.
+.sp
+ PCRE_MULTILINE
+.sp
+By default, PCRE treats the subject string as consisting of a single line of
+characters (even if it actually contains newlines). The "start of line"
+metacharacter (^) matches only at the start of the string, while the "end of
+line" metacharacter ($) matches only at the end of the string, or before a
+terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
+Perl.
+.P
+When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
+match immediately following or immediately before internal newlines in the
+subject string, respectively, as well as at the very start and end. This is
+equivalent to Perl's /m option, and it can be changed within a pattern by a
+(?m) option setting. If there are no newlines in a subject string, or no
+occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
+.sp
+ PCRE_NEWLINE_CR
+ PCRE_NEWLINE_LF
+ PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
+ PCRE_NEWLINE_ANY
+.sp
+These options override the default newline definition that was chosen when PCRE
+was built. Setting the first or the second specifies that a newline is
+indicated by a single character (CR or LF, respectively). Setting
+PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
+CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
+preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
+that any Unicode newline sequence should be recognized. The Unicode newline
+sequences are the three just mentioned, plus the single characters VT (vertical
+tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
+separator, U+2028), and PS (paragraph separator, U+2029). The last two are
+recognized only in UTF-8 mode.
+.P
+The newline setting in the options word uses three bits that are treated
+as a number, giving eight possibilities. Currently only six are used (default
+plus the five values above). This means that if you set more than one newline
+option, the combination may or may not be sensible. For example,
+PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
+other combinations may yield unused numbers and cause an error.
+.P
+The only time that a line break is specially recognized when compiling a
+pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
+class is encountered. This indicates a comment that lasts until after the next
+line break sequence. In other circumstances, line break sequences are treated
+as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
+as whitespace characters and are therefore ignored.
+.P
+The newline option that is set at compile time becomes the default that is used
+for \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
+.sp
+ PCRE_NO_AUTO_CAPTURE
+.sp
+If this option is set, it disables the use of numbered capturing parentheses in
+the pattern. Any opening parenthesis that is not followed by ? behaves as if it
+were followed by ?: but named parentheses can still be used for capturing (and
+they acquire numbers in the usual way). There is no equivalent of this option
+in Perl.
+.sp
+ PCRE_UNGREEDY
+.sp
+This option inverts the "greediness" of the quantifiers so that they are not
+greedy by default, but become greedy if followed by "?". It is not compatible
+with Perl. It can also be set by a (?U) option setting within the pattern.
+.sp
+ PCRE_UTF8
+.sp
+This option causes PCRE to regard both the pattern and the subject as strings
+of UTF-8 characters instead of single-byte character strings. However, it is
+available only when PCRE is built to include UTF-8 support. If not, the use
+of this option provokes an error. Details of how this option changes the
+behaviour of PCRE are given in the
+.\" HTML
+.\"
+section on UTF-8 support
+.\"
+in the main
+.\" HREF
+\fBpcre\fP
+.\"
+page.
+.sp
+ PCRE_NO_UTF8_CHECK
+.sp
+When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
+automatically checked. There is a discussion about the
+.\" HTML
+.\"
+validity of UTF-8 strings
+.\"
+in the main
+.\" HREF
+\fBpcre\fP
+.\"
+page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_compile()\fP
+returns an error. If you already know that your pattern is valid, and you want
+to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
+option. When it is set, the effect of passing an invalid UTF-8 string as a
+pattern is undefined. It may cause your program to crash. Note that this option
+can also be passed to \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, to suppress
+the UTF-8 validity checking of subject strings.
+.
+.
+.SH "COMPILATION ERROR CODES"
+.rs
+.sp
+The following table lists the error codes than may be returned by
+\fBpcre_compile2()\fP, along with the error messages that may be returned by
+both compiling functions. As PCRE has developed, some error codes have fallen
+out of use. To avoid confusion, they have not been re-used.
+.sp
+ 0 no error
+ 1 \e at end of pattern
+ 2 \ec at end of pattern
+ 3 unrecognized character follows \e
+ 4 numbers out of order in {} quantifier
+ 5 number too big in {} quantifier
+ 6 missing terminating ] for character class
+ 7 invalid escape sequence in character class
+ 8 range out of order in character class
+ 9 nothing to repeat
+ 10 [this code is not in use]
+ 11 internal error: unexpected repeat
+ 12 unrecognized character after (? or (?-
+ 13 POSIX named classes are supported only within a class
+ 14 missing )
+ 15 reference to non-existent subpattern
+ 16 erroffset passed as NULL
+ 17 unknown option bit(s) set
+ 18 missing ) after comment
+ 19 [this code is not in use]
+ 20 regular expression is too large
+ 21 failed to get memory
+ 22 unmatched parentheses
+ 23 internal error: code overflow
+ 24 unrecognized character after (?<
+ 25 lookbehind assertion is not fixed length
+ 26 malformed number or name after (?(
+ 27 conditional group contains more than two branches
+ 28 assertion expected after (?(
+ 29 (?R or (?[+-]digits must be followed by )
+ 30 unknown POSIX class name
+ 31 POSIX collating elements are not supported
+ 32 this version of PCRE is not compiled with PCRE_UTF8 support
+ 33 [this code is not in use]
+ 34 character value in \ex{...} sequence is too large
+ 35 invalid condition (?(0)
+ 36 \eC not allowed in lookbehind assertion
+ 37 PCRE does not support \eL, \el, \eN, \eU, or \eu
+ 38 number after (?C is > 255
+ 39 closing ) for (?C expected
+ 40 recursive call could loop indefinitely
+ 41 unrecognized character after (?P
+ 42 syntax error in subpattern name (missing terminator)
+ 43 two named subpatterns have the same name
+ 44 invalid UTF-8 string
+ 45 support for \eP, \ep, and \eX has not been compiled
+ 46 malformed \eP or \ep sequence
+ 47 unknown property name after \eP or \ep
+ 48 subpattern name is too long (maximum 32 characters)
+ 49 too many named subpatterns (maximum 10000)
+ 50 [this code is not in use]
+ 51 octal value is greater than \e377 (not in UTF-8 mode)
+ 52 internal error: overran compiling workspace
+ 53 internal error: previously-checked referenced subpattern not found
+ 54 DEFINE group contains more than one branch
+ 55 repeating a DEFINE group is not allowed
+ 56 inconsistent NEWLINE options
+ 57 \eg is not followed by a braced, angle-bracketed, or quoted
+ name/number or by a plain number
+ 58 a numbered reference must not be zero
+ 59 (*VERB) with an argument is not supported
+ 60 (*VERB) not recognized
+ 61 number is too big
+ 62 subpattern name expected
+ 63 digit expected after (?+
+ 64 ] is an invalid data character in JavaScript compatibility mode
+.sp
+The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
+be used if the limits were changed when PCRE was built.
+.
+.
+.SH "STUDYING A PATTERN"
+.rs
+.sp
+.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP
+.ti +5n
+.B const char **\fIerrptr\fP);
+.PP
+If a compiled pattern is going to be used several times, it is worth spending
+more time analyzing it in order to speed up the time taken for matching. The
+function \fBpcre_study()\fP takes a pointer to a compiled pattern as its first
+argument. If studying the pattern produces additional information that will
+help speed up matching, \fBpcre_study()\fP returns a pointer to a
+\fBpcre_extra\fP block, in which the \fIstudy_data\fP field points to the
+results of the study.
+.P
+The returned value from \fBpcre_study()\fP can be passed directly to
+\fBpcre_exec()\fP. However, a \fBpcre_extra\fP block also contains other
+fields that can be set by the caller before the block is passed; these are
+described
+.\" HTML
+.\"
+below
+.\"
+in the section on matching a pattern.
+.P
+If studying the pattern does not produce any additional information
+\fBpcre_study()\fP returns NULL. In that circumstance, if the calling program
+wants to pass any of the other fields to \fBpcre_exec()\fP, it must set up its
+own \fBpcre_extra\fP block.
+.P
+The second argument of \fBpcre_study()\fP contains option bits. At present, no
+options are defined, and this argument should always be zero.
+.P
+The third argument for \fBpcre_study()\fP is a pointer for an error message. If
+studying succeeds (even if no data is returned), the variable it points to is
+set to NULL. Otherwise it is set to point to a textual error message. This is a
+static string that is part of the library. You must not try to free it. You
+should test the error pointer for NULL after calling \fBpcre_study()\fP, to be
+sure that it has run successfully.
+.P
+This is a typical call to \fBpcre_study\fP():
+.sp
+ pcre_extra *pe;
+ pe = pcre_study(
+ re, /* result of pcre_compile() */
+ 0, /* no options exist */
+ &error); /* set to NULL or points to a message */
+.sp
+At present, studying a pattern is useful only for non-anchored patterns that do
+not have a single fixed starting character. A bitmap of possible starting
+bytes is created.
+.
+.
+.\" HTML
+.SH "LOCALE SUPPORT"
+.rs
+.sp
+PCRE handles caseless matching, and determines whether characters are letters,
+digits, or whatever, by reference to a set of tables, indexed by character
+value. When running in UTF-8 mode, this applies only to characters with codes
+less than 128. Higher-valued codes never match escapes such as \ew or \ed, but
+can be tested with \ep if PCRE is built with Unicode character property
+support. The use of locales with Unicode is discouraged. If you are handling
+characters with codes greater than 128, you should either use UTF-8 and
+Unicode, or use locales, but not try to mix the two.
+.P
+PCRE contains an internal set of tables that are used when the final argument
+of \fBpcre_compile()\fP is NULL. These are sufficient for many applications.
+Normally, the internal tables recognize only ASCII characters. However, when
+PCRE is built, it is possible to cause the internal tables to be rebuilt in the
+default "C" locale of the local system, which may cause them to be different.
+.P
+The internal tables can always be overridden by tables supplied by the
+application that calls PCRE. These may be created in a different locale from
+the default. As more and more applications change to using Unicode, the need
+for this locale support is expected to die away.
+.P
+External tables are built by calling the \fBpcre_maketables()\fP function,
+which has no arguments, in the relevant locale. The result can then be passed
+to \fBpcre_compile()\fP or \fBpcre_exec()\fP as often as necessary. For
+example, to build and use tables that are appropriate for the French locale
+(where accented characters with values greater than 128 are treated as letters),
+the following code could be used:
+.sp
+ setlocale(LC_CTYPE, "fr_FR");
+ tables = pcre_maketables();
+ re = pcre_compile(..., tables);
+.sp
+The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
+are using Windows, the name for the French locale is "french".
+.P
+When \fBpcre_maketables()\fP runs, the tables are built in memory that is
+obtained via \fBpcre_malloc\fP. It is the caller's responsibility to ensure
+that the memory containing the tables remains available for as long as it is
+needed.
+.P
+The pointer that is passed to \fBpcre_compile()\fP is saved with the compiled
+pattern, and the same tables are used via this pointer by \fBpcre_study()\fP
+and normally also by \fBpcre_exec()\fP. Thus, by default, for any single
+pattern, compilation, studying and matching all happen in the same locale, but
+different patterns can be compiled in different locales.
+.P
+It is possible to pass a table pointer or NULL (indicating the use of the
+internal tables) to \fBpcre_exec()\fP. Although not intended for this purpose,
+this facility could be used to match a pattern in a different locale from the
+one in which it was compiled. Passing table pointers at run time is discussed
+below in the section on matching a pattern.
+.
+.
+.SH "INFORMATION ABOUT A PATTERN"
+.rs
+.sp
+.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B int \fIwhat\fP, void *\fIwhere\fP);
+.PP
+The \fBpcre_fullinfo()\fP function returns information about a compiled
+pattern. It replaces the obsolete \fBpcre_info()\fP function, which is
+nevertheless retained for backwards compability (and is documented below).
+.P
+The first argument for \fBpcre_fullinfo()\fP is a pointer to the compiled
+pattern. The second argument is the result of \fBpcre_study()\fP, or NULL if
+the pattern was not studied. The third argument specifies which piece of
+information is required, and the fourth argument is a pointer to a variable
+to receive the data. The yield of the function is zero for success, or one of
+the following negative numbers:
+.sp
+ PCRE_ERROR_NULL the argument \fIcode\fP was NULL
+ the argument \fIwhere\fP was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
+.sp
+The "magic number" is placed at the start of each compiled pattern as an simple
+check against passing an arbitrary memory pointer. Here is a typical call of
+\fBpcre_fullinfo()\fP, to obtain the length of the compiled pattern:
+.sp
+ int rc;
+ size_t length;
+ rc = pcre_fullinfo(
+ re, /* result of pcre_compile() */
+ pe, /* result of pcre_study(), or NULL */
+ PCRE_INFO_SIZE, /* what is required */
+ &length); /* where to put the data */
+.sp
+The possible values for the third argument are defined in \fBpcre.h\fP, and are
+as follows:
+.sp
+ PCRE_INFO_BACKREFMAX
+.sp
+Return the number of the highest back reference in the pattern. The fourth
+argument should point to an \fBint\fP variable. Zero is returned if there are
+no back references.
+.sp
+ PCRE_INFO_CAPTURECOUNT
+.sp
+Return the number of capturing subpatterns in the pattern. The fourth argument
+should point to an \fBint\fP variable.
+.sp
+ PCRE_INFO_DEFAULT_TABLES
+.sp
+Return a pointer to the internal default character tables within PCRE. The
+fourth argument should point to an \fBunsigned char *\fP variable. This
+information call is provided for internal use by the \fBpcre_study()\fP
+function. External callers can cause PCRE to use its internal tables by passing
+a NULL table pointer.
+.sp
+ PCRE_INFO_FIRSTBYTE
+.sp
+Return information about the first byte of any matched string, for a
+non-anchored pattern. The fourth argument should point to an \fBint\fP
+variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is
+still recognized for backwards compatibility.)
+.P
+If there is a fixed first byte, for example, from a pattern such as
+(cat|cow|coyote), its value is returned. Otherwise, if either
+.sp
+(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
+starts with "^", or
+.sp
+(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
+(if it were set, the pattern would be anchored),
+.sp
+-1 is returned, indicating that the pattern matches only at the start of a
+subject string or after any newline within the string. Otherwise -2 is
+returned. For anchored patterns, -2 is returned.
+.sp
+ PCRE_INFO_FIRSTTABLE
+.sp
+If the pattern was studied, and this resulted in the construction of a 256-bit
+table indicating a fixed set of bytes for the first byte in any matching
+string, a pointer to the table is returned. Otherwise NULL is returned. The
+fourth argument should point to an \fBunsigned char *\fP variable.
+.sp
+ PCRE_INFO_HASCRORLF
+.sp
+Return 1 if the pattern contains any explicit matches for CR or LF characters,
+otherwise 0. The fourth argument should point to an \fBint\fP variable. An
+explicit match is either a literal CR or LF character, or \er or \en.
+.sp
+ PCRE_INFO_JCHANGED
+.sp
+Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
+0. The fourth argument should point to an \fBint\fP variable. (?J) and
+(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
+.sp
+ PCRE_INFO_LASTLITERAL
+.sp
+Return the value of the rightmost literal byte that must exist in any matched
+string, other than at its start, if such a byte has been recorded. The fourth
+argument should point to an \fBint\fP variable. If there is no such byte, -1 is
+returned. For anchored patterns, a last literal byte is recorded only if it
+follows something of variable length. For example, for the pattern
+/^a\ed+z\ed+/ the returned value is "z", but for /^a\edz\ed/ the returned value
+is -1.
+.sp
+ PCRE_INFO_NAMECOUNT
+ PCRE_INFO_NAMEENTRYSIZE
+ PCRE_INFO_NAMETABLE
+.sp
+PCRE supports the use of named as well as numbered capturing parentheses. The
+names are just an additional way of identifying the parentheses, which still
+acquire numbers. Several convenience functions such as
+\fBpcre_get_named_substring()\fP are provided for extracting captured
+substrings by name. It is also possible to extract the data directly, by first
+converting the name to a number in order to access the correct pointers in the
+output vector (described with \fBpcre_exec()\fP below). To do the conversion,
+you need to use the name-to-number map, which is described by these three
+values.
+.P
+The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT gives
+the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each
+entry; both of these return an \fBint\fP value. The entry size depends on the
+length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
+entry of the table (a pointer to \fBchar\fP). The first two bytes of each entry
+are the number of the capturing parenthesis, most significant byte first. The
+rest of the entry is the corresponding name, zero terminated. The names are in
+alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of
+their parentheses numbers. For example, consider the following pattern (assume
+PCRE_EXTENDED is set, so white space - including newlines - is ignored):
+.sp
+.\" JOIN
+ (? (?(\ed\ed)?\ed\ed) -
+ (?\ed\ed) - (?\ed\ed) )
+.sp
+There are four named subpatterns, so the table has four entries, and each entry
+in the table is eight bytes long. The table is as follows, with non-printing
+bytes shows in hexadecimal, and undefined bytes shown as ??:
+.sp
+ 00 01 d a t e 00 ??
+ 00 05 d a y 00 ?? ??
+ 00 04 m o n t h 00
+ 00 02 y e a r 00 ??
+.sp
+When writing code to extract data from named subpatterns using the
+name-to-number map, remember that the length of the entries is likely to be
+different for each compiled pattern.
+.sp
+ PCRE_INFO_OKPARTIAL
+.sp
+Return 1 if the pattern can be used for partial matching, otherwise 0. The
+fourth argument should point to an \fBint\fP variable. The
+.\" HREF
+\fBpcrepartial\fP
+.\"
+documentation lists the restrictions that apply to patterns when partial
+matching is used.
+.sp
+ PCRE_INFO_OPTIONS
+.sp
+Return a copy of the options with which the pattern was compiled. The fourth
+argument should point to an \fBunsigned long int\fP variable. These option bits
+are those specified in the call to \fBpcre_compile()\fP, modified by any
+top-level option settings at the start of the pattern itself. In other words,
+they are the options that will be in force when matching starts. For example,
+if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
+result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
+.P
+A pattern is automatically anchored by PCRE if all of its top-level
+alternatives begin with one of the following:
+.sp
+ ^ unless PCRE_MULTILINE is set
+ \eA always
+ \eG always
+.\" JOIN
+ .* if PCRE_DOTALL is set and there are no back
+ references to the subpattern in which .* appears
+.sp
+For such patterns, the PCRE_ANCHORED bit is set in the options returned by
+\fBpcre_fullinfo()\fP.
+.sp
+ PCRE_INFO_SIZE
+.sp
+Return the size of the compiled pattern, that is, the value that was passed as
+the argument to \fBpcre_malloc()\fP when PCRE was getting memory in which to
+place the compiled data. The fourth argument should point to a \fBsize_t\fP
+variable.
+.sp
+ PCRE_INFO_STUDYSIZE
+.sp
+Return the size of the data block pointed to by the \fIstudy_data\fP field in
+a \fBpcre_extra\fP block. That is, it is the value that was passed to
+\fBpcre_malloc()\fP when PCRE was getting memory into which to place the data
+created by \fBpcre_study()\fP. The fourth argument should point to a
+\fBsize_t\fP variable.
+.
+.
+.SH "OBSOLETE INFO FUNCTION"
+.rs
+.sp
+.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
+.B *\fIfirstcharptr\fP);
+.PP
+The \fBpcre_info()\fP function is now obsolete because its interface is too
+restrictive to return all the available data about a compiled pattern. New
+programs should use \fBpcre_fullinfo()\fP instead. The yield of
+\fBpcre_info()\fP is the number of capturing subpatterns, or one of the
+following negative numbers:
+.sp
+ PCRE_ERROR_NULL the argument \fIcode\fP was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+.sp
+If the \fIoptptr\fP argument is not NULL, a copy of the options with which the
+pattern was compiled is placed in the integer it points to (see
+PCRE_INFO_OPTIONS above).
+.P
+If the pattern is not anchored and the \fIfirstcharptr\fP argument is not NULL,
+it is used to pass back information about the first character of any matched
+string (see PCRE_INFO_FIRSTBYTE above).
+.
+.
+.SH "REFERENCE COUNTS"
+.rs
+.sp
+.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
+.PP
+The \fBpcre_refcount()\fP function is used to maintain a reference count in the
+data block that contains a compiled pattern. It is provided for the benefit of
+applications that operate in an object-oriented manner, where different parts
+of the application may be using the same compiled pattern, but you want to free
+the block when they are all done.
+.P
+When a pattern is compiled, the reference count field is initialized to zero.
+It is changed only by calling this function, whose action is to add the
+\fIadjust\fP value (which may be positive or negative) to it. The yield of the
+function is the new value. However, the value of the count is constrained to
+lie between 0 and 65535, inclusive. If the new value is outside these limits,
+it is forced to the appropriate limit value.
+.P
+Except when it is zero, the reference count is not correctly preserved if a
+pattern is compiled on one host and then transferred to a host whose byte-order
+is different. (This seems a highly unlikely scenario.)
+.
+.
+.SH "MATCHING A PATTERN: THE TRADITIONAL FUNCTION"
+.rs
+.sp
+.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
+.ti +5n
+.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
+.P
+The function \fBpcre_exec()\fP is called to match a subject string against a
+compiled pattern, which is passed in the \fIcode\fP argument. If the
+pattern has been studied, the result of the study should be passed in the
+\fIextra\fP argument. This function is the main matching facility of the
+library, and it operates in a Perl-like manner. For specialist use there is
+also an alternative matching function, which is described
+.\" HTML
+.\"
+below
+.\"
+in the section about the \fBpcre_dfa_exec()\fP function.
+.P
+In most applications, the pattern will have been compiled (and optionally
+studied) in the same process that calls \fBpcre_exec()\fP. However, it is
+possible to save compiled patterns and study data, and then use them later
+in different processes, possibly even on different hosts. For a discussion
+about this, see the
+.\" HREF
+\fBpcreprecompile\fP
+.\"
+documentation.
+.P
+Here is an example of a simple call to \fBpcre_exec()\fP:
+.sp
+ int rc;
+ int ovector[30];
+ rc = pcre_exec(
+ re, /* result of pcre_compile() */
+ NULL, /* we didn't study the pattern */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ ovector, /* vector of integers for substring information */
+ 30); /* number of elements (NOT size in bytes) */
+.
+.\" HTML
+.SS "Extra data for \fBpcre_exec()\fR"
+.rs
+.sp
+If the \fIextra\fP argument is not NULL, it must point to a \fBpcre_extra\fP
+data block. The \fBpcre_study()\fP function returns such a block (when it
+doesn't return NULL), but you can also create one for yourself, and pass
+additional information in it. The \fBpcre_extra\fP block contains the following
+fields (not necessarily in this order):
+.sp
+ unsigned long int \fIflags\fP;
+ void *\fIstudy_data\fP;
+ unsigned long int \fImatch_limit\fP;
+ unsigned long int \fImatch_limit_recursion\fP;
+ void *\fIcallout_data\fP;
+ const unsigned char *\fItables\fP;
+.sp
+The \fIflags\fP field is a bitmap that specifies which of the other fields
+are set. The flag bits are:
+.sp
+ PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_MATCH_LIMIT
+ PCRE_EXTRA_MATCH_LIMIT_RECURSION
+ PCRE_EXTRA_CALLOUT_DATA
+ PCRE_EXTRA_TABLES
+.sp
+Other flag bits should be set to zero. The \fIstudy_data\fP field is set in the
+\fBpcre_extra\fP block that is returned by \fBpcre_study()\fP, together with
+the appropriate flag bit. You should not set this yourself, but you may add to
+the block by setting the other fields and their corresponding flag bits.
+.P
+The \fImatch_limit\fP field provides a means of preventing PCRE from using up a
+vast amount of resources when running patterns that are not going to match,
+but which have a very large number of possibilities in their search trees. The
+classic example is the use of nested unlimited repeats.
+.P
+Internally, PCRE uses a function called \fBmatch()\fP which it calls repeatedly
+(sometimes recursively). The limit set by \fImatch_limit\fP is imposed on the
+number of times this function is called during a match, which has the effect of
+limiting the amount of backtracking that can take place. For patterns that are
+not anchored, the count restarts from zero for each position in the subject
+string.
+.P
+The default value for the limit can be set when PCRE is built; the default
+default is 10 million, which handles all but the most extreme cases. You can
+override the default by suppling \fBpcre_exec()\fP with a \fBpcre_extra\fP
+block in which \fImatch_limit\fP is set, and PCRE_EXTRA_MATCH_LIMIT is set in
+the \fIflags\fP field. If the limit is exceeded, \fBpcre_exec()\fP returns
+PCRE_ERROR_MATCHLIMIT.
+.P
+The \fImatch_limit_recursion\fP field is similar to \fImatch_limit\fP, but
+instead of limiting the total number of times that \fBmatch()\fP is called, it
+limits the depth of recursion. The recursion depth is a smaller number than the
+total number of calls, because not all calls to \fBmatch()\fP are recursive.
+This limit is of use only if it is set smaller than \fImatch_limit\fP.
+.P
+Limiting the recursion depth limits the amount of stack that can be used, or,
+when PCRE has been compiled to use memory on the heap instead of the stack, the
+amount of heap memory that can be used.
+.P
+The default value for \fImatch_limit_recursion\fP can be set when PCRE is
+built; the default default is the same value as the default for
+\fImatch_limit\fP. You can override the default by suppling \fBpcre_exec()\fP
+with a \fBpcre_extra\fP block in which \fImatch_limit_recursion\fP is set, and
+PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the \fIflags\fP field. If the limit
+is exceeded, \fBpcre_exec()\fP returns PCRE_ERROR_RECURSIONLIMIT.
+.P
+The \fIpcre_callout\fP field is used in conjunction with the "callout" feature,
+which is described in the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation.
+.P
+The \fItables\fP field is used to pass a character tables pointer to
+\fBpcre_exec()\fP; this overrides the value that is stored with the compiled
+pattern. A non-NULL value is stored with the compiled pattern only if custom
+tables were supplied to \fBpcre_compile()\fP via its \fItableptr\fP argument.
+If NULL is passed to \fBpcre_exec()\fP using this mechanism, it forces PCRE's
+internal tables to be used. This facility is helpful when re-using patterns
+that have been saved after compiling with an external set of tables, because
+the external tables might be at a different address when \fBpcre_exec()\fP is
+called. See the
+.\" HREF
+\fBpcreprecompile\fP
+.\"
+documentation for a discussion of saving compiled patterns for later use.
+.
+.\" HTML
+.SS "Option bits for \fBpcre_exec()\fP"
+.rs
+.sp
+The unused bits of the \fIoptions\fP argument for \fBpcre_exec()\fP must be
+zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
+PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
+.sp
+ PCRE_ANCHORED
+.sp
+The PCRE_ANCHORED option limits \fBpcre_exec()\fP to matching at the first
+matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
+to be anchored by virtue of its contents, it cannot be made unachored at
+matching time.
+.sp
+ PCRE_BSR_ANYCRLF
+ PCRE_BSR_UNICODE
+.sp
+These options (which are mutually exclusive) control what the \eR escape
+sequence matches. The choice is either to match only CR, LF, or CRLF, or to
+match any Unicode newline sequence. These options override the choice that was
+made or defaulted when the pattern was compiled.
+.sp
+ PCRE_NEWLINE_CR
+ PCRE_NEWLINE_LF
+ PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
+ PCRE_NEWLINE_ANY
+.sp
+These options override the newline definition that was chosen or defaulted when
+the pattern was compiled. For details, see the description of
+\fBpcre_compile()\fP above. During matching, the newline choice affects the
+behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
+the way the match position is advanced after a match failure for an unanchored
+pattern.
+.P
+When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
+match attempt for an unanchored pattern fails when the current position is at a
+CRLF sequence, and the pattern contains no explicit matches for CR or LF
+characters, the match position is advanced by two characters instead of one, in
+other words, to after the CRLF.
+.P
+The above rule is a compromise that makes the most common cases work as
+expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
+set), it does not match the string "\er\enA" because, after failing at the
+start, it skips both the CR and the LF before retrying. However, the pattern
+[\er\en]A does match that string, because it contains an explicit CR or LF
+reference, and so advances only by one character after the first failure.
+.P
+An explicit match for CR of LF is either a literal appearance of one of those
+characters, or one of the \er or \en escape sequences. Implicit matches such as
+[^X] do not count, nor does \es (which includes CR and LF in the characters
+that it matches).
+.P
+Notwithstanding the above, anomalous effects may still occur when CRLF is a
+valid newline sequence and explicit \er or \en escapes appear in the pattern.
+.sp
+ PCRE_NOTBOL
+.sp
+This option specifies that first character of the subject string is not the
+beginning of a line, so the circumflex metacharacter should not match before
+it. Setting this without PCRE_MULTILINE (at compile time) causes circumflex
+never to match. This option affects only the behaviour of the circumflex
+metacharacter. It does not affect \eA.
+.sp
+ PCRE_NOTEOL
+.sp
+This option specifies that the end of the subject string is not the end of a
+line, so the dollar metacharacter should not match it nor (except in multiline
+mode) a newline immediately before it. Setting this without PCRE_MULTILINE (at
+compile time) causes dollar never to match. This option affects only the
+behaviour of the dollar metacharacter. It does not affect \eZ or \ez.
+.sp
+ PCRE_NOTEMPTY
+.sp
+An empty string is not considered to be a valid match if this option is set. If
+there are alternatives in the pattern, they are tried. If all the alternatives
+match the empty string, the entire match fails. For example, if the pattern
+.sp
+ a?b?
+.sp
+is applied to a string not beginning with "a" or "b", it matches the empty
+string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
+valid, so PCRE searches further into the string for occurrences of "a" or "b".
+.P
+Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
+of a pattern match of the empty string within its \fBsplit()\fP function, and
+when using the /g modifier. It is possible to emulate Perl's behaviour after
+matching a null string by first trying the match again at the same offset with
+PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
+starting offset (see below) and trying an ordinary match again. There is some
+code that demonstrates how to do this in the \fIpcredemo.c\fP sample program.
+.sp
+ PCRE_NO_UTF8_CHECK
+.sp
+When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
+string is automatically checked when \fBpcre_exec()\fP is subsequently called.
+The value of \fIstartoffset\fP is also checked to ensure that it points to the
+start of a UTF-8 character. There is a discussion about the validity of UTF-8
+strings in the
+.\" HTML
+.\"
+section on UTF-8 support
+.\"
+in the main
+.\" HREF
+\fBpcre\fP
+.\"
+page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
+the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains an invalid value,
+PCRE_ERROR_BADUTF8_OFFSET is returned.
+.P
+If you already know that your subject is valid, and you want to skip these
+checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
+calling \fBpcre_exec()\fP. You might want to do this for the second and
+subsequent calls to \fBpcre_exec()\fP if you are making repeated calls to find
+all the matches in a single subject string. However, you should be sure that
+the value of \fIstartoffset\fP points to the start of a UTF-8 character. When
+PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a
+subject, or a value of \fIstartoffset\fP that does not point to the start of a
+UTF-8 character, is undefined. Your program may crash.
+.sp
+ PCRE_PARTIAL
+.sp
+This option turns on the partial matching feature. If the subject string fails
+to match the pattern, but at some point during the matching process the end of
+the subject was reached (that is, the subject partially matches the pattern and
+the failure to match occurred only because there were not enough subject
+characters), \fBpcre_exec()\fP returns PCRE_ERROR_PARTIAL instead of
+PCRE_ERROR_NOMATCH. When PCRE_PARTIAL is used, there are restrictions on what
+may appear in the pattern. These are discussed in the
+.\" HREF
+\fBpcrepartial\fP
+.\"
+documentation.
+.
+.SS "The string to be matched by \fBpcre_exec()\fP"
+.rs
+.sp
+The subject string is passed to \fBpcre_exec()\fP as a pointer in
+\fIsubject\fP, a length in \fIlength\fP, and a starting byte offset in
+\fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of a
+UTF-8 character. Unlike the pattern string, the subject may contain binary zero
+bytes. When the starting offset is zero, the search for a match starts at the
+beginning of the subject, and this is by far the most common case.
+.P
+A non-zero starting offset is useful when searching for another match in the
+same subject by calling \fBpcre_exec()\fP again after a previous success.
+Setting \fIstartoffset\fP differs from just passing over a shortened string and
+setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
+lookbehind. For example, consider the pattern
+.sp
+ \eBiss\eB
+.sp
+which finds occurrences of "iss" in the middle of words. (\eB matches only if
+the current position in the subject is not a word boundary.) When applied to
+the string "Mississipi" the first call to \fBpcre_exec()\fP finds the first
+occurrence. If \fBpcre_exec()\fP is called again with just the remainder of the
+subject, namely "issipi", it does not match, because \eB is always false at the
+start of the subject, which is deemed to be a word boundary. However, if
+\fBpcre_exec()\fP is passed the entire string again, but with \fIstartoffset\fP
+set to 4, it finds the second occurrence of "iss" because it is able to look
+behind the starting point to discover that it is preceded by a letter.
+.P
+If a non-zero starting offset is passed when the pattern is anchored, one
+attempt to match at the given offset is made. This can only succeed if the
+pattern does not require the match to be at the start of the subject.
+.
+.SS "How \fBpcre_exec()\fP returns captured substrings"
+.rs
+.sp
+In general, a pattern matches a certain portion of the subject, and in
+addition, further substrings from the subject may be picked out by parts of the
+pattern. Following the usage in Jeffrey Friedl's book, this is called
+"capturing" in what follows, and the phrase "capturing subpattern" is used for
+a fragment of a pattern that picks out a substring. PCRE supports several other
+kinds of parenthesized subpattern that do not cause substrings to be captured.
+.P
+Captured substrings are returned to the caller via a vector of integer offsets
+whose address is passed in \fIovector\fP. The number of elements in the vector
+is passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP:
+this argument is NOT the size of \fIovector\fP in bytes.
+.P
+The first two-thirds of the vector is used to pass back captured substrings,
+each substring using a pair of integers. The remaining third of the vector is
+used as workspace by \fBpcre_exec()\fP while matching capturing subpatterns,
+and is not available for passing back information. The length passed in
+\fIovecsize\fP should always be a multiple of three. If it is not, it is
+rounded down.
+.P
+When a match is successful, information about captured substrings is returned
+in pairs of integers, starting at the beginning of \fIovector\fP, and
+continuing up to two-thirds of its length at the most. The first element of a
+pair is set to the offset of the first character in a substring, and the second
+is set to the offset of the first character after the end of a substring. The
+first pair, \fIovector[0]\fP and \fIovector[1]\fP, identify the portion of the
+subject string matched by the entire pattern. The next pair is used for the
+first capturing subpattern, and so on. The value returned by \fBpcre_exec()\fP
+is one more than the highest numbered pair that has been set. For example, if
+two substrings have been captured, the returned value is 3. If there are no
+capturing subpatterns, the return value from a successful match is 1,
+indicating that just the first pair of offsets has been set.
+.P
+If a capturing subpattern is matched repeatedly, it is the last portion of the
+string that it matched that is returned.
+.P
+If the vector is too small to hold all the captured substring offsets, it is
+used as far as possible (up to two-thirds of its length), and the function
+returns a value of zero. In particular, if the substring offsets are not of
+interest, \fBpcre_exec()\fP may be called with \fIovector\fP passed as NULL and
+\fIovecsize\fP as zero. However, if the pattern contains back references and
+the \fIovector\fP is not big enough to remember the related substrings, PCRE
+has to get additional memory for use during matching. Thus it is usually
+advisable to supply an \fIovector\fP.
+.P
+The \fBpcre_info()\fP function can be used to find out how many capturing
+subpatterns there are in a compiled pattern. The smallest size for
+\fIovector\fP that will allow for \fIn\fP captured substrings, in addition to
+the offsets of the substring matched by the whole pattern, is (\fIn\fP+1)*3.
+.P
+It is possible for capturing subpattern number \fIn+1\fP to match some part of
+the subject when subpattern \fIn\fP has not been used at all. For example, if
+the string "abc" is matched against the pattern (a|(z))(bc) the return from the
+function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
+happens, both values in the offset pairs corresponding to unused subpatterns
+are set to -1.
+.P
+Offset values that correspond to unused subpatterns at the end of the
+expression are also set to -1. For example, if the string "abc" is matched
+against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The
+return from the function is 2, because the highest used capturing subpattern
+number is 1. However, you can refer to the offsets for the second and third
+capturing subpatterns if you wish (assuming the vector is large enough, of
+course).
+.P
+Some convenience functions are provided for extracting the captured substrings
+as separate strings. These are described below.
+.
+.\" HTML
+.SS "Error return values from \fBpcre_exec()\fP"
+.rs
+.sp
+If \fBpcre_exec()\fP fails, it returns a negative number. The following are
+defined in the header file:
+.sp
+ PCRE_ERROR_NOMATCH (-1)
+.sp
+The subject string did not match the pattern.
+.sp
+ PCRE_ERROR_NULL (-2)
+.sp
+Either \fIcode\fP or \fIsubject\fP was passed as NULL, or \fIovector\fP was
+NULL and \fIovecsize\fP was not zero.
+.sp
+ PCRE_ERROR_BADOPTION (-3)
+.sp
+An unrecognized bit was set in the \fIoptions\fP argument.
+.sp
+ PCRE_ERROR_BADMAGIC (-4)
+.sp
+PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
+the case when it is passed a junk pointer and to detect when a pattern that was
+compiled in an environment of one endianness is run in an environment with the
+other endianness. This is the error that PCRE gives when the magic number is
+not present.
+.sp
+ PCRE_ERROR_UNKNOWN_OPCODE (-5)
+.sp
+While running the pattern match, an unknown item was encountered in the
+compiled pattern. This error could be caused by a bug in PCRE or by overwriting
+of the compiled pattern.
+.sp
+ PCRE_ERROR_NOMEMORY (-6)
+.sp
+If a pattern contains back references, but the \fIovector\fP that is passed to
+\fBpcre_exec()\fP is not big enough to remember the referenced substrings, PCRE
+gets a block of memory at the start of matching to use for this purpose. If the
+call via \fBpcre_malloc()\fP fails, this error is given. The memory is
+automatically freed at the end of matching.
+.sp
+ PCRE_ERROR_NOSUBSTRING (-7)
+.sp
+This error is used by the \fBpcre_copy_substring()\fP,
+\fBpcre_get_substring()\fP, and \fBpcre_get_substring_list()\fP functions (see
+below). It is never returned by \fBpcre_exec()\fP.
+.sp
+ PCRE_ERROR_MATCHLIMIT (-8)
+.sp
+The backtracking limit, as specified by the \fImatch_limit\fP field in a
+\fBpcre_extra\fP structure (or defaulted) was reached. See the description
+above.
+.sp
+ PCRE_ERROR_CALLOUT (-9)
+.sp
+This error is never generated by \fBpcre_exec()\fP itself. It is provided for
+use by callout functions that want to yield a distinctive error code. See the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation for details.
+.sp
+ PCRE_ERROR_BADUTF8 (-10)
+.sp
+A string that contains an invalid UTF-8 byte sequence was passed as a subject.
+.sp
+ PCRE_ERROR_BADUTF8_OFFSET (-11)
+.sp
+The UTF-8 byte sequence that was passed as a subject was valid, but the value
+of \fIstartoffset\fP did not point to the beginning of a UTF-8 character.
+.sp
+ PCRE_ERROR_PARTIAL (-12)
+.sp
+The subject string did not match, but it did match partially. See the
+.\" HREF
+\fBpcrepartial\fP
+.\"
+documentation for details of partial matching.
+.sp
+ PCRE_ERROR_BADPARTIAL (-13)
+.sp
+The PCRE_PARTIAL option was used with a compiled pattern containing items that
+are not supported for partial matching. See the
+.\" HREF
+\fBpcrepartial\fP
+.\"
+documentation for details of partial matching.
+.sp
+ PCRE_ERROR_INTERNAL (-14)
+.sp
+An unexpected internal error has occurred. This error could be caused by a bug
+in PCRE or by overwriting of the compiled pattern.
+.sp
+ PCRE_ERROR_BADCOUNT (-15)
+.sp
+This error is given if the value of the \fIovecsize\fP argument is negative.
+.sp
+ PCRE_ERROR_RECURSIONLIMIT (-21)
+.sp
+The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
+field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
+description above.
+.sp
+ PCRE_ERROR_BADNEWLINE (-23)
+.sp
+An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
+.P
+Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
+.
+.
+.SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER"
+.rs
+.sp
+.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
+.ti +5n
+.B int \fIbuffersize\fP);
+.PP
+.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, int \fIstringnumber\fP,
+.ti +5n
+.B const char **\fIstringptr\fP);
+.PP
+.B int pcre_get_substring_list(const char *\fIsubject\fP,
+.ti +5n
+.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
+.PP
+Captured substrings can be accessed directly by using the offsets returned by
+\fBpcre_exec()\fP in \fIovector\fP. For convenience, the functions
+\fBpcre_copy_substring()\fP, \fBpcre_get_substring()\fP, and
+\fBpcre_get_substring_list()\fP are provided for extracting captured substrings
+as new, separate, zero-terminated strings. These functions identify substrings
+by number. The next section describes functions for extracting named
+substrings.
+.P
+A substring that contains a binary zero is correctly extracted and has a
+further zero added on the end, but the result is not, of course, a C string.
+However, you can process such a string by referring to the length that is
+returned by \fBpcre_copy_substring()\fP and \fBpcre_get_substring()\fP.
+Unfortunately, the interface to \fBpcre_get_substring_list()\fP is not adequate
+for handling strings containing binary zeros, because the end of the final
+string is not independently indicated.
+.P
+The first three arguments are the same for all three of these functions:
+\fIsubject\fP is the subject string that has just been successfully matched,
+\fIovector\fP is a pointer to the vector of integer offsets that was passed to
+\fBpcre_exec()\fP, and \fIstringcount\fP is the number of substrings that were
+captured by the match, including the substring that matched the entire regular
+expression. This is the value returned by \fBpcre_exec()\fP if it is greater
+than zero. If \fBpcre_exec()\fP returned zero, indicating that it ran out of
+space in \fIovector\fP, the value passed as \fIstringcount\fP should be the
+number of elements in the vector divided by three.
+.P
+The functions \fBpcre_copy_substring()\fP and \fBpcre_get_substring()\fP
+extract a single substring, whose number is given as \fIstringnumber\fP. A
+value of zero extracts the substring that matched the entire pattern, whereas
+higher values extract the captured substrings. For \fBpcre_copy_substring()\fP,
+the string is placed in \fIbuffer\fP, whose length is given by
+\fIbuffersize\fP, while for \fBpcre_get_substring()\fP a new block of memory is
+obtained via \fBpcre_malloc\fP, and its address is returned via
+\fIstringptr\fP. The yield of the function is the length of the string, not
+including the terminating zero, or one of these error codes:
+.sp
+ PCRE_ERROR_NOMEMORY (-6)
+.sp
+The buffer was too small for \fBpcre_copy_substring()\fP, or the attempt to get
+memory failed for \fBpcre_get_substring()\fP.
+.sp
+ PCRE_ERROR_NOSUBSTRING (-7)
+.sp
+There is no substring whose number is \fIstringnumber\fP.
+.P
+The \fBpcre_get_substring_list()\fP function extracts all available substrings
+and builds a list of pointers to them. All this is done in a single block of
+memory that is obtained via \fBpcre_malloc\fP. The address of the memory block
+is returned via \fIlistptr\fP, which is also the start of the list of string
+pointers. The end of the list is marked by a NULL pointer. The yield of the
+function is zero if all went well, or the error code
+.sp
+ PCRE_ERROR_NOMEMORY (-6)
+.sp
+if the attempt to get the memory block failed.
+.P
+When any of these functions encounter a substring that is unset, which can
+happen when capturing subpattern number \fIn+1\fP matches some part of the
+subject, but subpattern \fIn\fP has not been used at all, they return an empty
+string. This can be distinguished from a genuine zero-length substring by
+inspecting the appropriate offset in \fIovector\fP, which is negative for unset
+substrings.
+.P
+The two convenience functions \fBpcre_free_substring()\fP and
+\fBpcre_free_substring_list()\fP can be used to free the memory returned by
+a previous call of \fBpcre_get_substring()\fP or
+\fBpcre_get_substring_list()\fP, respectively. They do nothing more than call
+the function pointed to by \fBpcre_free\fP, which of course could be called
+directly from a C program. However, PCRE is used in some situations where it is
+linked via a special interface to another programming language that cannot use
+\fBpcre_free\fP directly; it is for these cases that the functions are
+provided.
+.
+.
+.SH "EXTRACTING CAPTURED SUBSTRINGS BY NAME"
+.rs
+.sp
+.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIname\fP);
+.PP
+.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, const char *\fIstringname\fP,
+.ti +5n
+.B char *\fIbuffer\fP, int \fIbuffersize\fP);
+.PP
+.B int pcre_get_named_substring(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIsubject\fP, int *\fIovector\fP,
+.ti +5n
+.B int \fIstringcount\fP, const char *\fIstringname\fP,
+.ti +5n
+.B const char **\fIstringptr\fP);
+.PP
+To extract a substring by name, you first have to find associated number.
+For example, for this pattern
+.sp
+ (a+)b(?\ed+)...
+.sp
+the number of the subpattern called "xxx" is 2. If the name is known to be
+unique (PCRE_DUPNAMES was not set), you can find the number from the name by
+calling \fBpcre_get_stringnumber()\fP. The first argument is the compiled
+pattern, and the second is the name. The yield of the function is the
+subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no subpattern of
+that name.
+.P
+Given the number, you can extract the substring directly, or use one of the
+functions described in the previous section. For convenience, there are also
+two functions that do the whole job.
+.P
+Most of the arguments of \fBpcre_copy_named_substring()\fP and
+\fBpcre_get_named_substring()\fP are the same as those for the similarly named
+functions that extract by number. As these are described in the previous
+section, they are not re-described here. There are just two differences:
+.P
+First, instead of a substring number, a substring name is given. Second, there
+is an extra argument, given at the start, which is a pointer to the compiled
+pattern. This is needed in order to gain access to the name-to-number
+translation table.
+.P
+These functions call \fBpcre_get_stringnumber()\fP, and if it succeeds, they
+then call \fBpcre_copy_substring()\fP or \fBpcre_get_substring()\fP, as
+appropriate. \fBNOTE:\fP If PCRE_DUPNAMES is set and there are duplicate names,
+the behaviour may not be what you want (see the next section).
+.
+.
+.SH "DUPLICATE SUBPATTERN NAMES"
+.rs
+.sp
+.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
+.ti +5n
+.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
+.PP
+When a pattern is compiled with the PCRE_DUPNAMES option, names for subpatterns
+are not required to be unique. Normally, patterns with duplicate names are such
+that in any one match, only one of the named subpatterns participates. An
+example is shown in the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation.
+.P
+When duplicates are present, \fBpcre_copy_named_substring()\fP and
+\fBpcre_get_named_substring()\fP return the first substring corresponding to
+the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
+returned; no data is returned. The \fBpcre_get_stringnumber()\fP function
+returns one of the numbers that are associated with the name, but it is not
+defined which it is.
+.P
+If you want to get full details of all captured substrings for a given name,
+you must use the \fBpcre_get_stringtable_entries()\fP function. The first
+argument is the compiled pattern, and the second is the name. The third and
+fourth are pointers to variables which are updated by the function. After it
+has run, they point to the first and last entries in the name-to-number table
+for the given name. The function itself returns the length of each entry, or
+PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
+described above in the section entitled \fIInformation about a pattern\fP.
+Given all the relevant entries for the name, you can extract each of their
+numbers, and hence the captured data, if any.
+.
+.
+.SH "FINDING ALL POSSIBLE MATCHES"
+.rs
+.sp
+The traditional matching function uses a similar algorithm to Perl, which stops
+when it finds the first match, starting at a given point in the subject. If you
+want to find all possible matches, or the longest possible match, consider
+using the alternative matching function (see below) instead. If you cannot use
+the alternative function, but still need to find all possible matches, you
+can kludge it up by making use of the callout facility, which is described in
+the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation.
+.P
+What you have to do is to insert a callout right at the end of the pattern.
+When your callout function is called, extract and save the current matched
+substring. Then return 1, which forces \fBpcre_exec()\fP to backtrack and try
+other alternatives. Ultimately, when it runs out of matches, \fBpcre_exec()\fP
+will yield PCRE_ERROR_NOMATCH.
+.
+.
+.\" HTML
+.SH "MATCHING A PATTERN: THE ALTERNATIVE FUNCTION"
+.rs
+.sp
+.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
+.ti +5n
+.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
+.ti +5n
+.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
+.ti +5n
+.B int *\fIworkspace\fP, int \fIwscount\fP);
+.P
+The function \fBpcre_dfa_exec()\fP is called to match a subject string against
+a compiled pattern, using a matching algorithm that scans the subject string
+just once, and does not backtrack. This has different characteristics to the
+normal algorithm, and is not compatible with Perl. Some of the features of PCRE
+patterns are not supported. Nevertheless, there are times when this kind of
+matching can be useful. For a discussion of the two matching algorithms, see
+the
+.\" HREF
+\fBpcrematching\fP
+.\"
+documentation.
+.P
+The arguments for the \fBpcre_dfa_exec()\fP function are the same as for
+\fBpcre_exec()\fP, plus two extras. The \fIovector\fP argument is used in a
+different way, and this is described below. The other common arguments are used
+in the same way as for \fBpcre_exec()\fP, so their description is not repeated
+here.
+.P
+The two additional arguments provide workspace for the function. The workspace
+vector should contain at least 20 elements. It is used for keeping track of
+multiple paths through the pattern tree. More workspace will be needed for
+patterns and subjects where there are a lot of potential matches.
+.P
+Here is an example of a simple call to \fBpcre_dfa_exec()\fP:
+.sp
+ int rc;
+ int ovector[10];
+ int wspace[20];
+ rc = pcre_dfa_exec(
+ re, /* result of pcre_compile() */
+ NULL, /* we didn't study the pattern */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ ovector, /* vector of integers for substring information */
+ 10, /* number of elements (NOT size in bytes) */
+ wspace, /* working space vector */
+ 20); /* number of elements (NOT size in bytes) */
+.
+.SS "Option bits for \fBpcre_dfa_exec()\fP"
+.rs
+.sp
+The unused bits of the \fIoptions\fP argument for \fBpcre_dfa_exec()\fP must be
+zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
+PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL,
+PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last three of these are
+the same as for \fBpcre_exec()\fP, so their description is not repeated here.
+.sp
+ PCRE_PARTIAL
+.sp
+This has the same general effect as it does for \fBpcre_exec()\fP, but the
+details are slightly different. When PCRE_PARTIAL is set for
+\fBpcre_dfa_exec()\fP, the return code PCRE_ERROR_NOMATCH is converted into
+PCRE_ERROR_PARTIAL if the end of the subject is reached, there have been no
+complete matches, but there is still at least one matching possibility. The
+portion of the string that provided the partial match is set as the first
+matching string.
+.sp
+ PCRE_DFA_SHORTEST
+.sp
+Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
+soon as it has found one match. Because of the way the alternative algorithm
+works, this is necessarily the shortest possible match at the first possible
+matching point in the subject string.
+.sp
+ PCRE_DFA_RESTART
+.sp
+When \fBpcre_dfa_exec()\fP is called with the PCRE_PARTIAL option, and returns
+a partial match, it is possible to call it again, with additional subject
+characters, and have it continue with the same match. The PCRE_DFA_RESTART
+option requests this action; when it is set, the \fIworkspace\fP and
+\fIwscount\fP options must reference the same vector as before because data
+about the match so far is left in them after a partial match. There is more
+discussion of this facility in the
+.\" HREF
+\fBpcrepartial\fP
+.\"
+documentation.
+.
+.SS "Successful returns from \fBpcre_dfa_exec()\fP"
+.rs
+.sp
+When \fBpcre_dfa_exec()\fP succeeds, it may have matched more than one
+substring in the subject. Note, however, that all the matches from one run of
+the function start at the same point in the subject. The shorter matches are
+all initial substrings of the longer matches. For example, if the pattern
+.sp
+ <.*>
+.sp
+is matched against the string
+.sp
+ This is no more
+.sp
+the three matched strings are
+.sp
+
+
+
+.sp
+On success, the yield of the function is a number greater than zero, which is
+the number of matched substrings. The substrings themselves are returned in
+\fIovector\fP. Each string uses two elements; the first is the offset to the
+start, and the second is the offset to the end. In fact, all the strings have
+the same start offset. (Space could have been saved by giving this only once,
+but it was decided to retain some compatibility with the way \fBpcre_exec()\fP
+returns data, even though the meaning of the strings is different.)
+.P
+The strings are returned in reverse order of length; that is, the longest
+matching string is given first. If there were too many matches to fit into
+\fIovector\fP, the yield of the function is zero, and the vector is filled with
+the longest matches.
+.
+.SS "Error returns from \fBpcre_dfa_exec()\fP"
+.rs
+.sp
+The \fBpcre_dfa_exec()\fP function returns a negative number when it fails.
+Many of the errors are the same as for \fBpcre_exec()\fP, and these are
+described
+.\" HTML
+.\"
+above.
+.\"
+There are in addition the following errors that are specific to
+\fBpcre_dfa_exec()\fP:
+.sp
+ PCRE_ERROR_DFA_UITEM (-16)
+.sp
+This return is given if \fBpcre_dfa_exec()\fP encounters an item in the pattern
+that it does not support, for instance, the use of \eC or a back reference.
+.sp
+ PCRE_ERROR_DFA_UCOND (-17)
+.sp
+This return is given if \fBpcre_dfa_exec()\fP encounters a condition item that
+uses a back reference for the condition, or a test for recursion in a specific
+group. These are not supported.
+.sp
+ PCRE_ERROR_DFA_UMLIMIT (-18)
+.sp
+This return is given if \fBpcre_dfa_exec()\fP is called with an \fIextra\fP
+block that contains a setting of the \fImatch_limit\fP field. This is not
+supported (it is meaningless).
+.sp
+ PCRE_ERROR_DFA_WSSIZE (-19)
+.sp
+This return is given if \fBpcre_dfa_exec()\fP runs out of space in the
+\fIworkspace\fP vector.
+.sp
+ PCRE_ERROR_DFA_RECURSE (-20)
+.sp
+When a recursive subpattern is processed, the matching function calls itself
+recursively, using private vectors for \fIovector\fP and \fIworkspace\fP. This
+error is given if the output vector is not large enough. This should be
+extremely rare, as a vector of size 1000 is used.
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcrebuild\fP(3), \fBpcrecallout\fP(3), \fBpcrecpp(3)\fP(3),
+\fBpcrematching\fP(3), \fBpcrepartial\fP(3), \fBpcreposix\fP(3),
+\fBpcreprecompile\fP(3), \fBpcresample\fP(3), \fBpcrestack\fP(3).
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 12 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcrebuild.3 b/src/doc/pcrebuild.3
new file mode 100644
index 0000000..db635e8
--- /dev/null
+++ b/src/doc/pcrebuild.3
@@ -0,0 +1,327 @@
+.TH PCREBUILD 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE BUILD-TIME OPTIONS"
+.rs
+.sp
+This document describes the optional features of PCRE that can be selected when
+the library is compiled. It assumes use of the \fBconfigure\fP script, where
+the optional features are selected or deselected by providing options to
+\fBconfigure\fP before running the \fBmake\fP command. However, the same
+options can be selected in both Unix-like and non-Unix-like environments using
+the GUI facility of \fBCMakeSetup\fP if you are using \fBCMake\fP instead of
+\fBconfigure\fP to build PCRE.
+.P
+The complete list of options for \fBconfigure\fP (which includes the standard
+ones such as the selection of the installation directory) can be obtained by
+running
+.sp
+ ./configure --help
+.sp
+The following sections include descriptions of options whose names begin with
+--enable or --disable. These settings specify changes to the defaults for the
+\fBconfigure\fP command. Because of the way that \fBconfigure\fP works,
+--enable and --disable always come in pairs, so the complementary option always
+exists as well, but as it specifies the default, it is not described.
+.
+.SH "C++ SUPPORT"
+.rs
+.sp
+By default, the \fBconfigure\fP script will search for a C++ compiler and C++
+header files. If it finds them, it automatically builds the C++ wrapper library
+for PCRE. You can disable this by adding
+.sp
+ --disable-cpp
+.sp
+to the \fBconfigure\fP command.
+.
+.SH "UTF-8 SUPPORT"
+.rs
+.sp
+To build PCRE with support for UTF-8 character strings, add
+.sp
+ --enable-utf8
+.sp
+to the \fBconfigure\fP command. Of itself, this does not make PCRE treat
+strings as UTF-8. As well as compiling PCRE with this option, you also have
+have to set the PCRE_UTF8 option when you call the \fBpcre_compile()\fP
+function.
+.
+.SH "UNICODE CHARACTER PROPERTY SUPPORT"
+.rs
+.sp
+UTF-8 support allows PCRE to process character values greater than 255 in the
+strings that it handles. On its own, however, it does not provide any
+facilities for accessing the properties of such characters. If you want to be
+able to use the pattern escapes \eP, \ep, and \eX, which refer to Unicode
+character properties, you must add
+.sp
+ --enable-unicode-properties
+.sp
+to the \fBconfigure\fP command. This implies UTF-8 support, even if you have
+not explicitly requested it.
+.P
+Including Unicode property support adds around 30K of tables to the PCRE
+library. Only the general category properties such as \fILu\fP and \fINd\fP are
+supported. Details are given in the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation.
+.
+.SH "CODE VALUE OF NEWLINE"
+.rs
+.sp
+By default, PCRE interprets character 10 (linefeed, LF) as indicating the end
+of a line. This is the normal newline character on Unix-like systems. You can
+compile PCRE to use character 13 (carriage return, CR) instead, by adding
+.sp
+ --enable-newline-is-cr
+.sp
+to the \fBconfigure\fP command. There is also a --enable-newline-is-lf option,
+which explicitly specifies linefeed as the newline character.
+.sp
+Alternatively, you can specify that line endings are to be indicated by the two
+character sequence CRLF. If you want this, add
+.sp
+ --enable-newline-is-crlf
+.sp
+to the \fBconfigure\fP command. There is a fourth option, specified by
+.sp
+ --enable-newline-is-anycrlf
+.sp
+which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
+indicating a line ending. Finally, a fifth option, specified by
+.sp
+ --enable-newline-is-any
+.sp
+causes PCRE to recognize any Unicode newline sequence.
+.P
+Whatever line ending convention is selected when PCRE is built can be
+overridden when the library functions are called. At build time it is
+conventional to use the standard for your operating system.
+.
+.SH "WHAT \eR MATCHES"
+.rs
+.sp
+By default, the sequence \eR in a pattern matches any Unicode newline sequence,
+whatever has been selected as the line ending sequence. If you specify
+.sp
+ --enable-bsr-anycrlf
+.sp
+the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is
+selected when PCRE is built can be overridden when the library functions are
+called.
+.
+.SH "BUILDING SHARED AND STATIC LIBRARIES"
+.rs
+.sp
+The PCRE building process uses \fBlibtool\fP to build both shared and static
+Unix libraries by default. You can suppress one of these by adding one of
+.sp
+ --disable-shared
+ --disable-static
+.sp
+to the \fBconfigure\fP command, as required.
+.
+.SH "POSIX MALLOC USAGE"
+.rs
+.sp
+When PCRE is called through the POSIX interface (see the
+.\" HREF
+\fBpcreposix\fP
+.\"
+documentation), additional working storage is required for holding the pointers
+to capturing substrings, because PCRE requires three integers per substring,
+whereas the POSIX interface provides only two. If the number of expected
+substrings is small, the wrapper function uses space on the stack, because this
+is faster than using \fBmalloc()\fP for each call. The default threshold above
+which the stack is no longer used is 10; it can be changed by adding a setting
+such as
+.sp
+ --with-posix-malloc-threshold=20
+.sp
+to the \fBconfigure\fP command.
+.
+.SH "HANDLING VERY LARGE PATTERNS"
+.rs
+.sp
+Within a compiled pattern, offset values are used to point from one part to
+another (for example, from an opening parenthesis to an alternation
+metacharacter). By default, two-byte values are used for these offsets, leading
+to a maximum size for a compiled pattern of around 64K. This is sufficient to
+handle all but the most gigantic patterns. Nevertheless, some people do want to
+process enormous patterns, so it is possible to compile PCRE to use three-byte
+or four-byte offsets by adding a setting such as
+.sp
+ --with-link-size=3
+.sp
+to the \fBconfigure\fP command. The value given must be 2, 3, or 4. Using
+longer offsets slows down the operation of PCRE because it has to load
+additional bytes when handling them.
+.
+.SH "AVOIDING EXCESSIVE STACK USAGE"
+.rs
+.sp
+When matching with the \fBpcre_exec()\fP function, PCRE implements backtracking
+by making recursive calls to an internal function called \fBmatch()\fP. In
+environments where the size of the stack is limited, this can severely limit
+PCRE's operation. (The Unix environment does not usually suffer from this
+problem, but it may sometimes be necessary to increase the maximum stack size.
+There is a discussion in the
+.\" HREF
+\fBpcrestack\fP
+.\"
+documentation.) An alternative approach to recursion that uses memory from the
+heap to remember data, instead of using recursive function calls, has been
+implemented to work round the problem of limited stack size. If you want to
+build a version of PCRE that works this way, add
+.sp
+ --disable-stack-for-recursion
+.sp
+to the \fBconfigure\fP command. With this configuration, PCRE will use the
+\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables to call memory
+management functions. By default these point to \fBmalloc()\fP and
+\fBfree()\fP, but you can replace the pointers so that your own functions are
+used.
+.P
+Separate functions are provided rather than using \fBpcre_malloc\fP and
+\fBpcre_free\fP because the usage is very predictable: the block sizes
+requested are always the same, and the blocks are always freed in reverse
+order. A calling program might be able to implement optimized functions that
+perform better than \fBmalloc()\fP and \fBfree()\fP. PCRE runs noticeably more
+slowly when built in this way. This option affects only the \fBpcre_exec()\fP
+function; it is not relevant for the the \fBpcre_dfa_exec()\fP function.
+.
+.SH "LIMITING PCRE RESOURCE USAGE"
+.rs
+.sp
+Internally, PCRE has a function called \fBmatch()\fP, which it calls repeatedly
+(sometimes recursively) when matching a pattern with the \fBpcre_exec()\fP
+function. By controlling the maximum number of times this function may be
+called during a single matching operation, a limit can be placed on the
+resources used by a single call to \fBpcre_exec()\fP. The limit can be changed
+at run time, as described in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation. The default is 10 million, but this can be changed by adding a
+setting such as
+.sp
+ --with-match-limit=500000
+.sp
+to the \fBconfigure\fP command. This setting has no effect on the
+\fBpcre_dfa_exec()\fP matching function.
+.P
+In some environments it is desirable to limit the depth of recursive calls of
+\fBmatch()\fP more strictly than the total number of calls, in order to
+restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion
+is specified) that is used. A second limit controls this; it defaults to the
+value that is set for --with-match-limit, which imposes no additional
+constraints. However, you can set a lower limit by adding, for example,
+.sp
+ --with-match-limit-recursion=10000
+.sp
+to the \fBconfigure\fP command. This value can also be overridden at run time.
+.
+.SH "CREATING CHARACTER TABLES AT BUILD TIME"
+.rs
+.sp
+PCRE uses fixed tables for processing characters whose code values are less
+than 256. By default, PCRE is built with a set of tables that are distributed
+in the file \fIpcre_chartables.c.dist\fP. These tables are for ASCII codes
+only. If you add
+.sp
+ --enable-rebuild-chartables
+.sp
+to the \fBconfigure\fP command, the distributed tables are no longer used.
+Instead, a program called \fBdftables\fP is compiled and run. This outputs the
+source for new set of tables, created in the default locale of your C runtime
+system. (This method of replacing the tables does not work if you are cross
+compiling, because \fBdftables\fP is run on the local host. If you need to
+create alternative tables when cross compiling, you will have to do so "by
+hand".)
+.
+.SH "USING EBCDIC CODE"
+.rs
+.sp
+PCRE assumes by default that it will run in an environment where the character
+code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
+most computer operating systems. PCRE can, however, be compiled to run in an
+EBCDIC environment by adding
+.sp
+ --enable-ebcdic
+.sp
+to the \fBconfigure\fP command. This setting implies
+--enable-rebuild-chartables. You should only use it if you know that you are in
+an EBCDIC environment (for example, an IBM mainframe operating system).
+.
+.SH "PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT"
+.rs
+.sp
+By default, \fBpcregrep\fP reads all files as plain text. You can build it so
+that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads
+them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of
+.sp
+ --enable-pcregrep-libz
+ --enable-pcregrep-libbz2
+.sp
+to the \fBconfigure\fP command. These options naturally require that the
+relevant libraries are installed on your system. Configuration will fail if
+they are not.
+.
+.SH "PCRETEST OPTION FOR LIBREADLINE SUPPORT"
+.rs
+.sp
+If you add
+.sp
+ --enable-pcretest-libreadline
+.sp
+to the \fBconfigure\fP command, \fBpcretest\fP is linked with the
+\fBlibreadline\fP library, and when its input is from a terminal, it reads it
+using the \fBreadline()\fP function. This provides line-editing and history
+facilities. Note that \fBlibreadline\fP is GPL-licenced, so if you distribute a
+binary of \fBpcretest\fP linked in this way, there may be licensing issues.
+.P
+Setting this option causes the \fB-lreadline\fP option to be added to the
+\fBpcretest\fP build. In many operating environments with a sytem-installed
+\fBlibreadline\fP this is sufficient. However, in some environments (e.g.
+if an unmodified distribution version of readline is in use), some extra
+configuration may be necessary. The INSTALL file for \fBlibreadline\fP says
+this:
+.sp
+ "Readline uses the termcap functions, but does not link with the
+ termcap or curses library itself, allowing applications which link
+ with readline the to choose an appropriate library."
+.sp
+If your environment has not been set up so that an appropriate library is
+automatically included, you may need to add something like
+.sp
+ LIBS="-ncurses"
+.sp
+immediately before the \fBconfigure\fP command.
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcreapi\fP(3), \fBpcre_config\fP(3).
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 13 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcrecallout.3 b/src/doc/pcrecallout.3
new file mode 100644
index 0000000..1258c4e
--- /dev/null
+++ b/src/doc/pcrecallout.3
@@ -0,0 +1,177 @@
+.TH PCRECALLOUT 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE CALLOUTS"
+.rs
+.sp
+.B int (*pcre_callout)(pcre_callout_block *);
+.PP
+PCRE provides a feature called "callout", which is a means of temporarily
+passing control to the caller of PCRE in the middle of pattern matching. The
+caller of PCRE provides an external function by putting its entry point in the
+global variable \fIpcre_callout\fP. By default, this variable contains NULL,
+which disables all calling out.
+.P
+Within a regular expression, (?C) indicates the points at which the external
+function is to be called. Different callout points can be identified by putting
+a number less than 256 after the letter C. The default value is zero.
+For example, this pattern has two callout points:
+.sp
+ (?C1)abc(?C2)def
+.sp
+If the PCRE_AUTO_CALLOUT option bit is set when \fBpcre_compile()\fP is called,
+PCRE automatically inserts callouts, all with number 255, before each item in
+the pattern. For example, if PCRE_AUTO_CALLOUT is used with the pattern
+.sp
+ A(\ed{2}|--)
+.sp
+it is processed as if it were
+.sp
+(?C255)A(?C255)((?C255)\ed{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
+.sp
+Notice that there is a callout before and after each parenthesis and
+alternation bar. Automatic callouts can be used for tracking the progress of
+pattern matching. The
+.\" HREF
+\fBpcretest\fP
+.\"
+command has an option that sets automatic callouts; when it is used, the output
+indicates how the pattern is matched. This is useful information when you are
+trying to optimize the performance of a particular pattern.
+.
+.
+.SH "MISSING CALLOUTS"
+.rs
+.sp
+You should be aware that, because of optimizations in the way PCRE matches
+patterns, callouts sometimes do not happen. For example, if the pattern is
+.sp
+ ab(?C4)cd
+.sp
+PCRE knows that any matching string must contain the letter "d". If the subject
+string is "abyz", the lack of "d" means that matching doesn't ever start, and
+the callout is never reached. However, with "abyd", though the result is still
+no match, the callout is obeyed.
+.
+.
+.SH "THE CALLOUT INTERFACE"
+.rs
+.sp
+During matching, when PCRE reaches a callout point, the external function
+defined by \fIpcre_callout\fP is called (if it is set). This applies to both
+the \fBpcre_exec()\fP and the \fBpcre_dfa_exec()\fP matching functions. The
+only argument to the callout function is a pointer to a \fBpcre_callout\fP
+block. This structure contains the following fields:
+.sp
+ int \fIversion\fP;
+ int \fIcallout_number\fP;
+ int *\fIoffset_vector\fP;
+ const char *\fIsubject\fP;
+ int \fIsubject_length\fP;
+ int \fIstart_match\fP;
+ int \fIcurrent_position\fP;
+ int \fIcapture_top\fP;
+ int \fIcapture_last\fP;
+ void *\fIcallout_data\fP;
+ int \fIpattern_position\fP;
+ int \fInext_item_length\fP;
+.sp
+The \fIversion\fP field is an integer containing the version number of the
+block format. The initial version was 0; the current version is 1. The version
+number will change again in future if additional fields are added, but the
+intention is never to remove any of the existing fields.
+.P
+The \fIcallout_number\fP field contains the number of the callout, as compiled
+into the pattern (that is, the number after ?C for manual callouts, and 255 for
+automatically generated callouts).
+.P
+The \fIoffset_vector\fP field is a pointer to the vector of offsets that was
+passed by the caller to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. When
+\fBpcre_exec()\fP is used, the contents can be inspected in order to extract
+substrings that have been matched so far, in the same way as for extracting
+substrings after a match has completed. For \fBpcre_dfa_exec()\fP this field is
+not useful.
+.P
+The \fIsubject\fP and \fIsubject_length\fP fields contain copies of the values
+that were passed to \fBpcre_exec()\fP.
+.P
+The \fIstart_match\fP field normally contains the offset within the subject at
+which the current match attempt started. However, if the escape sequence \eK
+has been encountered, this value is changed to reflect the modified starting
+point. If the pattern is not anchored, the callout function may be called
+several times from the same point in the pattern for different starting points
+in the subject.
+.P
+The \fIcurrent_position\fP field contains the offset within the subject of the
+current match pointer.
+.P
+When the \fBpcre_exec()\fP function is used, the \fIcapture_top\fP field
+contains one more than the number of the highest numbered captured substring so
+far. If no substrings have been captured, the value of \fIcapture_top\fP is
+one. This is always the case when \fBpcre_dfa_exec()\fP is used, because it
+does not support captured substrings.
+.P
+The \fIcapture_last\fP field contains the number of the most recently captured
+substring. If no substrings have been captured, its value is -1. This is always
+the case when \fBpcre_dfa_exec()\fP is used.
+.P
+The \fIcallout_data\fP field contains a value that is passed to
+\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP specifically so that it can be
+passed back in callouts. It is passed in the \fIpcre_callout\fP field of the
+\fBpcre_extra\fP data structure. If no such data was passed, the value of
+\fIcallout_data\fP in a \fBpcre_callout\fP block is NULL. There is a
+description of the \fBpcre_extra\fP structure in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.P
+The \fIpattern_position\fP field is present from version 1 of the
+\fIpcre_callout\fP structure. It contains the offset to the next item to be
+matched in the pattern string.
+.P
+The \fInext_item_length\fP field is present from version 1 of the
+\fIpcre_callout\fP structure. It contains the length of the next item to be
+matched in the pattern string. When the callout immediately precedes an
+alternation bar, a closing parenthesis, or the end of the pattern, the length
+is zero. When the callout precedes an opening parenthesis, the length is that
+of the entire subpattern.
+.P
+The \fIpattern_position\fP and \fInext_item_length\fP fields are intended to
+help in distinguishing between different automatic callouts, which all have the
+same callout number. However, they are set for all callouts.
+.
+.
+.SH "RETURN VALUES"
+.rs
+.sp
+The external callout function returns an integer to PCRE. If the value is zero,
+matching proceeds as normal. If the value is greater than zero, matching fails
+at the current point, but the testing of other matching possibilities goes
+ahead, just as if a lookahead assertion had failed. If the value is less than
+zero, the match is abandoned, and \fBpcre_exec()\fP (or \fBpcre_dfa_exec()\fP)
+returns the negative value.
+.P
+Negative values should normally be chosen from the set of PCRE_ERROR_xxx
+values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
+The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
+it will never be used by PCRE itself.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 29 May 2007
+Copyright (c) 1997-2007 University of Cambridge.
+.fi
diff --git a/src/doc/pcrecompat.3 b/src/doc/pcrecompat.3
new file mode 100644
index 0000000..3be6a6a
--- /dev/null
+++ b/src/doc/pcrecompat.3
@@ -0,0 +1,148 @@
+.TH PCRECOMPAT 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "DIFFERENCES BETWEEN PCRE AND PERL"
+.rs
+.sp
+This document describes the differences in the ways that PCRE and Perl handle
+regular expressions. The differences described here are mainly with respect to
+Perl 5.8, though PCRE versions 7.0 and later contain some features that are
+expected to be in the forthcoming Perl 5.10.
+.P
+1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
+it does have are given in the
+.\" HTML
+.\"
+section on UTF-8 support
+.\"
+in the main
+.\" HREF
+\fBpcre\fP
+.\"
+page.
+.P
+2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
+them, but they do not mean what you might think. For example, (?!a){3} does
+not assert that the next three characters are not "a". It just asserts that the
+next character is not "a" three times.
+.P
+3. Capturing subpatterns that occur inside negative lookahead assertions are
+counted, but their entries in the offsets vector are never set. Perl sets its
+numerical variables from any such patterns that are matched before the
+assertion fails to match something (thereby succeeding), but only if the
+negative lookahead assertion contains just one branch.
+.P
+4. Though binary zero characters are supported in the subject string, they are
+not allowed in a pattern string because it is passed as a normal C string,
+terminated by zero. The escape sequence \e0 can be used in the pattern to
+represent a binary zero.
+.P
+5. The following Perl escape sequences are not supported: \el, \eu, \eL,
+\eU, and \eN. In fact these are implemented by Perl's general string-handling
+and are not part of its pattern matching engine. If any of these are
+encountered by PCRE, an error is generated.
+.P
+6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE is
+built with Unicode character property support. The properties that can be
+tested with \ep and \eP are limited to the general category properties such as
+Lu and Nd, script names such as Greek or Han, and the derived properties Any
+and L&.
+.P
+7. PCRE does support the \eQ...\eE escape for quoting substrings. Characters in
+between are treated as literals. This is slightly different from Perl in that $
+and @ are also handled as literals inside the quotes. In Perl, they cause
+variable interpolation (but of course PCRE does not have variables). Note the
+following examples:
+.sp
+ Pattern PCRE matches Perl matches
+.sp
+.\" JOIN
+ \eQabc$xyz\eE abc$xyz abc followed by the
+ contents of $xyz
+ \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz
+ \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz
+.sp
+The \eQ...\eE sequence is recognized both inside and outside character classes.
+.P
+8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
+constructions. However, there is support for recursive patterns. This is not
+available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
+feature allows an external function to be called during pattern matching. See
+the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation for details.
+.P
+9. Subpatterns that are called recursively or as "subroutines" are always
+treated as atomic groups in PCRE. This is like Python, but unlike Perl.
+.P
+10. There are some differences that are concerned with the settings of captured
+strings when part of a pattern is repeated. For example, matching "aba" against
+the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
+.P
+11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
+(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
+argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
+parentheses, PCRE does not set that capture group; this is different to Perl.
+.P
+12. PCRE provides some extensions to the Perl regular expression facilities.
+Perl 5.10 will include new features that are not in earlier versions, some of
+which (such as named parentheses) have been in PCRE for some time. This list is
+with respect to Perl 5.10:
+.sp
+(a) Although lookbehind assertions must match fixed length strings, each
+alternative branch of a lookbehind assertion can match a different length of
+string. Perl requires them all to have the same length.
+.sp
+(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
+meta-character matches only at the very end of the string.
+.sp
+(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
+meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
+(Perl can be made to issue a warning.)
+.sp
+(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
+inverted, that is, by default they are not greedy, but if followed by a
+question mark they are.
+.sp
+(e) PCRE_ANCHORED can be used at matching time to force a pattern to be tried
+only at the first matching position in the subject string.
+.sp
+(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
+options for \fBpcre_exec()\fP have no Perl equivalents.
+.sp
+(g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF
+by the PCRE_BSR_ANYCRLF option.
+.sp
+(h) The callout facility is PCRE-specific.
+.sp
+(i) The partial matching facility is PCRE-specific.
+.sp
+(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
+different hosts that have the other endianness.
+.sp
+(k) The alternative matching function (\fBpcre_dfa_exec()\fP) matches in a
+different way and is not Perl-compatible.
+.sp
+(l) PCRE recognizes some special sequences such as (*CR) at the start of
+a pattern that set overall options that cannot be changed within the pattern.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 11 September 2007
+Copyright (c) 1997-2007 University of Cambridge.
+.fi
diff --git a/src/doc/pcrecpp.3 b/src/doc/pcrecpp.3
new file mode 100644
index 0000000..e52bd29
--- /dev/null
+++ b/src/doc/pcrecpp.3
@@ -0,0 +1,342 @@
+.TH PCRECPP 3
+.SH NAME
+PCRE - Perl-compatible regular expressions.
+.SH "SYNOPSIS OF C++ WRAPPER"
+.rs
+.sp
+.B #include
+.
+.SH DESCRIPTION
+.rs
+.sp
+The C++ wrapper for PCRE was provided by Google Inc. Some additional
+functionality was added by Giuseppe Maxia. This brief man page was constructed
+from the notes in the \fIpcrecpp.h\fP file, which should be consulted for
+further details.
+.
+.
+.SH "MATCHING INTERFACE"
+.rs
+.sp
+The "FullMatch" operation checks that supplied text matches a supplied pattern
+exactly. If pointer arguments are supplied, it copies matched sub-strings that
+match sub-patterns into them.
+.sp
+ Example: successful match
+ pcrecpp::RE re("h.*o");
+ re.FullMatch("hello");
+.sp
+ Example: unsuccessful match (requires full match):
+ pcrecpp::RE re("e");
+ !re.FullMatch("hello");
+.sp
+ Example: creating a temporary RE object:
+ pcrecpp::RE("h.*o").FullMatch("hello");
+.sp
+You can pass in a "const char*" or a "string" for "text". The examples below
+tend to use a const char*. You can, as in the different examples above, store
+the RE object explicitly in a variable or use a temporary RE object. The
+examples below use one mode or the other arbitrarily. Either could correctly be
+used for any of these examples.
+.P
+You must supply extra pointer arguments to extract matched subpieces.
+.sp
+ Example: extracts "ruby" into "s" and 1234 into "i"
+ int i;
+ string s;
+ pcrecpp::RE re("(\e\ew+):(\e\ed+)");
+ re.FullMatch("ruby:1234", &s, &i);
+.sp
+ Example: does not try to extract any extra sub-patterns
+ re.FullMatch("ruby:1234", &s);
+.sp
+ Example: does not try to extract into NULL
+ re.FullMatch("ruby:1234", NULL, &i);
+.sp
+ Example: integer overflow causes failure
+ !re.FullMatch("ruby:1234567891234", NULL, &i);
+.sp
+ Example: fails because there aren't enough sub-patterns:
+ !pcrecpp::RE("\e\ew+:\e\ed+").FullMatch("ruby:1234", &s);
+.sp
+ Example: fails because string cannot be stored in integer
+ !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
+.sp
+The provided pointer arguments can be pointers to any scalar numeric
+type, or one of:
+.sp
+ string (matched piece is copied to string)
+ StringPiece (StringPiece is mutated to point to matched piece)
+ T (where "bool T::ParseFrom(const char*, int)" exists)
+ NULL (the corresponding matched sub-pattern is not copied)
+.sp
+The function returns true iff all of the following conditions are satisfied:
+.sp
+ a. "text" matches "pattern" exactly;
+.sp
+ b. The number of matched sub-patterns is >= number of supplied
+ pointers;
+.sp
+ c. The "i"th argument has a suitable type for holding the
+ string captured as the "i"th sub-pattern. If you pass in
+ void * NULL for the "i"th argument, or a non-void * NULL
+ of the correct type, or pass fewer arguments than the
+ number of sub-patterns, "i"th captured sub-pattern is
+ ignored.
+.sp
+CAVEAT: An optional sub-pattern that does not exist in the matched
+string is assigned the empty string. Therefore, the following will
+return false (because the empty string is not a valid number):
+.sp
+ int number;
+ pcrecpp::RE::FullMatch("abc", "[a-z]+(\e\ed+)?", &number);
+.sp
+The matching interface supports at most 16 arguments per call.
+If you need more, consider using the more general interface
+\fBpcrecpp::RE::DoMatch\fP. See \fBpcrecpp.h\fP for the signature for
+\fBDoMatch\fP.
+.
+.SH "QUOTING METACHARACTERS"
+.rs
+.sp
+You can use the "QuoteMeta" operation to insert backslashes before all
+potentially meaningful characters in a string. The returned string, used as a
+regular expression, will exactly match the original string.
+.sp
+ Example:
+ string quoted = RE::QuoteMeta(unquoted);
+.sp
+Note that it's legal to escape a character even if it has no special meaning in
+a regular expression -- so this function does that. (This also makes it
+identical to the perl function of the same name; see "perldoc -f quotemeta".)
+For example, "1.5-2.0?" becomes "1\e.5\e-2\e.0\e?".
+.
+.SH "PARTIAL MATCHES"
+.rs
+.sp
+You can use the "PartialMatch" operation when you want the pattern
+to match any substring of the text.
+.sp
+ Example: simple search for a string:
+ pcrecpp::RE("ell").PartialMatch("hello");
+.sp
+ Example: find first number in a string:
+ int number;
+ pcrecpp::RE re("(\e\ed+)");
+ re.PartialMatch("x*100 + 20", &number);
+ assert(number == 100);
+.
+.
+.SH "UTF-8 AND THE MATCHING INTERFACE"
+.rs
+.sp
+By default, pattern and text are plain text, one byte per character. The UTF8
+flag, passed to the constructor, causes both pattern and string to be treated
+as UTF-8 text, still a byte stream but potentially multiple bytes per
+character. In practice, the text is likelier to be UTF-8 than the pattern, but
+the match returned may depend on the UTF8 flag, so always use it when matching
+UTF8 text. For example, "." will match one byte normally but with UTF8 set may
+match up to three bytes of a multi-byte character.
+.sp
+ Example:
+ pcrecpp::RE_Options options;
+ options.set_utf8();
+ pcrecpp::RE re(utf8_pattern, options);
+ re.FullMatch(utf8_string);
+.sp
+ Example: using the convenience function UTF8():
+ pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
+ re.FullMatch(utf8_string);
+.sp
+NOTE: The UTF8 flag is ignored if pcre was not configured with the
+ --enable-utf8 flag.
+.
+.
+.SH "PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE"
+.rs
+.sp
+PCRE defines some modifiers to change the behavior of the regular expression
+engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
+pass such modifiers to a RE class. Currently, the following modifiers are
+supported:
+.sp
+ modifier description Perl corresponding
+.sp
+ PCRE_CASELESS case insensitive match /i
+ PCRE_MULTILINE multiple lines match /m
+ PCRE_DOTALL dot matches newlines /s
+ PCRE_DOLLAR_ENDONLY $ matches only at end N/A
+ PCRE_EXTRA strict escape parsing N/A
+ PCRE_EXTENDED ignore whitespaces /x
+ PCRE_UTF8 handles UTF8 chars built-in
+ PCRE_UNGREEDY reverses * and *? N/A
+ PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
+.sp
+(*) Both Perl and PCRE allow non capturing parentheses by means of the
+"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
+capture, while (ab|cd) does.
+.P
+For a full account on how each modifier works, please check the
+PCRE API reference page.
+.P
+For each modifier, there are two member functions whose name is made
+out of the modifier in lowercase, without the "PCRE_" prefix. For
+instance, PCRE_CASELESS is handled by
+.sp
+ bool caseless()
+.sp
+which returns true if the modifier is set, and
+.sp
+ RE_Options & set_caseless(bool)
+.sp
+which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can be
+accessed through the \fBset_match_limit()\fR and \fBmatch_limit()\fR member
+functions. Setting \fImatch_limit\fR to a non-zero value will limit the
+execution of pcre to keep it from doing bad things like blowing the stack or
+taking an eternity to return a result. A value of 5000 is good enough to stop
+stack blowup in a 2MB thread stack. Setting \fImatch_limit\fR to zero disables
+match limiting. Alternatively, you can call \fBmatch_limit_recursion()\fP
+which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much PCRE
+recurses. \fBmatch_limit()\fP limits the number of matches PCRE does;
+\fBmatch_limit_recursion()\fP limits the depth of internal recursion, and
+therefore the amount of stack that is used.
+.P
+Normally, to pass one or more modifiers to a RE class, you declare
+a \fIRE_Options\fR object, set the appropriate options, and pass this
+object to a RE constructor. Example:
+.sp
+ RE_options opt;
+ opt.set_caseless(true);
+ if (RE("HELLO", opt).PartialMatch("hello world")) ...
+.sp
+RE_options has two constructors. The default constructor takes no arguments and
+creates a set of flags that are off by default. The optional parameter
+\fIoption_flags\fR is to facilitate transfer of legacy code from C programs.
+This lets you do
+.sp
+ RE(pattern,
+ RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
+.sp
+However, new code is better off doing
+.sp
+ RE(pattern,
+ RE_Options().set_caseless(true).set_multiline(true))
+ .PartialMatch(str);
+.sp
+If you are going to pass one of the most used modifiers, there are some
+convenience functions that return a RE_Options class with the
+appropriate modifier already set: \fBCASELESS()\fR, \fBUTF8()\fR,
+\fBMULTILINE()\fR, \fBDOTALL\fR(), and \fBEXTENDED()\fR.
+.P
+If you need to set several options at once, and you don't want to go through
+the pains of declaring a RE_Options object and setting several options, there
+is a parallel method that give you such ability on the fly. You can concatenate
+several \fBset_xxxxx()\fR member functions, since each of them returns a
+reference to its class object. For example, to pass PCRE_CASELESS,
+PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
+.sp
+ RE(" ^ xyz \e\es+ .* blah$",
+ RE_Options()
+ .set_caseless(true)
+ .set_extended(true)
+ .set_multiline(true)).PartialMatch(sometext);
+.sp
+.
+.
+.SH "SCANNING TEXT INCREMENTALLY"
+.rs
+.sp
+The "Consume" operation may be useful if you want to repeatedly
+match regular expressions at the front of a string and skip over
+them as they match. This requires use of the "StringPiece" type,
+which represents a sub-range of a real string. Like RE, StringPiece
+is defined in the pcrecpp namespace.
+.sp
+ Example: read lines of the form "var = value" from a string.
+ string contents = ...; // Fill string somehow
+ pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
+
+ string var;
+ int value;
+ pcrecpp::RE re("(\e\ew+) = (\e\ed+)\en");
+ while (re.Consume(&input, &var, &value)) {
+ ...;
+ }
+.sp
+Each successful call to "Consume" will set "var/value", and also
+advance "input" so it points past the matched text.
+.P
+The "FindAndConsume" operation is similar to "Consume" but does not
+anchor your match at the beginning of the string. For example, you
+could extract all words from a string by repeatedly calling
+.sp
+ pcrecpp::RE("(\e\ew+)").FindAndConsume(&input, &word)
+.
+.
+.SH "PARSING HEX/OCTAL/C-RADIX NUMBERS"
+.rs
+.sp
+By default, if you pass a pointer to a numeric value, the
+corresponding text is interpreted as a base-10 number. You can
+instead wrap the pointer with a call to one of the operators Hex(),
+Octal(), or CRadix() to interpret the text in another base. The
+CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
+prefixes, but defaults to base-10.
+.sp
+ Example:
+ int a, b, c, d;
+ pcrecpp::RE re("(.*) (.*) (.*) (.*)");
+ re.FullMatch("100 40 0100 0x40",
+ pcrecpp::Octal(&a), pcrecpp::Hex(&b),
+ pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
+.sp
+will leave 64 in a, b, c, and d.
+.
+.
+.SH "REPLACING PARTS OF STRINGS"
+.rs
+.sp
+You can replace the first match of "pattern" in "str" with "rewrite".
+Within "rewrite", backslash-escaped digits (\e1 to \e9) can be
+used to insert text matching corresponding parenthesized group
+from the pattern. \e0 in "rewrite" refers to the entire matching
+text. For example:
+.sp
+ string s = "yabba dabba doo";
+ pcrecpp::RE("b+").Replace("d", &s);
+.sp
+will leave "s" containing "yada dabba doo". The result is true if the pattern
+matches and a replacement occurs, false otherwise.
+.P
+\fBGlobalReplace\fP is like \fBReplace\fP except that it replaces all
+occurrences of the pattern in the string with the rewrite. Replacements are
+not subject to re-matching. For example:
+.sp
+ string s = "yabba dabba doo";
+ pcrecpp::RE("b+").GlobalReplace("d", &s);
+.sp
+will leave "s" containing "yada dada doo". It returns the number of
+replacements made.
+.P
+\fBExtract\fP is like \fBReplace\fP, except that if the pattern matches,
+"rewrite" is copied into "out" (an additional argument) with substitutions.
+The non-matching portions of "text" are ignored. Returns true iff a match
+occurred and the extraction happened successfully; if no match occurs, the
+string is left unaffected.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+The C++ wrapper was contributed by Google Inc.
+Copyright (c) 2007 Google Inc.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 12 November 2007
+.fi
diff --git a/src/doc/pcregrep.1 b/src/doc/pcregrep.1
new file mode 100644
index 0000000..8803cf1
--- /dev/null
+++ b/src/doc/pcregrep.1
@@ -0,0 +1,463 @@
+.TH PCREGREP 1
+.SH NAME
+pcregrep - a grep with Perl-compatible regular expressions.
+.SH SYNOPSIS
+.B pcregrep [options] [long options] [pattern] [path1 path2 ...]
+.
+.SH DESCRIPTION
+.rs
+.sp
+\fBpcregrep\fP searches files for character patterns, in the same way as other
+grep commands do, but it uses the PCRE regular expression library to support
+patterns that are compatible with the regular expressions of Perl 5. See
+.\" HREF
+\fBpcrepattern\fP(3)
+.\"
+for a full description of syntax and semantics of the regular expressions
+that PCRE supports.
+.P
+Patterns, whether supplied on the command line or in a separate file, are given
+without delimiters. For example:
+.sp
+ pcregrep Thursday /etc/motd
+.sp
+If you attempt to use delimiters (for example, by surrounding a pattern with
+slashes, as is common in Perl scripts), they are interpreted as part of the
+pattern. Quotes can of course be used to delimit patterns on the command line
+because they are interpreted by the shell, and indeed they are required if a
+pattern contains white space or shell metacharacters.
+.P
+The first argument that follows any option settings is treated as the single
+pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present.
+Conversely, when one or both of these options are used to specify patterns, all
+arguments are treated as path names. At least one of \fB-e\fP, \fB-f\fP, or an
+argument pattern must be provided.
+.P
+If no files are specified, \fBpcregrep\fP reads the standard input. The
+standard input can also be referenced by a name consisting of a single hyphen.
+For example:
+.sp
+ pcregrep some-pattern /file1 - /file3
+.sp
+By default, each line that matches a pattern is copied to the standard
+output, and if there is more than one file, the file name is output at the
+start of each line, followed by a colon. However, there are options that can
+change how \fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it
+possible to search for patterns that span line boundaries. What defines a line
+boundary is controlled by the \fB-N\fP (\fB--newline\fP) option.
+.P
+Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
+BUFSIZ is defined in \fB\fP. When there is more than one pattern
+(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to
+each line in the order in which they are defined, except that all the \fB-e\fP
+patterns are tried before the \fB-f\fP patterns. As soon as one pattern matches
+(or fails to match when \fB-v\fP is used), no further patterns are considered.
+.P
+When \fB--only-matching\fP, \fB--file-offsets\fP, or \fB--line-offsets\fP
+is used, the output is the part of the line that matched (either shown
+literally, or as an offset). In this case, scanning resumes immediately
+following the match, so that further matches on the same line can be found.
+If there are multiple patterns, they are all tried on the remainder of the
+line. However, patterns that follow the one that matched are not tried on the
+earlier part of the line.
+.P
+If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set,
+\fBpcregrep\fP uses the value to set a locale when calling the PCRE library.
+The \fB--locale\fP option can be used to override this.
+.
+.SH "SUPPORT FOR COMPRESSED FILES"
+.rs
+.sp
+It is possible to compile \fBpcregrep\fP so that it uses \fBlibz\fP or
+\fBlibbz2\fP to read files whose names end in \fB.gz\fP or \fB.bz2\fP,
+respectively. You can find out whether your binary has support for one or both
+of these file types by running it with the \fB--help\fP option. If the
+appropriate support is not present, files are treated as plain text. The
+standard input is always so treated.
+.
+.SH OPTIONS
+.rs
+.TP 10
+\fB--\fP
+This terminate the list of options. It is useful if the next item on the
+command line starts with a hyphen but is not an option. This allows for the
+processing of patterns and filenames that start with hyphens.
+.TP
+\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
+Output \fInumber\fP lines of context after each matching line. If filenames
+and/or line numbers are being output, a hyphen separator is used instead of a
+colon for the context lines. A line containing "--" is output between each
+group of lines, unless they are in fact contiguous in the input file. The value
+of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
+guarantees to have up to 8K of following text available for context output.
+.TP
+\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
+Output \fInumber\fP lines of context before each matching line. If filenames
+and/or line numbers are being output, a hyphen separator is used instead of a
+colon for the context lines. A line containing "--" is output between each
+group of lines, unless they are in fact contiguous in the input file. The value
+of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
+guarantees to have up to 8K of preceding text available for context output.
+.TP
+\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
+Output \fInumber\fP lines of context both before and after each matching line.
+This is equivalent to setting both \fB-A\fP and \fB-B\fP to the same value.
+.TP
+\fB-c\fP, \fB--count\fP
+Do not output individual lines; instead just output a count of the number of
+lines that would otherwise have been output. If several files are given, a
+count is output for each of them. In this mode, the \fB-A\fP, \fB-B\fP, and
+\fB-C\fP options are ignored.
+.TP
+\fB--colour\fP, \fB--color\fP
+If this option is given without any data, it is equivalent to "--colour=auto".
+If data is required, it must be given in the same shell item, separated by an
+equals sign.
+.TP
+\fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
+This option specifies under what circumstances the part of a line that matched
+a pattern should be coloured in the output. The value may be "never" (the
+default), "always", or "auto". In the latter case, colouring happens only if
+the standard output is connected to a terminal. The colour can be specified by
+setting the environment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
+of this variable should be a string of two numbers, separated by a semicolon.
+They are copied directly into the control string for setting colour on a
+terminal, so it is your responsibility to ensure that they make sense. If
+neither of the environment variables is set, the default is "1;31", which gives
+red.
+.TP
+\fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP
+If an input path is not a regular file or a directory, "action" specifies how
+it is to be processed. Valid values are "read" (the default) or "skip"
+(silently skip the path).
+.TP
+\fB-d\fP \fIaction\fP, \fB--directories=\fP\fIaction\fP
+If an input path is a directory, "action" specifies how it is to be processed.
+Valid values are "read" (the default), "recurse" (equivalent to the \fB-r\fP
+option), or "skip" (silently skip the path). In the default case, directories
+are read as if they were ordinary files. In some operating systems the effect
+of reading a directory like this is an immediate end-of-file.
+.TP
+\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
+Specify a pattern to be matched. This option can be used multiple times in
+order to specify several patterns. It can also be used as a way of specifying a
+single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
+pattern is taken from the command line; all arguments are treated as file
+names. There is an overall maximum of 100 patterns. They are applied to each
+line in the order in which they are defined until one matches (or fails to
+match if \fB-v\fP is used). If \fB-f\fP is used with \fB-e\fP, the command line
+patterns are matched first, followed by the patterns from the file, independent
+of the order in which these options are specified. Note that multiple use of
+\fB-e\fP is not the same as a single pattern with alternatives. For example,
+X|Y finds the first character in a line that is X or Y, whereas if the two
+patterns are given separately, \fBpcregrep\fP finds X if it is present, even if
+it follows Y in the line. It finds Y only if there is no X in the line. This
+really matters only if you are using \fB-o\fP to show the part(s) of the line
+that matched.
+.TP
+\fB--exclude\fP=\fIpattern\fP
+When \fBpcregrep\fP is searching the files in a directory as a consequence of
+the \fB-r\fP (recursive search) option, any regular files whose names match the
+pattern are excluded. Subdirectories are not excluded by this option; they are
+searched recursively, subject to the \fB--exclude_dir\fP and
+\fB--include_dir\fP options. The pattern is a PCRE regular expression, and is
+matched against the final component of the file name (not the entire path). If
+a file name matches both \fB--include\fP and \fB--exclude\fP, it is excluded.
+There is no short form for this option.
+.TP
+\fB--exclude_dir\fP=\fIpattern\fP
+When \fBpcregrep\fP is searching the contents of a directory as a consequence
+of the \fB-r\fP (recursive search) option, any subdirectories whose names match
+the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
+subdirectories.) The pattern is a PCRE regular expression, and is matched
+against the final component of the name (not the entire path). If a
+subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
+is excluded. There is no short form for this option.
+.TP
+\fB-F\fP, \fB--fixed-strings\fP
+Interpret each pattern as a list of fixed strings, separated by newlines,
+instead of as a regular expression. The \fB-w\fP (match as a word) and \fB-x\fP
+(match whole line) options can be used with \fB-F\fP. They apply to each of the
+fixed strings. A line is selected if any of the fixed strings are found in it
+(subject to \fB-w\fP or \fB-x\fP, if present).
+.TP
+\fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP
+Read a number of patterns from the file, one per line, and match them against
+each line of input. A data line is output if any of the patterns match it. The
+filename can be given as "-" to refer to the standard input. When \fB-f\fP is
+used, patterns specified on the command line using \fB-e\fP may also be
+present; they are tested before the file's patterns. However, no other pattern
+is taken from the command line; all arguments are treated as file names. There
+is an overall maximum of 100 patterns. Trailing white space is removed from
+each line, and blank lines are ignored. An empty file contains no patterns and
+therefore matches nothing. See also the comments about multiple patterns versus
+a single pattern with alternatives in the description of \fB-e\fP above.
+.TP
+\fB--file-offsets\fP
+Instead of showing lines or parts of lines that match, show each match as an
+offset from the start of the file and a length, separated by a comma. In this
+mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
+options are ignored. If there is more than one match in a line, each of them is
+shown separately. This option is mutually exclusive with \fB--line-offsets\fP
+and \fB--only-matching\fP.
+.TP
+\fB-H\fP, \fB--with-filename\fP
+Force the inclusion of the filename at the start of output lines when searching
+a single file. By default, the filename is not shown in this case. For matching
+lines, the filename is followed by a colon and a space; for context lines, a
+hyphen separator is used. If a line number is also being output, it follows the
+file name without a space.
+.TP
+\fB-h\fP, \fB--no-filename\fP
+Suppress the output filenames when searching multiple files. By default,
+filenames are shown when multiple files are searched. For matching lines, the
+filename is followed by a colon and a space; for context lines, a hyphen
+separator is used. If a line number is also being output, it follows the file
+name without a space.
+.TP
+\fB--help\fP
+Output a help message, giving brief details of the command options and file
+type support, and then exit.
+.TP
+\fB-i\fP, \fB--ignore-case\fP
+Ignore upper/lower case distinctions during comparisons.
+.TP
+\fB--include\fP=\fIpattern\fP
+When \fBpcregrep\fP is searching the files in a directory as a consequence of
+the \fB-r\fP (recursive search) option, only those regular files whose names
+match the pattern are included. Subdirectories are always included and searched
+recursively, subject to the \fP--include_dir\fP and \fB--exclude_dir\fP
+options. The pattern is a PCRE regular expression, and is matched against the
+final component of the file name (not the entire path). If a file name matches
+both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no short
+form for this option.
+.TP
+\fB--include_dir\fP=\fIpattern\fP
+When \fBpcregrep\fP is searching the contents of a directory as a consequence
+of the \fB-r\fP (recursive search) option, only those subdirectories whose
+names match the pattern are included. (Note that the \fB--include\fP option
+does not affect subdirectories.) The pattern is a PCRE regular expression, and
+is matched against the final component of the name (not the entire path). If a
+subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
+is excluded. There is no short form for this option.
+.TP
+\fB-L\fP, \fB--files-without-match\fP
+Instead of outputting lines from the files, just output the names of the files
+that do not contain any lines that would have been output. Each file name is
+output once, on a separate line.
+.TP
+\fB-l\fP, \fB--files-with-matches\fP
+Instead of outputting lines from the files, just output the names of the files
+containing lines that would have been output. Each file name is output
+once, on a separate line. Searching stops as soon as a matching line is found
+in a file.
+.TP
+\fB--label\fP=\fIname\fP
+This option supplies a name to be used for the standard input when file names
+are being output. If not supplied, "(standard input)" is used. There is no
+short form for this option.
+.TP
+\fB--line-offsets\fP
+Instead of showing lines or parts of lines that match, show each match as a
+line number, the offset from the start of the line, and a length. The line
+number is terminated by a colon (as usual; see the \fB-n\fP option), and the
+offset and length are separated by a comma. In this mode, no context is shown.
+That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is
+more than one match in a line, each of them is shown separately. This option is
+mutually exclusive with \fB--file-offsets\fP and \fB--only-matching\fP.
+.TP
+\fB--locale\fP=\fIlocale-name\fP
+This option specifies a locale to be used for pattern matching. It overrides
+the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
+locale is specified, the PCRE library's default (usually the "C" locale) is
+used. There is no short form for this option.
+.TP
+\fB-M\fP, \fB--multiline\fP
+Allow patterns to match more than one line. When this option is given, patterns
+may usefully contain literal newline characters and internal occurrences of ^
+and $ characters. The output for any one match may consist of more than one
+line. When this option is set, the PCRE library is called in "multiline" mode.
+There is a limit to the number of lines that can be matched, imposed by the way
+that \fBpcregrep\fP buffers the input file as it scans it. However,
+\fBpcregrep\fP ensures that at least 8K characters or the rest of the document
+(whichever is the shorter) are available for forward matching, and similarly
+the previous 8K characters (or all the previous characters, if fewer than 8K)
+are guaranteed to be available for lookbehind assertions.
+.TP
+\fB-N\fP \fInewline-type\fP, \fB--newline=\fP\fInewline-type\fP
+The PCRE library supports five different conventions for indicating
+the ends of lines. They are the single-character sequences CR (carriage return)
+and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
+which recognizes any of the preceding three types, and an "any" convention, in
+which any Unicode line ending sequence is assumed to end a line. The Unicode
+sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
+(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
+PS (paragraph separator, U+2029).
+.sp
+When the PCRE library is built, a default line-ending sequence is specified.
+This is normally the standard sequence for the operating system. Unless
+otherwise specified by this option, \fBpcregrep\fP uses the library's default.
+The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
+makes it possible to use \fBpcregrep\fP on files that have come from other
+environments without having to modify their line endings. If the data that is
+being scanned does not agree with the convention set by this option,
+\fBpcregrep\fP may behave in strange ways.
+.TP
+\fB-n\fP, \fB--line-number\fP
+Precede each output line by its line number in the file, followed by a colon
+and a space for matching lines or a hyphen and a space for context lines. If
+the filename is also being output, it precedes the line number. This option is
+forced if \fB--line-offsets\fP is used.
+.TP
+\fB-o\fP, \fB--only-matching\fP
+Show only the part of the line that matched a pattern. In this mode, no
+context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are
+ignored. If there is more than one match in a line, each of them is shown
+separately. If \fB-o\fP is combined with \fB-v\fP (invert the sense of the
+match to find non-matching lines), no output is generated, but the return code
+is set appropriately. This option is mutually exclusive with
+\fB--file-offsets\fP and \fB--line-offsets\fP.
+.TP
+\fB-q\fP, \fB--quiet\fP
+Work quietly, that is, display nothing except error messages. The exit
+status indicates whether or not any matches were found.
+.TP
+\fB-r\fP, \fB--recursive\fP
+If any given path is a directory, recursively scan the files it contains,
+taking note of any \fB--include\fP and \fB--exclude\fP settings. By default, a
+directory is read as a normal file; in some operating systems this gives an
+immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
+option to "recurse".
+.TP
+\fB-s\fP, \fB--no-messages\fP
+Suppress error messages about non-existent or unreadable files. Such files are
+quietly skipped. However, the return code is still 2, even if matches were
+found in other files.
+.TP
+\fB-u\fP, \fB--utf-8\fP
+Operate in UTF-8 mode. This option is available only if PCRE has been compiled
+with UTF-8 support. Both patterns and subject lines must be valid strings of
+UTF-8 characters.
+.TP
+\fB-V\fP, \fB--version\fP
+Write the version numbers of \fBpcregrep\fP and the PCRE library that is being
+used to the standard error stream.
+.TP
+\fB-v\fP, \fB--invert-match\fP
+Invert the sense of the match, so that lines which do \fInot\fP match any of
+the patterns are the ones that are found.
+.TP
+\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
+Force the patterns to match only whole words. This is equivalent to having \eb
+at the start and end of the pattern.
+.TP
+\fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP
+Force the patterns to be anchored (each must start matching at the beginning of
+a line) and in addition, require them to match entire lines. This is
+equivalent to having ^ and $ characters at the start and end of each
+alternative branch in every pattern.
+.
+.
+.SH "ENVIRONMENT VARIABLES"
+.rs
+.sp
+The environment variables \fBLC_ALL\fP and \fBLC_CTYPE\fP are examined, in that
+order, for a locale. The first one that is set is used. This can be overridden
+by the \fB--locale\fP option. If no locale is set, the PCRE library's default
+(usually the "C" locale) is used.
+.
+.
+.SH "NEWLINES"
+.rs
+.sp
+The \fB-N\fP (\fB--newline\fP) option allows \fBpcregrep\fP to scan files with
+different newline conventions from the default. However, the setting of this
+option does not affect the way in which \fBpcregrep\fP writes information to
+the standard error and output streams. It uses the string "\en" in C
+\fBprintf()\fP calls to indicate newlines, relying on the C I/O library to
+convert this to an appropriate sequence if the output is sent to a file.
+.
+.
+.SH "OPTIONS COMPATIBILITY"
+.rs
+.sp
+The majority of short and long forms of \fBpcregrep\fP's options are the same
+as in the GNU \fBgrep\fP program. Any long option of the form
+\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
+(PCRE terminology). However, the \fB--locale\fP, \fB-M\fP, \fB--multiline\fP,
+\fB-u\fP, and \fB--utf-8\fP options are specific to \fBpcregrep\fP.
+.
+.
+.SH "OPTIONS WITH DATA"
+.rs
+.sp
+There are four different ways in which an option with data can be specified.
+If a short form option is used, the data may follow immediately, or in the next
+command line item. For example:
+.sp
+ -f/some/file
+ -f /some/file
+.sp
+If a long form option is used, the data may appear in the same command line
+item, separated by an equals character, or (with one exception) it may appear
+in the next command line item. For example:
+.sp
+ --file=/some/file
+ --file /some/file
+.sp
+Note, however, that if you want to supply a file name beginning with ~ as data
+in a shell command, and have the shell expand ~ to a home directory, you must
+separate the file name from the option, because the shell does not treat ~
+specially unless it is at the start of an item.
+.P
+The exception to the above is the \fB--colour\fP (or \fB--color\fP) option,
+for which the data is optional. If this option does have data, it must be given
+in the first form, using an equals character. Otherwise it will be assumed that
+it has no data.
+.
+.
+.SH "MATCHING ERRORS"
+.rs
+.sp
+It is possible to supply a regular expression that takes a very long time to
+fail to match certain lines. Such patterns normally involve nested indefinite
+repeats, for example: (a+)*\ed when matched against a line of a's with no final
+digit. The PCRE matching function has a resource limit that causes it to abort
+in these circumstances. If this happens, \fBpcregrep\fP outputs an error
+message and the line that caused the problem to the standard error stream. If
+there are more than 20 such errors, \fBpcregrep\fP gives up.
+.
+.
+.SH DIAGNOSTICS
+.rs
+.sp
+Exit status is 0 if any matches were found, 1 if no matches were found, and 2
+for syntax errors and non-existent or inacessible files (even if matches were
+found in other files) or too many matching errors. Using the \fB-s\fP option to
+suppress error messages about inaccessble files does not affect the return
+code.
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcrepattern\fP(3), \fBpcretest\fP(1).
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 08 March 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcregrep.txt b/src/doc/pcregrep.txt
new file mode 100644
index 0000000..84be6f9
--- /dev/null
+++ b/src/doc/pcregrep.txt
@@ -0,0 +1,498 @@
+PCREGREP(1) PCREGREP(1)
+
+
+NAME
+ pcregrep - a grep with Perl-compatible regular expressions.
+
+
+SYNOPSIS
+ pcregrep [options] [long options] [pattern] [path1 path2 ...]
+
+
+DESCRIPTION
+
+ pcregrep searches files for character patterns, in the same way as
+ other grep commands do, but it uses the PCRE regular expression library
+ to support patterns that are compatible with the regular expressions of
+ Perl 5. See pcrepattern(3) for a full description of syntax and seman-
+ tics of the regular expressions that PCRE supports.
+
+ Patterns, whether supplied on the command line or in a separate file,
+ are given without delimiters. For example:
+
+ pcregrep Thursday /etc/motd
+
+ If you attempt to use delimiters (for example, by surrounding a pattern
+ with slashes, as is common in Perl scripts), they are interpreted as
+ part of the pattern. Quotes can of course be used to delimit patterns
+ on the command line because they are interpreted by the shell, and
+ indeed they are required if a pattern contains white space or shell
+ metacharacters.
+
+ The first argument that follows any option settings is treated as the
+ single pattern to be matched when neither -e nor -f is present. Con-
+ versely, when one or both of these options are used to specify pat-
+ terns, all arguments are treated as path names. At least one of -e, -f,
+ or an argument pattern must be provided.
+
+ If no files are specified, pcregrep reads the standard input. The stan-
+ dard input can also be referenced by a name consisting of a single
+ hyphen. For example:
+
+ pcregrep some-pattern /file1 - /file3
+
+ By default, each line that matches a pattern is copied to the standard
+ output, and if there is more than one file, the file name is output at
+ the start of each line, followed by a colon. However, there are options
+ that can change how pcregrep behaves. In particular, the -M option
+ makes it possible to search for patterns that span line boundaries.
+ What defines a line boundary is controlled by the -N (--newline)
+ option.
+
+ Patterns are limited to 8K or BUFSIZ characters, whichever is the
+ greater. BUFSIZ is defined in . When there is more than one
+ pattern (specified by the use of -e and/or -f), each pattern is applied
+ to each line in the order in which they are defined, except that all
+ the -e patterns are tried before the -f patterns. As soon as one pat-
+ tern matches (or fails to match when -v is used), no further patterns
+ are considered.
+
+ When --only-matching, --file-offsets, or --line-offsets is used, the
+ output is the part of the line that matched (either shown literally, or
+ as an offset). In this case, scanning resumes immediately following the
+ match, so that further matches on the same line can be found. If there
+ are multiple patterns, they are all tried on the remainder of the line.
+ However, patterns that follow the one that matched are not tried on the
+ earlier part of the line.
+
+ If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
+ the value to set a locale when calling the PCRE library. The --locale
+ option can be used to override this.
+
+
+SUPPORT FOR COMPRESSED FILES
+
+ It is possible to compile pcregrep so that it uses libz or libbz2 to
+ read files whose names end in .gz or .bz2, respectively. You can find
+ out whether your binary has support for one or both of these file types
+ by running it with the --help option. If the appropriate support is not
+ present, files are treated as plain text. The standard input is always
+ so treated.
+
+
+OPTIONS
+
+ -- This terminate the list of options. It is useful if the next
+ item on the command line starts with a hyphen but is not an
+ option. This allows for the processing of patterns and file-
+ names that start with hyphens.
+
+ -A number, --after-context=number
+ Output number lines of context after each matching line. If
+ filenames and/or line numbers are being output, a hyphen sep-
+ arator is used instead of a colon for the context lines. A
+ line containing "--" is output between each group of lines,
+ unless they are in fact contiguous in the input file. The
+ value of number is expected to be relatively small. However,
+ pcregrep guarantees to have up to 8K of following text avail-
+ able for context output.
+
+ -B number, --before-context=number
+ Output number lines of context before each matching line. If
+ filenames and/or line numbers are being output, a hyphen sep-
+ arator is used instead of a colon for the context lines. A
+ line containing "--" is output between each group of lines,
+ unless they are in fact contiguous in the input file. The
+ value of number is expected to be relatively small. However,
+ pcregrep guarantees to have up to 8K of preceding text avail-
+ able for context output.
+
+ -C number, --context=number
+ Output number lines of context both before and after each
+ matching line. This is equivalent to setting both -A and -B
+ to the same value.
+
+ -c, --count
+ Do not output individual lines; instead just output a count
+ of the number of lines that would otherwise have been output.
+ If several files are given, a count is output for each of
+ them. In this mode, the -A, -B, and -C options are ignored.
+
+ --colour, --color
+ If this option is given without any data, it is equivalent to
+ "--colour=auto". If data is required, it must be given in
+ the same shell item, separated by an equals sign.
+
+ --colour=value, --color=value
+ This option specifies under what circumstances the part of a
+ line that matched a pattern should be coloured in the output.
+ The value may be "never" (the default), "always", or "auto".
+ In the latter case, colouring happens only if the standard
+ output is connected to a terminal. The colour can be speci-
+ fied by setting the environment variable PCREGREP_COLOUR or
+ PCREGREP_COLOR. The value of this variable should be a string
+ of two numbers, separated by a semicolon. They are copied
+ directly into the control string for setting colour on a ter-
+ minal, so it is your responsibility to ensure that they make
+ sense. If neither of the environment variables is set, the
+ default is "1;31", which gives red.
+
+ -D action, --devices=action
+ If an input path is not a regular file or a directory,
+ "action" specifies how it is to be processed. Valid values
+ are "read" (the default) or "skip" (silently skip the path).
+
+ -d action, --directories=action
+ If an input path is a directory, "action" specifies how it is
+ to be processed. Valid values are "read" (the default),
+ "recurse" (equivalent to the -r option), or "skip" (silently
+ skip the path). In the default case, directories are read as
+ if they were ordinary files. In some operating systems the
+ effect of reading a directory like this is an immediate end-
+ of-file.
+
+ -e pattern, --regex=pattern, --regexp=pattern
+ Specify a pattern to be matched. This option can be used mul-
+ tiple times in order to specify several patterns. It can also
+ be used as a way of specifying a single pattern that starts
+ with a hyphen. When -e is used, no argument pattern is taken
+ from the command line; all arguments are treated as file
+ names. There is an overall maximum of 100 patterns. They are
+ applied to each line in the order in which they are defined
+ until one matches (or fails to match if -v is used). If -f is
+ used with -e, the command line patterns are matched first,
+ followed by the patterns from the file, independent of the
+ order in which these options are specified. Note that multi-
+ ple use of -e is not the same as a single pattern with alter-
+ natives. For example, X|Y finds the first character in a line
+ that is X or Y, whereas if the two patterns are given sepa-
+ rately, pcregrep finds X if it is present, even if it follows
+ Y in the line. It finds Y only if there is no X in the line.
+ This really matters only if you are using -o to show the
+ part(s) of the line that matched.
+
+ --exclude=pattern
+ When pcregrep is searching the files in a directory as a con-
+ sequence of the -r (recursive search) option, any regular
+ files whose names match the pattern are excluded. Subdirecto-
+ ries are not excluded by this option; they are searched
+ recursively, subject to the --exclude_dir and --include_dir
+ options. The pattern is a PCRE regular expression, and is
+ matched against the final component of the file name (not the
+ entire path). If a file name matches both --include and
+ --exclude, it is excluded. There is no short form for this
+ option.
+
+ --exclude_dir=pattern
+ When pcregrep is searching the contents of a directory as a
+ consequence of the -r (recursive search) option, any subdi-
+ rectories whose names match the pattern are excluded. (Note
+ that the --exclude option does not affect subdirectories.)
+ The pattern is a PCRE regular expression, and is matched
+ against the final component of the name (not the entire
+ path). If a subdirectory name matches both --include_dir and
+ --exclude_dir, it is excluded. There is no short form for
+ this option.
+
+ -F, --fixed-strings
+ Interpret each pattern as a list of fixed strings, separated
+ by newlines, instead of as a regular expression. The -w
+ (match as a word) and -x (match whole line) options can be
+ used with -F. They apply to each of the fixed strings. A line
+ is selected if any of the fixed strings are found in it (sub-
+ ject to -w or -x, if present).
+
+ -f filename, --file=filename
+ Read a number of patterns from the file, one per line, and
+ match them against each line of input. A data line is output
+ if any of the patterns match it. The filename can be given as
+ "-" to refer to the standard input. When -f is used, patterns
+ specified on the command line using -e may also be present;
+ they are tested before the file's patterns. However, no other
+ pattern is taken from the command line; all arguments are
+ treated as file names. There is an overall maximum of 100
+ patterns. Trailing white space is removed from each line, and
+ blank lines are ignored. An empty file contains no patterns
+ and therefore matches nothing. See also the comments about
+ multiple patterns versus a single pattern with alternatives
+ in the description of -e above.
+
+ --file-offsets
+ Instead of showing lines or parts of lines that match, show
+ each match as an offset from the start of the file and a
+ length, separated by a comma. In this mode, no context is
+ shown. That is, the -A, -B, and -C options are ignored. If
+ there is more than one match in a line, each of them is shown
+ separately. This option is mutually exclusive with --line-
+ offsets and --only-matching.
+
+ -H, --with-filename
+ Force the inclusion of the filename at the start of output
+ lines when searching a single file. By default, the filename
+ is not shown in this case. For matching lines, the filename
+ is followed by a colon and a space; for context lines, a
+ hyphen separator is used. If a line number is also being out-
+ put, it follows the file name without a space.
+
+ -h, --no-filename
+ Suppress the output filenames when searching multiple files.
+ By default, filenames are shown when multiple files are
+ searched. For matching lines, the filename is followed by a
+ colon and a space; for context lines, a hyphen separator is
+ used. If a line number is also being output, it follows the
+ file name without a space.
+
+ --help Output a help message, giving brief details of the command
+ options and file type support, and then exit.
+
+ -i, --ignore-case
+ Ignore upper/lower case distinctions during comparisons.
+
+ --include=pattern
+ When pcregrep is searching the files in a directory as a con-
+ sequence of the -r (recursive search) option, only those reg-
+ ular files whose names match the pattern are included. Subdi-
+ rectories are always included and searched recursively, sub-
+ ject to the --include_dir and --exclude_dir options. The pat-
+ tern is a PCRE regular expression, and is matched against the
+ final component of the file name (not the entire path). If a
+ file name matches both --include and --exclude, it is
+ excluded. There is no short form for this option.
+
+ --include_dir=pattern
+ When pcregrep is searching the contents of a directory as a
+ consequence of the -r (recursive search) option, only those
+ subdirectories whose names match the pattern are included.
+ (Note that the --include option does not affect subdirecto-
+ ries.) The pattern is a PCRE regular expression, and is
+ matched against the final component of the name (not the
+ entire path). If a subdirectory name matches both
+ --include_dir and --exclude_dir, it is excluded. There is no
+ short form for this option.
+
+ -L, --files-without-match
+ Instead of outputting lines from the files, just output the
+ names of the files that do not contain any lines that would
+ have been output. Each file name is output once, on a sepa-
+ rate line.
+
+ -l, --files-with-matches
+ Instead of outputting lines from the files, just output the
+ names of the files containing lines that would have been out-
+ put. Each file name is output once, on a separate line.
+ Searching stops as soon as a matching line is found in a
+ file.
+
+ --label=name
+ This option supplies a name to be used for the standard input
+ when file names are being output. If not supplied, "(standard
+ input)" is used. There is no short form for this option.
+
+ --line-offsets
+ Instead of showing lines or parts of lines that match, show
+ each match as a line number, the offset from the start of the
+ line, and a length. The line number is terminated by a colon
+ (as usual; see the -n option), and the offset and length are
+ separated by a comma. In this mode, no context is shown.
+ That is, the -A, -B, and -C options are ignored. If there is
+ more than one match in a line, each of them is shown sepa-
+ rately. This option is mutually exclusive with --file-offsets
+ and --only-matching.
+
+ --locale=locale-name
+ This option specifies a locale to be used for pattern match-
+ ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
+ ronment variables. If no locale is specified, the PCRE
+ library's default (usually the "C" locale) is used. There is
+ no short form for this option.
+
+ -M, --multiline
+ Allow patterns to match more than one line. When this option
+ is given, patterns may usefully contain literal newline char-
+ acters and internal occurrences of ^ and $ characters. The
+ output for any one match may consist of more than one line.
+ When this option is set, the PCRE library is called in "mul-
+ tiline" mode. There is a limit to the number of lines that
+ can be matched, imposed by the way that pcregrep buffers the
+ input file as it scans it. However, pcregrep ensures that at
+ least 8K characters or the rest of the document (whichever is
+ the shorter) are available for forward matching, and simi-
+ larly the previous 8K characters (or all the previous charac-
+ ters, if fewer than 8K) are guaranteed to be available for
+ lookbehind assertions.
+
+ -N newline-type, --newline=newline-type
+ The PCRE library supports five different conventions for
+ indicating the ends of lines. They are the single-character
+ sequences CR (carriage return) and LF (linefeed), the two-
+ character sequence CRLF, an "anycrlf" convention, which rec-
+ ognizes any of the preceding three types, and an "any" con-
+ vention, in which any Unicode line ending sequence is assumed
+ to end a line. The Unicode sequences are the three just men-
+ tioned, plus VT (vertical tab, U+000B), FF (formfeed,
+ U+000C), NEL (next line, U+0085), LS (line separator,
+ U+2028), and PS (paragraph separator, U+2029).
+
+ When the PCRE library is built, a default line-ending
+ sequence is specified. This is normally the standard
+ sequence for the operating system. Unless otherwise specified
+ by this option, pcregrep uses the library's default. The
+ possible values for this option are CR, LF, CRLF, ANYCRLF, or
+ ANY. This makes it possible to use pcregrep on files that
+ have come from other environments without having to modify
+ their line endings. If the data that is being scanned does
+ not agree with the convention set by this option, pcregrep
+ may behave in strange ways.
+
+ -n, --line-number
+ Precede each output line by its line number in the file, fol-
+ lowed by a colon and a space for matching lines or a hyphen
+ and a space for context lines. If the filename is also being
+ output, it precedes the line number. This option is forced if
+ --line-offsets is used.
+
+ -o, --only-matching
+ Show only the part of the line that matched a pattern. In
+ this mode, no context is shown. That is, the -A, -B, and -C
+ options are ignored. If there is more than one match in a
+ line, each of them is shown separately. If -o is combined
+ with -v (invert the sense of the match to find non-matching
+ lines), no output is generated, but the return code is set
+ appropriately. This option is mutually exclusive with --file-
+ offsets and --line-offsets.
+
+ -q, --quiet
+ Work quietly, that is, display nothing except error messages.
+ The exit status indicates whether or not any matches were
+ found.
+
+ -r, --recursive
+ If any given path is a directory, recursively scan the files
+ it contains, taking note of any --include and --exclude set-
+ tings. By default, a directory is read as a normal file; in
+ some operating systems this gives an immediate end-of-file.
+ This option is a shorthand for setting the -d option to
+ "recurse".
+
+ -s, --no-messages
+ Suppress error messages about non-existent or unreadable
+ files. Such files are quietly skipped. However, the return
+ code is still 2, even if matches were found in other files.
+
+ -u, --utf-8
+ Operate in UTF-8 mode. This option is available only if PCRE
+ has been compiled with UTF-8 support. Both patterns and sub-
+ ject lines must be valid strings of UTF-8 characters.
+
+ -V, --version
+ Write the version numbers of pcregrep and the PCRE library
+ that is being used to the standard error stream.
+
+ -v, --invert-match
+ Invert the sense of the match, so that lines which do not
+ match any of the patterns are the ones that are found.
+
+ -w, --word-regex, --word-regexp
+ Force the patterns to match only whole words. This is equiva-
+ lent to having \b at the start and end of the pattern.
+
+ -x, --line-regex, --line-regexp
+ Force the patterns to be anchored (each must start matching
+ at the beginning of a line) and in addition, require them to
+ match entire lines. This is equivalent to having ^ and $
+ characters at the start and end of each alternative branch in
+ every pattern.
+
+
+ENVIRONMENT VARIABLES
+
+ The environment variables LC_ALL and LC_CTYPE are examined, in that
+ order, for a locale. The first one that is set is used. This can be
+ overridden by the --locale option. If no locale is set, the PCRE
+ library's default (usually the "C" locale) is used.
+
+
+NEWLINES
+
+ The -N (--newline) option allows pcregrep to scan files with different
+ newline conventions from the default. However, the setting of this
+ option does not affect the way in which pcregrep writes information to
+ the standard error and output streams. It uses the string "\n" in C
+ printf() calls to indicate newlines, relying on the C I/O library to
+ convert this to an appropriate sequence if the output is sent to a
+ file.
+
+
+OPTIONS COMPATIBILITY
+
+ The majority of short and long forms of pcregrep's options are the same
+ as in the GNU grep program. Any long option of the form --xxx-regexp
+ (GNU terminology) is also available as --xxx-regex (PCRE terminology).
+ However, the --locale, -M, --multiline, -u, and --utf-8 options are
+ specific to pcregrep.
+
+
+OPTIONS WITH DATA
+
+ There are four different ways in which an option with data can be spec-
+ ified. If a short form option is used, the data may follow immedi-
+ ately, or in the next command line item. For example:
+
+ -f/some/file
+ -f /some/file
+
+ If a long form option is used, the data may appear in the same command
+ line item, separated by an equals character, or (with one exception) it
+ may appear in the next command line item. For example:
+
+ --file=/some/file
+ --file /some/file
+
+ Note, however, that if you want to supply a file name beginning with ~
+ as data in a shell command, and have the shell expand ~ to a home
+ directory, you must separate the file name from the option, because the
+ shell does not treat ~ specially unless it is at the start of an item.
+
+ The exception to the above is the --colour (or --color) option, for
+ which the data is optional. If this option does have data, it must be
+ given in the first form, using an equals character. Otherwise it will
+ be assumed that it has no data.
+
+
+MATCHING ERRORS
+
+ It is possible to supply a regular expression that takes a very long
+ time to fail to match certain lines. Such patterns normally involve
+ nested indefinite repeats, for example: (a+)*\d when matched against a
+ line of a's with no final digit. The PCRE matching function has a
+ resource limit that causes it to abort in these circumstances. If this
+ happens, pcregrep outputs an error message and the line that caused the
+ problem to the standard error stream. If there are more than 20 such
+ errors, pcregrep gives up.
+
+
+DIAGNOSTICS
+
+ Exit status is 0 if any matches were found, 1 if no matches were found,
+ and 2 for syntax errors and non-existent or inacessible files (even if
+ matches were found in other files) or too many matching errors. Using
+ the -s option to suppress error messages about inaccessble files does
+ not affect the return code.
+
+
+SEE ALSO
+
+ pcrepattern(3), pcretest(1).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 08 March 2008
+ Copyright (c) 1997-2008 University of Cambridge.
diff --git a/src/doc/pcrematching.3 b/src/doc/pcrematching.3
new file mode 100644
index 0000000..560a48c
--- /dev/null
+++ b/src/doc/pcrematching.3
@@ -0,0 +1,188 @@
+.TH PCREMATCHING 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE MATCHING ALGORITHMS"
+.rs
+.sp
+This document describes the two different algorithms that are available in PCRE
+for matching a compiled regular expression against a given subject string. The
+"standard" algorithm is the one provided by the \fBpcre_exec()\fP function.
+This works in the same was as Perl's matching function, and provides a
+Perl-compatible matching operation.
+.P
+An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP function;
+this operates in a different way, and is not Perl-compatible. It has advantages
+and disadvantages compared with the standard algorithm, and these are described
+below.
+.P
+When there is only one possible way in which a given subject string can match a
+pattern, the two algorithms give the same answer. A difference arises, however,
+when there are multiple possibilities. For example, if the pattern
+.sp
+ ^<.*>
+.sp
+is matched against the string
+.sp
+
+.sp
+there are three possible answers. The standard algorithm finds only one of
+them, whereas the alternative algorithm finds all three.
+.
+.SH "REGULAR EXPRESSIONS AS TREES"
+.rs
+.sp
+The set of strings that are matched by a regular expression can be represented
+as a tree structure. An unlimited repetition in the pattern makes the tree of
+infinite size, but it is still a tree. Matching the pattern to a given subject
+string (from a given starting point) can be thought of as a search of the tree.
+There are two ways to search a tree: depth-first and breadth-first, and these
+correspond to the two matching algorithms provided by PCRE.
+.
+.SH "THE STANDARD MATCHING ALGORITHM"
+.rs
+.sp
+In the terminology of Jeffrey Friedl's book "Mastering Regular
+Expressions", the standard algorithm is an "NFA algorithm". It conducts a
+depth-first search of the pattern tree. That is, it proceeds along a single
+path through the tree, checking that the subject matches what is required. When
+there is a mismatch, the algorithm tries any alternatives at the current point,
+and if they all fail, it backs up to the previous branch point in the tree, and
+tries the next alternative branch at that level. This often involves backing up
+(moving to the left) in the subject string as well. The order in which
+repetition branches are tried is controlled by the greedy or ungreedy nature of
+the quantifier.
+.P
+If a leaf node is reached, a matching string has been found, and at that point
+the algorithm stops. Thus, if there is more than one possible match, this
+algorithm returns the first one that it finds. Whether this is the shortest,
+the longest, or some intermediate length depends on the way the greedy and
+ungreedy repetition quantifiers are specified in the pattern.
+.P
+Because it ends up with a single path through the tree, it is relatively
+straightforward for this algorithm to keep track of the substrings that are
+matched by portions of the pattern in parentheses. This provides support for
+capturing parentheses and back references.
+.
+.SH "THE ALTERNATIVE MATCHING ALGORITHM"
+.rs
+.sp
+This algorithm conducts a breadth-first search of the tree. Starting from the
+first matching point in the subject, it scans the subject string from left to
+right, once, character by character, and as it does this, it remembers all the
+paths through the tree that represent valid matches. In Friedl's terminology,
+this is a kind of "DFA algorithm", though it is not implemented as a
+traditional finite state machine (it keeps multiple states active
+simultaneously).
+.P
+The scan continues until either the end of the subject is reached, or there are
+no more unterminated paths. At this point, terminated paths represent the
+different matching possibilities (if there are none, the match has failed).
+Thus, if there is more than one possible match, this algorithm finds all of
+them, and in particular, it finds the longest. In PCRE, there is an option to
+stop the algorithm after the first match (which is necessarily the shortest)
+has been found.
+.P
+Note that all the matches that are found start at the same point in the
+subject. If the pattern
+.sp
+ cat(er(pillar)?)
+.sp
+is matched against the string "the caterpillar catchment", the result will be
+the three strings "cat", "cater", and "caterpillar" that start at the fourth
+character of the subject. The algorithm does not automatically move on to find
+matches that start at later positions.
+.P
+There are a number of features of PCRE regular expressions that are not
+supported by the alternative matching algorithm. They are as follows:
+.P
+1. Because the algorithm finds all possible matches, the greedy or ungreedy
+nature of repetition quantifiers is not relevant. Greedy and ungreedy
+quantifiers are treated in exactly the same way. However, possessive
+quantifiers can make a difference when what follows could also match what is
+quantified, for example in a pattern like this:
+.sp
+ ^a++\ew!
+.sp
+This pattern matches "aaab!" but not "aaa!", which would be matched by a
+non-possessive quantifier. Similarly, if an atomic group is present, it is
+matched as if it were a standalone pattern at the current point, and the
+longest match is then "locked in" for the rest of the overall pattern.
+.P
+2. When dealing with multiple paths through the tree simultaneously, it is not
+straightforward to keep track of captured substrings for the different matching
+possibilities, and PCRE's implementation of this algorithm does not attempt to
+do this. This means that no captured substrings are available.
+.P
+3. Because no substrings are captured, back references within the pattern are
+not supported, and cause errors if encountered.
+.P
+4. For the same reason, conditional expressions that use a backreference as the
+condition or test for a specific group recursion are not supported.
+.P
+5. Because many paths through the tree may be active, the \eK escape sequence,
+which resets the start of the match when encountered (but may be on some paths
+and not on others), is not supported. It causes an error if encountered.
+.P
+6. Callouts are supported, but the value of the \fIcapture_top\fP field is
+always 1, and the value of the \fIcapture_last\fP field is always -1.
+.P
+7. The \eC escape sequence, which (in the standard algorithm) matches a single
+byte, even in UTF-8 mode, is not supported because the alternative algorithm
+moves through the subject string one character at a time, for all active paths
+through the tree.
+.P
+8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
+supported. (*FAIL) is supported, and behaves like a failing negative assertion.
+.
+.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
+.rs
+.sp
+Using the alternative matching algorithm provides the following advantages:
+.P
+1. All possible matches (at a single point in the subject) are automatically
+found, and in particular, the longest match is found. To find more than one
+match using the standard algorithm, you have to do kludgy things with
+callouts.
+.P
+2. There is much better support for partial matching. The restrictions on the
+content of the pattern that apply when using the standard algorithm for partial
+matching do not apply to the alternative algorithm. For non-anchored patterns,
+the starting position of a partial match is available.
+.P
+3. Because the alternative algorithm scans the subject string just once, and
+never needs to backtrack, it is possible to pass very long subject strings to
+the matching function in several pieces, checking for partial matching each
+time.
+.
+.SH "DISADVANTAGES OF THE ALTERNATIVE ALGORITHM"
+.rs
+.sp
+The alternative algorithm suffers from a number of disadvantages:
+.P
+1. It is substantially slower than the standard algorithm. This is partly
+because it has to search for all possible matches, but is also because it is
+less susceptible to optimization.
+.P
+2. Capturing parentheses and back references are not supported.
+.P
+3. Although atomic groups are supported, their use does not provide the
+performance advantage that it does for the standard algorithm.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 19 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcrepartial.3 b/src/doc/pcrepartial.3
new file mode 100644
index 0000000..e418734
--- /dev/null
+++ b/src/doc/pcrepartial.3
@@ -0,0 +1,219 @@
+.TH PCREPARTIAL 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PARTIAL MATCHING IN PCRE"
+.rs
+.sp
+In normal use of PCRE, if the subject string that is passed to
+\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP matches as far as it goes, but is
+too short to match the entire pattern, PCRE_ERROR_NOMATCH is returned. There
+are circumstances where it might be helpful to distinguish this case from other
+cases in which there is no match.
+.P
+Consider, for example, an application where a human is required to type in data
+for a field with specific formatting requirements. An example might be a date
+in the form \fIddmmmyy\fP, defined by this pattern:
+.sp
+ ^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
+.sp
+If the application sees the user's keystrokes one by one, and can check that
+what has been typed so far is potentially valid, it is able to raise an error
+as soon as a mistake is made, possibly beeping and not reflecting the
+character that has been typed. This immediate feedback is likely to be a better
+user interface than a check that is delayed until the entire string has been
+entered.
+.P
+PCRE supports the concept of partial matching by means of the PCRE_PARTIAL
+option, which can be set when calling \fBpcre_exec()\fP or
+\fBpcre_dfa_exec()\fP. When this flag is set for \fBpcre_exec()\fP, the return
+code PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
+during the matching process the last part of the subject string matched part of
+the pattern. Unfortunately, for non-anchored matching, it is not possible to
+obtain the position of the start of the partial match. No captured data is set
+when PCRE_ERROR_PARTIAL is returned.
+.P
+When PCRE_PARTIAL is set for \fBpcre_dfa_exec()\fP, the return code
+PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of the
+subject is reached, there have been no complete matches, but there is still at
+least one matching possibility. The portion of the string that provided the
+partial match is set as the first matching string.
+.P
+Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers the
+last literal byte in a pattern, and abandons matching immediately if such a
+byte is not present in the subject string. This optimization cannot be used
+for a subject string that might match only partially.
+.
+.
+.SH "RESTRICTED PATTERNS FOR PCRE_PARTIAL"
+.rs
+.sp
+Because of the way certain internal optimizations are implemented in the
+\fBpcre_exec()\fP function, the PCRE_PARTIAL option cannot be used with all
+patterns. These restrictions do not apply when \fBpcre_dfa_exec()\fP is used.
+For \fBpcre_exec()\fP, repeated single characters such as
+.sp
+ a{2,4}
+.sp
+and repeated single metasequences such as
+.sp
+ \ed+
+.sp
+are not permitted if the maximum number of occurrences is greater than one.
+Optional items such as \ed? (where the maximum is one) are permitted.
+Quantifiers with any values are permitted after parentheses, so the invalid
+examples above can be coded thus:
+.sp
+ (a){2,4}
+ (\ed)+
+.sp
+These constructions run more slowly, but for the kinds of application that are
+envisaged for this facility, this is not felt to be a major restriction.
+.P
+If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
+\fBpcre_exec()\fP returns the error code PCRE_ERROR_BADPARTIAL (-13).
+You can use the PCRE_INFO_OKPARTIAL call to \fBpcre_fullinfo()\fP to find out
+if a compiled pattern can be used for partial matching.
+.
+.
+.SH "EXAMPLE OF PARTIAL MATCHING USING PCRETEST"
+.rs
+.sp
+If the escape sequence \eP is present in a \fBpcretest\fP data line, the
+PCRE_PARTIAL flag is used for the match. Here is a run of \fBpcretest\fP that
+uses the date example quoted above:
+.sp
+ re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
+ data> 25jun04\eP
+ 0: 25jun04
+ 1: jun
+ data> 25dec3\eP
+ Partial match
+ data> 3ju\eP
+ Partial match
+ data> 3juj\eP
+ No match
+ data> j\eP
+ No match
+.sp
+The first data string is matched completely, so \fBpcretest\fP shows the
+matched substrings. The remaining four strings do not match the complete
+pattern, but the first two are partial matches. The same test, using
+\fBpcre_dfa_exec()\fP matching (by means of the \eD escape sequence), produces
+the following output:
+.sp
+ re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
+ data> 25jun04\eP\eD
+ 0: 25jun04
+ data> 23dec3\eP\eD
+ Partial match: 23dec3
+ data> 3ju\eP\eD
+ Partial match: 3ju
+ data> 3juj\eP\eD
+ No match
+ data> j\eP\eD
+ No match
+.sp
+Notice that in this case the portion of the string that was matched is made
+available.
+.
+.
+.SH "MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()"
+.rs
+.sp
+When a partial match has been found using \fBpcre_dfa_exec()\fP, it is possible
+to continue the match by providing additional subject data and calling
+\fBpcre_dfa_exec()\fP again with the same compiled regular expression, this
+time setting the PCRE_DFA_RESTART option. You must also pass the same working
+space as before, because this is where details of the previous partial match
+are stored. Here is an example using \fBpcretest\fP, using the \eR escape
+sequence to set the PCRE_DFA_RESTART option (\eP and \eD are as above):
+.sp
+ re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
+ data> 23ja\eP\eD
+ Partial match: 23ja
+ data> n05\eR\eD
+ 0: n05
+.sp
+The first call has "23ja" as the subject, and requests partial matching; the
+second call has "n05" as the subject for the continued (restarted) match.
+Notice that when the match is complete, only the last part is shown; PCRE does
+not retain the previously partially-matched string. It is up to the calling
+program to do that if it needs to.
+.P
+You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
+over multiple segments. This facility can be used to pass very long subject
+strings to \fBpcre_dfa_exec()\fP. However, some care is needed for certain
+types of pattern.
+.P
+1. If the pattern contains tests for the beginning or end of a line, you need
+to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
+subject string for any call does not contain the beginning or end of a line.
+.P
+2. If the pattern contains backward assertions (including \eb or \eB), you need
+to arrange for some overlap in the subject strings to allow for this. For
+example, you could pass the subject in chunks that are 500 bytes long, but in
+a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
+bytes at the start of the buffer.
+.P
+3. Matching a subject string that is split into multiple segments does not
+always produce exactly the same result as matching over one single long string.
+The difference arises when there are multiple matching possibilities, because a
+partial match result is given only when there are no completed matches in a
+call to \fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
+been found, continuation to a new subject segment is no longer possible.
+Consider this \fBpcretest\fP example:
+.sp
+ re> /dog(sbody)?/
+ data> do\eP\eD
+ Partial match: do
+ data> gsb\eR\eP\eD
+ 0: g
+ data> dogsbody\eD
+ 0: dogsbody
+ 1: dog
+.sp
+The pattern matches the words "dog" or "dogsbody". When the subject is
+presented in several parts ("do" and "gsb" being the first two) the match stops
+when "dog" has been found, and it is not possible to continue. On the other
+hand, if "dogsbody" is presented as a single string, both matches are found.
+.P
+Because of this phenomenon, it does not usually make sense to end a pattern
+that is going to be matched in this way with a variable repeat.
+.P
+4. Patterns that contain alternatives at the top level which do not all
+start with the same pattern item may not work as expected. For example,
+consider this pattern:
+.sp
+ 1234|3789
+.sp
+If the first part of the subject is "ABC123", a partial match of the first
+alternative is found at offset 3. There is no partial match for the second
+alternative, because such a match does not start at the same point in the
+subject string. Attempting to continue with the string "789" does not yield a
+match because only those alternatives that match at one point in the subject
+are remembered. The problem arises because the start of the second alternative
+matches within the first alternative. There is no problem with anchored
+patterns or patterns such as:
+.sp
+ 1234|ABCD
+.sp
+where no string can be a partial match for both alternatives.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 04 June 2007
+Copyright (c) 1997-2007 University of Cambridge.
+.fi
diff --git a/src/doc/pcrepattern.3 b/src/doc/pcrepattern.3
new file mode 100644
index 0000000..19895b3
--- /dev/null
+++ b/src/doc/pcrepattern.3
@@ -0,0 +1,2245 @@
+.TH PCREPATTERN 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE REGULAR EXPRESSION DETAILS"
+.rs
+.sp
+The syntax and semantics of the regular expressions that are supported by PCRE
+are described in detail below. There is a quick-reference syntax summary in the
+.\" HREF
+\fBpcresyntax\fP
+.\"
+page. PCRE tries to match Perl syntax and semantics as closely as it can. PCRE
+also supports some alternative regular expression syntax (which does not
+conflict with the Perl syntax) in order to provide some compatibility with
+regular expressions in Python, .NET, and Oniguruma.
+.P
+Perl's regular expressions are described in its own documentation, and
+regular expressions in general are covered in a number of books, some of which
+have copious examples. Jeffrey Friedl's "Mastering Regular Expressions",
+published by O'Reilly, covers regular expressions in great detail. This
+description of PCRE's regular expressions is intended as reference material.
+.P
+The original operation of PCRE was on strings of one-byte characters. However,
+there is now also support for UTF-8 character strings. To use this, you must
+build PCRE to include UTF-8 support, and then call \fBpcre_compile()\fP with
+the PCRE_UTF8 option. How this affects pattern matching is mentioned in several
+places below. There is also a summary of UTF-8 features in the
+.\" HTML
+.\"
+section on UTF-8 support
+.\"
+in the main
+.\" HREF
+\fBpcre\fP
+.\"
+page.
+.P
+The remainder of this document discusses the patterns that are supported by
+PCRE when its main matching function, \fBpcre_exec()\fP, is used.
+From release 6.0, PCRE offers a second matching function,
+\fBpcre_dfa_exec()\fP, which matches using a different algorithm that is not
+Perl-compatible. Some of the features discussed below are not available when
+\fBpcre_dfa_exec()\fP is used. The advantages and disadvantages of the
+alternative function, and how it differs from the normal function, are
+discussed in the
+.\" HREF
+\fBpcrematching\fP
+.\"
+page.
+.
+.
+.SH "NEWLINE CONVENTIONS"
+.rs
+.sp
+PCRE supports five different conventions for indicating line breaks in
+strings: a single CR (carriage return) character, a single LF (linefeed)
+character, the two-character sequence CRLF, any of the three preceding, or any
+Unicode newline sequence. The
+.\" HREF
+\fBpcreapi\fP
+.\"
+page has
+.\" HTML
+.\"
+further discussion
+.\"
+about newlines, and shows how to set the newline convention in the
+\fIoptions\fP arguments for the compiling and matching functions.
+.P
+It is also possible to specify a newline convention by starting a pattern
+string with one of the following five sequences:
+.sp
+ (*CR) carriage return
+ (*LF) linefeed
+ (*CRLF) carriage return, followed by linefeed
+ (*ANYCRLF) any of the three above
+ (*ANY) all Unicode newline sequences
+.sp
+These override the default and the options given to \fBpcre_compile()\fP. For
+example, on a Unix system where LF is the default newline sequence, the pattern
+.sp
+ (*CR)a.b
+.sp
+changes the convention to CR. That pattern matches "a\enb" because LF is no
+longer a newline. Note that these special settings, which are not
+Perl-compatible, are recognized only at the very start of a pattern, and that
+they must be in upper case. If more than one of them is present, the last one
+is used.
+.P
+The newline convention does not affect what the \eR escape sequence matches. By
+default, this is any Unicode newline sequence, for Perl compatibility. However,
+this can be changed; see the description of \eR in the section entitled
+.\" HTML
+.\"
+"Newline sequences"
+.\"
+below. A change of \eR setting can be combined with a change of newline
+convention.
+.
+.
+.SH "CHARACTERS AND METACHARACTERS"
+.rs
+.sp
+A regular expression is a pattern that is matched against a subject string from
+left to right. Most characters stand for themselves in a pattern, and match the
+corresponding characters in the subject. As a trivial example, the pattern
+.sp
+ The quick brown fox
+.sp
+matches a portion of a subject string that is identical to itself. When
+caseless matching is specified (the PCRE_CASELESS option), letters are matched
+independently of case. In UTF-8 mode, PCRE always understands the concept of
+case for characters whose values are less than 128, so caseless matching is
+always possible. For characters with higher values, the concept of case is
+supported if PCRE is compiled with Unicode property support, but not otherwise.
+If you want to use caseless matching for characters 128 and above, you must
+ensure that PCRE is compiled with Unicode property support as well as with
+UTF-8 support.
+.P
+The power of regular expressions comes from the ability to include alternatives
+and repetitions in the pattern. These are encoded in the pattern by the use of
+\fImetacharacters\fP, which do not stand for themselves but instead are
+interpreted in some special way.
+.P
+There are two different sets of metacharacters: those that are recognized
+anywhere in the pattern except within square brackets, and those that are
+recognized within square brackets. Outside square brackets, the metacharacters
+are as follows:
+.sp
+ \e general escape character with several uses
+ ^ assert start of string (or line, in multiline mode)
+ $ assert end of string (or line, in multiline mode)
+ . match any character except newline (by default)
+ [ start character class definition
+ | start of alternative branch
+ ( start subpattern
+ ) end subpattern
+ ? extends the meaning of (
+ also 0 or 1 quantifier
+ also quantifier minimizer
+ * 0 or more quantifier
+ + 1 or more quantifier
+ also "possessive quantifier"
+ { start min/max quantifier
+.sp
+Part of a pattern that is in square brackets is called a "character class". In
+a character class the only metacharacters are:
+.sp
+ \e general escape character
+ ^ negate the class, but only if the first character
+ - indicates character range
+.\" JOIN
+ [ POSIX character class (only if followed by POSIX
+ syntax)
+ ] terminates the character class
+.sp
+The following sections describe the use of each of the metacharacters.
+.
+.
+.SH BACKSLASH
+.rs
+.sp
+The backslash character has several uses. Firstly, if it is followed by a
+non-alphanumeric character, it takes away any special meaning that character
+may have. This use of backslash as an escape character applies both inside and
+outside character classes.
+.P
+For example, if you want to match a * character, you write \e* in the pattern.
+This escaping action applies whether or not the following character would
+otherwise be interpreted as a metacharacter, so it is always safe to precede a
+non-alphanumeric with backslash to specify that it stands for itself. In
+particular, if you want to match a backslash, you write \e\e.
+.P
+If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
+pattern (other than in a character class) and characters between a # outside
+a character class and the next newline are ignored. An escaping backslash can
+be used to include a whitespace or # character as part of the pattern.
+.P
+If you want to remove the special meaning from a sequence of characters, you
+can do so by putting them between \eQ and \eE. This is different from Perl in
+that $ and @ are handled as literals in \eQ...\eE sequences in PCRE, whereas in
+Perl, $ and @ cause variable interpolation. Note the following examples:
+.sp
+ Pattern PCRE matches Perl matches
+.sp
+.\" JOIN
+ \eQabc$xyz\eE abc$xyz abc followed by the
+ contents of $xyz
+ \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz
+ \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz
+.sp
+The \eQ...\eE sequence is recognized both inside and outside character classes.
+.
+.
+.\" HTML
+.SS "Non-printing characters"
+.rs
+.sp
+A second use of backslash provides a way of encoding non-printing characters
+in patterns in a visible manner. There is no restriction on the appearance of
+non-printing characters, apart from the binary zero that terminates a pattern,
+but when a pattern is being prepared by text editing, it is usually easier to
+use one of the following escape sequences than the binary character it
+represents:
+.sp
+ \ea alarm, that is, the BEL character (hex 07)
+ \ecx "control-x", where x is any character
+ \ee escape (hex 1B)
+ \ef formfeed (hex 0C)
+ \en linefeed (hex 0A)
+ \er carriage return (hex 0D)
+ \et tab (hex 09)
+ \eddd character with octal code ddd, or backreference
+ \exhh character with hex code hh
+ \ex{hhh..} character with hex code hhh..
+.sp
+The precise effect of \ecx is as follows: if x is a lower case letter, it
+is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
+Thus \ecz becomes hex 1A, but \ec{ becomes hex 3B, while \ec; becomes hex
+7B.
+.P
+After \ex, from zero to two hexadecimal digits are read (letters can be in
+upper or lower case). Any number of hexadecimal digits may appear between \ex{
+and }, but the value of the character code must be less than 256 in non-UTF-8
+mode, and less than 2**31 in UTF-8 mode. That is, the maximum value in
+hexadecimal is 7FFFFFFF. Note that this is bigger than the largest Unicode code
+point, which is 10FFFF.
+.P
+If characters other than hexadecimal digits appear between \ex{ and }, or if
+there is no terminating }, this form of escape is not recognized. Instead, the
+initial \ex will be interpreted as a basic hexadecimal escape, with no
+following digits, giving a character whose value is zero.
+.P
+Characters whose value is less than 256 can be defined by either of the two
+syntaxes for \ex. There is no difference in the way they are handled. For
+example, \exdc is exactly the same as \ex{dc}.
+.P
+After \e0 up to two further octal digits are read. If there are fewer than two
+digits, just those that are present are used. Thus the sequence \e0\ex\e07
+specifies two binary zeros followed by a BEL character (code value 7). Make
+sure you supply two digits after the initial zero if the pattern character that
+follows is itself an octal digit.
+.P
+The handling of a backslash followed by a digit other than 0 is complicated.
+Outside a character class, PCRE reads it and any following digits as a decimal
+number. If the number is less than 10, or if there have been at least that many
+previous capturing left parentheses in the expression, the entire sequence is
+taken as a \fIback reference\fP. A description of how this works is given
+.\" HTML
+.\"
+later,
+.\"
+following the discussion of
+.\" HTML
+.\"
+parenthesized subpatterns.
+.\"
+.P
+Inside a character class, or if the decimal number is greater than 9 and there
+have not been that many capturing subpatterns, PCRE re-reads up to three octal
+digits following the backslash, and uses them to generate a data character. Any
+subsequent digits stand for themselves. In non-UTF-8 mode, the value of a
+character specified in octal must be less than \e400. In UTF-8 mode, values up
+to \e777 are permitted. For example:
+.sp
+ \e040 is another way of writing a space
+.\" JOIN
+ \e40 is the same, provided there are fewer than 40
+ previous capturing subpatterns
+ \e7 is always a back reference
+.\" JOIN
+ \e11 might be a back reference, or another way of
+ writing a tab
+ \e011 is always a tab
+ \e0113 is a tab followed by the character "3"
+.\" JOIN
+ \e113 might be a back reference, otherwise the
+ character with octal code 113
+.\" JOIN
+ \e377 might be a back reference, otherwise
+ the byte consisting entirely of 1 bits
+.\" JOIN
+ \e81 is either a back reference, or a binary zero
+ followed by the two characters "8" and "1"
+.sp
+Note that octal values of 100 or greater must not be introduced by a leading
+zero, because no more than three octal digits are ever read.
+.P
+All the sequences that define a single character value can be used both inside
+and outside character classes. In addition, inside a character class, the
+sequence \eb is interpreted as the backspace character (hex 08), and the
+sequences \eR and \eX are interpreted as the characters "R" and "X",
+respectively. Outside a character class, these sequences have different
+meanings
+.\" HTML
+.\"
+(see below).
+.\"
+.
+.
+.SS "Absolute and relative back references"
+.rs
+.sp
+The sequence \eg followed by an unsigned or a negative number, optionally
+enclosed in braces, is an absolute or relative back reference. A named back
+reference can be coded as \eg{name}. Back references are discussed
+.\" HTML
+.\"
+later,
+.\"
+following the discussion of
+.\" HTML
+.\"
+parenthesized subpatterns.
+.\"
+.
+.
+.SS "Absolute and relative subroutine calls"
+.rs
+.sp
+For compatibility with Oniguruma, the non-Perl syntax \eg followed by a name or
+a number enclosed either in angle brackets or single quotes, is an alternative
+syntax for referencing a subpattern as a "subroutine". Details are discussed
+.\" HTML
+.\"
+later.
+.\"
+Note that \eg{...} (Perl syntax) and \eg<...> (Oniguruma syntax) are \fInot\fP
+synonymous. The former is a back reference; the latter is a subroutine call.
+.
+.
+.SS "Generic character types"
+.rs
+.sp
+Another use of backslash is for specifying generic character types. The
+following are always recognized:
+.sp
+ \ed any decimal digit
+ \eD any character that is not a decimal digit
+ \eh any horizontal whitespace character
+ \eH any character that is not a horizontal whitespace character
+ \es any whitespace character
+ \eS any character that is not a whitespace character
+ \ev any vertical whitespace character
+ \eV any character that is not a vertical whitespace character
+ \ew any "word" character
+ \eW any "non-word" character
+.sp
+Each pair of escape sequences partitions the complete set of characters into
+two disjoint sets. Any given character matches one, and only one, of each pair.
+.P
+These character type sequences can appear both inside and outside character
+classes. They each match one character of the appropriate type. If the current
+matching point is at the end of the subject string, all of them fail, since
+there is no character to match.
+.P
+For compatibility with Perl, \es does not match the VT character (code 11).
+This makes it different from the the POSIX "space" class. The \es characters
+are HT (9), LF (10), FF (12), CR (13), and space (32). If "use locale;" is
+included in a Perl script, \es may match the VT character. In PCRE, it never
+does.
+.P
+In UTF-8 mode, characters with values greater than 128 never match \ed, \es, or
+\ew, and always match \eD, \eS, and \eW. This is true even when Unicode
+character property support is available. These sequences retain their original
+meanings from before UTF-8 support was available, mainly for efficiency
+reasons.
+.P
+The sequences \eh, \eH, \ev, and \eV are Perl 5.10 features. In contrast to the
+other sequences, these do match certain high-valued codepoints in UTF-8 mode.
+The horizontal space characters are:
+.sp
+ U+0009 Horizontal tab
+ U+0020 Space
+ U+00A0 Non-break space
+ U+1680 Ogham space mark
+ U+180E Mongolian vowel separator
+ U+2000 En quad
+ U+2001 Em quad
+ U+2002 En space
+ U+2003 Em space
+ U+2004 Three-per-em space
+ U+2005 Four-per-em space
+ U+2006 Six-per-em space
+ U+2007 Figure space
+ U+2008 Punctuation space
+ U+2009 Thin space
+ U+200A Hair space
+ U+202F Narrow no-break space
+ U+205F Medium mathematical space
+ U+3000 Ideographic space
+.sp
+The vertical space characters are:
+.sp
+ U+000A Linefeed
+ U+000B Vertical tab
+ U+000C Formfeed
+ U+000D Carriage return
+ U+0085 Next line
+ U+2028 Line separator
+ U+2029 Paragraph separator
+.P
+A "word" character is an underscore or any character less than 256 that is a
+letter or digit. The definition of letters and digits is controlled by PCRE's
+low-valued character tables, and may vary if locale-specific matching is taking
+place (see
+.\" HTML
+.\"
+"Locale support"
+.\"
+in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page). For example, in a French locale such as "fr_FR" in Unix-like systems,
+or "french" in Windows, some character codes greater than 128 are used for
+accented letters, and these are matched by \ew. The use of locales with Unicode
+is discouraged.
+.
+.
+.\" HTML
+.SS "Newline sequences"
+.rs
+.sp
+Outside a character class, by default, the escape sequence \eR matches any
+Unicode newline sequence. This is a Perl 5.10 feature. In non-UTF-8 mode \eR is
+equivalent to the following:
+.sp
+ (?>\er\en|\en|\ex0b|\ef|\er|\ex85)
+.sp
+This is an example of an "atomic group", details of which are given
+.\" HTML
+.\"
+below.
+.\"
+This particular group matches either the two-character sequence CR followed by
+LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
+U+000B), FF (formfeed, U+000C), CR (carriage return, U+000D), or NEL (next
+line, U+0085). The two-character sequence is treated as a single unit that
+cannot be split.
+.P
+In UTF-8 mode, two additional characters whose codepoints are greater than 255
+are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029).
+Unicode character property support is not needed for these characters to be
+recognized.
+.P
+It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
+complete set of Unicode line endings) by setting the option PCRE_BSR_ANYCRLF
+either at compile time or when the pattern is matched. (BSR is an abbrevation
+for "backslash R".) This can be made the default when PCRE is built; if this is
+the case, the other behaviour can be requested via the PCRE_BSR_UNICODE option.
+It is also possible to specify these settings by starting a pattern string with
+one of the following sequences:
+.sp
+ (*BSR_ANYCRLF) CR, LF, or CRLF only
+ (*BSR_UNICODE) any Unicode newline sequence
+.sp
+These override the default and the options given to \fBpcre_compile()\fP, but
+they can be overridden by options given to \fBpcre_exec()\fP. Note that these
+special settings, which are not Perl-compatible, are recognized only at the
+very start of a pattern, and that they must be in upper case. If more than one
+of them is present, the last one is used. They can be combined with a change of
+newline convention, for example, a pattern can start with:
+.sp
+ (*ANY)(*BSR_ANYCRLF)
+.sp
+Inside a character class, \eR matches the letter "R".
+.
+.
+.\" HTML
+.SS Unicode character properties
+.rs
+.sp
+When PCRE is built with Unicode character property support, three additional
+escape sequences that match characters with specific properties are available.
+When not in UTF-8 mode, these sequences are of course limited to testing
+characters whose codepoints are less than 256, but they do work in this mode.
+The extra escape sequences are:
+.sp
+ \ep{\fIxx\fP} a character with the \fIxx\fP property
+ \eP{\fIxx\fP} a character without the \fIxx\fP property
+ \eX an extended Unicode sequence
+.sp
+The property names represented by \fIxx\fP above are limited to the Unicode
+script names, the general category properties, and "Any", which matches any
+character (including newline). Other properties such as "InMusicalSymbols" are
+not currently supported by PCRE. Note that \eP{Any} does not match any
+characters, so always causes a match failure.
+.P
+Sets of Unicode characters are defined as belonging to certain scripts. A
+character from one of these sets can be matched using a script name. For
+example:
+.sp
+ \ep{Greek}
+ \eP{Han}
+.sp
+Those that are not part of an identified script are lumped together as
+"Common". The current list of scripts is:
+.P
+Arabic,
+Armenian,
+Balinese,
+Bengali,
+Bopomofo,
+Braille,
+Buginese,
+Buhid,
+Canadian_Aboriginal,
+Cherokee,
+Common,
+Coptic,
+Cuneiform,
+Cypriot,
+Cyrillic,
+Deseret,
+Devanagari,
+Ethiopic,
+Georgian,
+Glagolitic,
+Gothic,
+Greek,
+Gujarati,
+Gurmukhi,
+Han,
+Hangul,
+Hanunoo,
+Hebrew,
+Hiragana,
+Inherited,
+Kannada,
+Katakana,
+Kharoshthi,
+Khmer,
+Lao,
+Latin,
+Limbu,
+Linear_B,
+Malayalam,
+Mongolian,
+Myanmar,
+New_Tai_Lue,
+Nko,
+Ogham,
+Old_Italic,
+Old_Persian,
+Oriya,
+Osmanya,
+Phags_Pa,
+Phoenician,
+Runic,
+Shavian,
+Sinhala,
+Syloti_Nagri,
+Syriac,
+Tagalog,
+Tagbanwa,
+Tai_Le,
+Tamil,
+Telugu,
+Thaana,
+Thai,
+Tibetan,
+Tifinagh,
+Ugaritic,
+Yi.
+.P
+Each character has exactly one general category property, specified by a
+two-letter abbreviation. For compatibility with Perl, negation can be specified
+by including a circumflex between the opening brace and the property name. For
+example, \ep{^Lu} is the same as \eP{Lu}.
+.P
+If only one letter is specified with \ep or \eP, it includes all the general
+category properties that start with that letter. In this case, in the absence
+of negation, the curly brackets in the escape sequence are optional; these two
+examples have the same effect:
+.sp
+ \ep{L}
+ \epL
+.sp
+The following general category property codes are supported:
+.sp
+ C Other
+ Cc Control
+ Cf Format
+ Cn Unassigned
+ Co Private use
+ Cs Surrogate
+.sp
+ L Letter
+ Ll Lower case letter
+ Lm Modifier letter
+ Lo Other letter
+ Lt Title case letter
+ Lu Upper case letter
+.sp
+ M Mark
+ Mc Spacing mark
+ Me Enclosing mark
+ Mn Non-spacing mark
+.sp
+ N Number
+ Nd Decimal number
+ Nl Letter number
+ No Other number
+.sp
+ P Punctuation
+ Pc Connector punctuation
+ Pd Dash punctuation
+ Pe Close punctuation
+ Pf Final punctuation
+ Pi Initial punctuation
+ Po Other punctuation
+ Ps Open punctuation
+.sp
+ S Symbol
+ Sc Currency symbol
+ Sk Modifier symbol
+ Sm Mathematical symbol
+ So Other symbol
+.sp
+ Z Separator
+ Zl Line separator
+ Zp Paragraph separator
+ Zs Space separator
+.sp
+The special property L& is also supported: it matches a character that has
+the Lu, Ll, or Lt property, in other words, a letter that is not classified as
+a modifier or "other".
+.P
+The Cs (Surrogate) property applies only to characters in the range U+D800 to
+U+DFFF. Such characters are not valid in UTF-8 strings (see RFC 3629) and so
+cannot be tested by PCRE, unless UTF-8 validity checking has been turned off
+(see the discussion of PCRE_NO_UTF8_CHECK in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page).
+.P
+The long synonyms for these properties that Perl supports (such as \ep{Letter})
+are not supported by PCRE, nor is it permitted to prefix any of these
+properties with "Is".
+.P
+No character that is in the Unicode table has the Cn (unassigned) property.
+Instead, this property is assumed for any code point that is not in the
+Unicode table.
+.P
+Specifying caseless matching does not affect these escape sequences. For
+example, \ep{Lu} always matches only upper case letters.
+.P
+The \eX escape matches any number of Unicode characters that form an extended
+Unicode sequence. \eX is equivalent to
+.sp
+ (?>\ePM\epM*)
+.sp
+That is, it matches a character without the "mark" property, followed by zero
+or more characters with the "mark" property, and treats the sequence as an
+atomic group
+.\" HTML
+.\"
+(see below).
+.\"
+Characters with the "mark" property are typically accents that affect the
+preceding character. None of them have codepoints less than 256, so in
+non-UTF-8 mode \eX matches any one character.
+.P
+Matching characters by Unicode property is not fast, because PCRE has to search
+a structure that contains data for over fifteen thousand characters. That is
+why the traditional escape sequences such as \ed and \ew do not use Unicode
+properties in PCRE.
+.
+.
+.\" HTML
+.SS "Resetting the match start"
+.rs
+.sp
+The escape sequence \eK, which is a Perl 5.10 feature, causes any previously
+matched characters not to be included in the final matched sequence. For
+example, the pattern:
+.sp
+ foo\eKbar
+.sp
+matches "foobar", but reports that it has matched "bar". This feature is
+similar to a lookbehind assertion
+.\" HTML
+.\"
+(described below).
+.\"
+However, in this case, the part of the subject before the real match does not
+have to be of fixed length, as lookbehind assertions do. The use of \eK does
+not interfere with the setting of
+.\" HTML
+.\"
+captured substrings.
+.\"
+For example, when the pattern
+.sp
+ (foo)\eKbar
+.sp
+matches "foobar", the first substring is still set to "foo".
+.
+.
+.\" HTML
+.SS "Simple assertions"
+.rs
+.sp
+The final use of backslash is for certain simple assertions. An assertion
+specifies a condition that has to be met at a particular point in a match,
+without consuming any characters from the subject string. The use of
+subpatterns for more complicated assertions is described
+.\" HTML
+.\"
+below.
+.\"
+The backslashed assertions are:
+.sp
+ \eb matches at a word boundary
+ \eB matches when not at a word boundary
+ \eA matches at the start of the subject
+ \eZ matches at the end of the subject
+ also matches before a newline at the end of the subject
+ \ez matches only at the end of the subject
+ \eG matches at the first matching position in the subject
+.sp
+These assertions may not appear in character classes (but note that \eb has a
+different meaning, namely the backspace character, inside a character class).
+.P
+A word boundary is a position in the subject string where the current character
+and the previous character do not both match \ew or \eW (i.e. one matches
+\ew and the other matches \eW), or the start or end of the string if the
+first or last character matches \ew, respectively.
+.P
+The \eA, \eZ, and \ez assertions differ from the traditional circumflex and
+dollar (described in the next section) in that they only ever match at the very
+start and end of the subject string, whatever options are set. Thus, they are
+independent of multiline mode. These three assertions are not affected by the
+PCRE_NOTBOL or PCRE_NOTEOL options, which affect only the behaviour of the
+circumflex and dollar metacharacters. However, if the \fIstartoffset\fP
+argument of \fBpcre_exec()\fP is non-zero, indicating that matching is to start
+at a point other than the beginning of the subject, \eA can never match. The
+difference between \eZ and \ez is that \eZ matches before a newline at the end
+of the string as well as at the very end, whereas \ez matches only at the end.
+.P
+The \eG assertion is true only when the current matching position is at the
+start point of the match, as specified by the \fIstartoffset\fP argument of
+\fBpcre_exec()\fP. It differs from \eA when the value of \fIstartoffset\fP is
+non-zero. By calling \fBpcre_exec()\fP multiple times with appropriate
+arguments, you can mimic Perl's /g option, and it is in this kind of
+implementation where \eG can be useful.
+.P
+Note, however, that PCRE's interpretation of \eG, as the start of the current
+match, is subtly different from Perl's, which defines it as the end of the
+previous match. In Perl, these can be different when the previously matched
+string was empty. Because PCRE does just one match at a time, it cannot
+reproduce this behaviour.
+.P
+If all the alternatives of a pattern begin with \eG, the expression is anchored
+to the starting match position, and the "anchored" flag is set in the compiled
+regular expression.
+.
+.
+.SH "CIRCUMFLEX AND DOLLAR"
+.rs
+.sp
+Outside a character class, in the default matching mode, the circumflex
+character is an assertion that is true only if the current matching point is
+at the start of the subject string. If the \fIstartoffset\fP argument of
+\fBpcre_exec()\fP is non-zero, circumflex can never match if the PCRE_MULTILINE
+option is unset. Inside a character class, circumflex has an entirely different
+meaning
+.\" HTML
+.\"
+(see below).
+.\"
+.P
+Circumflex need not be the first character of the pattern if a number of
+alternatives are involved, but it should be the first thing in each alternative
+in which it appears if the pattern is ever to match that branch. If all
+possible alternatives start with a circumflex, that is, if the pattern is
+constrained to match only at the start of the subject, it is said to be an
+"anchored" pattern. (There are also other constructs that can cause a pattern
+to be anchored.)
+.P
+A dollar character is an assertion that is true only if the current matching
+point is at the end of the subject string, or immediately before a newline
+at the end of the string (by default). Dollar need not be the last character of
+the pattern if a number of alternatives are involved, but it should be the last
+item in any branch in which it appears. Dollar has no special meaning in a
+character class.
+.P
+The meaning of dollar can be changed so that it matches only at the very end of
+the string, by setting the PCRE_DOLLAR_ENDONLY option at compile time. This
+does not affect the \eZ assertion.
+.P
+The meanings of the circumflex and dollar characters are changed if the
+PCRE_MULTILINE option is set. When this is the case, a circumflex matches
+immediately after internal newlines as well as at the start of the subject
+string. It does not match after a newline that ends the string. A dollar
+matches before any newlines in the string, as well as at the very end, when
+PCRE_MULTILINE is set. When newline is specified as the two-character
+sequence CRLF, isolated CR and LF characters do not indicate newlines.
+.P
+For example, the pattern /^abc$/ matches the subject string "def\enabc" (where
+\en represents a newline) in multiline mode, but not otherwise. Consequently,
+patterns that are anchored in single line mode because all branches start with
+^ are not anchored in multiline mode, and a match for circumflex is possible
+when the \fIstartoffset\fP argument of \fBpcre_exec()\fP is non-zero. The
+PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
+.P
+Note that the sequences \eA, \eZ, and \ez can be used to match the start and
+end of the subject in both modes, and if all branches of a pattern start with
+\eA it is always anchored, whether or not PCRE_MULTILINE is set.
+.
+.
+.SH "FULL STOP (PERIOD, DOT)"
+.rs
+.sp
+Outside a character class, a dot in the pattern matches any one character in
+the subject string except (by default) a character that signifies the end of a
+line. In UTF-8 mode, the matched character may be more than one byte long.
+.P
+When a line ending is defined as a single character, dot never matches that
+character; when the two-character sequence CRLF is used, dot does not match CR
+if it is immediately followed by LF, but otherwise it matches all characters
+(including isolated CRs and LFs). When any Unicode line endings are being
+recognized, dot does not match CR or LF or any of the other line ending
+characters.
+.P
+The behaviour of dot with regard to newlines can be changed. If the PCRE_DOTALL
+option is set, a dot matches any one character, without exception. If the
+two-character sequence CRLF is present in the subject string, it takes two dots
+to match it.
+.P
+The handling of dot is entirely independent of the handling of circumflex and
+dollar, the only relationship being that they both involve newlines. Dot has no
+special meaning in a character class.
+.
+.
+.SH "MATCHING A SINGLE BYTE"
+.rs
+.sp
+Outside a character class, the escape sequence \eC matches any one byte, both
+in and out of UTF-8 mode. Unlike a dot, it always matches any line-ending
+characters. The feature is provided in Perl in order to match individual bytes
+in UTF-8 mode. Because it breaks up UTF-8 characters into individual bytes,
+what remains in the string may be a malformed UTF-8 string. For this reason,
+the \eC escape sequence is best avoided.
+.P
+PCRE does not allow \eC to appear in lookbehind assertions
+.\" HTML
+.\"
+(described below),
+.\"
+because in UTF-8 mode this would make it impossible to calculate the length of
+the lookbehind.
+.
+.
+.\" HTML
+.SH "SQUARE BRACKETS AND CHARACTER CLASSES"
+.rs
+.sp
+An opening square bracket introduces a character class, terminated by a closing
+square bracket. A closing square bracket on its own is not special. If a
+closing square bracket is required as a member of the class, it should be the
+first data character in the class (after an initial circumflex, if present) or
+escaped with a backslash.
+.P
+A character class matches a single character in the subject. In UTF-8 mode, the
+character may occupy more than one byte. A matched character must be in the set
+of characters defined by the class, unless the first character in the class
+definition is a circumflex, in which case the subject character must not be in
+the set defined by the class. If a circumflex is actually required as a member
+of the class, ensure it is not the first character, or escape it with a
+backslash.
+.P
+For example, the character class [aeiou] matches any lower case vowel, while
+[^aeiou] matches any character that is not a lower case vowel. Note that a
+circumflex is just a convenient notation for specifying the characters that
+are in the class by enumerating those that are not. A class that starts with a
+circumflex is not an assertion: it still consumes a character from the subject
+string, and therefore it fails if the current pointer is at the end of the
+string.
+.P
+In UTF-8 mode, characters with values greater than 255 can be included in a
+class as a literal string of bytes, or by using the \ex{ escaping mechanism.
+.P
+When caseless matching is set, any letters in a class represent both their
+upper case and lower case versions, so for example, a caseless [aeiou] matches
+"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
+caseful version would. In UTF-8 mode, PCRE always understands the concept of
+case for characters whose values are less than 128, so caseless matching is
+always possible. For characters with higher values, the concept of case is
+supported if PCRE is compiled with Unicode property support, but not otherwise.
+If you want to use caseless matching for characters 128 and above, you must
+ensure that PCRE is compiled with Unicode property support as well as with
+UTF-8 support.
+.P
+Characters that might indicate line breaks are never treated in any special way
+when matching character classes, whatever line-ending sequence is in use, and
+whatever setting of the PCRE_DOTALL and PCRE_MULTILINE options is used. A class
+such as [^a] always matches one of these characters.
+.P
+The minus (hyphen) character can be used to specify a range of characters in a
+character class. For example, [d-m] matches any letter between d and m,
+inclusive. If a minus character is required in a class, it must be escaped with
+a backslash or appear in a position where it cannot be interpreted as
+indicating a range, typically as the first or last character in the class.
+.P
+It is not possible to have the literal character "]" as the end character of a
+range. A pattern such as [W-]46] is interpreted as a class of two characters
+("W" and "-") followed by a literal string "46]", so it would match "W46]" or
+"-46]". However, if the "]" is escaped with a backslash it is interpreted as
+the end of range, so [W-\e]46] is interpreted as a class containing a range
+followed by two other characters. The octal or hexadecimal representation of
+"]" can also be used to end a range.
+.P
+Ranges operate in the collating sequence of character values. They can also be
+used for characters specified numerically, for example [\e000-\e037]. In UTF-8
+mode, ranges can include characters whose values are greater than 255, for
+example [\ex{100}-\ex{2ff}].
+.P
+If a range that includes letters is used when caseless matching is set, it
+matches the letters in either case. For example, [W-c] is equivalent to
+[][\e\e^_`wxyzabc], matched caselessly, and in non-UTF-8 mode, if character
+tables for a French locale are in use, [\exc8-\excb] matches accented E
+characters in both cases. In UTF-8 mode, PCRE supports the concept of case for
+characters with values greater than 128 only when it is compiled with Unicode
+property support.
+.P
+The character types \ed, \eD, \ep, \eP, \es, \eS, \ew, and \eW may also appear
+in a character class, and add the characters that they match to the class. For
+example, [\edABCDEF] matches any hexadecimal digit. A circumflex can
+conveniently be used with the upper case character types to specify a more
+restricted set of characters than the matching lower case type. For example,
+the class [^\eW_] matches any letter or digit, but not underscore.
+.P
+The only metacharacters that are recognized in character classes are backslash,
+hyphen (only where it can be interpreted as specifying a range), circumflex
+(only at the start), opening square bracket (only when it can be interpreted as
+introducing a POSIX class name - see the next section), and the terminating
+closing square bracket. However, escaping other non-alphanumeric characters
+does no harm.
+.
+.
+.SH "POSIX CHARACTER CLASSES"
+.rs
+.sp
+Perl supports the POSIX notation for character classes. This uses names
+enclosed by [: and :] within the enclosing square brackets. PCRE also supports
+this notation. For example,
+.sp
+ [01[:alpha:]%]
+.sp
+matches "0", "1", any alphabetic character, or "%". The supported class names
+are
+.sp
+ alnum letters and digits
+ alpha letters
+ ascii character codes 0 - 127
+ blank space or tab only
+ cntrl control characters
+ digit decimal digits (same as \ed)
+ graph printing characters, excluding space
+ lower lower case letters
+ print printing characters, including space
+ punct printing characters, excluding letters and digits
+ space white space (not quite the same as \es)
+ upper upper case letters
+ word "word" characters (same as \ew)
+ xdigit hexadecimal digits
+.sp
+The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and
+space (32). Notice that this list includes the VT character (code 11). This
+makes "space" different to \es, which does not include VT (for Perl
+compatibility).
+.P
+The name "word" is a Perl extension, and "blank" is a GNU extension from Perl
+5.8. Another Perl extension is negation, which is indicated by a ^ character
+after the colon. For example,
+.sp
+ [12[:^digit:]]
+.sp
+matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
+syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
+supported, and an error is given if they are encountered.
+.P
+In UTF-8 mode, characters with values greater than 128 do not match any of
+the POSIX character classes.
+.
+.
+.SH "VERTICAL BAR"
+.rs
+.sp
+Vertical bar characters are used to separate alternative patterns. For example,
+the pattern
+.sp
+ gilbert|sullivan
+.sp
+matches either "gilbert" or "sullivan". Any number of alternatives may appear,
+and an empty alternative is permitted (matching the empty string). The matching
+process tries each alternative in turn, from left to right, and the first one
+that succeeds is used. If the alternatives are within a subpattern
+.\" HTML
+.\"
+(defined below),
+.\"
+"succeeds" means matching the rest of the main pattern as well as the
+alternative in the subpattern.
+.
+.
+.SH "INTERNAL OPTION SETTING"
+.rs
+.sp
+The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
+PCRE_EXTENDED options (which are Perl-compatible) can be changed from within
+the pattern by a sequence of Perl option letters enclosed between "(?" and ")".
+The option letters are
+.sp
+ i for PCRE_CASELESS
+ m for PCRE_MULTILINE
+ s for PCRE_DOTALL
+ x for PCRE_EXTENDED
+.sp
+For example, (?im) sets caseless, multiline matching. It is also possible to
+unset these options by preceding the letter with a hyphen, and a combined
+setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
+PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
+permitted. If a letter appears both before and after the hyphen, the option is
+unset.
+.P
+The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA can be
+changed in the same way as the Perl-compatible options by using the characters
+J, U and X respectively.
+.P
+When an option change occurs at top level (that is, not inside subpattern
+parentheses), the change applies to the remainder of the pattern that follows.
+If the change is placed right at the start of a pattern, PCRE extracts it into
+the global options (and it will therefore show up in data extracted by the
+\fBpcre_fullinfo()\fP function).
+.P
+An option change within a subpattern (see below for a description of
+subpatterns) affects only that part of the current pattern that follows it, so
+.sp
+ (a(?i)b)c
+.sp
+matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
+By this means, options can be made to have different settings in different
+parts of the pattern. Any changes made in one alternative do carry on
+into subsequent branches within the same subpattern. For example,
+.sp
+ (a(?i)b|c)
+.sp
+matches "ab", "aB", "c", and "C", even though when matching "C" the first
+branch is abandoned before the option setting. This is because the effects of
+option settings happen at compile time. There would be some very weird
+behaviour otherwise.
+.P
+\fBNote:\fP There are other PCRE-specific options that can be set by the
+application when the compile or match functions are called. In some cases the
+pattern can contain special leading sequences to override what the application
+has set or what has been defaulted. Details are given in the section entitled
+.\" HTML
+.\"
+"Newline sequences"
+.\"
+above.
+.
+.
+.\" HTML
+.SH SUBPATTERNS
+.rs
+.sp
+Subpatterns are delimited by parentheses (round brackets), which can be nested.
+Turning part of a pattern into a subpattern does two things:
+.sp
+1. It localizes a set of alternatives. For example, the pattern
+.sp
+ cat(aract|erpillar|)
+.sp
+matches one of the words "cat", "cataract", or "caterpillar". Without the
+parentheses, it would match "cataract", "erpillar" or an empty string.
+.sp
+2. It sets up the subpattern as a capturing subpattern. This means that, when
+the whole pattern matches, that portion of the subject string that matched the
+subpattern is passed back to the caller via the \fIovector\fP argument of
+\fBpcre_exec()\fP. Opening parentheses are counted from left to right (starting
+from 1) to obtain numbers for the capturing subpatterns.
+.P
+For example, if the string "the red king" is matched against the pattern
+.sp
+ the ((red|white) (king|queen))
+.sp
+the captured substrings are "red king", "red", and "king", and are numbered 1,
+2, and 3, respectively.
+.P
+The fact that plain parentheses fulfil two functions is not always helpful.
+There are often times when a grouping subpattern is required without a
+capturing requirement. If an opening parenthesis is followed by a question mark
+and a colon, the subpattern does not do any capturing, and is not counted when
+computing the number of any subsequent capturing subpatterns. For example, if
+the string "the white queen" is matched against the pattern
+.sp
+ the ((?:red|white) (king|queen))
+.sp
+the captured substrings are "white queen" and "queen", and are numbered 1 and
+2. The maximum number of capturing subpatterns is 65535.
+.P
+As a convenient shorthand, if any option settings are required at the start of
+a non-capturing subpattern, the option letters may appear between the "?" and
+the ":". Thus the two patterns
+.sp
+ (?i:saturday|sunday)
+ (?:(?i)saturday|sunday)
+.sp
+match exactly the same set of strings. Because alternative branches are tried
+from left to right, and options are not reset until the end of the subpattern
+is reached, an option setting in one branch does affect subsequent branches, so
+the above patterns match "SUNDAY" as well as "Saturday".
+.
+.
+.SH "DUPLICATE SUBPATTERN NUMBERS"
+.rs
+.sp
+Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
+the same numbers for its capturing parentheses. Such a subpattern starts with
+(?| and is itself a non-capturing subpattern. For example, consider this
+pattern:
+.sp
+ (?|(Sat)ur|(Sun))day
+.sp
+Because the two alternatives are inside a (?| group, both sets of capturing
+parentheses are numbered one. Thus, when the pattern matches, you can look
+at captured substring number one, whichever alternative matched. This construct
+is useful when you want to capture part, but not all, of one of a number of
+alternatives. Inside a (?| group, parentheses are numbered as usual, but the
+number is reset at the start of each branch. The numbers of any capturing
+buffers that follow the subpattern start after the highest number used in any
+branch. The following example is taken from the Perl documentation.
+The numbers underneath show in which buffer the captured content will be
+stored.
+.sp
+ # before ---------------branch-reset----------- after
+ / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
+ # 1 2 2 3 2 3 4
+.sp
+A backreference or a recursive call to a numbered subpattern always refers to
+the first one in the pattern with the given number.
+.P
+An alternative approach to using this "branch reset" feature is to use
+duplicate named subpatterns, as described in the next section.
+.
+.
+.SH "NAMED SUBPATTERNS"
+.rs
+.sp
+Identifying capturing parentheses by number is simple, but it can be very hard
+to keep track of the numbers in complicated regular expressions. Furthermore,
+if an expression is modified, the numbers may change. To help with this
+difficulty, PCRE supports the naming of subpatterns. This feature was not
+added to Perl until release 5.10. Python had the feature earlier, and PCRE
+introduced it at release 4.0, using the Python syntax. PCRE now supports both
+the Perl and the Python syntax.
+.P
+In PCRE, a subpattern can be named in one of three ways: (?...) or
+(?'name'...) as in Perl, or (?P...) as in Python. References to capturing
+parentheses from other parts of the pattern, such as
+.\" HTML
+.\"
+backreferences,
+.\"
+.\" HTML
+.\"
+recursion,
+.\"
+and
+.\" HTML
+.\"
+conditions,
+.\"
+can be made by name as well as by number.
+.P
+Names consist of up to 32 alphanumeric characters and underscores. Named
+capturing parentheses are still allocated numbers as well as names, exactly as
+if the names were not present. The PCRE API provides function calls for
+extracting the name-to-number translation table from a compiled pattern. There
+is also a convenience function for extracting a captured substring by name.
+.P
+By default, a name must be unique within a pattern, but it is possible to relax
+this constraint by setting the PCRE_DUPNAMES option at compile time. This can
+be useful for patterns where only one instance of the named parentheses can
+match. Suppose you want to match the name of a weekday, either as a 3-letter
+abbreviation or as the full name, and in both cases you want to extract the
+abbreviation. This pattern (ignoring the line breaks) does the job:
+.sp
+ (?Mon|Fri|Sun)(?:day)?|
+ (?Tue)(?:sday)?|
+ (?Wed)(?:nesday)?|
+ (?Thu)(?:rsday)?|
+ (?Sat)(?:urday)?
+.sp
+There are five capturing substrings, but only one is ever set after a match.
+(An alternative way of solving this problem is to use a "branch reset"
+subpattern, as described in the previous section.)
+.P
+The convenience function for extracting the data by name returns the substring
+for the first (and in this example, the only) subpattern of that name that
+matched. This saves searching to find which numbered subpattern it was. If you
+make a reference to a non-unique named subpattern from elsewhere in the
+pattern, the one that corresponds to the lowest number is used. For further
+details of the interfaces for handling named subpatterns, see the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.
+.
+.SH REPETITION
+.rs
+.sp
+Repetition is specified by quantifiers, which can follow any of the following
+items:
+.sp
+ a literal data character
+ the dot metacharacter
+ the \eC escape sequence
+ the \eX escape sequence (in UTF-8 mode with Unicode properties)
+ the \eR escape sequence
+ an escape such as \ed that matches a single character
+ a character class
+ a back reference (see next section)
+ a parenthesized subpattern (unless it is an assertion)
+.sp
+The general repetition quantifier specifies a minimum and maximum number of
+permitted matches, by giving the two numbers in curly brackets (braces),
+separated by a comma. The numbers must be less than 65536, and the first must
+be less than or equal to the second. For example:
+.sp
+ z{2,4}
+.sp
+matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
+character. If the second number is omitted, but the comma is present, there is
+no upper limit; if the second number and the comma are both omitted, the
+quantifier specifies an exact number of required matches. Thus
+.sp
+ [aeiou]{3,}
+.sp
+matches at least 3 successive vowels, but may match many more, while
+.sp
+ \ed{8}
+.sp
+matches exactly 8 digits. An opening curly bracket that appears in a position
+where a quantifier is not allowed, or one that does not match the syntax of a
+quantifier, is taken as a literal character. For example, {,6} is not a
+quantifier, but a literal string of four characters.
+.P
+In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to individual
+bytes. Thus, for example, \ex{100}{2} matches two UTF-8 characters, each of
+which is represented by a two-byte sequence. Similarly, when Unicode property
+support is available, \eX{3} matches three Unicode extended sequences, each of
+which may be several bytes long (and they may be of different lengths).
+.P
+The quantifier {0} is permitted, causing the expression to behave as if the
+previous item and the quantifier were not present. This may be useful for
+subpatterns that are referenced as
+.\" HTML
+.\"
+subroutines
+.\"
+from elsewhere in the pattern. Items other than subpatterns that have a {0}
+quantifier are omitted from the compiled pattern.
+.P
+For convenience, the three most common quantifiers have single-character
+abbreviations:
+.sp
+ * is equivalent to {0,}
+ + is equivalent to {1,}
+ ? is equivalent to {0,1}
+.sp
+It is possible to construct infinite loops by following a subpattern that can
+match no characters with a quantifier that has no upper limit, for example:
+.sp
+ (a?)*
+.sp
+Earlier versions of Perl and PCRE used to give an error at compile time for
+such patterns. However, because there are cases where this can be useful, such
+patterns are now accepted, but if any repetition of the subpattern does in fact
+match no characters, the loop is forcibly broken.
+.P
+By default, the quantifiers are "greedy", that is, they match as much as
+possible (up to the maximum number of permitted times), without causing the
+rest of the pattern to fail. The classic example of where this gives problems
+is in trying to match comments in C programs. These appear between /* and */
+and within the comment, individual * and / characters may appear. An attempt to
+match C comments by applying the pattern
+.sp
+ /\e*.*\e*/
+.sp
+to the string
+.sp
+ /* first comment */ not comment /* second comment */
+.sp
+fails, because it matches the entire string owing to the greediness of the .*
+item.
+.P
+However, if a quantifier is followed by a question mark, it ceases to be
+greedy, and instead matches the minimum number of times possible, so the
+pattern
+.sp
+ /\e*.*?\e*/
+.sp
+does the right thing with the C comments. The meaning of the various
+quantifiers is not otherwise changed, just the preferred number of matches.
+Do not confuse this use of question mark with its use as a quantifier in its
+own right. Because it has two uses, it can sometimes appear doubled, as in
+.sp
+ \ed??\ed
+.sp
+which matches one digit by preference, but can match two if that is the only
+way the rest of the pattern matches.
+.P
+If the PCRE_UNGREEDY option is set (an option that is not available in Perl),
+the quantifiers are not greedy by default, but individual ones can be made
+greedy by following them with a question mark. In other words, it inverts the
+default behaviour.
+.P
+When a parenthesized subpattern is quantified with a minimum repeat count that
+is greater than 1 or with a limited maximum, more memory is required for the
+compiled pattern, in proportion to the size of the minimum or maximum.
+.P
+If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
+to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is
+implicitly anchored, because whatever follows will be tried against every
+character position in the subject string, so there is no point in retrying the
+overall match at any position after the first. PCRE normally treats such a
+pattern as though it were preceded by \eA.
+.P
+In cases where it is known that the subject string contains no newlines, it is
+worth setting PCRE_DOTALL in order to obtain this optimization, or
+alternatively using ^ to indicate anchoring explicitly.
+.P
+However, there is one situation where the optimization cannot be used. When .*
+is inside capturing parentheses that are the subject of a backreference
+elsewhere in the pattern, a match at the start may fail where a later one
+succeeds. Consider, for example:
+.sp
+ (.*)abc\e1
+.sp
+If the subject is "xyz123abc123" the match point is the fourth character. For
+this reason, such a pattern is not implicitly anchored.
+.P
+When a capturing subpattern is repeated, the value captured is the substring
+that matched the final iteration. For example, after
+.sp
+ (tweedle[dume]{3}\es*)+
+.sp
+has matched "tweedledum tweedledee" the value of the captured substring is
+"tweedledee". However, if there are nested capturing subpatterns, the
+corresponding captured values may have been set in previous iterations. For
+example, after
+.sp
+ /(a|(b))+/
+.sp
+matches "aba" the value of the second captured substring is "b".
+.
+.
+.\" HTML
+.SH "ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS"
+.rs
+.sp
+With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
+repetition, failure of what follows normally causes the repeated item to be
+re-evaluated to see if a different number of repeats allows the rest of the
+pattern to match. Sometimes it is useful to prevent this, either to change the
+nature of the match, or to cause it fail earlier than it otherwise might, when
+the author of the pattern knows there is no point in carrying on.
+.P
+Consider, for example, the pattern \ed+foo when applied to the subject line
+.sp
+ 123456bar
+.sp
+After matching all 6 digits and then failing to match "foo", the normal
+action of the matcher is to try again with only 5 digits matching the \ed+
+item, and then with 4, and so on, before ultimately failing. "Atomic grouping"
+(a term taken from Jeffrey Friedl's book) provides the means for specifying
+that once a subpattern has matched, it is not to be re-evaluated in this way.
+.P
+If we use atomic grouping for the previous example, the matcher gives up
+immediately on failing to match "foo" the first time. The notation is a kind of
+special parenthesis, starting with (?> as in this example:
+.sp
+ (?>\ed+)foo
+.sp
+This kind of parenthesis "locks up" the part of the pattern it contains once
+it has matched, and a failure further into the pattern is prevented from
+backtracking into it. Backtracking past it to previous items, however, works as
+normal.
+.P
+An alternative description is that a subpattern of this type matches the string
+of characters that an identical standalone pattern would match, if anchored at
+the current point in the subject string.
+.P
+Atomic grouping subpatterns are not capturing subpatterns. Simple cases such as
+the above example can be thought of as a maximizing repeat that must swallow
+everything it can. So, while both \ed+ and \ed+? are prepared to adjust the
+number of digits they match in order to make the rest of the pattern match,
+(?>\ed+) can only match an entire sequence of digits.
+.P
+Atomic groups in general can of course contain arbitrarily complicated
+subpatterns, and can be nested. However, when the subpattern for an atomic
+group is just a single repeated item, as in the example above, a simpler
+notation, called a "possessive quantifier" can be used. This consists of an
+additional + character following a quantifier. Using this notation, the
+previous example can be rewritten as
+.sp
+ \ed++foo
+.sp
+Note that a possessive quantifier can be used with an entire group, for
+example:
+.sp
+ (abc|xyz){2,3}+
+.sp
+Possessive quantifiers are always greedy; the setting of the PCRE_UNGREEDY
+option is ignored. They are a convenient notation for the simpler forms of
+atomic group. However, there is no difference in the meaning of a possessive
+quantifier and the equivalent atomic group, though there may be a performance
+difference; possessive quantifiers should be slightly faster.
+.P
+The possessive quantifier syntax is an extension to the Perl 5.8 syntax.
+Jeffrey Friedl originated the idea (and the name) in the first edition of his
+book. Mike McCloskey liked it, so implemented it when he built Sun's Java
+package, and PCRE copied it from there. It ultimately found its way into Perl
+at release 5.10.
+.P
+PCRE has an optimization that automatically "possessifies" certain simple
+pattern constructs. For example, the sequence A+B is treated as A++B because
+there is no point in backtracking into a sequence of A's when B must follow.
+.P
+When a pattern contains an unlimited repeat inside a subpattern that can itself
+be repeated an unlimited number of times, the use of an atomic group is the
+only way to avoid some failing matches taking a very long time indeed. The
+pattern
+.sp
+ (\eD+|<\ed+>)*[!?]
+.sp
+matches an unlimited number of substrings that either consist of non-digits, or
+digits enclosed in <>, followed by either ! or ?. When it matches, it runs
+quickly. However, if it is applied to
+.sp
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+.sp
+it takes a long time before reporting failure. This is because the string can
+be divided between the internal \eD+ repeat and the external * repeat in a
+large number of ways, and all have to be tried. (The example uses [!?] rather
+than a single character at the end, because both PCRE and Perl have an
+optimization that allows for fast failure when a single character is used. They
+remember the last single character that is required for a match, and fail early
+if it is not present in the string.) If the pattern is changed so that it uses
+an atomic group, like this:
+.sp
+ ((?>\eD+)|<\ed+>)*[!?]
+.sp
+sequences of non-digits cannot be broken, and failure happens quickly.
+.
+.
+.\" HTML
+.SH "BACK REFERENCES"
+.rs
+.sp
+Outside a character class, a backslash followed by a digit greater than 0 (and
+possibly further digits) is a back reference to a capturing subpattern earlier
+(that is, to its left) in the pattern, provided there have been that many
+previous capturing left parentheses.
+.P
+However, if the decimal number following the backslash is less than 10, it is
+always taken as a back reference, and causes an error only if there are not
+that many capturing left parentheses in the entire pattern. In other words, the
+parentheses that are referenced need not be to the left of the reference for
+numbers less than 10. A "forward back reference" of this type can make sense
+when a repetition is involved and the subpattern to the right has participated
+in an earlier iteration.
+.P
+It is not possible to have a numerical "forward back reference" to a subpattern
+whose number is 10 or more using this syntax because a sequence such as \e50 is
+interpreted as a character defined in octal. See the subsection entitled
+"Non-printing characters"
+.\" HTML
+.\"
+above
+.\"
+for further details of the handling of digits following a backslash. There is
+no such problem when named parentheses are used. A back reference to any
+subpattern is possible using named parentheses (see below).
+.P
+Another way of avoiding the ambiguity inherent in the use of digits following a
+backslash is to use the \eg escape sequence, which is a feature introduced in
+Perl 5.10. This escape must be followed by an unsigned number or a negative
+number, optionally enclosed in braces. These examples are all identical:
+.sp
+ (ring), \e1
+ (ring), \eg1
+ (ring), \eg{1}
+.sp
+An unsigned number specifies an absolute reference without the ambiguity that
+is present in the older syntax. It is also useful when literal digits follow
+the reference. A negative number is a relative reference. Consider this
+example:
+.sp
+ (abc(def)ghi)\eg{-1}
+.sp
+The sequence \eg{-1} is a reference to the most recently started capturing
+subpattern before \eg, that is, is it equivalent to \e2. Similarly, \eg{-2}
+would be equivalent to \e1. The use of relative references can be helpful in
+long patterns, and also in patterns that are created by joining together
+fragments that contain references within themselves.
+.P
+A back reference matches whatever actually matched the capturing subpattern in
+the current subject string, rather than anything matching the subpattern
+itself (see
+.\" HTML
+.\"
+"Subpatterns as subroutines"
+.\"
+below for a way of doing that). So the pattern
+.sp
+ (sens|respons)e and \e1ibility
+.sp
+matches "sense and sensibility" and "response and responsibility", but not
+"sense and responsibility". If caseful matching is in force at the time of the
+back reference, the case of letters is relevant. For example,
+.sp
+ ((?i)rah)\es+\e1
+.sp
+matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
+capturing subpattern is matched caselessly.
+.P
+There are several different ways of writing back references to named
+subpatterns. The .NET syntax \ek{name} and the Perl syntax \ek or
+\ek'name' are supported, as is the Python syntax (?P=name). Perl 5.10's unified
+back reference syntax, in which \eg can be used for both numeric and named
+references, is also supported. We could rewrite the above example in any of
+the following ways:
+.sp
+ (?(?i)rah)\es+\ek
+ (?'p1'(?i)rah)\es+\ek{p1}
+ (?P(?i)rah)\es+(?P=p1)
+ (?(?i)rah)\es+\eg{p1}
+.sp
+A subpattern that is referenced by name may appear in the pattern before or
+after the reference.
+.P
+There may be more than one back reference to the same subpattern. If a
+subpattern has not actually been used in a particular match, any back
+references to it always fail. For example, the pattern
+.sp
+ (a|(bc))\e2
+.sp
+always fails if it starts to match "a" rather than "bc". Because there may be
+many capturing parentheses in a pattern, all digits following the backslash are
+taken as part of a potential back reference number. If the pattern continues
+with a digit character, some delimiter must be used to terminate the back
+reference. If the PCRE_EXTENDED option is set, this can be whitespace.
+Otherwise an empty comment (see
+.\" HTML
+.\"
+"Comments"
+.\"
+below) can be used.
+.P
+A back reference that occurs inside the parentheses to which it refers fails
+when the subpattern is first used, so, for example, (a\e1) never matches.
+However, such references can be useful inside repeated subpatterns. For
+example, the pattern
+.sp
+ (a|b\e1)+
+.sp
+matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
+the subpattern, the back reference matches the character string corresponding
+to the previous iteration. In order for this to work, the pattern must be such
+that the first iteration does not need to match the back reference. This can be
+done using alternation, as in the example above, or by a quantifier with a
+minimum of zero.
+.
+.
+.\" HTML
+.SH ASSERTIONS
+.rs
+.sp
+An assertion is a test on the characters following or preceding the current
+matching point that does not actually consume any characters. The simple
+assertions coded as \eb, \eB, \eA, \eG, \eZ, \ez, ^ and $ are described
+.\" HTML
+.\"
+above.
+.\"
+.P
+More complicated assertions are coded as subpatterns. There are two kinds:
+those that look ahead of the current position in the subject string, and those
+that look behind it. An assertion subpattern is matched in the normal way,
+except that it does not cause the current matching position to be changed.
+.P
+Assertion subpatterns are not capturing subpatterns, and may not be repeated,
+because it makes no sense to assert the same thing several times. If any kind
+of assertion contains capturing subpatterns within it, these are counted for
+the purposes of numbering the capturing subpatterns in the whole pattern.
+However, substring capturing is carried out only for positive assertions,
+because it does not make sense for negative assertions.
+.
+.
+.SS "Lookahead assertions"
+.rs
+.sp
+Lookahead assertions start with (?= for positive assertions and (?! for
+negative assertions. For example,
+.sp
+ \ew+(?=;)
+.sp
+matches a word followed by a semicolon, but does not include the semicolon in
+the match, and
+.sp
+ foo(?!bar)
+.sp
+matches any occurrence of "foo" that is not followed by "bar". Note that the
+apparently similar pattern
+.sp
+ (?!foo)bar
+.sp
+does not find an occurrence of "bar" that is preceded by something other than
+"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
+(?!foo) is always true when the next three characters are "bar". A
+lookbehind assertion is needed to achieve the other effect.
+.P
+If you want to force a matching failure at some point in a pattern, the most
+convenient way to do it is with (?!) because an empty string always matches, so
+an assertion that requires there not to be an empty string must always fail.
+.
+.
+.\" HTML
+.SS "Lookbehind assertions"
+.rs
+.sp
+Lookbehind assertions start with (?<= for positive assertions and (?
+.\"
+(see above)
+.\"
+can be used instead of a lookbehind assertion; this is not restricted to a
+fixed-length.
+.P
+The implementation of lookbehind assertions is, for each alternative, to
+temporarily move the current position back by the fixed length and then try to
+match. If there are insufficient characters before the current position, the
+assertion fails.
+.P
+PCRE does not allow the \eC escape (which matches a single byte in UTF-8 mode)
+to appear in lookbehind assertions, because it makes it impossible to calculate
+the length of the lookbehind. The \eX and \eR escapes, which can match
+different numbers of bytes, are also not permitted.
+.P
+Possessive quantifiers can be used in conjunction with lookbehind assertions to
+specify efficient matching at the end of the subject string. Consider a simple
+pattern such as
+.sp
+ abcd$
+.sp
+when applied to a long string that does not match. Because matching proceeds
+from left to right, PCRE will look for each "a" in the subject and then see if
+what follows matches the rest of the pattern. If the pattern is specified as
+.sp
+ ^.*abcd$
+.sp
+the initial .* matches the entire string at first, but when this fails (because
+there is no following "a"), it backtracks to match all but the last character,
+then all but the last two characters, and so on. Once again the search for "a"
+covers the entire string, from right to left, so we are no better off. However,
+if the pattern is written as
+.sp
+ ^.*+(?<=abcd)
+.sp
+there can be no backtracking for the .*+ item; it can match only the entire
+string. The subsequent lookbehind assertion does a single test on the last four
+characters. If it fails, the match fails immediately. For long strings, this
+approach makes a significant difference to the processing time.
+.
+.
+.SS "Using multiple assertions"
+.rs
+.sp
+Several assertions (of any sort) may occur in succession. For example,
+.sp
+ (?<=\ed{3})(?
+.SH "CONDITIONAL SUBPATTERNS"
+.rs
+.sp
+It is possible to cause the matching process to obey a subpattern
+conditionally or to choose between two alternative subpatterns, depending on
+the result of an assertion, or whether a previous capturing subpattern matched
+or not. The two possible forms of conditional subpattern are
+.sp
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+.sp
+If the condition is satisfied, the yes-pattern is used; otherwise the
+no-pattern (if present) is used. If there are more than two alternatives in the
+subpattern, a compile-time error occurs.
+.P
+There are four kinds of condition: references to subpatterns, references to
+recursion, a pseudo-condition called DEFINE, and assertions.
+.
+.SS "Checking for a used subpattern by number"
+.rs
+.sp
+If the text between the parentheses consists of a sequence of digits, the
+condition is true if the capturing subpattern of that number has previously
+matched. An alternative notation is to precede the digits with a plus or minus
+sign. In this case, the subpattern number is relative rather than absolute.
+The most recently opened parentheses can be referenced by (?(-1), the next most
+recent by (?(-2), and so on. In looping constructs it can also make sense to
+refer to subsequent groups with constructs such as (?(+2).
+.P
+Consider the following pattern, which contains non-significant white space to
+make it more readable (assume the PCRE_EXTENDED option) and to divide it into
+three parts for ease of discussion:
+.sp
+ ( \e( )? [^()]+ (?(1) \e) )
+.sp
+The first part matches an optional opening parenthesis, and if that
+character is present, sets it as the first captured substring. The second part
+matches one or more characters that are not parentheses. The third part is a
+conditional subpattern that tests whether the first set of parentheses matched
+or not. If they did, that is, if subject started with an opening parenthesis,
+the condition is true, and so the yes-pattern is executed and a closing
+parenthesis is required. Otherwise, since no-pattern is not present, the
+subpattern matches nothing. In other words, this pattern matches a sequence of
+non-parentheses, optionally enclosed in parentheses.
+.P
+If you were embedding this pattern in a larger one, you could use a relative
+reference:
+.sp
+ ...other stuff... ( \e( )? [^()]+ (?(-1) \e) ) ...
+.sp
+This makes the fragment independent of the parentheses in the larger pattern.
+.
+.SS "Checking for a used subpattern by name"
+.rs
+.sp
+Perl uses the syntax (?()...) or (?('name')...) to test for a used
+subpattern by name. For compatibility with earlier versions of PCRE, which had
+this facility before Perl, the syntax (?(name)...) is also recognized. However,
+there is a possible ambiguity with this syntax, because subpattern names may
+consist entirely of digits. PCRE looks first for a named subpattern; if it
+cannot find one and the name consists entirely of digits, PCRE looks for a
+subpattern of that number, which must be greater than zero. Using subpattern
+names that consist entirely of digits is not recommended.
+.P
+Rewriting the above example to use a named subpattern gives this:
+.sp
+ (? \e( )? [^()]+ (?() \e) )
+.sp
+.
+.SS "Checking for pattern recursion"
+.rs
+.sp
+If the condition is the string (R), and there is no subpattern with the name R,
+the condition is true if a recursive call to the whole pattern or any
+subpattern has been made. If digits or a name preceded by ampersand follow the
+letter R, for example:
+.sp
+ (?(R3)...) or (?(R&name)...)
+.sp
+the condition is true if the most recent recursion is into the subpattern whose
+number or name is given. This condition does not check the entire recursion
+stack.
+.P
+At "top level", all these recursion test conditions are false. Recursive
+patterns are described below.
+.
+.SS "Defining subpatterns for use by reference only"
+.rs
+.sp
+If the condition is the string (DEFINE), and there is no subpattern with the
+name DEFINE, the condition is always false. In this case, there may be only one
+alternative in the subpattern. It is always skipped if control reaches this
+point in the pattern; the idea of DEFINE is that it can be used to define
+"subroutines" that can be referenced from elsewhere. (The use of "subroutines"
+is described below.) For example, a pattern to match an IPv4 address could be
+written like this (ignore whitespace and line breaks):
+.sp
+ (?(DEFINE) (? 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
+ \eb (?&byte) (\e.(?&byte)){3} \eb
+.sp
+The first part of the pattern is a DEFINE group inside which a another group
+named "byte" is defined. This matches an individual component of an IPv4
+address (a number less than 256). When matching takes place, this part of the
+pattern is skipped because DEFINE acts like a false condition.
+.P
+The rest of the pattern uses references to the named group to match the four
+dot-separated components of an IPv4 address, insisting on a word boundary at
+each end.
+.
+.SS "Assertion conditions"
+.rs
+.sp
+If the condition is not in any of the above formats, it must be an assertion.
+This may be a positive or negative lookahead or lookbehind assertion. Consider
+this pattern, again containing non-significant white space, and with the two
+alternatives on the second line:
+.sp
+ (?(?=[^a-z]*[a-z])
+ \ed{2}-[a-z]{3}-\ed{2} | \ed{2}-\ed{2}-\ed{2} )
+.sp
+The condition is a positive lookahead assertion that matches an optional
+sequence of non-letters followed by a letter. In other words, it tests for the
+presence of at least one letter in the subject. If a letter is found, the
+subject is matched against the first alternative; otherwise it is matched
+against the second. This pattern matches strings in one of the two forms
+dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
+.
+.
+.\" HTML
+.SH COMMENTS
+.rs
+.sp
+The sequence (?# marks the start of a comment that continues up to the next
+closing parenthesis. Nested parentheses are not permitted. The characters
+that make up a comment play no part in the pattern matching at all.
+.P
+If the PCRE_EXTENDED option is set, an unescaped # character outside a
+character class introduces a comment that continues to immediately after the
+next newline in the pattern.
+.
+.
+.\" HTML
+.SH "RECURSIVE PATTERNS"
+.rs
+.sp
+Consider the problem of matching a string in parentheses, allowing for
+unlimited nested parentheses. Without the use of recursion, the best that can
+be done is to use a pattern that matches up to some fixed depth of nesting. It
+is not possible to handle an arbitrary nesting depth.
+.P
+For some time, Perl has provided a facility that allows regular expressions to
+recurse (amongst other things). It does this by interpolating Perl code in the
+expression at run time, and the code can refer to the expression itself. A Perl
+pattern using code interpolation to solve the parentheses problem can be
+created like this:
+.sp
+ $re = qr{\e( (?: (?>[^()]+) | (?p{$re}) )* \e)}x;
+.sp
+The (?p{...}) item interpolates Perl code at run time, and in this case refers
+recursively to the pattern in which it appears.
+.P
+Obviously, PCRE cannot support the interpolation of Perl code. Instead, it
+supports special syntax for recursion of the entire pattern, and also for
+individual subpattern recursion. After its introduction in PCRE and Python,
+this kind of recursion was introduced into Perl at release 5.10.
+.P
+A special item that consists of (? followed by a number greater than zero and a
+closing parenthesis is a recursive call of the subpattern of the given number,
+provided that it occurs inside that subpattern. (If not, it is a "subroutine"
+call, which is described in the next section.) The special item (?R) or (?0) is
+a recursive call of the entire regular expression.
+.P
+In PCRE (like Python, but unlike Perl), a recursive subpattern call is always
+treated as an atomic group. That is, once it has matched some of the subject
+string, it is never re-entered, even if it contains untried alternatives and
+there is a subsequent matching failure.
+.P
+This PCRE pattern solves the nested parentheses problem (assume the
+PCRE_EXTENDED option is set so that white space is ignored):
+.sp
+ \e( ( (?>[^()]+) | (?R) )* \e)
+.sp
+First it matches an opening parenthesis. Then it matches any number of
+substrings which can either be a sequence of non-parentheses, or a recursive
+match of the pattern itself (that is, a correctly parenthesized substring).
+Finally there is a closing parenthesis.
+.P
+If this were part of a larger pattern, you would not want to recurse the entire
+pattern, so instead you could use this:
+.sp
+ ( \e( ( (?>[^()]+) | (?1) )* \e) )
+.sp
+We have put the pattern into parentheses, and caused the recursion to refer to
+them instead of the whole pattern.
+.P
+In a larger pattern, keeping track of parenthesis numbers can be tricky. This
+is made easier by the use of relative references. (A Perl 5.10 feature.)
+Instead of (?1) in the pattern above you can write (?-2) to refer to the second
+most recently opened parentheses preceding the recursion. In other words, a
+negative number counts capturing parentheses leftwards from the point at which
+it is encountered.
+.P
+It is also possible to refer to subsequently opened parentheses, by writing
+references such as (?+2). However, these cannot be recursive because the
+reference is not inside the parentheses that are referenced. They are always
+"subroutine" calls, as described in the next section.
+.P
+An alternative approach is to use named parentheses instead. The Perl syntax
+for this is (?&name); PCRE's earlier syntax (?P>name) is also supported. We
+could rewrite the above example as follows:
+.sp
+ (? \e( ( (?>[^()]+) | (?&pn) )* \e) )
+.sp
+If there is more than one subpattern with the same name, the earliest one is
+used.
+.P
+This particular example pattern that we have been looking at contains nested
+unlimited repeats, and so the use of atomic grouping for matching strings of
+non-parentheses is important when applying the pattern to strings that do not
+match. For example, when this pattern is applied to
+.sp
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+.sp
+it yields "no match" quickly. However, if atomic grouping is not used,
+the match runs for a very long time indeed because there are so many different
+ways the + and * repeats can carve up the subject, and all have to be tested
+before failure can be reported.
+.P
+At the end of a match, the values set for any capturing subpatterns are those
+from the outermost level of the recursion at which the subpattern value is set.
+If you want to obtain intermediate values, a callout function can be used (see
+below and the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation). If the pattern above is matched against
+.sp
+ (ab(cd)ef)
+.sp
+the value for the capturing parentheses is "ef", which is the last value taken
+on at the top level. If additional parentheses are added, giving
+.sp
+ \e( ( ( (?>[^()]+) | (?R) )* ) \e)
+ ^ ^
+ ^ ^
+.sp
+the string they capture is "ab(cd)ef", the contents of the top level
+parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
+has to obtain extra memory to store data during a recursion, which it does by
+using \fBpcre_malloc\fP, freeing it via \fBpcre_free\fP afterwards. If no
+memory can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
+.P
+Do not confuse the (?R) item with the condition (R), which tests for recursion.
+Consider this pattern, which matches text in angle brackets, allowing for
+arbitrary nesting. Only digits are allowed in nested brackets (that is, when
+recursing), whereas any characters are permitted at the outer level.
+.sp
+ < (?: (?(R) \ed++ | [^<>]*+) | (?R)) * >
+.sp
+In this pattern, (?(R) is the start of a conditional subpattern, with two
+different alternatives for the recursive and non-recursive cases. The (?R) item
+is the actual recursive call.
+.
+.
+.\" HTML
+.SH "SUBPATTERNS AS SUBROUTINES"
+.rs
+.sp
+If the syntax for a recursive subpattern reference (either by number or by
+name) is used outside the parentheses to which it refers, it operates like a
+subroutine in a programming language. The "called" subpattern may be defined
+before or after the reference. A numbered reference can be absolute or
+relative, as in these examples:
+.sp
+ (...(absolute)...)...(?2)...
+ (...(relative)...)...(?-1)...
+ (...(?+1)...(relative)...
+.sp
+An earlier example pointed out that the pattern
+.sp
+ (sens|respons)e and \e1ibility
+.sp
+matches "sense and sensibility" and "response and responsibility", but not
+"sense and responsibility". If instead the pattern
+.sp
+ (sens|respons)e and (?1)ibility
+.sp
+is used, it does match "sense and responsibility" as well as the other two
+strings. Another example is given in the discussion of DEFINE above.
+.P
+Like recursive subpatterns, a "subroutine" call is always treated as an atomic
+group. That is, once it has matched some of the subject string, it is never
+re-entered, even if it contains untried alternatives and there is a subsequent
+matching failure.
+.P
+When a subpattern is used as a subroutine, processing options such as
+case-independence are fixed when the subpattern is defined. They cannot be
+changed for different calls. For example, consider this pattern:
+.sp
+ (abc)(?i:(?-1))
+.sp
+It matches "abcabc". It does not match "abcABC" because the change of
+processing option does not affect the called subpattern.
+.
+.
+.\" HTML
+.SH "ONIGURUMA SUBROUTINE SYNTAX"
+.rs
+.sp
+For compatibility with Oniguruma, the non-Perl syntax \eg followed by a name or
+a number enclosed either in angle brackets or single quotes, is an alternative
+syntax for referencing a subpattern as a subroutine, possibly recursively. Here
+are two of the examples used above, rewritten using this syntax:
+.sp
+ (? \e( ( (?>[^()]+) | \eg )* \e) )
+ (sens|respons)e and \eg'1'ibility
+.sp
+PCRE supports an extension to Oniguruma: if a number is preceded by a
+plus or a minus sign it is taken as a relative reference. For example:
+.sp
+ (abc)(?i:\eg<-1>)
+.sp
+Note that \eg{...} (Perl syntax) and \eg<...> (Oniguruma syntax) are \fInot\fP
+synonymous. The former is a back reference; the latter is a subroutine call.
+.
+.
+.SH CALLOUTS
+.rs
+.sp
+Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
+code to be obeyed in the middle of matching a regular expression. This makes it
+possible, amongst other things, to extract different substrings that match the
+same pair of parentheses when there is a repetition.
+.P
+PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
+code. The feature is called "callout". The caller of PCRE provides an external
+function by putting its entry point in the global variable \fIpcre_callout\fP.
+By default, this variable contains NULL, which disables all calling out.
+.P
+Within a regular expression, (?C) indicates the points at which the external
+function is to be called. If you want to identify different callout points, you
+can put a number less than 256 after the letter C. The default value is zero.
+For example, this pattern has two callout points:
+.sp
+ (?C1)abc(?C2)def
+.sp
+If the PCRE_AUTO_CALLOUT flag is passed to \fBpcre_compile()\fP, callouts are
+automatically installed before each item in the pattern. They are all numbered
+255.
+.P
+During matching, when PCRE reaches a callout point (and \fIpcre_callout\fP is
+set), the external function is called. It is provided with the number of the
+callout, the position in the pattern, and, optionally, one item of data
+originally supplied by the caller of \fBpcre_exec()\fP. The callout function
+may cause matching to proceed, to backtrack, or to fail altogether. A complete
+description of the interface to the callout function is given in the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation.
+.
+.
+.SH "BACKTRACKING CONTROL"
+.rs
+.sp
+Perl 5.10 introduced a number of "Special Backtracking Control Verbs", which
+are described in the Perl documentation as "experimental and subject to change
+or removal in a future version of Perl". It goes on to say: "Their usage in
+production code should be noted to avoid problems during upgrades." The same
+remarks apply to the PCRE features described in this section.
+.P
+Since these verbs are specifically related to backtracking, most of them can be
+used only when the pattern is to be matched using \fBpcre_exec()\fP, which uses
+a backtracking algorithm. With the exception of (*FAIL), which behaves like a
+failing negative assertion, they cause an error if encountered by
+\fBpcre_dfa_exec()\fP.
+.P
+The new verbs make use of what was previously invalid syntax: an opening
+parenthesis followed by an asterisk. In Perl, they are generally of the form
+(*VERB:ARG) but PCRE does not support the use of arguments, so its general
+form is just (*VERB). Any number of these verbs may occur in a pattern. There
+are two kinds:
+.
+.SS "Verbs that act immediately"
+.rs
+.sp
+The following verbs act as soon as they are encountered:
+.sp
+ (*ACCEPT)
+.sp
+This verb causes the match to end successfully, skipping the remainder of the
+pattern. When inside a recursion, only the innermost pattern is ended
+immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside
+capturing parentheses. In Perl, the data so far is captured: in PCRE no data is
+captured. For example:
+.sp
+ A(A|B(*ACCEPT)|C)D
+.sp
+This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is
+captured.
+.sp
+ (*FAIL) or (*F)
+.sp
+This verb causes the match to fail, forcing backtracking to occur. It is
+equivalent to (?!) but easier to read. The Perl documentation notes that it is
+probably useful only when combined with (?{}) or (??{}). Those are, of course,
+Perl features that are not present in PCRE. The nearest equivalent is the
+callout feature, as for example in this pattern:
+.sp
+ a+(?C)(*FAIL)
+.sp
+A match with the string "aaaa" always fails, but the callout is taken before
+each backtrack happens (in this example, 10 times).
+.
+.SS "Verbs that act after backtracking"
+.rs
+.sp
+The following verbs do nothing when they are encountered. Matching continues
+with what follows, but if there is no subsequent match, a failure is forced.
+The verbs differ in exactly what kind of failure occurs.
+.sp
+ (*COMMIT)
+.sp
+This verb causes the whole match to fail outright if the rest of the pattern
+does not match. Even if the pattern is unanchored, no further attempts to find
+a match by advancing the start point take place. Once (*COMMIT) has been
+passed, \fBpcre_exec()\fP is committed to finding a match at the current
+starting point, or not at all. For example:
+.sp
+ a+(*COMMIT)b
+.sp
+This matches "xxaab" but not "aacaab". It can be thought of as a kind of
+dynamic anchor, or "I've started, so I must finish."
+.sp
+ (*PRUNE)
+.sp
+This verb causes the match to fail at the current position if the rest of the
+pattern does not match. If the pattern is unanchored, the normal "bumpalong"
+advance to the next starting character then happens. Backtracking can occur as
+usual to the left of (*PRUNE), or when matching to the right of (*PRUNE), but
+if there is no match to the right, backtracking cannot cross (*PRUNE).
+In simple cases, the use of (*PRUNE) is just an alternative to an atomic
+group or possessive quantifier, but there are some uses of (*PRUNE) that cannot
+be expressed in any other way.
+.sp
+ (*SKIP)
+.sp
+This verb is like (*PRUNE), except that if the pattern is unanchored, the
+"bumpalong" advance is not to the next character, but to the position in the
+subject where (*SKIP) was encountered. (*SKIP) signifies that whatever text
+was matched leading up to it cannot be part of a successful match. Consider:
+.sp
+ a+(*SKIP)b
+.sp
+If the subject is "aaaac...", after the first match attempt fails (starting at
+the first character in the string), the starting point skips on to start the
+next attempt at "c". Note that a possessive quantifer does not have the same
+effect in this example; although it would suppress backtracking during the
+first match attempt, the second attempt would start at the second character
+instead of skipping on to "c".
+.sp
+ (*THEN)
+.sp
+This verb causes a skip to the next alternation if the rest of the pattern does
+not match. That is, it cancels pending backtracking, but only within the
+current alternation. Its name comes from the observation that it can be used
+for a pattern-based if-then-else block:
+.sp
+ ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
+.sp
+If the COND1 pattern matches, FOO is tried (and possibly further items after
+the end of the group if FOO succeeds); on failure the matcher skips to the
+second alternative and tries COND2, without backtracking into COND1. If (*THEN)
+is used outside of any alternation, it acts exactly like (*PRUNE).
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcreapi\fP(3), \fBpcrecallout\fP(3), \fBpcrematching\fP(3), \fBpcre\fP(3).
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 19 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcreperform.3 b/src/doc/pcreperform.3
new file mode 100644
index 0000000..915f7b7
--- /dev/null
+++ b/src/doc/pcreperform.3
@@ -0,0 +1,153 @@
+.TH PCREPERFORM 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE PERFORMANCE"
+.rs
+.sp
+Two aspects of performance are discussed below: memory usage and processing
+time. The way you express your pattern as a regular expression can affect both
+of them.
+.
+.SH "MEMORY USAGE"
+.rs
+.sp
+Patterns are compiled by PCRE into a reasonably efficient byte code, so that
+most simple patterns do not use much memory. However, there is one case where
+memory usage can be unexpectedly large. When a parenthesized subpattern has a
+quantifier with a minimum greater than 1 and/or a limited maximum, the whole
+subpattern is repeated in the compiled code. For example, the pattern
+.sp
+ (abc|def){2,4}
+.sp
+is compiled as if it were
+.sp
+ (abc|def)(abc|def)((abc|def)(abc|def)?)?
+.sp
+(Technical aside: It is done this way so that backtrack points within each of
+the repetitions can be independently maintained.)
+.P
+For regular expressions whose quantifiers use only small numbers, this is not
+usually a problem. However, if the numbers are large, and particularly if such
+repetitions are nested, the memory usage can become an embarrassment. For
+example, the very simple pattern
+.sp
+ ((ab){1,1000}c){1,3}
+.sp
+uses 51K bytes when compiled. When PCRE is compiled with its default internal
+pointer size of two bytes, the size limit on a compiled pattern is 64K, and
+this is reached with the above pattern if the outer repetition is increased
+from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
+handle larger compiled patterns, but it is better to try to rewrite your
+pattern to use less memory if you can.
+.P
+One way of reducing the memory usage for such patterns is to make use of PCRE's
+.\" HTML
+.\"
+"subroutine"
+.\"
+facility. Re-writing the above pattern as
+.sp
+ ((ab)(?2){0,999}c)(?1){0,2}
+.sp
+reduces the memory requirements to 18K, and indeed it remains under 20K even
+with the outer repetition increased to 100. However, this pattern is not
+exactly equivalent, because the "subroutine" calls are treated as
+.\" HTML
+.\"
+atomic groups
+.\"
+into which there can be no backtracking if there is a subsequent matching
+failure. Therefore, PCRE cannot do this kind of rewriting automatically.
+Furthermore, there is a noticeable loss of speed when executing the modified
+pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
+speed is acceptable, this kind of rewriting will allow you to process patterns
+that PCRE cannot otherwise handle.
+.
+.SH "PROCESSING TIME"
+.rs
+.sp
+Certain items in regular expression patterns are processed more efficiently
+than others. It is more efficient to use a character class like [aeiou] than a
+set of single-character alternatives such as (a|e|i|o|u). In general, the
+simplest construction that provides the required behaviour is usually the most
+efficient. Jeffrey Friedl's book contains a lot of useful general discussion
+about optimizing regular expressions for efficient performance. This document
+contains a few observations about PCRE.
+.P
+Using Unicode character properties (the \ep, \eP, and \eX escapes) is slow,
+because PCRE has to scan a structure that contains data for over fifteen
+thousand characters whenever it needs a character's property. If you can find
+an alternative pattern that does not use character properties, it will probably
+be faster.
+.P
+When a pattern begins with .* not in parentheses, or in parentheses that are
+not the subject of a backreference, and the PCRE_DOTALL option is set, the
+pattern is implicitly anchored by PCRE, since it can match only at the start of
+a subject string. However, if PCRE_DOTALL is not set, PCRE cannot make this
+optimization, because the . metacharacter does not then match a newline, and if
+the subject string contains newlines, the pattern may match from the character
+immediately following one of them instead of from the very start. For example,
+the pattern
+.sp
+ .*second
+.sp
+matches the subject "first\enand second" (where \en stands for a newline
+character), with the match starting at the seventh character. In order to do
+this, PCRE has to retry the match starting after every newline in the subject.
+.P
+If you are using such a pattern with subject strings that do not contain
+newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
+the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE
+from having to scan along the subject looking for a newline to restart at.
+.P
+Beware of patterns that contain nested indefinite repeats. These can take a
+long time to run when applied to a string that does not match. Consider the
+pattern fragment
+.sp
+ ^(a+)*
+.sp
+This can match "aaaa" in 16 different ways, and this number increases very
+rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
+times, and for each of those cases other than 0 or 4, the + repeats can match
+different numbers of times.) When the remainder of the pattern is such that the
+entire match is going to fail, PCRE has in principle to try every possible
+variation, and this can take an extremely long time, even for relatively short
+strings.
+.P
+An optimization catches some of the more simple cases such as
+.sp
+ (a+)*b
+.sp
+where a literal character follows. Before embarking on the standard matching
+procedure, PCRE checks that there is a "b" later in the subject string, and if
+there is not, it fails the match immediately. However, when there is no
+following literal this optimization cannot be used. You can see the difference
+by comparing the behaviour of
+.sp
+ (a+)*\ed
+.sp
+with the pattern above. The former gives a failure almost instantly when
+applied to a whole line of "a" characters, whereas the latter takes an
+appreciable time with strings longer than about 20 characters.
+.P
+In many cases, the solution to this kind of performance issue is to use an
+atomic group or a possessive quantifier.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 06 March 2007
+Copyright (c) 1997-2007 University of Cambridge.
+.fi
diff --git a/src/doc/pcreposix.3 b/src/doc/pcreposix.3
new file mode 100644
index 0000000..7391e29
--- /dev/null
+++ b/src/doc/pcreposix.3
@@ -0,0 +1,238 @@
+.TH PCREPOSIX 3
+.SH NAME
+PCRE - Perl-compatible regular expressions.
+.SH "SYNOPSIS OF POSIX API"
+.rs
+.sp
+.B #include
+.PP
+.SM
+.B int regcomp(regex_t *\fIpreg\fP, const char *\fIpattern\fP,
+.ti +5n
+.B int \fIcflags\fP);
+.PP
+.B int regexec(regex_t *\fIpreg\fP, const char *\fIstring\fP,
+.ti +5n
+.B size_t \fInmatch\fP, regmatch_t \fIpmatch\fP[], int \fIeflags\fP);
+.PP
+.B size_t regerror(int \fIerrcode\fP, const regex_t *\fIpreg\fP,
+.ti +5n
+.B char *\fIerrbuf\fP, size_t \fIerrbuf_size\fP);
+.PP
+.B void regfree(regex_t *\fIpreg\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This set of functions provides a POSIX-style API to the PCRE regular expression
+package. See the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation for a description of PCRE's native API, which contains much
+additional functionality.
+.P
+The functions described here are just wrapper functions that ultimately call
+the PCRE native API. Their prototypes are defined in the \fBpcreposix.h\fP
+header file, and on Unix systems the library itself is called
+\fBpcreposix.a\fP, so can be accessed by adding \fB-lpcreposix\fP to the
+command for linking an application that uses them. Because the POSIX functions
+call the native ones, it is also necessary to add \fB-lpcre\fP.
+.P
+I have implemented only those option bits that can be reasonably mapped to PCRE
+native options. In addition, the option REG_EXTENDED is defined with the value
+zero. This has no effect, but since programs that are written to the POSIX
+interface often use it, this makes it easier to slot in PCRE as a replacement
+library. Other POSIX options are not even defined.
+.P
+When PCRE is called via these functions, it is only the API that is POSIX-like
+in style. The syntax and semantics of the regular expressions themselves are
+still those of Perl, subject to the setting of various PCRE options, as
+described below. "POSIX-like in style" means that the API approximates to the
+POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
+domains it is probably even less compatible.
+.P
+The header for these functions is supplied as \fBpcreposix.h\fP to avoid any
+potential clash with other POSIX libraries. It can, of course, be renamed or
+aliased as \fBregex.h\fP, which is the "correct" name. It provides two
+structure types, \fIregex_t\fP for compiled internal forms, and
+\fIregmatch_t\fP for returning captured substrings. It also defines some
+constants whose names start with "REG_"; these are used for setting options and
+identifying error codes.
+.P
+.SH "COMPILING A PATTERN"
+.rs
+.sp
+The function \fBregcomp()\fP is called to compile a pattern into an
+internal form. The pattern is a C string terminated by a binary zero, and
+is passed in the argument \fIpattern\fP. The \fIpreg\fP argument is a pointer
+to a \fBregex_t\fP structure that is used as a base for storing information
+about the compiled regular expression.
+.P
+The argument \fIcflags\fP is either zero, or contains one or more of the bits
+defined by the following macros:
+.sp
+ REG_DOTALL
+.sp
+The PCRE_DOTALL option is set when the regular expression is passed for
+compilation to the native function. Note that REG_DOTALL is not part of the
+POSIX standard.
+.sp
+ REG_ICASE
+.sp
+The PCRE_CASELESS option is set when the regular expression is passed for
+compilation to the native function.
+.sp
+ REG_NEWLINE
+.sp
+The PCRE_MULTILINE option is set when the regular expression is passed for
+compilation to the native function. Note that this does \fInot\fP mimic the
+defined POSIX behaviour for REG_NEWLINE (see the following section).
+.sp
+ REG_NOSUB
+.sp
+The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is passed
+for compilation to the native function. In addition, when a pattern that is
+compiled with this flag is passed to \fBregexec()\fP for matching, the
+\fInmatch\fP and \fIpmatch\fP arguments are ignored, and no captured strings
+are returned.
+.sp
+ REG_UTF8
+.sp
+The PCRE_UTF8 option is set when the regular expression is passed for
+compilation to the native function. This causes the pattern itself and all data
+strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF8
+is not part of the POSIX standard.
+.P
+In the absence of these flags, no options are passed to the native function.
+This means the the regex is compiled with PCRE default semantics. In
+particular, the way it handles newline characters in the subject string is the
+Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
+\fIsome\fP of the effects specified for REG_NEWLINE. It does not affect the way
+newlines are matched by . (they aren't) or by a negative class such as [^a]
+(they are).
+.P
+The yield of \fBregcomp()\fP is zero on success, and non-zero otherwise. The
+\fIpreg\fP structure is filled in on success, and one member of the structure
+is public: \fIre_nsub\fP contains the number of capturing subpatterns in
+the regular expression. Various error codes are defined in the header file.
+.
+.
+.SH "MATCHING NEWLINE CHARACTERS"
+.rs
+.sp
+This area is not simple, because POSIX and Perl take different views of things.
+It is not possible to get PCRE to obey POSIX semantics, but then PCRE was never
+intended to be a POSIX engine. The following table lists the different
+possibilities for matching newline characters in PCRE:
+.sp
+ Default Change with
+.sp
+ . matches newline no PCRE_DOTALL
+ newline matches [^a] yes not changeable
+ $ matches \en at end yes PCRE_DOLLARENDONLY
+ $ matches \en in middle no PCRE_MULTILINE
+ ^ matches \en in middle no PCRE_MULTILINE
+.sp
+This is the equivalent table for POSIX:
+.sp
+ Default Change with
+.sp
+ . matches newline yes REG_NEWLINE
+ newline matches [^a] yes REG_NEWLINE
+ $ matches \en at end no REG_NEWLINE
+ $ matches \en in middle no REG_NEWLINE
+ ^ matches \en in middle no REG_NEWLINE
+.sp
+PCRE's behaviour is the same as Perl's, except that there is no equivalent for
+PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is no way to stop
+newline from matching [^a].
+.P
+The default POSIX newline handling can be obtained by setting PCRE_DOTALL and
+PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE behave exactly as for the
+REG_NEWLINE action.
+.
+.
+.SH "MATCHING A PATTERN"
+.rs
+.sp
+The function \fBregexec()\fP is called to match a compiled pattern \fIpreg\fP
+against a given \fIstring\fP, which is by default terminated by a zero byte
+(but see REG_STARTEND below), subject to the options in \fIeflags\fP. These can
+be:
+.sp
+ REG_NOTBOL
+.sp
+The PCRE_NOTBOL option is set when calling the underlying PCRE matching
+function.
+.sp
+ REG_NOTEOL
+.sp
+The PCRE_NOTEOL option is set when calling the underlying PCRE matching
+function.
+.sp
+ REG_STARTEND
+.sp
+The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and
+to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
+(there need not actually be a NUL at that location), regardless of the value of
+\fInmatch\fP. This is a BSD extension, compatible with but not specified by
+IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
+intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
+not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
+how it is matched.
+.P
+If the pattern was compiled with the REG_NOSUB flag, no data about any matched
+strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
+\fBregexec()\fP are ignored.
+.P
+Otherwise,the portion of the string that was matched, and also any captured
+substrings, are returned via the \fIpmatch\fP argument, which points to an
+array of \fInmatch\fP structures of type \fIregmatch_t\fP, containing the
+members \fIrm_so\fP and \fIrm_eo\fP. These contain the offset to the first
+character of each substring and the offset to the first character after the end
+of each substring, respectively. The 0th element of the vector relates to the
+entire portion of \fIstring\fP that was matched; subsequent elements relate to
+the capturing subpatterns of the regular expression. Unused entries in the
+array have both structure members set to -1.
+.P
+A successful match yields a zero return; various error codes are defined in the
+header file, of which REG_NOMATCH is the "expected" failure code.
+.
+.
+.SH "ERROR MESSAGES"
+.rs
+.sp
+The \fBregerror()\fP function maps a non-zero errorcode from either
+\fBregcomp()\fP or \fBregexec()\fP to a printable message. If \fIpreg\fP is not
+NULL, the error should have arisen from the use of that structure. A message
+terminated by a binary zero is placed in \fIerrbuf\fP. The length of the
+message, including the zero, is limited to \fIerrbuf_size\fP. The yield of the
+function is the size of buffer needed to hold the whole message.
+.
+.
+.SH MEMORY USAGE
+.rs
+.sp
+Compiling a regular expression causes memory to be allocated and associated
+with the \fIpreg\fP structure. The function \fBregfree()\fP frees all such
+memory, after which \fIpreg\fP may no longer be used as a compiled expression.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 05 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcreprecompile.3 b/src/doc/pcreprecompile.3
new file mode 100644
index 0000000..aa52542
--- /dev/null
+++ b/src/doc/pcreprecompile.3
@@ -0,0 +1,142 @@
+.TH PCREPRECOMPILE 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "SAVING AND RE-USING PRECOMPILED PCRE PATTERNS"
+.rs
+.sp
+If you are running an application that uses a large number of regular
+expression patterns, it may be useful to store them in a precompiled form
+instead of having to compile them every time the application is run.
+If you are not using any private character tables (see the
+.\" HREF
+\fBpcre_maketables()\fP
+.\"
+documentation), this is relatively straightforward. If you are using private
+tables, it is a little bit more complicated.
+.P
+If you save compiled patterns to a file, you can copy them to a different host
+and run them there. This works even if the new host has the opposite endianness
+to the one on which the patterns were compiled. There may be a small
+performance penalty, but it should be insignificant. However, compiling regular
+expressions with one version of PCRE for use with a different version is not
+guaranteed to work and may cause crashes.
+.
+.
+.SH "SAVING A COMPILED PATTERN"
+.rs
+.sh
+The value returned by \fBpcre_compile()\fP points to a single block of memory
+that holds the compiled pattern and associated data. You can find the length of
+this block in bytes by calling \fBpcre_fullinfo()\fP with an argument of
+PCRE_INFO_SIZE. You can then save the data in any appropriate manner. Here is
+sample code that compiles a pattern and writes it to a file. It assumes that
+the variable \fIfd\fP refers to a file that is open for output:
+.sp
+ int erroroffset, rc, size;
+ char *error;
+ pcre *re;
+.sp
+ re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
+ if (re == NULL) { ... handle errors ... }
+ rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
+ if (rc < 0) { ... handle errors ... }
+ rc = fwrite(re, 1, size, fd);
+ if (rc != size) { ... handle errors ... }
+.sp
+In this example, the bytes that comprise the compiled pattern are copied
+exactly. Note that this is binary data that may contain any of the 256 possible
+byte values. On systems that make a distinction between binary and non-binary
+data, be sure that the file is opened for binary output.
+.P
+If you want to write more than one pattern to a file, you will have to devise a
+way of separating them. For binary data, preceding each pattern with its length
+is probably the most straightforward approach. Another possibility is to write
+out the data in hexadecimal instead of binary, one pattern to a line.
+.P
+Saving compiled patterns in a file is only one possible way of storing them for
+later use. They could equally well be saved in a database, or in the memory of
+some daemon process that passes them via sockets to the processes that want
+them.
+.P
+If the pattern has been studied, it is also possible to save the study data in
+a similar way to the compiled pattern itself. When studying generates
+additional information, \fBpcre_study()\fP returns a pointer to a
+\fBpcre_extra\fP data block. Its format is defined in the
+.\" HTML
+.\"
+section on matching a pattern
+.\"
+in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation. The \fIstudy_data\fP field points to the binary study data, and
+this is what you must save (not the \fBpcre_extra\fP block itself). The length
+of the study data can be obtained by calling \fBpcre_fullinfo()\fP with an
+argument of PCRE_INFO_STUDYSIZE. Remember to check that \fBpcre_study()\fP did
+return a non-NULL value before trying to save the study data.
+.
+.
+.SH "RE-USING A PRECOMPILED PATTERN"
+.rs
+.sp
+Re-using a precompiled pattern is straightforward. Having reloaded it into main
+memory, you pass its pointer to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP in
+the usual way. This should work even on another host, and even if that host has
+the opposite endianness to the one where the pattern was compiled.
+.P
+However, if you passed a pointer to custom character tables when the pattern
+was compiled (the \fItableptr\fP argument of \fBpcre_compile()\fP), you must
+now pass a similar pointer to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP,
+because the value saved with the compiled pattern will obviously be nonsense. A
+field in a \fBpcre_extra()\fP block is used to pass this data, as described in
+the
+.\" HTML
+.\"
+section on matching a pattern
+.\"
+in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.P
+If you did not provide custom character tables when the pattern was compiled,
+the pointer in the compiled pattern is NULL, which causes \fBpcre_exec()\fP to
+use PCRE's internal tables. Thus, you do not need to take any special action at
+run time in this case.
+.P
+If you saved study data with the compiled pattern, you need to create your own
+\fBpcre_extra\fP data block and set the \fIstudy_data\fP field to point to the
+reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
+\fIflags\fP field to indicate that study data is present. Then pass the
+\fBpcre_extra\fP block to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP in the
+usual way.
+.
+.
+.SH "COMPATIBILITY WITH DIFFERENT PCRE RELEASES"
+.rs
+.sp
+In general, it is safest to recompile all saved patterns when you update to a
+new PCRE release, though not all updates actually require this. Recompiling is
+definitely needed for release 7.2.
+.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 13 June 2007
+Copyright (c) 1997-2007 University of Cambridge.
+.fi
diff --git a/src/doc/pcresample.3 b/src/doc/pcresample.3
new file mode 100644
index 0000000..d27690a
--- /dev/null
+++ b/src/doc/pcresample.3
@@ -0,0 +1,80 @@
+.TH PCRESAMPLE 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE SAMPLE PROGRAM"
+.rs
+.sp
+A simple, complete demonstration program, to get you started with using PCRE,
+is supplied in the file \fIpcredemo.c\fP in the PCRE distribution.
+.P
+The program compiles the regular expression that is its first argument, and
+matches it against the subject string in its second argument. No PCRE options
+are set, and default character tables are used. If matching succeeds, the
+program outputs the portion of the subject that matched, together with the
+contents of any captured substrings.
+.P
+If the -g option is given on the command line, the program then goes on to
+check for further matches of the same regular expression in the same subject
+string. The logic is a little bit tricky because of the possibility of matching
+an empty string. Comments in the code explain what is going on.
+.P
+If PCRE is installed in the standard include and library directories for your
+system, you should be able to compile the demonstration program using this
+command:
+.sp
+ gcc -o pcredemo pcredemo.c -lpcre
+.sp
+If PCRE is installed elsewhere, you may need to add additional options to the
+command line. For example, on a Unix-like system that has PCRE installed in
+\fI/usr/local\fP, you can compile the demonstration program using a command
+like this:
+.sp
+.\" JOINSH
+ gcc -o pcredemo -I/usr/local/include pcredemo.c \e
+ -L/usr/local/lib -lpcre
+.sp
+Once you have compiled the demonstration program, you can run simple tests like
+this:
+.sp
+ ./pcredemo 'cat|dog' 'the cat sat on the mat'
+ ./pcredemo -g 'cat|dog' 'the dog sat on the cat'
+.sp
+Note that there is a much more comprehensive test program, called
+.\" HREF
+\fBpcretest\fP,
+.\"
+which supports many more facilities for testing regular expressions and the
+PCRE library. The \fBpcredemo\fP program is provided as a simple coding
+example.
+.P
+On some operating systems (e.g. Solaris), when PCRE is not installed in the
+standard library directory, you may get an error like this when you try to run
+\fBpcredemo\fP:
+.sp
+ ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
+.sp
+This is caused by the way shared library support works on those systems. You
+need to add
+.sp
+ -R/usr/local/lib
+.sp
+(for example) to the compile command to get round this problem.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 23 January 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcrestack.3 b/src/doc/pcrestack.3
new file mode 100644
index 0000000..7e9bfc9
--- /dev/null
+++ b/src/doc/pcrestack.3
@@ -0,0 +1,140 @@
+.TH PCRESTACK 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE DISCUSSION OF STACK USAGE"
+.rs
+.sp
+When you call \fBpcre_exec()\fP, it makes use of an internal function called
+\fBmatch()\fP. This calls itself recursively at branch points in the pattern,
+in order to remember the state of the match so that it can back up and try a
+different alternative if the first one fails. As matching proceeds deeper and
+deeper into the tree of possibilities, the recursion depth increases.
+.P
+Not all calls of \fBmatch()\fP increase the recursion depth; for an item such
+as a* it may be called several times at the same level, after matching
+different numbers of a's. Furthermore, in a number of cases where the result of
+the recursive call would immediately be passed back as the result of the
+current call (a "tail recursion"), the function is just restarted instead.
+.P
+The \fBpcre_dfa_exec()\fP function operates in an entirely different way, and
+hardly uses recursion at all. The limit on its complexity is the amount of
+workspace it is given. The comments that follow do NOT apply to
+\fBpcre_dfa_exec()\fP; they are relevant only for \fBpcre_exec()\fP.
+.P
+You can set limits on the number of times that \fBmatch()\fP is called, both in
+total and recursively. If the limit is exceeded, an error occurs. For details,
+see the
+.\" HTML
+.\"
+section on extra data for \fBpcre_exec()\fP
+.\"
+in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.P
+Each time that \fBmatch()\fP is actually called recursively, it uses memory
+from the process stack. For certain kinds of pattern and data, very large
+amounts of stack may be needed, despite the recognition of "tail recursion".
+You can often reduce the amount of recursion, and therefore the amount of stack
+used, by modifying the pattern that is being matched. Consider, for example,
+this pattern:
+.sp
+ ([^<]|<(?!inet))+
+.sp
+It matches from wherever it starts until it encounters "...) named capturing group (Perl)
+ (?'name'...) named capturing group (Perl)
+ (?P...) named capturing group (Python)
+ (?:...) non-capturing group
+ (?|...) non-capturing group; reset group numbers for
+ capturing groups in each alternative
+.
+.
+.SH "ATOMIC GROUPS"
+.rs
+.sp
+ (?>...) atomic, non-capturing group
+.
+.
+.
+.
+.SH "COMMENT"
+.rs
+.sp
+ (?#....) comment (not nestable)
+.
+.
+.SH "OPTION SETTING"
+.rs
+.sp
+ (?i) caseless
+ (?J) allow duplicate names
+ (?m) multiline
+ (?s) single line (dotall)
+ (?U) default ungreedy (lazy)
+ (?x) extended (ignore white space)
+ (?-...) unset option(s)
+.
+.
+.SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS"
+.rs
+.sp
+ (?=...) positive look ahead
+ (?!...) negative look ahead
+ (?<=...) positive look behind
+ (? reference by name (Perl)
+ \ek'name' reference by name (Perl)
+ \eg{name} reference by name (Perl)
+ \ek{name} reference by name (.NET)
+ (?P=name) reference by name (Python)
+.
+.
+.SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)"
+.rs
+.sp
+ (?R) recurse whole pattern
+ (?n) call subpattern by absolute number
+ (?+n) call subpattern by relative number
+ (?-n) call subpattern by relative number
+ (?&name) call subpattern by name (Perl)
+ (?P>name) call subpattern by name (Python)
+ \eg call subpattern by name (Oniguruma)
+ \eg'name' call subpattern by name (Oniguruma)
+ \eg call subpattern by absolute number (Oniguruma)
+ \eg'n' call subpattern by absolute number (Oniguruma)
+ \eg<+n> call subpattern by relative number (PCRE extension)
+ \eg'+n' call subpattern by relative number (PCRE extension)
+ \eg<-n> call subpattern by relative number (PCRE extension)
+ \eg'-n' call subpattern by relative number (PCRE extension)
+.
+.
+.SH "CONDITIONAL PATTERNS"
+.rs
+.sp
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+.sp
+ (?(n)... absolute reference condition
+ (?(+n)... relative reference condition
+ (?(-n)... relative reference condition
+ (?()... named reference condition (Perl)
+ (?('name')... named reference condition (Perl)
+ (?(name)... named reference condition (PCRE)
+ (?(R)... overall recursion condition
+ (?(Rn)... specific group recursion condition
+ (?(R&name)... specific recursion condition
+ (?(DEFINE)... define subpattern for reference
+ (?(assert)... assertion condition
+.
+.
+.SH "BACKTRACKING CONTROL"
+.rs
+.sp
+The following act immediately they are reached:
+.sp
+ (*ACCEPT) force successful match
+ (*FAIL) force backtrack; synonym (*F)
+.sp
+The following act only when a subsequent match failure causes a backtrack to
+reach them. They all force a match failure, but they differ in what happens
+afterwards. Those that advance the start-of-match point do so only if the
+pattern is not anchored.
+.sp
+ (*COMMIT) overall failure, no advance of starting point
+ (*PRUNE) advance to next starting character
+ (*SKIP) advance start to current matching position
+ (*THEN) local failure, backtrack to next alternation
+.
+.
+.SH "NEWLINE CONVENTIONS"
+.rs
+.sp
+These are recognized only at the very start of the pattern or after a
+(*BSR_...) option.
+.sp
+ (*CR)
+ (*LF)
+ (*CRLF)
+ (*ANYCRLF)
+ (*ANY)
+.
+.
+.SH "WHAT \eR MATCHES"
+.rs
+.sp
+These are recognized only at the very start of the pattern or after a
+(*...) option that sets the newline convention.
+.sp
+ (*BSR_ANYCRLF)
+ (*BSR_UNICODE)
+.
+.
+.SH "CALLOUTS"
+.rs
+.sp
+ (?C) callout
+ (?Cn) callout with data n
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcrepattern\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3),
+\fBpcrematching\fP(3), \fBpcre\fP(3).
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 09 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
+.fi
diff --git a/src/doc/pcretest.1 b/src/doc/pcretest.1
new file mode 100644
index 0000000..8efdbe6
--- /dev/null
+++ b/src/doc/pcretest.1
@@ -0,0 +1,723 @@
+.TH PCRETEST 1
+.SH NAME
+pcretest - a program for testing Perl-compatible regular expressions.
+.SH SYNOPSIS
+.rs
+.sp
+.B pcretest "[options] [source] [destination]"
+.sp
+\fBpcretest\fP was written as a test program for the PCRE regular expression
+library itself, but it can also be used for experimenting with regular
+expressions. This document describes the features of the test program; for
+details of the regular expressions themselves, see the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation. For details of the PCRE library function calls and their
+options, see the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.
+.
+.SH OPTIONS
+.rs
+.TP 10
+\fB-b\fP
+Behave as if each regex has the \fB/B\fP (show bytecode) modifier; the internal
+form is output after compilation.
+.TP 10
+\fB-C\fP
+Output the version number of the PCRE library, and all available information
+about the optional features that are included, and then exit.
+.TP 10
+\fB-d\fP
+Behave as if each regex has the \fB/D\fP (debug) modifier; the internal
+form and information about the compiled pattern is output after compilation;
+\fB-d\fP is equivalent to \fB-b -i\fP.
+.TP 10
+\fB-dfa\fP
+Behave as if each data line contains the \eD escape sequence; this causes the
+alternative matching function, \fBpcre_dfa_exec()\fP, to be used instead of the
+standard \fBpcre_exec()\fP function (more detail is given below).
+.TP 10
+\fB-help\fP
+Output a brief summary these options and then exit.
+.TP 10
+\fB-i\fP
+Behave as if each regex has the \fB/I\fP modifier; information about the
+compiled pattern is given after compilation.
+.TP 10
+\fB-m\fP
+Output the size of each compiled pattern after it has been compiled. This is
+equivalent to adding \fB/M\fP to each regular expression. For compatibility
+with earlier versions of pcretest, \fB-s\fP is a synonym for \fB-m\fP.
+.TP 10
+\fB-o\fP \fIosize\fP
+Set the number of elements in the output vector that is used when calling
+\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP to be \fIosize\fP. The default value
+is 45, which is enough for 14 capturing subexpressions for \fBpcre_exec()\fP or
+22 different matches for \fBpcre_dfa_exec()\fP. The vector size can be
+changed for individual matching calls by including \eO in the data line (see
+below).
+.TP 10
+\fB-p\fP
+Behave as if each regex has the \fB/P\fP modifier; the POSIX wrapper API is
+used to call PCRE. None of the other options has any effect when \fB-p\fP is
+set.
+.TP 10
+\fB-q\fP
+Do not output the version number of \fBpcretest\fP at the start of execution.
+.TP 10
+\fB-S\fP \fIsize\fP
+On Unix-like systems, set the size of the runtime stack to \fIsize\fP
+megabytes.
+.TP 10
+\fB-t\fP
+Run each compile, study, and match many times with a timer, and output
+resulting time per compile or match (in milliseconds). Do not set \fB-m\fP with
+\fB-t\fP, because you will then get the size output a zillion times, and the
+timing will be distorted. You can control the number of iterations that are
+used for timing by following \fB-t\fP with a number (as a separate item on the
+command line). For example, "-t 1000" would iterate 1000 times. The default is
+to iterate 500000 times.
+.TP 10
+\fB-tm\fP
+This is like \fB-t\fP except that it times only the matching phase, not the
+compile or study phases.
+.
+.
+.SH DESCRIPTION
+.rs
+.sp
+If \fBpcretest\fP is given two filename arguments, it reads from the first and
+writes to the second. If it is given only one filename argument, it reads from
+that file and writes to stdout. Otherwise, it reads from stdin and writes to
+stdout, and prompts for each line of input, using "re>" to prompt for regular
+expressions, and "data>" to prompt for data lines.
+.P
+When \fBpcretest\fP is built, a configuration option can specify that it should
+be linked with the \fBlibreadline\fP library. When this is done, if the input
+is from a terminal, it is read using the \fBreadline()\fP function. This
+provides line-editing and history facilities. The output from the \fB-help\fP
+option states whether or not \fBreadline()\fP will be used.
+.P
+The program handles any number of sets of input on a single input file. Each
+set starts with a regular expression, and continues with any number of data
+lines to be matched against the pattern.
+.P
+Each data line is matched separately and independently. If you want to do
+multi-line matches, you have to use the \en escape sequence (or \er or \er\en,
+etc., depending on the newline setting) in a single line of input to encode the
+newline sequences. There is no limit on the length of data lines; the input
+buffer is automatically extended if it is too small.
+.P
+An empty line signals the end of the data lines, at which point a new regular
+expression is read. The regular expressions are given enclosed in any
+non-alphanumeric delimiters other than backslash, for example:
+.sp
+ /(a|bc)x+yz/
+.sp
+White space before the initial delimiter is ignored. A regular expression may
+be continued over several input lines, in which case the newline characters are
+included within it. It is possible to include the delimiter within the pattern
+by escaping it, for example
+.sp
+ /abc\e/def/
+.sp
+If you do so, the escape and the delimiter form part of the pattern, but since
+delimiters are always non-alphanumeric, this does not affect its interpretation.
+If the terminating delimiter is immediately followed by a backslash, for
+example,
+.sp
+ /abc/\e
+.sp
+then a backslash is added to the end of the pattern. This is done to provide a
+way of testing the error condition that arises if a pattern finishes with a
+backslash, because
+.sp
+ /abc\e/
+.sp
+is interpreted as the first line of a pattern that starts with "abc/", causing
+pcretest to read the next line as a continuation of the regular expression.
+.
+.
+.SH "PATTERN MODIFIERS"
+.rs
+.sp
+A pattern may be followed by any number of modifiers, which are mostly single
+characters. Following Perl usage, these are referred to below as, for example,
+"the \fB/i\fP modifier", even though the delimiter of the pattern need not
+always be a slash, and no slash is used when writing modifiers. Whitespace may
+appear between the final pattern delimiter and the first modifier, and between
+the modifiers themselves.
+.P
+The \fB/i\fP, \fB/m\fP, \fB/s\fP, and \fB/x\fP modifiers set the PCRE_CASELESS,
+PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when
+\fBpcre_compile()\fP is called. These four modifier letters have the same
+effect as they do in Perl. For example:
+.sp
+ /caseless/i
+.sp
+The following table shows additional modifiers for setting PCRE options that do
+not correspond to anything in Perl:
+.sp
+ \fB/A\fP PCRE_ANCHORED
+ \fB/C\fP PCRE_AUTO_CALLOUT
+ \fB/E\fP PCRE_DOLLAR_ENDONLY
+ \fB/f\fP PCRE_FIRSTLINE
+ \fB/J\fP PCRE_DUPNAMES
+ \fB/N\fP PCRE_NO_AUTO_CAPTURE
+ \fB/U\fP PCRE_UNGREEDY
+ \fB/X\fP PCRE_EXTRA
+ \fB/\fP PCRE_JAVASCRIPT_COMPAT
+ \fB/\fP PCRE_NEWLINE_CR
+ \fB/\fP PCRE_NEWLINE_LF
+ \fB/\fP PCRE_NEWLINE_CRLF
+ \fB/\fP PCRE_NEWLINE_ANYCRLF
+ \fB/\fP PCRE_NEWLINE_ANY
+ \fB/\fP PCRE_BSR_ANYCRLF
+ \fB/\fP PCRE_BSR_UNICODE
+.sp
+Those specifying line ending sequences are literal strings as shown, but the
+letters can be in either case. This example sets multiline matching with CRLF
+as the line ending sequence:
+.sp
+ /^abc/m
+.sp
+Details of the meanings of these PCRE options are given in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.
+.
+.SS "Finding all matches in a string"
+.rs
+.sp
+Searching for all possible matches within each subject string can be requested
+by the \fB/g\fP or \fB/G\fP modifier. After finding a match, PCRE is called
+again to search the remainder of the subject string. The difference between
+\fB/g\fP and \fB/G\fP is that the former uses the \fIstartoffset\fP argument to
+\fBpcre_exec()\fP to start searching at a new point within the entire string
+(which is in effect what Perl does), whereas the latter passes over a shortened
+substring. This makes a difference to the matching process if the pattern
+begins with a lookbehind assertion (including \eb or \eB).
+.P
+If any call to \fBpcre_exec()\fP in a \fB/g\fP or \fB/G\fP sequence matches an
+empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
+flags set in order to search for another, non-empty, match at the same point.
+If this second match fails, the start offset is advanced by one, and the normal
+match is retried. This imitates the way Perl handles such cases when using the
+\fB/g\fP modifier or the \fBsplit()\fP function.
+.
+.
+.SS "Other modifiers"
+.rs
+.sp
+There are yet more modifiers for controlling the way \fBpcretest\fP
+operates.
+.P
+The \fB/+\fP modifier requests that as well as outputting the substring that
+matched the entire pattern, pcretest should in addition output the remainder of
+the subject string. This is useful for tests where the subject contains
+multiple copies of the same substring.
+.P
+The \fB/B\fP modifier is a debugging feature. It requests that \fBpcretest\fP
+output a representation of the compiled byte code after compilation. Normally
+this information contains length and offset values; however, if \fB/Z\fP is
+also present, this data is replaced by spaces. This is a special feature for
+use in the automatic test scripts; it ensures that the same output is generated
+for different internal link sizes.
+.P
+The \fB/L\fP modifier must be followed directly by the name of a locale, for
+example,
+.sp
+ /pattern/Lfr_FR
+.sp
+For this reason, it must be the last modifier. The given locale is set,
+\fBpcre_maketables()\fP is called to build a set of character tables for the
+locale, and this is then passed to \fBpcre_compile()\fP when compiling the
+regular expression. Without an \fB/L\fP modifier, NULL is passed as the tables
+pointer; that is, \fB/L\fP applies only to the expression on which it appears.
+.P
+The \fB/I\fP modifier requests that \fBpcretest\fP output information about the
+compiled pattern (whether it is anchored, has a fixed first character, and
+so on). It does this by calling \fBpcre_fullinfo()\fP after compiling a
+pattern. If the pattern is studied, the results of that are also output.
+.P
+The \fB/D\fP modifier is a PCRE debugging feature, and is equivalent to
+\fB/BI\fP, that is, both the \fB/B\fP and the \fB/I\fP modifiers.
+.P
+The \fB/F\fP modifier causes \fBpcretest\fP to flip the byte order of the
+fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
+facility is for testing the feature in PCRE that allows it to execute patterns
+that were compiled on a host with a different endianness. This feature is not
+available when the POSIX interface to PCRE is being used, that is, when the
+\fB/P\fP pattern modifier is specified. See also the section about saving and
+reloading compiled patterns below.
+.P
+The \fB/S\fP modifier causes \fBpcre_study()\fP to be called after the
+expression has been compiled, and the results used when the expression is
+matched.
+.P
+The \fB/M\fP modifier causes the size of memory block used to hold the compiled
+pattern to be output.
+.P
+The \fB/P\fP modifier causes \fBpcretest\fP to call PCRE via the POSIX wrapper
+API rather than its native API. When this is done, all other modifiers except
+\fB/i\fP, \fB/m\fP, and \fB/+\fP are ignored. REG_ICASE is set if \fB/i\fP is
+present, and REG_NEWLINE is set if \fB/m\fP is present. The wrapper functions
+force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
+.P
+The \fB/8\fP modifier causes \fBpcretest\fP to call PCRE with the PCRE_UTF8
+option set. This turns on support for UTF-8 character handling in PCRE,
+provided that it was compiled with this support enabled. This modifier also
+causes any non-printing characters in output strings to be printed using the
+\ex{hh...} notation if they are valid UTF-8 sequences.
+.P
+If the \fB/?\fP modifier is used with \fB/8\fP, it causes \fBpcretest\fP to
+call \fBpcre_compile()\fP with the PCRE_NO_UTF8_CHECK option, to suppress the
+checking of the string for UTF-8 validity.
+.
+.
+.SH "DATA LINES"
+.rs
+.sp
+Before each data line is passed to \fBpcre_exec()\fP, leading and trailing
+whitespace is removed, and it is then scanned for \e escapes. Some of these are
+pretty esoteric features, intended for checking out some of the more
+complicated features of PCRE. If you are just testing "ordinary" regular
+expressions, you probably don't need any of these. The following escapes are
+recognized:
+.sp
+ \ea alarm (BEL, \ex07)
+ \eb backspace (\ex08)
+ \ee escape (\ex27)
+ \ef formfeed (\ex0c)
+ \en newline (\ex0a)
+.\" JOIN
+ \eqdd set the PCRE_MATCH_LIMIT limit to dd
+ (any number of digits)
+ \er carriage return (\ex0d)
+ \et tab (\ex09)
+ \ev vertical tab (\ex0b)
+ \ennn octal character (up to 3 octal digits)
+ \exhh hexadecimal character (up to 2 hex digits)
+.\" JOIN
+ \ex{hh...} hexadecimal character, any number of digits
+ in UTF-8 mode
+.\" JOIN
+ \eA pass the PCRE_ANCHORED option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \eB pass the PCRE_NOTBOL option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \eCdd call pcre_copy_substring() for substring dd
+ after a successful match (number less than 32)
+.\" JOIN
+ \eCname call pcre_copy_named_substring() for substring
+ "name" after a successful match (name termin-
+ ated by next non alphanumeric character)
+.\" JOIN
+ \eC+ show the current captured substrings at callout
+ time
+ \eC- do not supply a callout function
+.\" JOIN
+ \eC!n return 1 instead of 0 when callout number n is
+ reached
+.\" JOIN
+ \eC!n!m return 1 instead of 0 when callout number n is
+ reached for the nth time
+.\" JOIN
+ \eC*n pass the number n (may be negative) as callout
+ data; this is used as the callout return value
+ \eD use the \fBpcre_dfa_exec()\fP match function
+ \eF only shortest match for \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \eGdd call pcre_get_substring() for substring dd
+ after a successful match (number less than 32)
+.\" JOIN
+ \eGname call pcre_get_named_substring() for substring
+ "name" after a successful match (name termin-
+ ated by next non-alphanumeric character)
+.\" JOIN
+ \eL call pcre_get_substringlist() after a
+ successful match
+.\" JOIN
+ \eM discover the minimum MATCH_LIMIT and
+ MATCH_LIMIT_RECURSION settings
+.\" JOIN
+ \eN pass the PCRE_NOTEMPTY option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \eOdd set the size of the output vector passed to
+ \fBpcre_exec()\fP to dd (any number of digits)
+.\" JOIN
+ \eP pass the PCRE_PARTIAL option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \eQdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd
+ (any number of digits)
+ \eR pass the PCRE_DFA_RESTART option to \fBpcre_dfa_exec()\fP
+ \eS output details of memory get/free calls during matching
+.\" JOIN
+ \eZ pass the PCRE_NOTEOL option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \e? pass the PCRE_NO_UTF8_CHECK option to
+ \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP
+ \e>dd start the match at offset dd (any number of digits);
+.\" JOIN
+ this sets the \fIstartoffset\fP argument for \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \e pass the PCRE_NEWLINE_CR option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \e pass the PCRE_NEWLINE_LF option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \e pass the PCRE_NEWLINE_CRLF option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \e pass the PCRE_NEWLINE_ANYCRLF option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
+ \e pass the PCRE_NEWLINE_ANY option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.sp
+The escapes that specify line ending sequences are literal strings, exactly as
+shown. No more than one newline setting should be present in any data line.
+.P
+A backslash followed by anything else just escapes the anything else. If
+the very last character is a backslash, it is ignored. This gives a way of
+passing an empty line as data, since a real empty line terminates the data
+input.
+.P
+If \eM is present, \fBpcretest\fP calls \fBpcre_exec()\fP several times, with
+different values in the \fImatch_limit\fP and \fImatch_limit_recursion\fP
+fields of the \fBpcre_extra\fP data structure, until it finds the minimum
+numbers for each parameter that allow \fBpcre_exec()\fP to complete. The
+\fImatch_limit\fP number is a measure of the amount of backtracking that takes
+place, and checking it out can be instructive. For most simple matches, the
+number is quite small, but for patterns with very large numbers of matching
+possibilities, it can become large very quickly with increasing length of
+subject string. The \fImatch_limit_recursion\fP number is a measure of how much
+stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
+to complete the match attempt.
+.P
+When \eO is used, the value specified may be higher or lower than the size set
+by the \fB-O\fP command line option (or defaulted to 45); \eO applies only to
+the call of \fBpcre_exec()\fP for the line in which it appears.
+.P
+If the \fB/P\fP modifier was present on the pattern, causing the POSIX wrapper
+API to be used, the only option-setting sequences that have any effect are \eB
+and \eZ, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
+\fBregexec()\fP.
+.P
+The use of \ex{hh...} to represent UTF-8 characters is not dependent on the use
+of the \fB/8\fP modifier on the pattern. It is recognized always. There may be
+any number of hexadecimal digits inside the braces. The result is from one to
+six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
+allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
+valid Unicode code points, or indeed valid UTF-8 characters according to the
+later rules in RFC 3629.
+.
+.
+.SH "THE ALTERNATIVE MATCHING FUNCTION"
+.rs
+.sp
+By default, \fBpcretest\fP uses the standard PCRE matching function,
+\fBpcre_exec()\fP to match each data line. From release 6.0, PCRE supports an
+alternative matching function, \fBpcre_dfa_test()\fP, which operates in a
+different way, and has some restrictions. The differences between the two
+functions are described in the
+.\" HREF
+\fBpcrematching\fP
+.\"
+documentation.
+.P
+If a data line contains the \eD escape sequence, or if the command line
+contains the \fB-dfa\fP option, the alternative matching function is called.
+This function finds all possible matches at a given point. If, however, the \eF
+escape sequence is present in the data line, it stops after the first match is
+found. This is always the shortest possible match.
+.
+.
+.SH "DEFAULT OUTPUT FROM PCRETEST"
+.rs
+.sp
+This section describes the output when the normal matching function,
+\fBpcre_exec()\fP, is being used.
+.P
+When a match succeeds, pcretest outputs the list of captured substrings that
+\fBpcre_exec()\fP returns, starting with number 0 for the string that matched
+the whole pattern. Otherwise, it outputs "No match" or "Partial match"
+when \fBpcre_exec()\fP returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PARTIAL,
+respectively, and otherwise the PCRE negative error number. Here is an example
+of an interactive \fBpcretest\fP run.
+.sp
+ $ pcretest
+ PCRE version 7.0 30-Nov-2006
+.sp
+ re> /^abc(\ed+)/
+ data> abc123
+ 0: abc123
+ 1: 123
+ data> xyz
+ No match
+.sp
+Note that unset capturing substrings that are not followed by one that is set
+are not returned by \fBpcre_exec()\fP, and are not shown by \fBpcretest\fP. In
+the following example, there are two capturing substrings, but when the first
+data line is matched, the second, unset substring is not shown. An "internal"
+unset substring is shown as "", as for the second data line.
+.sp
+ re> /(a)|(b)/
+ data> a
+ 0: a
+ 1: a
+ data> b
+ 0: b
+ 1:
+ 2: b
+.sp
+If the strings contain any non-printing characters, they are output as \e0x
+escapes, or as \ex{...} escapes if the \fB/8\fP modifier was present on the
+pattern. See below for the definition of non-printing characters. If the
+pattern has the \fB/+\fP modifier, the output for substring 0 is followed by
+the the rest of the subject string, identified by "0+" like this:
+.sp
+ re> /cat/+
+ data> cataract
+ 0: cat
+ 0+ aract
+.sp
+If the pattern has the \fB/g\fP or \fB/G\fP modifier, the results of successive
+matching attempts are output in sequence, like this:
+.sp
+ re> /\eBi(\ew\ew)/g
+ data> Mississippi
+ 0: iss
+ 1: ss
+ 0: iss
+ 1: ss
+ 0: ipp
+ 1: pp
+.sp
+"No match" is output only if the first match attempt fails.
+.P
+If any of the sequences \fB\eC\fP, \fB\eG\fP, or \fB\eL\fP are present in a
+data line that is successfully matched, the substrings extracted by the
+convenience functions are output with C, G, or L after the string number
+instead of a colon. This is in addition to the normal full list. The string
+length (that is, the return from the extraction function) is given in
+parentheses after each string for \fB\eC\fP and \fB\eG\fP.
+.P
+Note that whereas patterns can be continued over several lines (a plain ">"
+prompt is used for continuations), data lines may not. However newlines can be
+included in data by means of the \en escape (or \er, \er\en, etc., depending on
+the newline sequence setting).
+.
+.
+.
+.SH "OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION"
+.rs
+.sp
+When the alternative matching function, \fBpcre_dfa_exec()\fP, is used (by
+means of the \eD escape sequence or the \fB-dfa\fP command line option), the
+output consists of a list of all the matches that start at the first point in
+the subject where there is at least one match. For example:
+.sp
+ re> /(tang|tangerine|tan)/
+ data> yellow tangerine\eD
+ 0: tangerine
+ 1: tang
+ 2: tan
+.sp
+(Using the normal matching function on this data finds only "tang".) The
+longest matching string is always given first (and numbered zero).
+.P
+If \fB/g\fP is present on the pattern, the search for further matches resumes
+at the end of the longest match. For example:
+.sp
+ re> /(tang|tangerine|tan)/g
+ data> yellow tangerine and tangy sultana\eD
+ 0: tangerine
+ 1: tang
+ 2: tan
+ 0: tang
+ 1: tan
+ 0: tan
+.sp
+Since the matching function does not support substring capture, the escape
+sequences that are concerned with captured substrings are not relevant.
+.
+.
+.SH "RESTARTING AFTER A PARTIAL MATCH"
+.rs
+.sp
+When the alternative matching function has given the PCRE_ERROR_PARTIAL return,
+indicating that the subject partially matched the pattern, you can restart the
+match with additional subject data by means of the \eR escape sequence. For
+example:
+.sp
+ re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
+ data> 23ja\eP\eD
+ Partial match: 23ja
+ data> n05\eR\eD
+ 0: n05
+.sp
+For further information about partial matching, see the
+.\" HREF
+\fBpcrepartial\fP
+.\"
+documentation.
+.
+.
+.SH CALLOUTS
+.rs
+.sp
+If the pattern contains any callout requests, \fBpcretest\fP's callout function
+is called during matching. This works with both matching functions. By default,
+the called function displays the callout number, the start and current
+positions in the text at the callout time, and the next pattern item to be
+tested. For example, the output
+.sp
+ --->pqrabcdef
+ 0 ^ ^ \ed
+.sp
+indicates that callout number 0 occurred for a match attempt starting at the
+fourth character of the subject string, when the pointer was at the seventh
+character of the data, and when the next pattern item was \ed. Just one
+circumflex is output if the start and current positions are the same.
+.P
+Callouts numbered 255 are assumed to be automatic callouts, inserted as a
+result of the \fB/C\fP pattern modifier. In this case, instead of showing the
+callout number, the offset in the pattern, preceded by a plus, is output. For
+example:
+.sp
+ re> /\ed?[A-E]\e*/C
+ data> E*
+ --->E*
+ +0 ^ \ed?
+ +3 ^ [A-E]
+ +8 ^^ \e*
+ +10 ^ ^
+ 0: E*
+.sp
+The callout function in \fBpcretest\fP returns zero (carry on matching) by
+default, but you can use a \eC item in a data line (as described above) to
+change this.
+.P
+Inserting callouts can be helpful when using \fBpcretest\fP to check
+complicated regular expressions. For further information about callouts, see
+the
+.\" HREF
+\fBpcrecallout\fP
+.\"
+documentation.
+.
+.
+.
+.SH "NON-PRINTING CHARACTERS"
+.rs
+.sp
+When \fBpcretest\fP is outputting text in the compiled version of a pattern,
+bytes other than 32-126 are always treated as non-printing characters are are
+therefore shown as hex escapes.
+.P
+When \fBpcretest\fP is outputting text that is a matched part of a subject
+string, it behaves in the same way, unless a different locale has been set for
+the pattern (using the \fB/L\fP modifier). In this case, the \fBisprint()\fP
+function to distinguish printing and non-printing characters.
+.
+.
+.
+.SH "SAVING AND RELOADING COMPILED PATTERNS"
+.rs
+.sp
+The facilities described in this section are not available when the POSIX
+inteface to PCRE is being used, that is, when the \fB/P\fP pattern modifier is
+specified.
+.P
+When the POSIX interface is not in use, you can cause \fBpcretest\fP to write a
+compiled pattern to a file, by following the modifiers with > and a file name.
+For example:
+.sp
+ /pattern/im >/some/file
+.sp
+See the
+.\" HREF
+\fBpcreprecompile\fP
+.\"
+documentation for a discussion about saving and re-using compiled patterns.
+.P
+The data that is written is binary. The first eight bytes are the length of the
+compiled pattern data followed by the length of the optional study data, each
+written as four bytes in big-endian order (most significant byte first). If
+there is no study data (either the pattern was not studied, or studying did not
+return any data), the second length is zero. The lengths are followed by an
+exact copy of the compiled pattern. If there is additional study data, this
+follows immediately after the compiled pattern. After writing the file,
+\fBpcretest\fP expects to read a new pattern.
+.P
+A saved pattern can be reloaded into \fBpcretest\fP by specifing < and a file
+name instead of a pattern. The name of the file must not contain a < character,
+as otherwise \fBpcretest\fP will interpret the line as a pattern delimited by <
+characters.
+For example:
+.sp
+ re> " to prompt for regular expressions, and "data>" to prompt for data
+ lines.
+
+ When pcretest is built, a configuration option can specify that it
+ should be linked with the libreadline library. When this is done, if
+ the input is from a terminal, it is read using the readline() function.
+ This provides line-editing and history facilities. The output from the
+ -help option states whether or not readline() will be used.
+
+ The program handles any number of sets of input on a single input file.
+ Each set starts with a regular expression, and continues with any num-
+ ber of data lines to be matched against the pattern.
+
+ Each data line is matched separately and independently. If you want to
+ do multi-line matches, you have to use the \n escape sequence (or \r or
+ \r\n, etc., depending on the newline setting) in a single line of input
+ to encode the newline sequences. There is no limit on the length of
+ data lines; the input buffer is automatically extended if it is too
+ small.
+
+ An empty line signals the end of the data lines, at which point a new
+ regular expression is read. The regular expressions are given enclosed
+ in any non-alphanumeric delimiters other than backslash, for example:
+
+ /(a|bc)x+yz/
+
+ White space before the initial delimiter is ignored. A regular expres-
+ sion may be continued over several input lines, in which case the new-
+ line characters are included within it. It is possible to include the
+ delimiter within the pattern by escaping it, for example
+
+ /abc\/def/
+
+ If you do so, the escape and the delimiter form part of the pattern,
+ but since delimiters are always non-alphanumeric, this does not affect
+ its interpretation. If the terminating delimiter is immediately fol-
+ lowed by a backslash, for example,
+
+ /abc/\
+
+ then a backslash is added to the end of the pattern. This is done to
+ provide a way of testing the error condition that arises if a pattern
+ finishes with a backslash, because
+
+ /abc\/
+
+ is interpreted as the first line of a pattern that starts with "abc/",
+ causing pcretest to read the next line as a continuation of the regular
+ expression.
+
+
+PATTERN MODIFIERS
+
+ A pattern may be followed by any number of modifiers, which are mostly
+ single characters. Following Perl usage, these are referred to below
+ as, for example, "the /i modifier", even though the delimiter of the
+ pattern need not always be a slash, and no slash is used when writing
+ modifiers. Whitespace may appear between the final pattern delimiter
+ and the first modifier, and between the modifiers themselves.
+
+ The /i, /m, /s, and /x modifiers set the PCRE_CASELESS, PCRE_MULTILINE,
+ PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when pcre_com-
+ pile() is called. These four modifier letters have the same effect as
+ they do in Perl. For example:
+
+ /caseless/i
+
+ The following table shows additional modifiers for setting PCRE options
+ that do not correspond to anything in Perl:
+
+ /A PCRE_ANCHORED
+ /C PCRE_AUTO_CALLOUT
+ /E PCRE_DOLLAR_ENDONLY
+ /f PCRE_FIRSTLINE
+ /J PCRE_DUPNAMES
+ /N PCRE_NO_AUTO_CAPTURE
+ /U PCRE_UNGREEDY
+ /X PCRE_EXTRA
+ / PCRE_JAVASCRIPT_COMPAT
+ / PCRE_NEWLINE_CR
+ / PCRE_NEWLINE_LF
+ / PCRE_NEWLINE_CRLF
+ / PCRE_NEWLINE_ANYCRLF
+ / PCRE_NEWLINE_ANY
+ / PCRE_BSR_ANYCRLF
+ / PCRE_BSR_UNICODE
+
+ Those specifying line ending sequences are literal strings as shown,
+ but the letters can be in either case. This example sets multiline
+ matching with CRLF as the line ending sequence:
+
+ /^abc/m
+
+ Details of the meanings of these PCRE options are given in the pcreapi
+ documentation.
+
+ Finding all matches in a string
+
+ Searching for all possible matches within each subject string can be
+ requested by the /g or /G modifier. After finding a match, PCRE is
+ called again to search the remainder of the subject string. The differ-
+ ence between /g and /G is that the former uses the startoffset argument
+ to pcre_exec() to start searching at a new point within the entire
+ string (which is in effect what Perl does), whereas the latter passes
+ over a shortened substring. This makes a difference to the matching
+ process if the pattern begins with a lookbehind assertion (including \b
+ or \B).
+
+ If any call to pcre_exec() in a /g or /G sequence matches an empty
+ string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
+ flags set in order to search for another, non-empty, match at the same
+ point. If this second match fails, the start offset is advanced by
+ one, and the normal match is retried. This imitates the way Perl han-
+ dles such cases when using the /g modifier or the split() function.
+
+ Other modifiers
+
+ There are yet more modifiers for controlling the way pcretest operates.
+
+ The /+ modifier requests that as well as outputting the substring that
+ matched the entire pattern, pcretest should in addition output the
+ remainder of the subject string. This is useful for tests where the
+ subject contains multiple copies of the same substring.
+
+ The /B modifier is a debugging feature. It requests that pcretest out-
+ put a representation of the compiled byte code after compilation. Nor-
+ mally this information contains length and offset values; however, if
+ /Z is also present, this data is replaced by spaces. This is a special
+ feature for use in the automatic test scripts; it ensures that the same
+ output is generated for different internal link sizes.
+
+ The /L modifier must be followed directly by the name of a locale, for
+ example,
+
+ /pattern/Lfr_FR
+
+ For this reason, it must be the last modifier. The given locale is set,
+ pcre_maketables() is called to build a set of character tables for the
+ locale, and this is then passed to pcre_compile() when compiling the
+ regular expression. Without an /L modifier, NULL is passed as the
+ tables pointer; that is, /L applies only to the expression on which it
+ appears.
+
+ The /I modifier requests that pcretest output information about the
+ compiled pattern (whether it is anchored, has a fixed first character,
+ and so on). It does this by calling pcre_fullinfo() after compiling a
+ pattern. If the pattern is studied, the results of that are also out-
+ put.
+
+ The /D modifier is a PCRE debugging feature, and is equivalent to /BI,
+ that is, both the /B and the /I modifiers.
+
+ The /F modifier causes pcretest to flip the byte order of the fields in
+ the compiled pattern that contain 2-byte and 4-byte numbers. This
+ facility is for testing the feature in PCRE that allows it to execute
+ patterns that were compiled on a host with a different endianness. This
+ feature is not available when the POSIX interface to PCRE is being
+ used, that is, when the /P pattern modifier is specified. See also the
+ section about saving and reloading compiled patterns below.
+
+ The /S modifier causes pcre_study() to be called after the expression
+ has been compiled, and the results used when the expression is matched.
+
+ The /M modifier causes the size of memory block used to hold the com-
+ piled pattern to be output.
+
+ The /P modifier causes pcretest to call PCRE via the POSIX wrapper API
+ rather than its native API. When this is done, all other modifiers
+ except /i, /m, and /+ are ignored. REG_ICASE is set if /i is present,
+ and REG_NEWLINE is set if /m is present. The wrapper functions force
+ PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
+
+ The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option
+ set. This turns on support for UTF-8 character handling in PCRE, pro-
+ vided that it was compiled with this support enabled. This modifier
+ also causes any non-printing characters in output strings to be printed
+ using the \x{hh...} notation if they are valid UTF-8 sequences.
+
+ If the /? modifier is used with /8, it causes pcretest to call
+ pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
+ checking of the string for UTF-8 validity.
+
+
+DATA LINES
+
+ Before each data line is passed to pcre_exec(), leading and trailing
+ whitespace is removed, and it is then scanned for \ escapes. Some of
+ these are pretty esoteric features, intended for checking out some of
+ the more complicated features of PCRE. If you are just testing "ordi-
+ nary" regular expressions, you probably don't need any of these. The
+ following escapes are recognized:
+
+ \a alarm (BEL, \x07)
+ \b backspace (\x08)
+ \e escape (\x27)
+ \f formfeed (\x0c)
+ \n newline (\x0a)
+ \qdd set the PCRE_MATCH_LIMIT limit to dd
+ (any number of digits)
+ \r carriage return (\x0d)
+ \t tab (\x09)
+ \v vertical tab (\x0b)
+ \nnn octal character (up to 3 octal digits)
+ \xhh hexadecimal character (up to 2 hex digits)
+ \x{hh...} hexadecimal character, any number of digits
+ in UTF-8 mode
+ \A pass the PCRE_ANCHORED option to pcre_exec()
+ or pcre_dfa_exec()
+ \B pass the PCRE_NOTBOL option to pcre_exec()
+ or pcre_dfa_exec()
+ \Cdd call pcre_copy_substring() for substring dd
+ after a successful match (number less than 32)
+ \Cname call pcre_copy_named_substring() for substring
+ "name" after a successful match (name termin-
+ ated by next non alphanumeric character)
+ \C+ show the current captured substrings at callout
+ time
+ \C- do not supply a callout function
+ \C!n return 1 instead of 0 when callout number n is
+ reached
+ \C!n!m return 1 instead of 0 when callout number n is
+ reached for the nth time
+ \C*n pass the number n (may be negative) as callout
+ data; this is used as the callout return value
+ \D use the pcre_dfa_exec() match function
+ \F only shortest match for pcre_dfa_exec()
+ \Gdd call pcre_get_substring() for substring dd
+ after a successful match (number less than 32)
+ \Gname call pcre_get_named_substring() for substring
+ "name" after a successful match (name termin-
+ ated by next non-alphanumeric character)
+ \L call pcre_get_substringlist() after a
+ successful match
+ \M discover the minimum MATCH_LIMIT and
+ MATCH_LIMIT_RECURSION settings
+ \N pass the PCRE_NOTEMPTY option to pcre_exec()
+ or pcre_dfa_exec()
+ \Odd set the size of the output vector passed to
+ pcre_exec() to dd (any number of digits)
+ \P pass the PCRE_PARTIAL option to pcre_exec()
+ or pcre_dfa_exec()
+ \Qdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd
+ (any number of digits)
+ \R pass the PCRE_DFA_RESTART option to pcre_dfa_exec()
+ \S output details of memory get/free calls during matching
+ \Z pass the PCRE_NOTEOL option to pcre_exec()
+ or pcre_dfa_exec()
+ \? pass the PCRE_NO_UTF8_CHECK option to
+ pcre_exec() or pcre_dfa_exec()
+ \>dd start the match at offset dd (any number of digits);
+ this sets the startoffset argument for pcre_exec()
+ or pcre_dfa_exec()
+ \ pass the PCRE_NEWLINE_CR option to pcre_exec()
+ or pcre_dfa_exec()
+ \ pass the PCRE_NEWLINE_LF option to pcre_exec()
+ or pcre_dfa_exec()
+ \ pass the PCRE_NEWLINE_CRLF option to pcre_exec()
+ or pcre_dfa_exec()
+ \ pass the PCRE_NEWLINE_ANYCRLF option to pcre_exec()
+ or pcre_dfa_exec()
+ \ pass the PCRE_NEWLINE_ANY option to pcre_exec()
+ or pcre_dfa_exec()
+
+ The escapes that specify line ending sequences are literal strings,
+ exactly as shown. No more than one newline setting should be present in
+ any data line.
+
+ A backslash followed by anything else just escapes the anything else.
+ If the very last character is a backslash, it is ignored. This gives a
+ way of passing an empty line as data, since a real empty line termi-
+ nates the data input.
+
+ If \M is present, pcretest calls pcre_exec() several times, with dif-
+ ferent values in the match_limit and match_limit_recursion fields of
+ the pcre_extra data structure, until it finds the minimum numbers for
+ each parameter that allow pcre_exec() to complete. The match_limit num-
+ ber is a measure of the amount of backtracking that takes place, and
+ checking it out can be instructive. For most simple matches, the number
+ is quite small, but for patterns with very large numbers of matching
+ possibilities, it can become large very quickly with increasing length
+ of subject string. The match_limit_recursion number is a measure of how
+ much stack (or, if PCRE is compiled with NO_RECURSE, how much heap)
+ memory is needed to complete the match attempt.
+
+ When \O is used, the value specified may be higher or lower than the
+ size set by the -O command line option (or defaulted to 45); \O applies
+ only to the call of pcre_exec() for the line in which it appears.
+
+ If the /P modifier was present on the pattern, causing the POSIX wrap-
+ per API to be used, the only option-setting sequences that have any
+ effect are \B and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively,
+ to be passed to regexec().
+
+ The use of \x{hh...} to represent UTF-8 characters is not dependent on
+ the use of the /8 modifier on the pattern. It is recognized always.
+ There may be any number of hexadecimal digits inside the braces. The
+ result is from one to six bytes, encoded according to the original
+ UTF-8 rules of RFC 2279. This allows for values in the range 0 to
+ 0x7FFFFFFF. Note that not all of those are valid Unicode code points,
+ or indeed valid UTF-8 characters according to the later rules in RFC
+ 3629.
+
+
+THE ALTERNATIVE MATCHING FUNCTION
+
+ By default, pcretest uses the standard PCRE matching function,
+ pcre_exec() to match each data line. From release 6.0, PCRE supports an
+ alternative matching function, pcre_dfa_test(), which operates in a
+ different way, and has some restrictions. The differences between the
+ two functions are described in the pcrematching documentation.
+
+ If a data line contains the \D escape sequence, or if the command line
+ contains the -dfa option, the alternative matching function is called.
+ This function finds all possible matches at a given point. If, however,
+ the \F escape sequence is present in the data line, it stops after the
+ first match is found. This is always the shortest possible match.
+
+
+DEFAULT OUTPUT FROM PCRETEST
+
+ This section describes the output when the normal matching function,
+ pcre_exec(), is being used.
+
+ When a match succeeds, pcretest outputs the list of captured substrings
+ that pcre_exec() returns, starting with number 0 for the string that
+ matched the whole pattern. Otherwise, it outputs "No match" or "Partial
+ match" when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PAR-
+ TIAL, respectively, and otherwise the PCRE negative error number. Here
+ is an example of an interactive pcretest run.
+
+ $ pcretest
+ PCRE version 7.0 30-Nov-2006
+
+ re> /^abc(\d+)/
+ data> abc123
+ 0: abc123
+ 1: 123
+ data> xyz
+ No match
+
+ Note that unset capturing substrings that are not followed by one that
+ is set are not returned by pcre_exec(), and are not shown by pcretest.
+ In the following example, there are two capturing substrings, but when
+ the first data line is matched, the second, unset substring is not
+ shown. An "internal" unset substring is shown as "", as for the
+ second data line.
+
+ re> /(a)|(b)/
+ data> a
+ 0: a
+ 1: a
+ data> b
+ 0: b
+ 1:
+ 2: b
+
+ If the strings contain any non-printing characters, they are output as
+ \0x escapes, or as \x{...} escapes if the /8 modifier was present on
+ the pattern. See below for the definition of non-printing characters.
+ If the pattern has the /+ modifier, the output for substring 0 is fol-
+ lowed by the the rest of the subject string, identified by "0+" like
+ this:
+
+ re> /cat/+
+ data> cataract
+ 0: cat
+ 0+ aract
+
+ If the pattern has the /g or /G modifier, the results of successive
+ matching attempts are output in sequence, like this:
+
+ re> /\Bi(\w\w)/g
+ data> Mississippi
+ 0: iss
+ 1: ss
+ 0: iss
+ 1: ss
+ 0: ipp
+ 1: pp
+
+ "No match" is output only if the first match attempt fails.
+
+ If any of the sequences \C, \G, or \L are present in a data line that
+ is successfully matched, the substrings extracted by the convenience
+ functions are output with C, G, or L after the string number instead of
+ a colon. This is in addition to the normal full list. The string length
+ (that is, the return from the extraction function) is given in paren-
+ theses after each string for \C and \G.
+
+ Note that whereas patterns can be continued over several lines (a plain
+ ">" prompt is used for continuations), data lines may not. However new-
+ lines can be included in data by means of the \n escape (or \r, \r\n,
+ etc., depending on the newline sequence setting).
+
+
+OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
+
+ When the alternative matching function, pcre_dfa_exec(), is used (by
+ means of the \D escape sequence or the -dfa command line option), the
+ output consists of a list of all the matches that start at the first
+ point in the subject where there is at least one match. For example:
+
+ re> /(tang|tangerine|tan)/
+ data> yellow tangerine\D
+ 0: tangerine
+ 1: tang
+ 2: tan
+
+ (Using the normal matching function on this data finds only "tang".)
+ The longest matching string is always given first (and numbered zero).
+
+ If /g is present on the pattern, the search for further matches resumes
+ at the end of the longest match. For example:
+
+ re> /(tang|tangerine|tan)/g
+ data> yellow tangerine and tangy sultana\D
+ 0: tangerine
+ 1: tang
+ 2: tan
+ 0: tang
+ 1: tan
+ 0: tan
+
+ Since the matching function does not support substring capture, the
+ escape sequences that are concerned with captured substrings are not
+ relevant.
+
+
+RESTARTING AFTER A PARTIAL MATCH
+
+ When the alternative matching function has given the PCRE_ERROR_PARTIAL
+ return, indicating that the subject partially matched the pattern, you
+ can restart the match with additional subject data by means of the \R
+ escape sequence. For example:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 23ja\P\D
+ Partial match: 23ja
+ data> n05\R\D
+ 0: n05
+
+ For further information about partial matching, see the pcrepartial
+ documentation.
+
+
+CALLOUTS
+
+ If the pattern contains any callout requests, pcretest's callout func-
+ tion is called during matching. This works with both matching func-
+ tions. By default, the called function displays the callout number, the
+ start and current positions in the text at the callout time, and the
+ next pattern item to be tested. For example, the output
+
+ --->pqrabcdef
+ 0 ^ ^ \d
+
+ indicates that callout number 0 occurred for a match attempt starting
+ at the fourth character of the subject string, when the pointer was at
+ the seventh character of the data, and when the next pattern item was
+ \d. Just one circumflex is output if the start and current positions
+ are the same.
+
+ Callouts numbered 255 are assumed to be automatic callouts, inserted as
+ a result of the /C pattern modifier. In this case, instead of showing
+ the callout number, the offset in the pattern, preceded by a plus, is
+ output. For example:
+
+ re> /\d?[A-E]\*/C
+ data> E*
+ --->E*
+ +0 ^ \d?
+ +3 ^ [A-E]
+ +8 ^^ \*
+ +10 ^ ^
+ 0: E*
+
+ The callout function in pcretest returns zero (carry on matching) by
+ default, but you can use a \C item in a data line (as described above)
+ to change this.
+
+ Inserting callouts can be helpful when using pcretest to check compli-
+ cated regular expressions. For further information about callouts, see
+ the pcrecallout documentation.
+
+
+NON-PRINTING CHARACTERS
+
+ When pcretest is outputting text in the compiled version of a pattern,
+ bytes other than 32-126 are always treated as non-printing characters
+ are are therefore shown as hex escapes.
+
+ When pcretest is outputting text that is a matched part of a subject
+ string, it behaves in the same way, unless a different locale has been
+ set for the pattern (using the /L modifier). In this case, the
+ isprint() function to distinguish printing and non-printing characters.
+
+
+SAVING AND RELOADING COMPILED PATTERNS
+
+ The facilities described in this section are not available when the
+ POSIX inteface to PCRE is being used, that is, when the /P pattern mod-
+ ifier is specified.
+
+ When the POSIX interface is not in use, you can cause pcretest to write
+ a compiled pattern to a file, by following the modifiers with > and a
+ file name. For example:
+
+ /pattern/im >/some/file
+
+ See the pcreprecompile documentation for a discussion about saving and
+ re-using compiled patterns.
+
+ The data that is written is binary. The first eight bytes are the
+ length of the compiled pattern data followed by the length of the
+ optional study data, each written as four bytes in big-endian order
+ (most significant byte first). If there is no study data (either the
+ pattern was not studied, or studying did not return any data), the sec-
+ ond length is zero. The lengths are followed by an exact copy of the
+ compiled pattern. If there is additional study data, this follows imme-
+ diately after the compiled pattern. After writing the file, pcretest
+ expects to read a new pattern.
+
+ A saved pattern can be reloaded into pcretest by specifing < and a file
+ name instead of a pattern. The name of the file must not contain a <
+ character, as otherwise pcretest will interpret the line as a pattern
+ delimited by < characters. For example:
+
+ re>
+
+/* Allow for C++ users */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Options */
+
+#define PCRE_CASELESS 0x00000001
+#define PCRE_MULTILINE 0x00000002
+#define PCRE_DOTALL 0x00000004
+#define PCRE_EXTENDED 0x00000008
+#define PCRE_ANCHORED 0x00000010
+#define PCRE_DOLLAR_ENDONLY 0x00000020
+#define PCRE_EXTRA 0x00000040
+#define PCRE_NOTBOL 0x00000080
+#define PCRE_NOTEOL 0x00000100
+#define PCRE_UNGREEDY 0x00000200
+#define PCRE_NOTEMPTY 0x00000400
+#define PCRE_UTF8 0x00000800
+#define PCRE_NO_AUTO_CAPTURE 0x00001000
+#define PCRE_NO_UTF8_CHECK 0x00002000
+#define PCRE_AUTO_CALLOUT 0x00004000
+#define PCRE_PARTIAL 0x00008000
+#define PCRE_DFA_SHORTEST 0x00010000
+#define PCRE_DFA_RESTART 0x00020000
+#define PCRE_FIRSTLINE 0x00040000
+#define PCRE_DUPNAMES 0x00080000
+#define PCRE_NEWLINE_CR 0x00100000
+#define PCRE_NEWLINE_LF 0x00200000
+#define PCRE_NEWLINE_CRLF 0x00300000
+#define PCRE_NEWLINE_ANY 0x00400000
+#define PCRE_NEWLINE_ANYCRLF 0x00500000
+#define PCRE_BSR_ANYCRLF 0x00800000
+#define PCRE_BSR_UNICODE 0x01000000
+#define PCRE_JAVASCRIPT_COMPAT 0x02000000
+
+/* Exec-time and get/set-time error codes */
+
+#define PCRE_ERROR_NOMATCH (-1)
+#define PCRE_ERROR_NULL (-2)
+#define PCRE_ERROR_BADOPTION (-3)
+#define PCRE_ERROR_BADMAGIC (-4)
+#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
+#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
+#define PCRE_ERROR_NOMEMORY (-6)
+#define PCRE_ERROR_NOSUBSTRING (-7)
+#define PCRE_ERROR_MATCHLIMIT (-8)
+#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
+#define PCRE_ERROR_BADUTF8 (-10)
+#define PCRE_ERROR_BADUTF8_OFFSET (-11)
+#define PCRE_ERROR_PARTIAL (-12)
+#define PCRE_ERROR_BADPARTIAL (-13)
+#define PCRE_ERROR_INTERNAL (-14)
+#define PCRE_ERROR_BADCOUNT (-15)
+#define PCRE_ERROR_DFA_UITEM (-16)
+#define PCRE_ERROR_DFA_UCOND (-17)
+#define PCRE_ERROR_DFA_UMLIMIT (-18)
+#define PCRE_ERROR_DFA_WSSIZE (-19)
+#define PCRE_ERROR_DFA_RECURSE (-20)
+#define PCRE_ERROR_RECURSIONLIMIT (-21)
+#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
+#define PCRE_ERROR_BADNEWLINE (-23)
+
+/* Request types for pcre_fullinfo() */
+
+#define PCRE_INFO_OPTIONS 0
+#define PCRE_INFO_SIZE 1
+#define PCRE_INFO_CAPTURECOUNT 2
+#define PCRE_INFO_BACKREFMAX 3
+#define PCRE_INFO_FIRSTBYTE 4
+#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */
+#define PCRE_INFO_FIRSTTABLE 5
+#define PCRE_INFO_LASTLITERAL 6
+#define PCRE_INFO_NAMEENTRYSIZE 7
+#define PCRE_INFO_NAMECOUNT 8
+#define PCRE_INFO_NAMETABLE 9
+#define PCRE_INFO_STUDYSIZE 10
+#define PCRE_INFO_DEFAULT_TABLES 11
+#define PCRE_INFO_OKPARTIAL 12
+#define PCRE_INFO_JCHANGED 13
+#define PCRE_INFO_HASCRORLF 14
+
+/* Request types for pcre_config(). Do not re-arrange, in order to remain
+compatible. */
+
+#define PCRE_CONFIG_UTF8 0
+#define PCRE_CONFIG_NEWLINE 1
+#define PCRE_CONFIG_LINK_SIZE 2
+#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
+#define PCRE_CONFIG_MATCH_LIMIT 4
+#define PCRE_CONFIG_STACKRECURSE 5
+#define PCRE_CONFIG_UNICODE_PROPERTIES 6
+#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
+#define PCRE_CONFIG_BSR 8
+
+/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
+these bits, just add new ones on the end, in order to remain compatible. */
+
+#define PCRE_EXTRA_STUDY_DATA 0x0001
+#define PCRE_EXTRA_MATCH_LIMIT 0x0002
+#define PCRE_EXTRA_CALLOUT_DATA 0x0004
+#define PCRE_EXTRA_TABLES 0x0008
+#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
+
+/* Types */
+
+struct real_pcre; /* declaration; the definition is private */
+typedef struct real_pcre pcre;
+
+/* When PCRE is compiled as a C++ library, the subject pointer type can be
+replaced with a custom type. For conventional use, the public interface is a
+const char *. */
+
+#ifndef PCRE_SPTR
+#define PCRE_SPTR const char *
+#endif
+
+/* The structure for passing additional data to pcre_exec(). This is defined in
+such as way as to be extensible. Always add new fields at the end, in order to
+remain compatible. */
+
+typedef struct pcre_extra {
+ unsigned long int flags; /* Bits for which fields are set */
+ void *study_data; /* Opaque data from pcre_study() */
+ unsigned long int match_limit; /* Maximum number of calls to match() */
+ void *callout_data; /* Data passed back in callouts */
+ const unsigned char *tables; /* Pointer to character tables */
+ unsigned long int match_limit_recursion; /* Max recursive calls to match() */
+} pcre_extra;
+
+/* The structure for passing out data via the pcre_callout_function. We use a
+structure so that new fields can be added on the end in future versions,
+without changing the API of the function, thereby allowing old clients to work
+without modification. */
+
+typedef struct pcre_callout_block {
+ int version; /* Identifies version of block */
+ /* ------------------------ Version 0 ------------------------------- */
+ int callout_number; /* Number compiled into pattern */
+ int *offset_vector; /* The offset vector */
+ PCRE_SPTR subject; /* The subject being matched */
+ int subject_length; /* The length of the subject */
+ int start_match; /* Offset to start of this match attempt */
+ int current_position; /* Where we currently are in the subject */
+ int capture_top; /* Max current capture */
+ int capture_last; /* Most recently closed capture */
+ void *callout_data; /* Data passed in with the call */
+ /* ------------------- Added for Version 1 -------------------------- */
+ int pattern_position; /* Offset to next item in the pattern */
+ int next_item_length; /* Length of next item in the pattern */
+ /* ------------------------------------------------------------------ */
+} pcre_callout_block;
+
+/* Indirection for store get and free functions. These can be set to
+alternative malloc/free functions if required. Special ones are used in the
+non-recursive case for "frames". There is also an optional callout function
+that is triggered by the (?) regex item. For Virtual Pascal, these definitions
+have to take another form. */
+
+#ifndef VPCOMPAT
+PCRE_EXP_DECL void *(*pcre_malloc)(size_t);
+PCRE_EXP_DECL void (*pcre_free)(void *);
+PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t);
+PCRE_EXP_DECL void (*pcre_stack_free)(void *);
+PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
+#else /* VPCOMPAT */
+PCRE_EXP_DECL void *pcre_malloc(size_t);
+PCRE_EXP_DECL void pcre_free(void *);
+PCRE_EXP_DECL void *pcre_stack_malloc(size_t);
+PCRE_EXP_DECL void pcre_stack_free(void *);
+PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
+#endif /* VPCOMPAT */
+
+/* Exported PCRE functions */
+
+PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
+ const unsigned char *);
+PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
+ int *, const unsigned char *);
+PCRE_EXP_DECL int pcre_config(int, void *);
+PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
+ int *, int, const char *, char *, int);
+PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *,
+ int);
+PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *,
+ const char *, int, int, int, int *, int , int *, int);
+PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR,
+ int, int, int, int *, int);
+PCRE_EXP_DECL void pcre_free_substring(const char *);
+PCRE_EXP_DECL void pcre_free_substring_list(const char **);
+PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
+ void *);
+PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
+ int *, int, const char *, const char **);
+PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
+PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *,
+ char **, char **);
+PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
+ const char **);
+PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
+ const char ***);
+PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
+PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
+PCRE_EXP_DECL int pcre_refcount(pcre *, int);
+PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
+PCRE_EXP_DECL const char *pcre_version(void);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* End of pcre.h */
diff --git a/src/pcre_compile.c b/src/pcre_compile.c
new file mode 100644
index 0000000..4b28343
--- /dev/null
+++ b/src/pcre_compile.c
@@ -0,0 +1,6379 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_compile(), along with
+supporting internal functions that are not used by other modules. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define NLBLOCK cd /* Block containing newline information */
+#define PSSTART start_pattern /* Field containing processed string start */
+#define PSEND end_pattern /* Field containing processed string end */
+
+#include "pcre_internal.h"
+
+
+/* When DEBUG is defined, we need the pcre_printint() function, which is also
+used by pcretest. DEBUG is not defined when building a production library. */
+
+#ifdef DEBUG
+#include "pcre_printint.src"
+#endif
+
+
+/* Macro for setting individual bits in class bitmaps. */
+
+#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
+
+/* Maximum length value to check against when making sure that the integer that
+holds the compiled pattern length does not overflow. We make it a bit less than
+INT_MAX to allow for adding in group terminating bytes, so that we don't have
+to check them every time. */
+
+#define OFLOW_MAX (INT_MAX - 20)
+
+
+/*************************************************
+* Code parameters and static tables *
+*************************************************/
+
+/* This value specifies the size of stack workspace that is used during the
+first pre-compile phase that determines how much memory is required. The regex
+is partly compiled into this space, but the compiled parts are discarded as
+soon as they can be, so that hopefully there will never be an overrun. The code
+does, however, check for an overrun. The largest amount I've seen used is 218,
+so this number is very generous.
+
+The same workspace is used during the second, actual compile phase for
+remembering forward references to groups so that they can be filled in at the
+end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
+is 4 there is plenty of room. */
+
+#define COMPILE_WORK_SIZE (4096)
+
+
+/* Table for handling escaped characters in the range '0'-'z'. Positive returns
+are simple data values; negative values are for special things like \d and so
+on. Zero means further processing is needed (for things like \x), or the escape
+is invalid. */
+
+#ifndef EBCDIC /* This is the "normal" table for ASCII systems */
+static const short int escapes[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
+ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
+ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
+-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
+-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
+-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
+ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
+-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
+-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
+ 0, 0, -ESC_z /* x - z */
+};
+
+#else /* This is the "abnormal" table for EBCDIC systems */
+static const short int escapes[] = {
+/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
+/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
+/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
+/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
+/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
+/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
+/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
+/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
+/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
+/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
+/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
+/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
+/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
+/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
+/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
+/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
+/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
+/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
+/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
+/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
+};
+#endif
+
+
+/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
+searched linearly. Put all the names into a single string, in order to reduce
+the number of relocations when a shared library is dynamically linked. */
+
+typedef struct verbitem {
+ int len;
+ int op;
+} verbitem;
+
+static const char verbnames[] =
+ "ACCEPT\0"
+ "COMMIT\0"
+ "F\0"
+ "FAIL\0"
+ "PRUNE\0"
+ "SKIP\0"
+ "THEN";
+
+static const verbitem verbs[] = {
+ { 6, OP_ACCEPT },
+ { 6, OP_COMMIT },
+ { 1, OP_FAIL },
+ { 4, OP_FAIL },
+ { 5, OP_PRUNE },
+ { 4, OP_SKIP },
+ { 4, OP_THEN }
+};
+
+static const int verbcount = sizeof(verbs)/sizeof(verbitem);
+
+
+/* Tables of names of POSIX character classes and their lengths. The names are
+now all in a single string, to reduce the number of relocations when a shared
+library is dynamically loaded. The list of lengths is terminated by a zero
+length entry. The first three must be alpha, lower, upper, as this is assumed
+for handling case independence. */
+
+static const char posix_names[] =
+ "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
+ "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
+ "word\0" "xdigit";
+
+static const uschar posix_name_lengths[] = {
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
+
+/* Table of class bit maps for each POSIX class. Each class is formed from a
+base map, with an optional addition or removal of another map. Then, for some
+classes, there is some additional tweaking: for [:blank:] the vertical space
+characters are removed, and for [:alpha:] and [:alnum:] the underscore
+character is removed. The triples in the table consist of the base map offset,
+second map offset or -1 if no second map, and a non-negative value for map
+addition or a negative value for map subtraction (if there are two maps). The
+absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
+remove vertical space characters, 2 => remove underscore. */
+
+static const int posix_class_maps[] = {
+ cbit_word, cbit_digit, -2, /* alpha */
+ cbit_lower, -1, 0, /* lower */
+ cbit_upper, -1, 0, /* upper */
+ cbit_word, -1, 2, /* alnum - word without underscore */
+ cbit_print, cbit_cntrl, 0, /* ascii */
+ cbit_space, -1, 1, /* blank - a GNU extension */
+ cbit_cntrl, -1, 0, /* cntrl */
+ cbit_digit, -1, 0, /* digit */
+ cbit_graph, -1, 0, /* graph */
+ cbit_print, -1, 0, /* print */
+ cbit_punct, -1, 0, /* punct */
+ cbit_space, -1, 0, /* space */
+ cbit_word, -1, 0, /* word - a Perl extension */
+ cbit_xdigit,-1, 0 /* xdigit */
+};
+
+
+#define STRING(a) # a
+#define XSTRING(s) STRING(s)
+
+/* The texts of compile-time error messages. These are "char *" because they
+are passed to the outside world. Do not ever re-use any error number, because
+they are documented. Always add a new error instead. Messages marked DEAD below
+are no longer used. This used to be a table of strings, but in order to reduce
+the number of relocations needed when a shared library is loaded dynamically,
+it is now one long string. We cannot use a table of offsets, because the
+lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
+simply count through to the one we want - this isn't a performance issue
+because these strings are used only when there is a compilation error. */
+
+static const char error_texts[] =
+ "no error\0"
+ "\\ at end of pattern\0"
+ "\\c at end of pattern\0"
+ "unrecognized character follows \\\0"
+ "numbers out of order in {} quantifier\0"
+ /* 5 */
+ "number too big in {} quantifier\0"
+ "missing terminating ] for character class\0"
+ "invalid escape sequence in character class\0"
+ "range out of order in character class\0"
+ "nothing to repeat\0"
+ /* 10 */
+ "operand of unlimited repeat could match the empty string\0" /** DEAD **/
+ "internal error: unexpected repeat\0"
+ "unrecognized character after (? or (?-\0"
+ "POSIX named classes are supported only within a class\0"
+ "missing )\0"
+ /* 15 */
+ "reference to non-existent subpattern\0"
+ "erroffset passed as NULL\0"
+ "unknown option bit(s) set\0"
+ "missing ) after comment\0"
+ "parentheses nested too deeply\0" /** DEAD **/
+ /* 20 */
+ "regular expression is too large\0"
+ "failed to get memory\0"
+ "unmatched parentheses\0"
+ "internal error: code overflow\0"
+ "unrecognized character after (?<\0"
+ /* 25 */
+ "lookbehind assertion is not fixed length\0"
+ "malformed number or name after (?(\0"
+ "conditional group contains more than two branches\0"
+ "assertion expected after (?(\0"
+ "(?R or (?[+-]digits must be followed by )\0"
+ /* 30 */
+ "unknown POSIX class name\0"
+ "POSIX collating elements are not supported\0"
+ "this version of PCRE is not compiled with PCRE_UTF8 support\0"
+ "spare error\0" /** DEAD **/
+ "character value in \\x{...} sequence is too large\0"
+ /* 35 */
+ "invalid condition (?(0)\0"
+ "\\C not allowed in lookbehind assertion\0"
+ "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
+ "number after (?C is > 255\0"
+ "closing ) for (?C expected\0"
+ /* 40 */
+ "recursive call could loop indefinitely\0"
+ "unrecognized character after (?P\0"
+ "syntax error in subpattern name (missing terminator)\0"
+ "two named subpatterns have the same name\0"
+ "invalid UTF-8 string\0"
+ /* 45 */
+ "support for \\P, \\p, and \\X has not been compiled\0"
+ "malformed \\P or \\p sequence\0"
+ "unknown property name after \\P or \\p\0"
+ "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
+ "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
+ /* 50 */
+ "repeated subpattern is too long\0" /** DEAD **/
+ "octal value is greater than \\377 (not in UTF-8 mode)\0"
+ "internal error: overran compiling workspace\0"
+ "internal error: previously-checked referenced subpattern not found\0"
+ "DEFINE group contains more than one branch\0"
+ /* 55 */
+ "repeating a DEFINE group is not allowed\0"
+ "inconsistent NEWLINE options\0"
+ "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
+ "a numbered reference must not be zero\0"
+ "(*VERB) with an argument is not supported\0"
+ /* 60 */
+ "(*VERB) not recognized\0"
+ "number is too big\0"
+ "subpattern name expected\0"
+ "digit expected after (?+\0"
+ "] is an invalid data character in JavaScript compatibility mode";
+
+
+/* Table to identify digits and hex digits. This is used when compiling
+patterns. Note that the tables in chartables are dependent on the locale, and
+may mark arbitrary characters as digits - but the PCRE compiling code expects
+to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
+a private table here. It costs 256 bytes, but it is a lot faster than doing
+character value tests (at least in some simple cases I timed), and in some
+applications one wants PCRE to compile efficiently as well as match
+efficiently.
+
+For convenience, we use the same bit definitions as in chartables:
+
+ 0x04 decimal digit
+ 0x08 hexadecimal digit
+
+Then we can use ctype_digit and ctype_xdigit in the code. */
+
+#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
+static const unsigned char digitab[] =
+ {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
+ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
+ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
+
+#else /* This is the "abnormal" case, for EBCDIC systems */
+static const unsigned char digitab[] =
+ {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
+ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
+ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
+
+static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
+ 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
+ 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
+ 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
+ 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
+ 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
+ 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
+ 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
+ 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
+ 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
+ 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
+ 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
+ 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
+ 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
+ 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
+ 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
+#endif
+
+
+/* Definition to allow mutual recursion */
+
+static BOOL
+ compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
+ int *, int *, branch_chain *, compile_data *, int *);
+
+
+
+/*************************************************
+* Find an error text *
+*************************************************/
+
+/* The error texts are now all in one long string, to save on relocations. As
+some of the text is of unknown length, we can't use a table of offsets.
+Instead, just count through the strings. This is not a performance issue
+because it happens only when there has been a compilation error.
+
+Argument: the error number
+Returns: pointer to the error string
+*/
+
+static const char *
+find_error_text(int n)
+{
+const char *s = error_texts;
+for (; n > 0; n--) while (*s++ != 0);
+return s;
+}
+
+
+/*************************************************
+* Handle escapes *
+*************************************************/
+
+/* This function is called when a \ has been encountered. It either returns a
+positive value for a simple escape such as \n, or a negative value which
+encodes one of the more complicated things such as \d. A backreference to group
+n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
+UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
+ptr is pointing at the \. On exit, it is on the final character of the escape
+sequence.
+
+Arguments:
+ ptrptr points to the pattern position pointer
+ errorcodeptr points to the errorcode variable
+ bracount number of previous extracting brackets
+ options the options bits
+ isclass TRUE if inside a character class
+
+Returns: zero or positive => a data character
+ negative => a special escape sequence
+ on error, errorcodeptr is set
+*/
+
+static int
+check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
+ int options, BOOL isclass)
+{
+BOOL utf8 = (options & PCRE_UTF8) != 0;
+const uschar *ptr = *ptrptr + 1;
+int c, i;
+
+GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
+ptr--; /* Set pointer back to the last byte */
+
+/* If backslash is at the end of the pattern, it's an error. */
+
+if (c == 0) *errorcodeptr = ERR1;
+
+/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
+in a table. A non-zero result is something that can be returned immediately.
+Otherwise further processing may be required. */
+
+#ifndef EBCDIC /* ASCII coding */
+else if (c < '0' || c > 'z') {} /* Not alphanumeric */
+else if ((i = escapes[c - '0']) != 0) c = i;
+
+#else /* EBCDIC coding */
+else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
+else if ((i = escapes[c - 0x48]) != 0) c = i;
+#endif
+
+/* Escapes that need further processing, or are illegal. */
+
+else
+ {
+ const uschar *oldptr;
+ BOOL braced, negated;
+
+ switch (c)
+ {
+ /* A number of Perl escapes are not handled by PCRE. We give an explicit
+ error. */
+
+ case 'l':
+ case 'L':
+ case 'N':
+ case 'u':
+ case 'U':
+ *errorcodeptr = ERR37;
+ break;
+
+ /* \g must be followed by one of a number of specific things:
+
+ (1) A number, either plain or braced. If positive, it is an absolute
+ backreference. If negative, it is a relative backreference. This is a Perl
+ 5.10 feature.
+
+ (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
+ is part of Perl's movement towards a unified syntax for back references. As
+ this is synonymous with \k{name}, we fudge it up by pretending it really
+ was \k.
+
+ (3) For Oniguruma compatibility we also support \g followed by a name or a
+ number either in angle brackets or in single quotes. However, these are
+ (possibly recursive) subroutine calls, _not_ backreferences. Just return
+ the -ESC_g code (cf \k). */
+
+ case 'g':
+ if (ptr[1] == '<' || ptr[1] == '\'')
+ {
+ c = -ESC_g;
+ break;
+ }
+
+ /* Handle the Perl-compatible cases */
+
+ if (ptr[1] == '{')
+ {
+ const uschar *p;
+ for (p = ptr+2; *p != 0 && *p != '}'; p++)
+ if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
+ if (*p != 0 && *p != '}')
+ {
+ c = -ESC_k;
+ break;
+ }
+ braced = TRUE;
+ ptr++;
+ }
+ else braced = FALSE;
+
+ if (ptr[1] == '-')
+ {
+ negated = TRUE;
+ ptr++;
+ }
+ else negated = FALSE;
+
+ c = 0;
+ while ((digitab[ptr[1]] & ctype_digit) != 0)
+ c = c * 10 + *(++ptr) - '0';
+
+ if (c < 0) /* Integer overflow */
+ {
+ *errorcodeptr = ERR61;
+ break;
+ }
+
+ if (braced && *(++ptr) != '}')
+ {
+ *errorcodeptr = ERR57;
+ break;
+ }
+
+ if (c == 0)
+ {
+ *errorcodeptr = ERR58;
+ break;
+ }
+
+ if (negated)
+ {
+ if (c > bracount)
+ {
+ *errorcodeptr = ERR15;
+ break;
+ }
+ c = bracount - (c - 1);
+ }
+
+ c = -(ESC_REF + c);
+ break;
+
+ /* The handling of escape sequences consisting of a string of digits
+ starting with one that is not zero is not straightforward. By experiment,
+ the way Perl works seems to be as follows:
+
+ Outside a character class, the digits are read as a decimal number. If the
+ number is less than 10, or if there are that many previous extracting
+ left brackets, then it is a back reference. Otherwise, up to three octal
+ digits are read to form an escaped byte. Thus \123 is likely to be octal
+ 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
+ value is greater than 377, the least significant 8 bits are taken. Inside a
+ character class, \ followed by a digit is always an octal number. */
+
+ case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9':
+
+ if (!isclass)
+ {
+ oldptr = ptr;
+ c -= '0';
+ while ((digitab[ptr[1]] & ctype_digit) != 0)
+ c = c * 10 + *(++ptr) - '0';
+ if (c < 0) /* Integer overflow */
+ {
+ *errorcodeptr = ERR61;
+ break;
+ }
+ if (c < 10 || c <= bracount)
+ {
+ c = -(ESC_REF + c);
+ break;
+ }
+ ptr = oldptr; /* Put the pointer back and fall through */
+ }
+
+ /* Handle an octal number following \. If the first digit is 8 or 9, Perl
+ generates a binary zero byte and treats the digit as a following literal.
+ Thus we have to pull back the pointer by one. */
+
+ if ((c = *ptr) >= '8')
+ {
+ ptr--;
+ c = 0;
+ break;
+ }
+
+ /* \0 always starts an octal number, but we may drop through to here with a
+ larger first octal digit. The original code used just to take the least
+ significant 8 bits of octal numbers (I think this is what early Perls used
+ to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
+ than 3 octal digits. */
+
+ case '0':
+ c -= '0';
+ while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
+ c = c * 8 + *(++ptr) - '0';
+ if (!utf8 && c > 255) *errorcodeptr = ERR51;
+ break;
+
+ /* \x is complicated. \x{ddd} is a character number which can be greater
+ than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
+ treated as a data character. */
+
+ case 'x':
+ if (ptr[1] == '{')
+ {
+ const uschar *pt = ptr + 2;
+ int count = 0;
+
+ c = 0;
+ while ((digitab[*pt] & ctype_xdigit) != 0)
+ {
+ register int cc = *pt++;
+ if (c == 0 && cc == '0') continue; /* Leading zeroes */
+ count++;
+
+#ifndef EBCDIC /* ASCII coding */
+ if (cc >= 'a') cc -= 32; /* Convert to upper case */
+ c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
+#else /* EBCDIC coding */
+ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
+ c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
+#endif
+ }
+
+ if (*pt == '}')
+ {
+ if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
+ ptr = pt;
+ break;
+ }
+
+ /* If the sequence of hex digits does not end with '}', then we don't
+ recognize this construct; fall through to the normal \x handling. */
+ }
+
+ /* Read just a single-byte hex-defined char */
+
+ c = 0;
+ while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
+ {
+ int cc; /* Some compilers don't like ++ */
+ cc = *(++ptr); /* in initializers */
+#ifndef EBCDIC /* ASCII coding */
+ if (cc >= 'a') cc -= 32; /* Convert to upper case */
+ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
+#else /* EBCDIC coding */
+ if (cc <= 'z') cc += 64; /* Convert to upper case */
+ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
+#endif
+ }
+ break;
+
+ /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
+ This coding is ASCII-specific, but then the whole concept of \cx is
+ ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
+
+ case 'c':
+ c = *(++ptr);
+ if (c == 0)
+ {
+ *errorcodeptr = ERR2;
+ break;
+ }
+
+#ifndef EBCDIC /* ASCII coding */
+ if (c >= 'a' && c <= 'z') c -= 32;
+ c ^= 0x40;
+#else /* EBCDIC coding */
+ if (c >= 'a' && c <= 'z') c += 64;
+ c ^= 0xC0;
+#endif
+ break;
+
+ /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
+ other alphanumeric following \ is an error if PCRE_EXTRA was set;
+ otherwise, for Perl compatibility, it is a literal. This code looks a bit
+ odd, but there used to be some cases other than the default, and there may
+ be again in future, so I haven't "optimized" it. */
+
+ default:
+ if ((options & PCRE_EXTRA) != 0) switch(c)
+ {
+ default:
+ *errorcodeptr = ERR3;
+ break;
+ }
+ break;
+ }
+ }
+
+*ptrptr = ptr;
+return c;
+}
+
+
+
+#ifdef SUPPORT_UCP
+/*************************************************
+* Handle \P and \p *
+*************************************************/
+
+/* This function is called after \P or \p has been encountered, provided that
+PCRE is compiled with support for Unicode properties. On entry, ptrptr is
+pointing at the P or p. On exit, it is pointing at the final character of the
+escape sequence.
+
+Argument:
+ ptrptr points to the pattern position pointer
+ negptr points to a boolean that is set TRUE for negation else FALSE
+ dptr points to an int that is set to the detailed property value
+ errorcodeptr points to the error code variable
+
+Returns: type value from ucp_type_table, or -1 for an invalid type
+*/
+
+static int
+get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
+{
+int c, i, bot, top;
+const uschar *ptr = *ptrptr;
+char name[32];
+
+c = *(++ptr);
+if (c == 0) goto ERROR_RETURN;
+
+*negptr = FALSE;
+
+/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
+negation. */
+
+if (c == '{')
+ {
+ if (ptr[1] == '^')
+ {
+ *negptr = TRUE;
+ ptr++;
+ }
+ for (i = 0; i < (int)sizeof(name) - 1; i++)
+ {
+ c = *(++ptr);
+ if (c == 0) goto ERROR_RETURN;
+ if (c == '}') break;
+ name[i] = c;
+ }
+ if (c !='}') goto ERROR_RETURN;
+ name[i] = 0;
+ }
+
+/* Otherwise there is just one following character */
+
+else
+ {
+ name[0] = c;
+ name[1] = 0;
+ }
+
+*ptrptr = ptr;
+
+/* Search for a recognized property name using binary chop */
+
+bot = 0;
+top = _pcre_utt_size;
+
+while (bot < top)
+ {
+ i = (bot + top) >> 1;
+ c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
+ if (c == 0)
+ {
+ *dptr = _pcre_utt[i].value;
+ return _pcre_utt[i].type;
+ }
+ if (c > 0) bot = i + 1; else top = i;
+ }
+
+*errorcodeptr = ERR47;
+*ptrptr = ptr;
+return -1;
+
+ERROR_RETURN:
+*errorcodeptr = ERR46;
+*ptrptr = ptr;
+return -1;
+}
+#endif
+
+
+
+
+/*************************************************
+* Check for counted repeat *
+*************************************************/
+
+/* This function is called when a '{' is encountered in a place where it might
+start a quantifier. It looks ahead to see if it really is a quantifier or not.
+It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
+where the ddds are digits.
+
+Arguments:
+ p pointer to the first char after '{'
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+is_counted_repeat(const uschar *p)
+{
+if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
+while ((digitab[*p] & ctype_digit) != 0) p++;
+if (*p == '}') return TRUE;
+
+if (*p++ != ',') return FALSE;
+if (*p == '}') return TRUE;
+
+if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
+while ((digitab[*p] & ctype_digit) != 0) p++;
+
+return (*p == '}');
+}
+
+
+
+/*************************************************
+* Read repeat counts *
+*************************************************/
+
+/* Read an item of the form {n,m} and return the values. This is called only
+after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
+so the syntax is guaranteed to be correct, but we need to check the values.
+
+Arguments:
+ p pointer to first char after '{'
+ minp pointer to int for min
+ maxp pointer to int for max
+ returned as -1 if no max
+ errorcodeptr points to error code variable
+
+Returns: pointer to '}' on success;
+ current ptr on error, with errorcodeptr set non-zero
+*/
+
+static const uschar *
+read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
+{
+int min = 0;
+int max = -1;
+
+/* Read the minimum value and do a paranoid check: a negative value indicates
+an integer overflow. */
+
+while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
+if (min < 0 || min > 65535)
+ {
+ *errorcodeptr = ERR5;
+ return p;
+ }
+
+/* Read the maximum value if there is one, and again do a paranoid on its size.
+Also, max must not be less than min. */
+
+if (*p == '}') max = min; else
+ {
+ if (*(++p) != '}')
+ {
+ max = 0;
+ while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
+ if (max < 0 || max > 65535)
+ {
+ *errorcodeptr = ERR5;
+ return p;
+ }
+ if (max < min)
+ {
+ *errorcodeptr = ERR4;
+ return p;
+ }
+ }
+ }
+
+/* Fill in the required variables, and pass back the pointer to the terminating
+'}'. */
+
+*minp = min;
+*maxp = max;
+return p;
+}
+
+
+
+/*************************************************
+* Find forward referenced subpattern *
+*************************************************/
+
+/* This function scans along a pattern's text looking for capturing
+subpatterns, and counting them. If it finds a named pattern that matches the
+name it is given, it returns its number. Alternatively, if the name is NULL, it
+returns when it reaches a given numbered subpattern. This is used for forward
+references to subpatterns. We know that if (?P< is encountered, the name will
+be terminated by '>' because that is checked in the first pass.
+
+Arguments:
+ ptr current position in the pattern
+ cd compile background data
+ name name to seek, or NULL if seeking a numbered subpattern
+ lorn name length, or subpattern number if name is NULL
+ xmode TRUE if we are in /x mode
+
+Returns: the number of the named subpattern, or -1 if not found
+*/
+
+static int
+find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
+ BOOL xmode)
+{
+const uschar *thisname;
+int count = cd->bracount;
+
+for (; *ptr != 0; ptr++)
+ {
+ int term;
+
+ /* Skip over backslashed characters and also entire \Q...\E */
+
+ if (*ptr == '\\')
+ {
+ if (*(++ptr) == 0) return -1;
+ if (*ptr == 'Q') for (;;)
+ {
+ while (*(++ptr) != 0 && *ptr != '\\');
+ if (*ptr == 0) return -1;
+ if (*(++ptr) == 'E') break;
+ }
+ continue;
+ }
+
+ /* Skip over character classes; this logic must be similar to the way they
+ are handled for real. If the first character is '^', skip it. Also, if the
+ first few characters (either before or after ^) are \Q\E or \E we skip them
+ too. This makes for compatibility with Perl. */
+
+ if (*ptr == '[')
+ {
+ BOOL negate_class = FALSE;
+ for (;;)
+ {
+ int c = *(++ptr);
+ if (c == '\\')
+ {
+ if (ptr[1] == 'E') ptr++;
+ else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
+ else break;
+ }
+ else if (!negate_class && c == '^')
+ negate_class = TRUE;
+ else break;
+ }
+
+ /* If the next character is ']', it is a data character that must be
+ skipped, except in JavaScript compatibility mode. */
+
+ if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
+ ptr++;
+
+ while (*(++ptr) != ']')
+ {
+ if (*ptr == 0) return -1;
+ if (*ptr == '\\')
+ {
+ if (*(++ptr) == 0) return -1;
+ if (*ptr == 'Q') for (;;)
+ {
+ while (*(++ptr) != 0 && *ptr != '\\');
+ if (*ptr == 0) return -1;
+ if (*(++ptr) == 'E') break;
+ }
+ continue;
+ }
+ }
+ continue;
+ }
+
+ /* Skip comments in /x mode */
+
+ if (xmode && *ptr == '#')
+ {
+ while (*(++ptr) != 0 && *ptr != '\n');
+ if (*ptr == 0) return -1;
+ continue;
+ }
+
+ /* An opening parens must now be a real metacharacter */
+
+ if (*ptr != '(') continue;
+ if (ptr[1] != '?' && ptr[1] != '*')
+ {
+ count++;
+ if (name == NULL && count == lorn) return count;
+ continue;
+ }
+
+ ptr += 2;
+ if (*ptr == 'P') ptr++; /* Allow optional P */
+
+ /* We have to disambiguate (? */
+
+ if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
+ *ptr != '\'')
+ continue;
+
+ count++;
+
+ if (name == NULL && count == lorn) return count;
+ term = *ptr++;
+ if (term == '<') term = '>';
+ thisname = ptr;
+ while (*ptr != term) ptr++;
+ if (name != NULL && lorn == ptr - thisname &&
+ strncmp((const char *)name, (const char *)thisname, lorn) == 0)
+ return count;
+ }
+
+return -1;
+}
+
+
+
+/*************************************************
+* Find first significant op code *
+*************************************************/
+
+/* This is called by several functions that scan a compiled expression looking
+for a fixed first character, or an anchoring op code etc. It skips over things
+that do not influence this. For some calls, a change of option is important.
+For some calls, it makes sense to skip negative forward and all backward
+assertions, and also the \b assertion; for others it does not.
+
+Arguments:
+ code pointer to the start of the group
+ options pointer to external options
+ optbit the option bit whose changing is significant, or
+ zero if none are
+ skipassert TRUE if certain assertions are to be skipped
+
+Returns: pointer to the first significant opcode
+*/
+
+static const uschar*
+first_significant_code(const uschar *code, int *options, int optbit,
+ BOOL skipassert)
+{
+for (;;)
+ {
+ switch ((int)*code)
+ {
+ case OP_OPT:
+ if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
+ *options = (int)code[1];
+ code += 2;
+ break;
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ if (!skipassert) return code;
+ do code += GET(code, 1); while (*code == OP_ALT);
+ code += _pcre_OP_lengths[*code];
+ break;
+
+ case OP_WORD_BOUNDARY:
+ case OP_NOT_WORD_BOUNDARY:
+ if (!skipassert) return code;
+ /* Fall through */
+
+ case OP_CALLOUT:
+ case OP_CREF:
+ case OP_RREF:
+ case OP_DEF:
+ code += _pcre_OP_lengths[*code];
+ break;
+
+ default:
+ return code;
+ }
+ }
+/* Control never reaches here */
+}
+
+
+
+
+/*************************************************
+* Find the fixed length of a pattern *
+*************************************************/
+
+/* Scan a pattern and compute the fixed length of subject that will match it,
+if the length is fixed. This is needed for dealing with backward assertions.
+In UTF8 mode, the result is in characters rather than bytes.
+
+Arguments:
+ code points to the start of the pattern (the bracket)
+ options the compiling options
+
+Returns: the fixed length, or -1 if there is no fixed length,
+ or -2 if \C was encountered
+*/
+
+static int
+find_fixedlength(uschar *code, int options)
+{
+int length = -1;
+
+register int branchlength = 0;
+register uschar *cc = code + 1 + LINK_SIZE;
+
+/* Scan along the opcodes for this branch. If we get to the end of the
+branch, check the length against that of the other branches. */
+
+for (;;)
+ {
+ int d;
+ register int op = *cc;
+ switch (op)
+ {
+ case OP_CBRA:
+ case OP_BRA:
+ case OP_ONCE:
+ case OP_COND:
+ d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
+ if (d < 0) return d;
+ branchlength += d;
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+ cc += 1 + LINK_SIZE;
+ break;
+
+ /* Reached end of a branch; if it's a ket it is the end of a nested
+ call. If it's ALT it is an alternation in a nested call. If it is
+ END it's the end of the outer call. All can be handled by the same code. */
+
+ case OP_ALT:
+ case OP_KET:
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ case OP_END:
+ if (length < 0) length = branchlength;
+ else if (length != branchlength) return -1;
+ if (*cc != OP_ALT) return length;
+ cc += 1 + LINK_SIZE;
+ branchlength = 0;
+ break;
+
+ /* Skip over assertive subpatterns */
+
+ case OP_ASSERT:
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+ /* Fall through */
+
+ /* Skip over things that don't match chars */
+
+ case OP_REVERSE:
+ case OP_CREF:
+ case OP_RREF:
+ case OP_DEF:
+ case OP_OPT:
+ case OP_CALLOUT:
+ case OP_SOD:
+ case OP_SOM:
+ case OP_EOD:
+ case OP_EODN:
+ case OP_CIRC:
+ case OP_DOLL:
+ case OP_NOT_WORD_BOUNDARY:
+ case OP_WORD_BOUNDARY:
+ cc += _pcre_OP_lengths[*cc];
+ break;
+
+ /* Handle literal characters */
+
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_NOT:
+ branchlength++;
+ cc += 2;
+#ifdef SUPPORT_UTF8
+ if ((options & PCRE_UTF8) != 0)
+ {
+ while ((*cc & 0xc0) == 0x80) cc++;
+ }
+#endif
+ break;
+
+ /* Handle exact repetitions. The count is already in characters, but we
+ need to skip over a multibyte character in UTF8 mode. */
+
+ case OP_EXACT:
+ branchlength += GET2(cc,1);
+ cc += 4;
+#ifdef SUPPORT_UTF8
+ if ((options & PCRE_UTF8) != 0)
+ {
+ while((*cc & 0x80) == 0x80) cc++;
+ }
+#endif
+ break;
+
+ case OP_TYPEEXACT:
+ branchlength += GET2(cc,1);
+ if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
+ cc += 4;
+ break;
+
+ /* Handle single-char matchers */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ cc += 2;
+ /* Fall through */
+
+ case OP_NOT_DIGIT:
+ case OP_DIGIT:
+ case OP_NOT_WHITESPACE:
+ case OP_WHITESPACE:
+ case OP_NOT_WORDCHAR:
+ case OP_WORDCHAR:
+ case OP_ANY:
+ case OP_ALLANY:
+ branchlength++;
+ cc++;
+ break;
+
+ /* The single-byte matcher isn't allowed */
+
+ case OP_ANYBYTE:
+ return -2;
+
+ /* Check a class for variable quantification */
+
+#ifdef SUPPORT_UTF8
+ case OP_XCLASS:
+ cc += GET(cc, 1) - 33;
+ /* Fall through */
+#endif
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ cc += 33;
+
+ switch (*cc)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ return -1;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ if (GET2(cc,1) != GET2(cc,3)) return -1;
+ branchlength += GET2(cc,1);
+ cc += 5;
+ break;
+
+ default:
+ branchlength++;
+ }
+ break;
+
+ /* Anything else is variable length */
+
+ default:
+ return -1;
+ }
+ }
+/* Control never gets here */
+}
+
+
+
+
+/*************************************************
+* Scan compiled regex for numbered bracket *
+*************************************************/
+
+/* This little function scans through a compiled pattern until it finds a
+capturing bracket with the given number.
+
+Arguments:
+ code points to start of expression
+ utf8 TRUE in UTF-8 mode
+ number the required bracket number
+
+Returns: pointer to the opcode for the bracket, or NULL if not found
+*/
+
+static const uschar *
+find_bracket(const uschar *code, BOOL utf8, int number)
+{
+for (;;)
+ {
+ register int c = *code;
+ if (c == OP_END) return NULL;
+
+ /* XCLASS is used for classes that cannot be represented just by a bit
+ map. This includes negated single high-valued characters. The length in
+ the table is zero; the actual length is stored in the compiled code. */
+
+ if (c == OP_XCLASS) code += GET(code, 1);
+
+ /* Handle capturing bracket */
+
+ else if (c == OP_CBRA)
+ {
+ int n = GET2(code, 1+LINK_SIZE);
+ if (n == number) return (uschar *)code;
+ code += _pcre_OP_lengths[c];
+ }
+
+ /* Otherwise, we can get the item's length from the table, except that for
+ repeated character types, we have to test for \p and \P, which have an extra
+ two bytes of parameters. */
+
+ else
+ {
+ switch(c)
+ {
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ case OP_TYPEPOSSTAR:
+ case OP_TYPEPOSPLUS:
+ case OP_TYPEPOSQUERY:
+ if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
+ break;
+
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ case OP_TYPEEXACT:
+ case OP_TYPEPOSUPTO:
+ if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
+ break;
+ }
+
+ /* Add in the fixed length from the table */
+
+ code += _pcre_OP_lengths[c];
+
+ /* In UTF-8 mode, opcodes that are followed by a character may be followed by
+ a multi-byte character. The length in the table is a minimum, so we have to
+ arrange to skip the extra bytes. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8) switch(c)
+ {
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_EXACT:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ case OP_POSUPTO:
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_POSSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_POSPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ case OP_POSQUERY:
+ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
+ break;
+ }
+#endif
+ }
+ }
+}
+
+
+
+/*************************************************
+* Scan compiled regex for recursion reference *
+*************************************************/
+
+/* This little function scans through a compiled pattern until it finds an
+instance of OP_RECURSE.
+
+Arguments:
+ code points to start of expression
+ utf8 TRUE in UTF-8 mode
+
+Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
+*/
+
+static const uschar *
+find_recurse(const uschar *code, BOOL utf8)
+{
+for (;;)
+ {
+ register int c = *code;
+ if (c == OP_END) return NULL;
+ if (c == OP_RECURSE) return code;
+
+ /* XCLASS is used for classes that cannot be represented just by a bit
+ map. This includes negated single high-valued characters. The length in
+ the table is zero; the actual length is stored in the compiled code. */
+
+ if (c == OP_XCLASS) code += GET(code, 1);
+
+ /* Otherwise, we can get the item's length from the table, except that for
+ repeated character types, we have to test for \p and \P, which have an extra
+ two bytes of parameters. */
+
+ else
+ {
+ switch(c)
+ {
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ case OP_TYPEPOSSTAR:
+ case OP_TYPEPOSPLUS:
+ case OP_TYPEPOSQUERY:
+ if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
+ break;
+
+ case OP_TYPEPOSUPTO:
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ case OP_TYPEEXACT:
+ if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
+ break;
+ }
+
+ /* Add in the fixed length from the table */
+
+ code += _pcre_OP_lengths[c];
+
+ /* In UTF-8 mode, opcodes that are followed by a character may be followed
+ by a multi-byte character. The length in the table is a minimum, so we have
+ to arrange to skip the extra bytes. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8) switch(c)
+ {
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_EXACT:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ case OP_POSUPTO:
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_POSSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_POSPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ case OP_POSQUERY:
+ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
+ break;
+ }
+#endif
+ }
+ }
+}
+
+
+
+/*************************************************
+* Scan compiled branch for non-emptiness *
+*************************************************/
+
+/* This function scans through a branch of a compiled pattern to see whether it
+can match the empty string or not. It is called from could_be_empty()
+below and from compile_branch() when checking for an unlimited repeat of a
+group that can match nothing. Note that first_significant_code() skips over
+backward and negative forward assertions when its final argument is TRUE. If we
+hit an unclosed bracket, we return "empty" - this means we've struck an inner
+bracket whose current branch will already have been scanned.
+
+Arguments:
+ code points to start of search
+ endcode points to where to stop
+ utf8 TRUE if in UTF8 mode
+
+Returns: TRUE if what is matched could be empty
+*/
+
+static BOOL
+could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
+{
+register int c;
+for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
+ code < endcode;
+ code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
+ {
+ const uschar *ccode;
+
+ c = *code;
+
+ /* Skip over forward assertions; the other assertions are skipped by
+ first_significant_code() with a TRUE final argument. */
+
+ if (c == OP_ASSERT)
+ {
+ do code += GET(code, 1); while (*code == OP_ALT);
+ c = *code;
+ continue;
+ }
+
+ /* Groups with zero repeats can of course be empty; skip them. */
+
+ if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
+ {
+ code += _pcre_OP_lengths[c];
+ do code += GET(code, 1); while (*code == OP_ALT);
+ c = *code;
+ continue;
+ }
+
+ /* For other groups, scan the branches. */
+
+ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
+ {
+ BOOL empty_branch;
+ if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
+
+ /* Scan a closed bracket */
+
+ empty_branch = FALSE;
+ do
+ {
+ if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
+ empty_branch = TRUE;
+ code += GET(code, 1);
+ }
+ while (*code == OP_ALT);
+ if (!empty_branch) return FALSE; /* All branches are non-empty */
+ c = *code;
+ continue;
+ }
+
+ /* Handle the other opcodes */
+
+ switch (c)
+ {
+ /* Check for quantifiers after a class. XCLASS is used for classes that
+ cannot be represented just by a bit map. This includes negated single
+ high-valued characters. The length in _pcre_OP_lengths[] is zero; the
+ actual length is stored in the compiled code, so we must update "code"
+ here. */
+
+#ifdef SUPPORT_UTF8
+ case OP_XCLASS:
+ ccode = code += GET(code, 1);
+ goto CHECK_CLASS_REPEAT;
+#endif
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ ccode = code + 33;
+
+#ifdef SUPPORT_UTF8
+ CHECK_CLASS_REPEAT:
+#endif
+
+ switch (*ccode)
+ {
+ case OP_CRSTAR: /* These could be empty; continue */
+ case OP_CRMINSTAR:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ break;
+
+ default: /* Non-repeat => class must match */
+ case OP_CRPLUS: /* These repeats aren't empty */
+ case OP_CRMINPLUS:
+ return FALSE;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
+ break;
+ }
+ break;
+
+ /* Opcodes that must match a character */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ case OP_EXTUNI:
+ case OP_NOT_DIGIT:
+ case OP_DIGIT:
+ case OP_NOT_WHITESPACE:
+ case OP_WHITESPACE:
+ case OP_NOT_WORDCHAR:
+ case OP_WORDCHAR:
+ case OP_ANY:
+ case OP_ALLANY:
+ case OP_ANYBYTE:
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_NOT:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_POSPLUS:
+ case OP_EXACT:
+ case OP_NOTPLUS:
+ case OP_NOTMINPLUS:
+ case OP_NOTPOSPLUS:
+ case OP_NOTEXACT:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEPOSPLUS:
+ case OP_TYPEEXACT:
+ return FALSE;
+
+ /* These are going to continue, as they may be empty, but we have to
+ fudge the length for the \p and \P cases. */
+
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPOSSTAR:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ case OP_TYPEPOSQUERY:
+ if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
+ break;
+
+ /* Same for these */
+
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ case OP_TYPEPOSUPTO:
+ if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
+ break;
+
+ /* End of branch */
+
+ case OP_KET:
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ case OP_ALT:
+ return TRUE;
+
+ /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
+ MINUPTO, and POSUPTO may be followed by a multibyte character */
+
+#ifdef SUPPORT_UTF8
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_POSSTAR:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ case OP_POSQUERY:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ case OP_POSUPTO:
+ if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
+ break;
+#endif
+ }
+ }
+
+return TRUE;
+}
+
+
+
+/*************************************************
+* Scan compiled regex for non-emptiness *
+*************************************************/
+
+/* This function is called to check for left recursive calls. We want to check
+the current branch of the current pattern to see if it could match the empty
+string. If it could, we must look outwards for branches at other levels,
+stopping when we pass beyond the bracket which is the subject of the recursion.
+
+Arguments:
+ code points to start of the recursion
+ endcode points to where to stop (current RECURSE item)
+ bcptr points to the chain of current (unclosed) branch starts
+ utf8 TRUE if in UTF-8 mode
+
+Returns: TRUE if what is matched could be empty
+*/
+
+static BOOL
+could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
+ BOOL utf8)
+{
+while (bcptr != NULL && bcptr->current >= code)
+ {
+ if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
+ bcptr = bcptr->outer;
+ }
+return TRUE;
+}
+
+
+
+/*************************************************
+* Check for POSIX class syntax *
+*************************************************/
+
+/* This function is called when the sequence "[:" or "[." or "[=" is
+encountered in a character class. It checks whether this is followed by a
+sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
+reach an unescaped ']' without the special preceding character, return FALSE.
+
+Originally, this function only recognized a sequence of letters between the
+terminators, but it seems that Perl recognizes any sequence of characters,
+though of course unknown POSIX names are subsequently rejected. Perl gives an
+"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
+didn't consider this to be a POSIX class. Likewise for [:1234:].
+
+The problem in trying to be exactly like Perl is in the handling of escapes. We
+have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
+class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
+below handles the special case of \], but does not try to do any other escape
+processing. This makes it different from Perl for cases such as [:l\ower:]
+where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
+"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
+I think.
+
+Arguments:
+ ptr pointer to the initial [
+ endptr where to return the end pointer
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+check_posix_syntax(const uschar *ptr, const uschar **endptr)
+{
+int terminator; /* Don't combine these lines; the Solaris cc */
+terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
+for (++ptr; *ptr != 0; ptr++)
+ {
+ if (*ptr == '\\' && ptr[1] == ']') ptr++; else
+ {
+ if (*ptr == ']') return FALSE;
+ if (*ptr == terminator && ptr[1] == ']')
+ {
+ *endptr = ptr;
+ return TRUE;
+ }
+ }
+ }
+return FALSE;
+}
+
+
+
+
+/*************************************************
+* Check POSIX class name *
+*************************************************/
+
+/* This function is called to check the name given in a POSIX-style class entry
+such as [:alnum:].
+
+Arguments:
+ ptr points to the first letter
+ len the length of the name
+
+Returns: a value representing the name, or -1 if unknown
+*/
+
+static int
+check_posix_name(const uschar *ptr, int len)
+{
+const char *pn = posix_names;
+register int yield = 0;
+while (posix_name_lengths[yield] != 0)
+ {
+ if (len == posix_name_lengths[yield] &&
+ strncmp((const char *)ptr, pn, len) == 0) return yield;
+ pn += posix_name_lengths[yield] + 1;
+ yield++;
+ }
+return -1;
+}
+
+
+/*************************************************
+* Adjust OP_RECURSE items in repeated group *
+*************************************************/
+
+/* OP_RECURSE items contain an offset from the start of the regex to the group
+that is referenced. This means that groups can be replicated for fixed
+repetition simply by copying (because the recursion is allowed to refer to
+earlier groups that are outside the current group). However, when a group is
+optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
+inserted before it, after it has been compiled. This means that any OP_RECURSE
+items within it that refer to the group itself or any contained groups have to
+have their offsets adjusted. That one of the jobs of this function. Before it
+is called, the partially compiled regex must be temporarily terminated with
+OP_END.
+
+This function has been extended with the possibility of forward references for
+recursions and subroutine calls. It must also check the list of such references
+for the group we are dealing with. If it finds that one of the recursions in
+the current group is on this list, it adjusts the offset in the list, not the
+value in the reference (which is a group number).
+
+Arguments:
+ group points to the start of the group
+ adjust the amount by which the group is to be moved
+ utf8 TRUE in UTF-8 mode
+ cd contains pointers to tables etc.
+ save_hwm the hwm forward reference pointer at the start of the group
+
+Returns: nothing
+*/
+
+static void
+adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
+ uschar *save_hwm)
+{
+uschar *ptr = group;
+
+while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
+ {
+ int offset;
+ uschar *hc;
+
+ /* See if this recursion is on the forward reference list. If so, adjust the
+ reference. */
+
+ for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
+ {
+ offset = GET(hc, 0);
+ if (cd->start_code + offset == ptr + 1)
+ {
+ PUT(hc, 0, offset + adjust);
+ break;
+ }
+ }
+
+ /* Otherwise, adjust the recursion offset if it's after the start of this
+ group. */
+
+ if (hc >= cd->hwm)
+ {
+ offset = GET(ptr, 1);
+ if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
+ }
+
+ ptr += 1 + LINK_SIZE;
+ }
+}
+
+
+
+/*************************************************
+* Insert an automatic callout point *
+*************************************************/
+
+/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
+callout points before each pattern item.
+
+Arguments:
+ code current code pointer
+ ptr current pattern pointer
+ cd pointers to tables etc
+
+Returns: new code pointer
+*/
+
+static uschar *
+auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
+{
+*code++ = OP_CALLOUT;
+*code++ = 255;
+PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
+PUT(code, LINK_SIZE, 0); /* Default length */
+return code + 2*LINK_SIZE;
+}
+
+
+
+/*************************************************
+* Complete a callout item *
+*************************************************/
+
+/* A callout item contains the length of the next item in the pattern, which
+we can't fill in till after we have reached the relevant point. This is used
+for both automatic and manual callouts.
+
+Arguments:
+ previous_callout points to previous callout item
+ ptr current pattern pointer
+ cd pointers to tables etc
+
+Returns: nothing
+*/
+
+static void
+complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
+{
+int length = ptr - cd->start_pattern - GET(previous_callout, 2);
+PUT(previous_callout, 2 + LINK_SIZE, length);
+}
+
+
+
+#ifdef SUPPORT_UCP
+/*************************************************
+* Get othercase range *
+*************************************************/
+
+/* This function is passed the start and end of a class range, in UTF-8 mode
+with UCP support. It searches up the characters, looking for internal ranges of
+characters in the "other" case. Each call returns the next one, updating the
+start address.
+
+Arguments:
+ cptr points to starting character value; updated
+ d end value
+ ocptr where to put start of othercase range
+ odptr where to put end of othercase range
+
+Yield: TRUE when range returned; FALSE when no more
+*/
+
+static BOOL
+get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
+ unsigned int *odptr)
+{
+unsigned int c, othercase, next;
+
+for (c = *cptr; c <= d; c++)
+ { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
+
+if (c > d) return FALSE;
+
+*ocptr = othercase;
+next = othercase + 1;
+
+for (++c; c <= d; c++)
+ {
+ if (_pcre_ucp_othercase(c) != next) break;
+ next++;
+ }
+
+*odptr = next - 1;
+*cptr = c;
+
+return TRUE;
+}
+#endif /* SUPPORT_UCP */
+
+
+
+/*************************************************
+* Check if auto-possessifying is possible *
+*************************************************/
+
+/* This function is called for unlimited repeats of certain items, to see
+whether the next thing could possibly match the repeated item. If not, it makes
+sense to automatically possessify the repeated item.
+
+Arguments:
+ op_code the repeated op code
+ this data for this item, depends on the opcode
+ utf8 TRUE in UTF-8 mode
+ utf8_char used for utf8 character bytes, NULL if not relevant
+ ptr next character in pattern
+ options options bits
+ cd contains pointers to tables etc.
+
+Returns: TRUE if possessifying is wanted
+*/
+
+static BOOL
+check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
+ const uschar *ptr, int options, compile_data *cd)
+{
+int next;
+
+/* Skip whitespace and comments in extended mode */
+
+if ((options & PCRE_EXTENDED) != 0)
+ {
+ for (;;)
+ {
+ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
+ if (*ptr == '#')
+ {
+ while (*(++ptr) != 0)
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ }
+ else break;
+ }
+ }
+
+/* If the next item is one that we can handle, get its value. A non-negative
+value is a character, a negative value is an escape value. */
+
+if (*ptr == '\\')
+ {
+ int temperrorcode = 0;
+ next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
+ if (temperrorcode != 0) return FALSE;
+ ptr++; /* Point after the escape sequence */
+ }
+
+else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8) { GETCHARINC(next, ptr); } else
+#endif
+ next = *ptr++;
+ }
+
+else return FALSE;
+
+/* Skip whitespace and comments in extended mode */
+
+if ((options & PCRE_EXTENDED) != 0)
+ {
+ for (;;)
+ {
+ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
+ if (*ptr == '#')
+ {
+ while (*(++ptr) != 0)
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ }
+ else break;
+ }
+ }
+
+/* If the next thing is itself optional, we have to give up. */
+
+if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
+ return FALSE;
+
+/* Now compare the next item with the previous opcode. If the previous is a
+positive single character match, "item" either contains the character or, if
+"item" is greater than 127 in utf8 mode, the character's bytes are in
+utf8_char. */
+
+
+/* Handle cases when the next item is a character. */
+
+if (next >= 0) switch(op_code)
+ {
+ case OP_CHAR:
+#ifdef SUPPORT_UTF8
+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+#endif
+ return item != next;
+
+ /* For CHARNC (caseless character) we must check the other case. If we have
+ Unicode property support, we can use it to test the other case of
+ high-valued characters. */
+
+ case OP_CHARNC:
+#ifdef SUPPORT_UTF8
+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+#endif
+ if (item == next) return FALSE;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ unsigned int othercase;
+ if (next < 128) othercase = cd->fcc[next]; else
+#ifdef SUPPORT_UCP
+ othercase = _pcre_ucp_othercase((unsigned int)next);
+#else
+ othercase = NOTACHAR;
+#endif
+ return (unsigned int)item != othercase;
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ return (item != cd->fcc[next]); /* Non-UTF-8 mode */
+
+ /* For OP_NOT, "item" must be a single-byte character. */
+
+ case OP_NOT:
+ if (item == next) return TRUE;
+ if ((options & PCRE_CASELESS) == 0) return FALSE;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ unsigned int othercase;
+ if (next < 128) othercase = cd->fcc[next]; else
+#ifdef SUPPORT_UCP
+ othercase = _pcre_ucp_othercase(next);
+#else
+ othercase = NOTACHAR;
+#endif
+ return (unsigned int)item == othercase;
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ return (item == cd->fcc[next]); /* Non-UTF-8 mode */
+
+ case OP_DIGIT:
+ return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
+
+ case OP_NOT_DIGIT:
+ return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
+
+ case OP_WHITESPACE:
+ return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
+
+ case OP_NOT_WHITESPACE:
+ return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
+
+ case OP_WORDCHAR:
+ return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
+
+ case OP_NOT_WORDCHAR:
+ return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
+
+ case OP_HSPACE:
+ case OP_NOT_HSPACE:
+ switch(next)
+ {
+ case 0x09:
+ case 0x20:
+ case 0xa0:
+ case 0x1680:
+ case 0x180e:
+ case 0x2000:
+ case 0x2001:
+ case 0x2002:
+ case 0x2003:
+ case 0x2004:
+ case 0x2005:
+ case 0x2006:
+ case 0x2007:
+ case 0x2008:
+ case 0x2009:
+ case 0x200A:
+ case 0x202f:
+ case 0x205f:
+ case 0x3000:
+ return op_code != OP_HSPACE;
+ default:
+ return op_code == OP_HSPACE;
+ }
+
+ case OP_VSPACE:
+ case OP_NOT_VSPACE:
+ switch(next)
+ {
+ case 0x0a:
+ case 0x0b:
+ case 0x0c:
+ case 0x0d:
+ case 0x85:
+ case 0x2028:
+ case 0x2029:
+ return op_code != OP_VSPACE;
+ default:
+ return op_code == OP_VSPACE;
+ }
+
+ default:
+ return FALSE;
+ }
+
+
+/* Handle the case when the next item is \d, \s, etc. */
+
+switch(op_code)
+ {
+ case OP_CHAR:
+ case OP_CHARNC:
+#ifdef SUPPORT_UTF8
+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+#endif
+ switch(-next)
+ {
+ case ESC_d:
+ return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
+
+ case ESC_D:
+ return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
+
+ case ESC_s:
+ return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
+
+ case ESC_S:
+ return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
+
+ case ESC_w:
+ return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
+
+ case ESC_W:
+ return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
+
+ case ESC_h:
+ case ESC_H:
+ switch(item)
+ {
+ case 0x09:
+ case 0x20:
+ case 0xa0:
+ case 0x1680:
+ case 0x180e:
+ case 0x2000:
+ case 0x2001:
+ case 0x2002:
+ case 0x2003:
+ case 0x2004:
+ case 0x2005:
+ case 0x2006:
+ case 0x2007:
+ case 0x2008:
+ case 0x2009:
+ case 0x200A:
+ case 0x202f:
+ case 0x205f:
+ case 0x3000:
+ return -next != ESC_h;
+ default:
+ return -next == ESC_h;
+ }
+
+ case ESC_v:
+ case ESC_V:
+ switch(item)
+ {
+ case 0x0a:
+ case 0x0b:
+ case 0x0c:
+ case 0x0d:
+ case 0x85:
+ case 0x2028:
+ case 0x2029:
+ return -next != ESC_v;
+ default:
+ return -next == ESC_v;
+ }
+
+ default:
+ return FALSE;
+ }
+
+ case OP_DIGIT:
+ return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
+ next == -ESC_h || next == -ESC_v;
+
+ case OP_NOT_DIGIT:
+ return next == -ESC_d;
+
+ case OP_WHITESPACE:
+ return next == -ESC_S || next == -ESC_d || next == -ESC_w;
+
+ case OP_NOT_WHITESPACE:
+ return next == -ESC_s || next == -ESC_h || next == -ESC_v;
+
+ case OP_HSPACE:
+ return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
+
+ case OP_NOT_HSPACE:
+ return next == -ESC_h;
+
+ /* Can't have \S in here because VT matches \S (Perl anomaly) */
+ case OP_VSPACE:
+ return next == -ESC_V || next == -ESC_d || next == -ESC_w;
+
+ case OP_NOT_VSPACE:
+ return next == -ESC_v;
+
+ case OP_WORDCHAR:
+ return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
+
+ case OP_NOT_WORDCHAR:
+ return next == -ESC_w || next == -ESC_d;
+
+ default:
+ return FALSE;
+ }
+
+/* Control does not reach here */
+}
+
+
+
+/*************************************************
+* Compile one branch *
+*************************************************/
+
+/* Scan the pattern, compiling it into the a vector. If the options are
+changed during the branch, the pointer is used to change the external options
+bits. This function is used during the pre-compile phase when we are trying
+to find out the amount of memory needed, as well as during the real compile
+phase. The value of lengthptr distinguishes the two phases.
+
+Arguments:
+ optionsptr pointer to the option bits
+ codeptr points to the pointer to the current code point
+ ptrptr points to the current pattern pointer
+ errorcodeptr points to error code variable
+ firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
+ reqbyteptr set to the last literal character required, else < 0
+ bcptr points to current branch chain
+ cd contains pointers to tables etc.
+ lengthptr NULL during the real compile phase
+ points to length accumulator during pre-compile phase
+
+Returns: TRUE on success
+ FALSE, with *errorcodeptr set non-zero on error
+*/
+
+static BOOL
+compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
+ int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
+ compile_data *cd, int *lengthptr)
+{
+int repeat_type, op_type;
+int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
+int bravalue = 0;
+int greedy_default, greedy_non_default;
+int firstbyte, reqbyte;
+int zeroreqbyte, zerofirstbyte;
+int req_caseopt, reqvary, tempreqvary;
+int options = *optionsptr;
+int after_manual_callout = 0;
+int length_prevgroup = 0;
+register int c;
+register uschar *code = *codeptr;
+uschar *last_code = code;
+uschar *orig_code = code;
+uschar *tempcode;
+BOOL inescq = FALSE;
+BOOL groupsetfirstbyte = FALSE;
+const uschar *ptr = *ptrptr;
+const uschar *tempptr;
+uschar *previous = NULL;
+uschar *previous_callout = NULL;
+uschar *save_hwm = NULL;
+uschar classbits[32];
+
+#ifdef SUPPORT_UTF8
+BOOL class_utf8;
+BOOL utf8 = (options & PCRE_UTF8) != 0;
+uschar *class_utf8data;
+uschar *class_utf8data_base;
+uschar utf8_char[6];
+#else
+BOOL utf8 = FALSE;
+uschar *utf8_char = NULL;
+#endif
+
+#ifdef DEBUG
+if (lengthptr != NULL) DPRINTF((">> start branch\n"));
+#endif
+
+/* Set up the default and non-default settings for greediness */
+
+greedy_default = ((options & PCRE_UNGREEDY) != 0);
+greedy_non_default = greedy_default ^ 1;
+
+/* Initialize no first byte, no required byte. REQ_UNSET means "no char
+matching encountered yet". It gets changed to REQ_NONE if we hit something that
+matches a non-fixed char first char; reqbyte just remains unset if we never
+find one.
+
+When we hit a repeat whose minimum is zero, we may have to adjust these values
+to take the zero repeat into account. This is implemented by setting them to
+zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
+item types that can be repeated set these backoff variables appropriately. */
+
+firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
+
+/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
+according to the current setting of the caseless flag. REQ_CASELESS is a bit
+value > 255. It is added into the firstbyte or reqbyte variables to record the
+case status of the value. This is used only for ASCII characters. */
+
+req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
+
+/* Switch on next character until the end of the branch */
+
+for (;; ptr++)
+ {
+ BOOL negate_class;
+ BOOL should_flip_negation;
+ BOOL possessive_quantifier;
+ BOOL is_quantifier;
+ BOOL is_recurse;
+ BOOL reset_bracount;
+ int class_charcount;
+ int class_lastchar;
+ int newoptions;
+ int recno;
+ int refsign;
+ int skipbytes;
+ int subreqbyte;
+ int subfirstbyte;
+ int terminator;
+ int mclength;
+ uschar mcbuffer[8];
+
+ /* Get next byte in the pattern */
+
+ c = *ptr;
+
+ /* If we are in the pre-compile phase, accumulate the length used for the
+ previous cycle of this loop. */
+
+ if (lengthptr != NULL)
+ {
+#ifdef DEBUG
+ if (code > cd->hwm) cd->hwm = code; /* High water info */
+#endif
+ if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
+ {
+ *errorcodeptr = ERR52;
+ goto FAILED;
+ }
+
+ /* There is at least one situation where code goes backwards: this is the
+ case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
+ the class is simply eliminated. However, it is created first, so we have to
+ allow memory for it. Therefore, don't ever reduce the length at this point.
+ */
+
+ if (code < last_code) code = last_code;
+
+ /* Paranoid check for integer overflow */
+
+ if (OFLOW_MAX - *lengthptr < code - last_code)
+ {
+ *errorcodeptr = ERR20;
+ goto FAILED;
+ }
+
+ *lengthptr += code - last_code;
+ DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
+
+ /* If "previous" is set and it is not at the start of the work space, move
+ it back to there, in order to avoid filling up the work space. Otherwise,
+ if "previous" is NULL, reset the current code pointer to the start. */
+
+ if (previous != NULL)
+ {
+ if (previous > orig_code)
+ {
+ memmove(orig_code, previous, code - previous);
+ code -= previous - orig_code;
+ previous = orig_code;
+ }
+ }
+ else code = orig_code;
+
+ /* Remember where this code item starts so we can pick up the length
+ next time round. */
+
+ last_code = code;
+ }
+
+ /* In the real compile phase, just check the workspace used by the forward
+ reference list. */
+
+ else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
+ {
+ *errorcodeptr = ERR52;
+ goto FAILED;
+ }
+
+ /* If in \Q...\E, check for the end; if not, we have a literal */
+
+ if (inescq && c != 0)
+ {
+ if (c == '\\' && ptr[1] == 'E')
+ {
+ inescq = FALSE;
+ ptr++;
+ continue;
+ }
+ else
+ {
+ if (previous_callout != NULL)
+ {
+ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
+ complete_callout(previous_callout, ptr, cd);
+ previous_callout = NULL;
+ }
+ if ((options & PCRE_AUTO_CALLOUT) != 0)
+ {
+ previous_callout = code;
+ code = auto_callout(code, ptr, cd);
+ }
+ goto NORMAL_CHAR;
+ }
+ }
+
+ /* Fill in length of a previous callout, except when the next thing is
+ a quantifier. */
+
+ is_quantifier = c == '*' || c == '+' || c == '?' ||
+ (c == '{' && is_counted_repeat(ptr+1));
+
+ if (!is_quantifier && previous_callout != NULL &&
+ after_manual_callout-- <= 0)
+ {
+ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
+ complete_callout(previous_callout, ptr, cd);
+ previous_callout = NULL;
+ }
+
+ /* In extended mode, skip white space and comments */
+
+ if ((options & PCRE_EXTENDED) != 0)
+ {
+ if ((cd->ctypes[c] & ctype_space) != 0) continue;
+ if (c == '#')
+ {
+ while (*(++ptr) != 0)
+ {
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ }
+ if (*ptr != 0) continue;
+
+ /* Else fall through to handle end of string */
+ c = 0;
+ }
+ }
+
+ /* No auto callout for quantifiers. */
+
+ if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
+ {
+ previous_callout = code;
+ code = auto_callout(code, ptr, cd);
+ }
+
+ switch(c)
+ {
+ /* ===================================================================*/
+ case 0: /* The branch terminates at string end */
+ case '|': /* or | or ) */
+ case ')':
+ *firstbyteptr = firstbyte;
+ *reqbyteptr = reqbyte;
+ *codeptr = code;
+ *ptrptr = ptr;
+ if (lengthptr != NULL)
+ {
+ if (OFLOW_MAX - *lengthptr < code - last_code)
+ {
+ *errorcodeptr = ERR20;
+ goto FAILED;
+ }
+ *lengthptr += code - last_code; /* To include callout length */
+ DPRINTF((">> end branch\n"));
+ }
+ return TRUE;
+
+
+ /* ===================================================================*/
+ /* Handle single-character metacharacters. In multiline mode, ^ disables
+ the setting of any following char as a first character. */
+
+ case '^':
+ if ((options & PCRE_MULTILINE) != 0)
+ {
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ }
+ previous = NULL;
+ *code++ = OP_CIRC;
+ break;
+
+ case '$':
+ previous = NULL;
+ *code++ = OP_DOLL;
+ break;
+
+ /* There can never be a first char if '.' is first, whatever happens about
+ repeats. The value of reqbyte doesn't change either. */
+
+ case '.':
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+ previous = code;
+ *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
+ break;
+
+
+ /* ===================================================================*/
+ /* Character classes. If the included characters are all < 256, we build a
+ 32-byte bitmap of the permitted characters, except in the special case
+ where there is only one such character. For negated classes, we build the
+ map as usual, then invert it at the end. However, we use a different opcode
+ so that data characters > 255 can be handled correctly.
+
+ If the class contains characters outside the 0-255 range, a different
+ opcode is compiled. It may optionally have a bit map for characters < 256,
+ but those above are are explicitly listed afterwards. A flag byte tells
+ whether the bitmap is present, and whether this is a negated class or not.
+
+ In JavaScript compatibility mode, an isolated ']' causes an error. In
+ default (Perl) mode, it is treated as a data character. */
+
+ case ']':
+ if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
+ {
+ *errorcodeptr = ERR64;
+ goto FAILED;
+ }
+ goto NORMAL_CHAR;
+
+ case '[':
+ previous = code;
+
+ /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
+ they are encountered at the top level, so we'll do that too. */
+
+ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
+ check_posix_syntax(ptr, &tempptr))
+ {
+ *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
+ goto FAILED;
+ }
+
+ /* If the first character is '^', set the negation flag and skip it. Also,
+ if the first few characters (either before or after ^) are \Q\E or \E we
+ skip them too. This makes for compatibility with Perl. */
+
+ negate_class = FALSE;
+ for (;;)
+ {
+ c = *(++ptr);
+ if (c == '\\')
+ {
+ if (ptr[1] == 'E') ptr++;
+ else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
+ else break;
+ }
+ else if (!negate_class && c == '^')
+ negate_class = TRUE;
+ else break;
+ }
+
+ /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
+ an initial ']' is taken as a data character -- the code below handles
+ that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
+ [^] must match any character, so generate OP_ALLANY. */
+
+ if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
+ {
+ *code++ = negate_class? OP_ALLANY : OP_FAIL;
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ break;
+ }
+
+ /* If a class contains a negative special such as \S, we need to flip the
+ negation flag at the end, so that support for characters > 255 works
+ correctly (they are all included in the class). */
+
+ should_flip_negation = FALSE;
+
+ /* Keep a count of chars with values < 256 so that we can optimize the case
+ of just a single character (as long as it's < 256). However, For higher
+ valued UTF-8 characters, we don't yet do any optimization. */
+
+ class_charcount = 0;
+ class_lastchar = -1;
+
+ /* Initialize the 32-char bit map to all zeros. We build the map in a
+ temporary bit of memory, in case the class contains only 1 character (less
+ than 256), because in that case the compiled code doesn't use the bit map.
+ */
+
+ memset(classbits, 0, 32 * sizeof(uschar));
+
+#ifdef SUPPORT_UTF8
+ class_utf8 = FALSE; /* No chars >= 256 */
+ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
+ class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
+#endif
+
+ /* Process characters until ] is reached. By writing this as a "do" it
+ means that an initial ] is taken as a data character. At the start of the
+ loop, c contains the first byte of the character. */
+
+ if (c != 0) do
+ {
+ const uschar *oldptr;
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c > 127)
+ { /* Braces are required because the */
+ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
+ }
+
+ /* In the pre-compile phase, accumulate the length of any UTF-8 extra
+ data and reset the pointer. This is so that very large classes that
+ contain a zillion UTF-8 characters no longer overwrite the work space
+ (which is on the stack). */
+
+ if (lengthptr != NULL)
+ {
+ *lengthptr += class_utf8data - class_utf8data_base;
+ class_utf8data = class_utf8data_base;
+ }
+
+#endif
+
+ /* Inside \Q...\E everything is literal except \E */
+
+ if (inescq)
+ {
+ if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
+ {
+ inescq = FALSE; /* Reset literal state */
+ ptr++; /* Skip the 'E' */
+ continue; /* Carry on with next */
+ }
+ goto CHECK_RANGE; /* Could be range if \E follows */
+ }
+
+ /* Handle POSIX class names. Perl allows a negation extension of the
+ form [:^name:]. A square bracket that doesn't match the syntax is
+ treated as a literal. We also recognize the POSIX constructions
+ [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
+ 5.6 and 5.8 do. */
+
+ if (c == '[' &&
+ (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
+ check_posix_syntax(ptr, &tempptr))
+ {
+ BOOL local_negate = FALSE;
+ int posix_class, taboffset, tabopt;
+ register const uschar *cbits = cd->cbits;
+ uschar pbits[32];
+
+ if (ptr[1] != ':')
+ {
+ *errorcodeptr = ERR31;
+ goto FAILED;
+ }
+
+ ptr += 2;
+ if (*ptr == '^')
+ {
+ local_negate = TRUE;
+ should_flip_negation = TRUE; /* Note negative special */
+ ptr++;
+ }
+
+ posix_class = check_posix_name(ptr, tempptr - ptr);
+ if (posix_class < 0)
+ {
+ *errorcodeptr = ERR30;
+ goto FAILED;
+ }
+
+ /* If matching is caseless, upper and lower are converted to
+ alpha. This relies on the fact that the class table starts with
+ alpha, lower, upper as the first 3 entries. */
+
+ if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
+ posix_class = 0;
+
+ /* We build the bit map for the POSIX class in a chunk of local store
+ because we may be adding and subtracting from it, and we don't want to
+ subtract bits that may be in the main map already. At the end we or the
+ result into the bit map that is being built. */
+
+ posix_class *= 3;
+
+ /* Copy in the first table (always present) */
+
+ memcpy(pbits, cbits + posix_class_maps[posix_class],
+ 32 * sizeof(uschar));
+
+ /* If there is a second table, add or remove it as required. */
+
+ taboffset = posix_class_maps[posix_class + 1];
+ tabopt = posix_class_maps[posix_class + 2];
+
+ if (taboffset >= 0)
+ {
+ if (tabopt >= 0)
+ for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
+ else
+ for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
+ }
+
+ /* Not see if we need to remove any special characters. An option
+ value of 1 removes vertical space and 2 removes underscore. */
+
+ if (tabopt < 0) tabopt = -tabopt;
+ if (tabopt == 1) pbits[1] &= ~0x3c;
+ else if (tabopt == 2) pbits[11] &= 0x7f;
+
+ /* Add the POSIX table or its complement into the main table that is
+ being built and we are done. */
+
+ if (local_negate)
+ for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
+ else
+ for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
+
+ ptr = tempptr + 1;
+ class_charcount = 10; /* Set > 1; assumes more than 1 per class */
+ continue; /* End of POSIX syntax handling */
+ }
+
+ /* Backslash may introduce a single character, or it may introduce one
+ of the specials, which just set a flag. The sequence \b is a special
+ case. Inside a class (and only there) it is treated as backspace.
+ Elsewhere it marks a word boundary. Other escapes have preset maps ready
+ to 'or' into the one we are building. We assume they have more than one
+ character in them, so set class_charcount bigger than one. */
+
+ if (c == '\\')
+ {
+ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
+ if (*errorcodeptr != 0) goto FAILED;
+
+ if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
+ else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
+ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
+ else if (-c == ESC_Q) /* Handle start of quoted string */
+ {
+ if (ptr[1] == '\\' && ptr[2] == 'E')
+ {
+ ptr += 2; /* avoid empty string */
+ }
+ else inescq = TRUE;
+ continue;
+ }
+ else if (-c == ESC_E) continue; /* Ignore orphan \E */
+
+ if (c < 0)
+ {
+ register const uschar *cbits = cd->cbits;
+ class_charcount += 2; /* Greater than 1 is what matters */
+
+ /* Save time by not doing this in the pre-compile phase. */
+
+ if (lengthptr == NULL) switch (-c)
+ {
+ case ESC_d:
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
+ continue;
+
+ case ESC_D:
+ should_flip_negation = TRUE;
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
+ continue;
+
+ case ESC_w:
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
+ continue;
+
+ case ESC_W:
+ should_flip_negation = TRUE;
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
+ continue;
+
+ case ESC_s:
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
+ classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
+ continue;
+
+ case ESC_S:
+ should_flip_negation = TRUE;
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
+ classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
+ continue;
+
+ default: /* Not recognized; fall through */
+ break; /* Need "default" setting to stop compiler warning. */
+ }
+
+ /* In the pre-compile phase, just do the recognition. */
+
+ else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
+ c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
+
+ /* We need to deal with \H, \h, \V, and \v in both phases because
+ they use extra memory. */
+
+ if (-c == ESC_h)
+ {
+ SETBIT(classbits, 0x09); /* VT */
+ SETBIT(classbits, 0x20); /* SPACE */
+ SETBIT(classbits, 0xa0); /* NSBP */
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ class_utf8 = TRUE;
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
+ }
+#endif
+ continue;
+ }
+
+ if (-c == ESC_H)
+ {
+ for (c = 0; c < 32; c++)
+ {
+ int x = 0xff;
+ switch (c)
+ {
+ case 0x09/8: x ^= 1 << (0x09%8); break;
+ case 0x20/8: x ^= 1 << (0x20%8); break;
+ case 0xa0/8: x ^= 1 << (0xa0%8); break;
+ default: break;
+ }
+ classbits[c] |= x;
+ }
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ class_utf8 = TRUE;
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
+ }
+#endif
+ continue;
+ }
+
+ if (-c == ESC_v)
+ {
+ SETBIT(classbits, 0x0a); /* LF */
+ SETBIT(classbits, 0x0b); /* VT */
+ SETBIT(classbits, 0x0c); /* FF */
+ SETBIT(classbits, 0x0d); /* CR */
+ SETBIT(classbits, 0x85); /* NEL */
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ class_utf8 = TRUE;
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
+ }
+#endif
+ continue;
+ }
+
+ if (-c == ESC_V)
+ {
+ for (c = 0; c < 32; c++)
+ {
+ int x = 0xff;
+ switch (c)
+ {
+ case 0x0a/8: x ^= 1 << (0x0a%8);
+ x ^= 1 << (0x0b%8);
+ x ^= 1 << (0x0c%8);
+ x ^= 1 << (0x0d%8);
+ break;
+ case 0x85/8: x ^= 1 << (0x85%8); break;
+ default: break;
+ }
+ classbits[c] |= x;
+ }
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ class_utf8 = TRUE;
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
+ }
+#endif
+ continue;
+ }
+
+ /* We need to deal with \P and \p in both phases. */
+
+#ifdef SUPPORT_UCP
+ if (-c == ESC_p || -c == ESC_P)
+ {
+ BOOL negated;
+ int pdata;
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
+ if (ptype < 0) goto FAILED;
+ class_utf8 = TRUE;
+ *class_utf8data++ = ((-c == ESC_p) != negated)?
+ XCL_PROP : XCL_NOTPROP;
+ *class_utf8data++ = ptype;
+ *class_utf8data++ = pdata;
+ class_charcount -= 2; /* Not a < 256 character */
+ continue;
+ }
+#endif
+ /* Unrecognized escapes are faulted if PCRE is running in its
+ strict mode. By default, for compatibility with Perl, they are
+ treated as literals. */
+
+ if ((options & PCRE_EXTRA) != 0)
+ {
+ *errorcodeptr = ERR7;
+ goto FAILED;
+ }
+
+ class_charcount -= 2; /* Undo the default count from above */
+ c = *ptr; /* Get the final character and fall through */
+ }
+
+ /* Fall through if we have a single character (c >= 0). This may be
+ greater than 256 in UTF-8 mode. */
+
+ } /* End of backslash handling */
+
+ /* A single character may be followed by '-' to form a range. However,
+ Perl does not permit ']' to be the end of the range. A '-' character
+ at the end is treated as a literal. Perl ignores orphaned \E sequences
+ entirely. The code for handling \Q and \E is messy. */
+
+ CHECK_RANGE:
+ while (ptr[1] == '\\' && ptr[2] == 'E')
+ {
+ inescq = FALSE;
+ ptr += 2;
+ }
+
+ oldptr = ptr;
+
+ /* Remember \r or \n */
+
+ if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
+
+ /* Check for range */
+
+ if (!inescq && ptr[1] == '-')
+ {
+ int d;
+ ptr += 2;
+ while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
+
+ /* If we hit \Q (not followed by \E) at this point, go into escaped
+ mode. */
+
+ while (*ptr == '\\' && ptr[1] == 'Q')
+ {
+ ptr += 2;
+ if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
+ inescq = TRUE;
+ break;
+ }
+
+ if (*ptr == 0 || (!inescq && *ptr == ']'))
+ {
+ ptr = oldptr;
+ goto LONE_SINGLE_CHARACTER;
+ }
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ { /* Braces are required because the */
+ GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
+ }
+ else
+#endif
+ d = *ptr; /* Not UTF-8 mode */
+
+ /* The second part of a range can be a single-character escape, but
+ not any of the other escapes. Perl 5.6 treats a hyphen as a literal
+ in such circumstances. */
+
+ if (!inescq && d == '\\')
+ {
+ d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
+ if (*errorcodeptr != 0) goto FAILED;
+
+ /* \b is backspace; \X is literal X; \R is literal R; any other
+ special means the '-' was literal */
+
+ if (d < 0)
+ {
+ if (d == -ESC_b) d = '\b';
+ else if (d == -ESC_X) d = 'X';
+ else if (d == -ESC_R) d = 'R'; else
+ {
+ ptr = oldptr;
+ goto LONE_SINGLE_CHARACTER; /* A few lines below */
+ }
+ }
+ }
+
+ /* Check that the two values are in the correct order. Optimize
+ one-character ranges */
+
+ if (d < c)
+ {
+ *errorcodeptr = ERR8;
+ goto FAILED;
+ }
+
+ if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
+
+ /* Remember \r or \n */
+
+ if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
+
+ /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
+ matching, we have to use an XCLASS with extra data items. Caseless
+ matching for characters > 127 is available only if UCP support is
+ available. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+ {
+ class_utf8 = TRUE;
+
+ /* With UCP support, we can find the other case equivalents of
+ the relevant characters. There may be several ranges. Optimize how
+ they fit with the basic range. */
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ unsigned int occ, ocd;
+ unsigned int cc = c;
+ unsigned int origd = d;
+ while (get_othercase_range(&cc, origd, &occ, &ocd))
+ {
+ if (occ >= (unsigned int)c &&
+ ocd <= (unsigned int)d)
+ continue; /* Skip embedded ranges */
+
+ if (occ < (unsigned int)c &&
+ ocd >= (unsigned int)c - 1) /* Extend the basic range */
+ { /* if there is overlap, */
+ c = occ; /* noting that if occ < c */
+ continue; /* we can't have ocd > d */
+ } /* because a subrange is */
+ if (ocd > (unsigned int)d &&
+ occ <= (unsigned int)d + 1) /* always shorter than */
+ { /* the basic range. */
+ d = ocd;
+ continue;
+ }
+
+ if (occ == ocd)
+ {
+ *class_utf8data++ = XCL_SINGLE;
+ }
+ else
+ {
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
+ }
+ class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
+ }
+ }
+#endif /* SUPPORT_UCP */
+
+ /* Now record the original range, possibly modified for UCP caseless
+ overlapping ranges. */
+
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += _pcre_ord2utf8(c, class_utf8data);
+ class_utf8data += _pcre_ord2utf8(d, class_utf8data);
+
+ /* With UCP support, we are done. Without UCP support, there is no
+ caseless matching for UTF-8 characters > 127; we can use the bit map
+ for the smaller ones. */
+
+#ifdef SUPPORT_UCP
+ continue; /* With next character in the class */
+#else
+ if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
+
+ /* Adjust upper limit and fall through to set up the map */
+
+ d = 127;
+
+#endif /* SUPPORT_UCP */
+ }
+#endif /* SUPPORT_UTF8 */
+
+ /* We use the bit map for all cases when not in UTF-8 mode; else
+ ranges that lie entirely within 0-127 when there is UCP support; else
+ for partial ranges without UCP support. */
+
+ class_charcount += d - c + 1;
+ class_lastchar = d;
+
+ /* We can save a bit of time by skipping this in the pre-compile. */
+
+ if (lengthptr == NULL) for (; c <= d; c++)
+ {
+ classbits[c/8] |= (1 << (c&7));
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int uc = cd->fcc[c]; /* flip case */
+ classbits[uc/8] |= (1 << (uc&7));
+ }
+ }
+
+ continue; /* Go get the next char in the class */
+ }
+
+ /* Handle a lone single character - we can get here for a normal
+ non-escape char, or after \ that introduces a single character or for an
+ apparent range that isn't. */
+
+ LONE_SINGLE_CHARACTER:
+
+ /* Handle a character that cannot go in the bit map */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+ {
+ class_utf8 = TRUE;
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(c, class_utf8data);
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ unsigned int othercase;
+ if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
+ {
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
+ }
+ }
+#endif /* SUPPORT_UCP */
+
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Handle a single-byte character */
+ {
+ classbits[c/8] |= (1 << (c&7));
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ c = cd->fcc[c]; /* flip case */
+ classbits[c/8] |= (1 << (c&7));
+ }
+ class_charcount++;
+ class_lastchar = c;
+ }
+ }
+
+ /* Loop until ']' reached. This "while" is the end of the "do" above. */
+
+ while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
+
+ if (c == 0) /* Missing terminating ']' */
+ {
+ *errorcodeptr = ERR6;
+ goto FAILED;
+ }
+
+
+/* This code has been disabled because it would mean that \s counts as
+an explicit \r or \n reference, and that's not really what is wanted. Now
+we set the flag only if there is a literal "\r" or "\n" in the class. */
+
+#if 0
+ /* Remember whether \r or \n are in this class */
+
+ if (negate_class)
+ {
+ if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
+ }
+ else
+ {
+ if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
+ }
+#endif
+
+
+ /* If class_charcount is 1, we saw precisely one character whose value is
+ less than 256. As long as there were no characters >= 128 and there was no
+ use of \p or \P, in other words, no use of any XCLASS features, we can
+ optimize.
+
+ In UTF-8 mode, we can optimize the negative case only if there were no
+ characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
+ operate on single-bytes only. This is an historical hangover. Maybe one day
+ we can tidy these opcodes to handle multi-byte characters.
+
+ The optimization throws away the bit map. We turn the item into a
+ 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
+ that OP_NOT does not support multibyte characters. In the positive case, it
+ can cause firstbyte to be set. Otherwise, there can be no first char if
+ this item is first, whatever repeat count may follow. In the case of
+ reqbyte, save the previous value for reinstating. */
+
+#ifdef SUPPORT_UTF8
+ if (class_charcount == 1 && !class_utf8 &&
+ (!utf8 || !negate_class || class_lastchar < 128))
+#else
+ if (class_charcount == 1)
+#endif
+ {
+ zeroreqbyte = reqbyte;
+
+ /* The OP_NOT opcode works on one-byte characters only. */
+
+ if (negate_class)
+ {
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ *code++ = OP_NOT;
+ *code++ = class_lastchar;
+ break;
+ }
+
+ /* For a single, positive character, get the value into mcbuffer, and
+ then we can handle this with the normal one-character code. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && class_lastchar > 127)
+ mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
+ else
+#endif
+ {
+ mcbuffer[0] = class_lastchar;
+ mclength = 1;
+ }
+ goto ONE_CHAR;
+ } /* End of 1-char optimization */
+
+ /* The general case - not the one-char optimization. If this is the first
+ thing in the branch, there can be no first char setting, whatever the
+ repeat count. Any reqbyte setting must remain unchanged after any kind of
+ repeat. */
+
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+
+ /* If there are characters with values > 255, we have to compile an
+ extended class, with its own opcode, unless there was a negated special
+ such as \S in the class, because in that case all characters > 255 are in
+ the class, so any that were explicitly given as well can be ignored. If
+ (when there are explicit characters > 255 that must be listed) there are no
+ characters < 256, we can omit the bitmap in the actual compiled code. */
+
+#ifdef SUPPORT_UTF8
+ if (class_utf8 && !should_flip_negation)
+ {
+ *class_utf8data++ = XCL_END; /* Marks the end of extra data */
+ *code++ = OP_XCLASS;
+ code += LINK_SIZE;
+ *code = negate_class? XCL_NOT : 0;
+
+ /* If the map is required, move up the extra data to make room for it;
+ otherwise just move the code pointer to the end of the extra data. */
+
+ if (class_charcount > 0)
+ {
+ *code++ |= XCL_MAP;
+ memmove(code + 32, code, class_utf8data - code);
+ memcpy(code, classbits, 32);
+ code = class_utf8data + 32;
+ }
+ else code = class_utf8data;
+
+ /* Now fill in the complete length of the item */
+
+ PUT(previous, 1, code - previous);
+ break; /* End of class handling */
+ }
+#endif
+
+ /* If there are no characters > 255, set the opcode to OP_CLASS or
+ OP_NCLASS, depending on whether the whole class was negated and whether
+ there were negative specials such as \S in the class. Then copy the 32-byte
+ map into the code vector, negating it if necessary. */
+
+ *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
+ if (negate_class)
+ {
+ if (lengthptr == NULL) /* Save time in the pre-compile phase */
+ for (c = 0; c < 32; c++) code[c] = ~classbits[c];
+ }
+ else
+ {
+ memcpy(code, classbits, 32);
+ }
+ code += 32;
+ break;
+
+
+ /* ===================================================================*/
+ /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
+ has been tested above. */
+
+ case '{':
+ if (!is_quantifier) goto NORMAL_CHAR;
+ ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
+ if (*errorcodeptr != 0) goto FAILED;
+ goto REPEAT;
+
+ case '*':
+ repeat_min = 0;
+ repeat_max = -1;
+ goto REPEAT;
+
+ case '+':
+ repeat_min = 1;
+ repeat_max = -1;
+ goto REPEAT;
+
+ case '?':
+ repeat_min = 0;
+ repeat_max = 1;
+
+ REPEAT:
+ if (previous == NULL)
+ {
+ *errorcodeptr = ERR9;
+ goto FAILED;
+ }
+
+ if (repeat_min == 0)
+ {
+ firstbyte = zerofirstbyte; /* Adjust for zero repeat */
+ reqbyte = zeroreqbyte; /* Ditto */
+ }
+
+ /* Remember whether this is a variable length repeat */
+
+ reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
+
+ op_type = 0; /* Default single-char op codes */
+ possessive_quantifier = FALSE; /* Default not possessive quantifier */
+
+ /* Save start of previous item, in case we have to move it up to make space
+ for an inserted OP_ONCE for the additional '+' extension. */
+
+ tempcode = previous;
+
+ /* If the next character is '+', we have a possessive quantifier. This
+ implies greediness, whatever the setting of the PCRE_UNGREEDY option.
+ If the next character is '?' this is a minimizing repeat, by default,
+ but if PCRE_UNGREEDY is set, it works the other way round. We change the
+ repeat type to the non-default. */
+
+ if (ptr[1] == '+')
+ {
+ repeat_type = 0; /* Force greedy */
+ possessive_quantifier = TRUE;
+ ptr++;
+ }
+ else if (ptr[1] == '?')
+ {
+ repeat_type = greedy_non_default;
+ ptr++;
+ }
+ else repeat_type = greedy_default;
+
+ /* If previous was a character match, abolish the item and generate a
+ repeat item instead. If a char item has a minumum of more than one, ensure
+ that it is set in reqbyte - it might not be if a sequence such as x{3} is
+ the first thing in a branch because the x will have gone into firstbyte
+ instead. */
+
+ if (*previous == OP_CHAR || *previous == OP_CHARNC)
+ {
+ /* Deal with UTF-8 characters that take up more than one byte. It's
+ easier to write this out separately than try to macrify it. Use c to
+ hold the length of the character in bytes, plus 0x80 to flag that it's a
+ length rather than a small character. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (code[-1] & 0x80) != 0)
+ {
+ uschar *lastchar = code - 1;
+ while((*lastchar & 0xc0) == 0x80) lastchar--;
+ c = code - lastchar; /* Length of UTF-8 character */
+ memcpy(utf8_char, lastchar, c); /* Save the char */
+ c |= 0x80; /* Flag c as a length */
+ }
+ else
+#endif
+
+ /* Handle the case of a single byte - either with no UTF8 support, or
+ with UTF-8 disabled, or for a UTF-8 character < 128. */
+
+ {
+ c = code[-1];
+ if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
+ }
+
+ /* If the repetition is unlimited, it pays to see if the next thing on
+ the line is something that cannot possibly match this character. If so,
+ automatically possessifying this item gains some performance in the case
+ where the match fails. */
+
+ if (!possessive_quantifier &&
+ repeat_max < 0 &&
+ check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
+ options, cd))
+ {
+ repeat_type = 0; /* Force greedy */
+ possessive_quantifier = TRUE;
+ }
+
+ goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
+ }
+
+ /* If previous was a single negated character ([^a] or similar), we use
+ one of the special opcodes, replacing it. The code is shared with single-
+ character repeats by setting opt_type to add a suitable offset into
+ repeat_type. We can also test for auto-possessification. OP_NOT is
+ currently used only for single-byte chars. */
+
+ else if (*previous == OP_NOT)
+ {
+ op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
+ c = previous[1];
+ if (!possessive_quantifier &&
+ repeat_max < 0 &&
+ check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
+ {
+ repeat_type = 0; /* Force greedy */
+ possessive_quantifier = TRUE;
+ }
+ goto OUTPUT_SINGLE_REPEAT;
+ }
+
+ /* If previous was a character type match (\d or similar), abolish it and
+ create a suitable repeat item. The code is shared with single-character
+ repeats by setting op_type to add a suitable offset into repeat_type. Note
+ the the Unicode property types will be present only when SUPPORT_UCP is
+ defined, but we don't wrap the little bits of code here because it just
+ makes it horribly messy. */
+
+ else if (*previous < OP_EODN)
+ {
+ uschar *oldcode;
+ int prop_type, prop_value;
+ op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
+ c = *previous;
+
+ if (!possessive_quantifier &&
+ repeat_max < 0 &&
+ check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
+ {
+ repeat_type = 0; /* Force greedy */
+ possessive_quantifier = TRUE;
+ }
+
+ OUTPUT_SINGLE_REPEAT:
+ if (*previous == OP_PROP || *previous == OP_NOTPROP)
+ {
+ prop_type = previous[1];
+ prop_value = previous[2];
+ }
+ else prop_type = prop_value = -1;
+
+ oldcode = code;
+ code = previous; /* Usually overwrite previous item */
+
+ /* If the maximum is zero then the minimum must also be zero; Perl allows
+ this case, so we do too - by simply omitting the item altogether. */
+
+ if (repeat_max == 0) goto END_REPEAT;
+
+ /* All real repeats make it impossible to handle partial matching (maybe
+ one day we will be able to remove this restriction). */
+
+ if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
+
+ /* Combine the op_type with the repeat_type */
+
+ repeat_type += op_type;
+
+ /* A minimum of zero is handled either as the special case * or ?, or as
+ an UPTO, with the maximum given. */
+
+ if (repeat_min == 0)
+ {
+ if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
+ else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
+ else
+ {
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max);
+ }
+ }
+
+ /* A repeat minimum of 1 is optimized into some special cases. If the
+ maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
+ left in place and, if the maximum is greater than 1, we use OP_UPTO with
+ one less than the maximum. */
+
+ else if (repeat_min == 1)
+ {
+ if (repeat_max == -1)
+ *code++ = OP_PLUS + repeat_type;
+ else
+ {
+ code = oldcode; /* leave previous item in place */
+ if (repeat_max == 1) goto END_REPEAT;
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max - 1);
+ }
+ }
+
+ /* The case {n,n} is just an EXACT, while the general case {n,m} is
+ handled as an EXACT followed by an UPTO. */
+
+ else
+ {
+ *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
+ PUT2INC(code, 0, repeat_min);
+
+ /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
+ we have to insert the character for the previous code. For a repeated
+ Unicode property match, there are two extra bytes that define the
+ required property. In UTF-8 mode, long characters have their length in
+ c, with the 0x80 bit as a flag. */
+
+ if (repeat_max < 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 128)
+ {
+ memcpy(code, utf8_char, c & 7);
+ code += c & 7;
+ }
+ else
+#endif
+ {
+ *code++ = c;
+ if (prop_type >= 0)
+ {
+ *code++ = prop_type;
+ *code++ = prop_value;
+ }
+ }
+ *code++ = OP_STAR + repeat_type;
+ }
+
+ /* Else insert an UPTO if the max is greater than the min, again
+ preceded by the character, for the previously inserted code. If the
+ UPTO is just for 1 instance, we can use QUERY instead. */
+
+ else if (repeat_max != repeat_min)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 128)
+ {
+ memcpy(code, utf8_char, c & 7);
+ code += c & 7;
+ }
+ else
+#endif
+ *code++ = c;
+ if (prop_type >= 0)
+ {
+ *code++ = prop_type;
+ *code++ = prop_value;
+ }
+ repeat_max -= repeat_min;
+
+ if (repeat_max == 1)
+ {
+ *code++ = OP_QUERY + repeat_type;
+ }
+ else
+ {
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max);
+ }
+ }
+ }
+
+ /* The character or character type itself comes last in all cases. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 128)
+ {
+ memcpy(code, utf8_char, c & 7);
+ code += c & 7;
+ }
+ else
+#endif
+ *code++ = c;
+
+ /* For a repeated Unicode property match, there are two extra bytes that
+ define the required property. */
+
+#ifdef SUPPORT_UCP
+ if (prop_type >= 0)
+ {
+ *code++ = prop_type;
+ *code++ = prop_value;
+ }
+#endif
+ }
+
+ /* If previous was a character class or a back reference, we put the repeat
+ stuff after it, but just skip the item if the repeat was {0,0}. */
+
+ else if (*previous == OP_CLASS ||
+ *previous == OP_NCLASS ||
+#ifdef SUPPORT_UTF8
+ *previous == OP_XCLASS ||
+#endif
+ *previous == OP_REF)
+ {
+ if (repeat_max == 0)
+ {
+ code = previous;
+ goto END_REPEAT;
+ }
+
+ /* All real repeats make it impossible to handle partial matching (maybe
+ one day we will be able to remove this restriction). */
+
+ if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
+
+ if (repeat_min == 0 && repeat_max == -1)
+ *code++ = OP_CRSTAR + repeat_type;
+ else if (repeat_min == 1 && repeat_max == -1)
+ *code++ = OP_CRPLUS + repeat_type;
+ else if (repeat_min == 0 && repeat_max == 1)
+ *code++ = OP_CRQUERY + repeat_type;
+ else
+ {
+ *code++ = OP_CRRANGE + repeat_type;
+ PUT2INC(code, 0, repeat_min);
+ if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
+ PUT2INC(code, 0, repeat_max);
+ }
+ }
+
+ /* If previous was a bracket group, we may have to replicate it in certain
+ cases. */
+
+ else if (*previous == OP_BRA || *previous == OP_CBRA ||
+ *previous == OP_ONCE || *previous == OP_COND)
+ {
+ register int i;
+ int ketoffset = 0;
+ int len = code - previous;
+ uschar *bralink = NULL;
+
+ /* Repeating a DEFINE group is pointless */
+
+ if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
+ {
+ *errorcodeptr = ERR55;
+ goto FAILED;
+ }
+
+ /* If the maximum repeat count is unlimited, find the end of the bracket
+ by scanning through from the start, and compute the offset back to it
+ from the current code pointer. There may be an OP_OPT setting following
+ the final KET, so we can't find the end just by going back from the code
+ pointer. */
+
+ if (repeat_max == -1)
+ {
+ register uschar *ket = previous;
+ do ket += GET(ket, 1); while (*ket != OP_KET);
+ ketoffset = code - ket;
+ }
+
+ /* The case of a zero minimum is special because of the need to stick
+ OP_BRAZERO in front of it, and because the group appears once in the
+ data, whereas in other cases it appears the minimum number of times. For
+ this reason, it is simplest to treat this case separately, as otherwise
+ the code gets far too messy. There are several special subcases when the
+ minimum is zero. */
+
+ if (repeat_min == 0)
+ {
+ /* If the maximum is also zero, we used to just omit the group from the
+ output altogether, like this:
+
+ ** if (repeat_max == 0)
+ ** {
+ ** code = previous;
+ ** goto END_REPEAT;
+ ** }
+
+ However, that fails when a group is referenced as a subroutine from
+ elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
+ so that it is skipped on execution. As we don't have a list of which
+ groups are referenced, we cannot do this selectively.
+
+ If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
+ and do no more at this point. However, we do need to adjust any
+ OP_RECURSE calls inside the group that refer to the group itself or any
+ internal or forward referenced group, because the offset is from the
+ start of the whole regex. Temporarily terminate the pattern while doing
+ this. */
+
+ if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
+ {
+ *code = OP_END;
+ adjust_recurse(previous, 1, utf8, cd, save_hwm);
+ memmove(previous+1, previous, len);
+ code++;
+ if (repeat_max == 0)
+ {
+ *previous++ = OP_SKIPZERO;
+ goto END_REPEAT;
+ }
+ *previous++ = OP_BRAZERO + repeat_type;
+ }
+
+ /* If the maximum is greater than 1 and limited, we have to replicate
+ in a nested fashion, sticking OP_BRAZERO before each set of brackets.
+ The first one has to be handled carefully because it's the original
+ copy, which has to be moved up. The remainder can be handled by code
+ that is common with the non-zero minimum case below. We have to
+ adjust the value or repeat_max, since one less copy is required. Once
+ again, we may have to adjust any OP_RECURSE calls inside the group. */
+
+ else
+ {
+ int offset;
+ *code = OP_END;
+ adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
+ memmove(previous + 2 + LINK_SIZE, previous, len);
+ code += 2 + LINK_SIZE;
+ *previous++ = OP_BRAZERO + repeat_type;
+ *previous++ = OP_BRA;
+
+ /* We chain together the bracket offset fields that have to be
+ filled in later when the ends of the brackets are reached. */
+
+ offset = (bralink == NULL)? 0 : previous - bralink;
+ bralink = previous;
+ PUTINC(previous, 0, offset);
+ }
+
+ repeat_max--;
+ }
+
+ /* If the minimum is greater than zero, replicate the group as many
+ times as necessary, and adjust the maximum to the number of subsequent
+ copies that we need. If we set a first char from the group, and didn't
+ set a required char, copy the latter from the former. If there are any
+ forward reference subroutine calls in the group, there will be entries on
+ the workspace list; replicate these with an appropriate increment. */
+
+ else
+ {
+ if (repeat_min > 1)
+ {
+ /* In the pre-compile phase, we don't actually do the replication. We
+ just adjust the length as if we had. Do some paranoid checks for
+ potential integer overflow. */
+
+ if (lengthptr != NULL)
+ {
+ int delta = (repeat_min - 1)*length_prevgroup;
+ if ((double)(repeat_min - 1)*(double)length_prevgroup >
+ (double)INT_MAX ||
+ OFLOW_MAX - *lengthptr < delta)
+ {
+ *errorcodeptr = ERR20;
+ goto FAILED;
+ }
+ *lengthptr += delta;
+ }
+
+ /* This is compiling for real */
+
+ else
+ {
+ if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
+ for (i = 1; i < repeat_min; i++)
+ {
+ uschar *hc;
+ uschar *this_hwm = cd->hwm;
+ memcpy(code, previous, len);
+ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
+ {
+ PUT(cd->hwm, 0, GET(hc, 0) + len);
+ cd->hwm += LINK_SIZE;
+ }
+ save_hwm = this_hwm;
+ code += len;
+ }
+ }
+ }
+
+ if (repeat_max > 0) repeat_max -= repeat_min;
+ }
+
+ /* This code is common to both the zero and non-zero minimum cases. If
+ the maximum is limited, it replicates the group in a nested fashion,
+ remembering the bracket starts on a stack. In the case of a zero minimum,
+ the first one was set up above. In all cases the repeat_max now specifies
+ the number of additional copies needed. Again, we must remember to
+ replicate entries on the forward reference list. */
+
+ if (repeat_max >= 0)
+ {
+ /* In the pre-compile phase, we don't actually do the replication. We
+ just adjust the length as if we had. For each repetition we must add 1
+ to the length for BRAZERO and for all but the last repetition we must
+ add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
+ paranoid checks to avoid integer overflow. */
+
+ if (lengthptr != NULL && repeat_max > 0)
+ {
+ int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
+ 2 - 2*LINK_SIZE; /* Last one doesn't nest */
+ if ((double)repeat_max *
+ (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
+ > (double)INT_MAX ||
+ OFLOW_MAX - *lengthptr < delta)
+ {
+ *errorcodeptr = ERR20;
+ goto FAILED;
+ }
+ *lengthptr += delta;
+ }
+
+ /* This is compiling for real */
+
+ else for (i = repeat_max - 1; i >= 0; i--)
+ {
+ uschar *hc;
+ uschar *this_hwm = cd->hwm;
+
+ *code++ = OP_BRAZERO + repeat_type;
+
+ /* All but the final copy start a new nesting, maintaining the
+ chain of brackets outstanding. */
+
+ if (i != 0)
+ {
+ int offset;
+ *code++ = OP_BRA;
+ offset = (bralink == NULL)? 0 : code - bralink;
+ bralink = code;
+ PUTINC(code, 0, offset);
+ }
+
+ memcpy(code, previous, len);
+ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
+ {
+ PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
+ cd->hwm += LINK_SIZE;
+ }
+ save_hwm = this_hwm;
+ code += len;
+ }
+
+ /* Now chain through the pending brackets, and fill in their length
+ fields (which are holding the chain links pro tem). */
+
+ while (bralink != NULL)
+ {
+ int oldlinkoffset;
+ int offset = code - bralink + 1;
+ uschar *bra = code - offset;
+ oldlinkoffset = GET(bra, 1);
+ bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
+ *code++ = OP_KET;
+ PUTINC(code, 0, offset);
+ PUT(bra, 1, offset);
+ }
+ }
+
+ /* If the maximum is unlimited, set a repeater in the final copy. We
+ can't just offset backwards from the current code point, because we
+ don't know if there's been an options resetting after the ket. The
+ correct offset was computed above.
+
+ Then, when we are doing the actual compile phase, check to see whether
+ this group is a non-atomic one that could match an empty string. If so,
+ convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
+ that runtime checking can be done. [This check is also applied to
+ atomic groups at runtime, but in a different way.] */
+
+ else
+ {
+ uschar *ketcode = code - ketoffset;
+ uschar *bracode = ketcode - GET(ketcode, 1);
+ *ketcode = OP_KETRMAX + repeat_type;
+ if (lengthptr == NULL && *bracode != OP_ONCE)
+ {
+ uschar *scode = bracode;
+ do
+ {
+ if (could_be_empty_branch(scode, ketcode, utf8))
+ {
+ *bracode += OP_SBRA - OP_BRA;
+ break;
+ }
+ scode += GET(scode, 1);
+ }
+ while (*scode == OP_ALT);
+ }
+ }
+ }
+
+ /* If previous is OP_FAIL, it was generated by an empty class [] in
+ JavaScript mode. The other ways in which OP_FAIL can be generated, that is
+ by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
+ error above. We can just ignore the repeat in JS case. */
+
+ else if (*previous == OP_FAIL) goto END_REPEAT;
+
+ /* Else there's some kind of shambles */
+
+ else
+ {
+ *errorcodeptr = ERR11;
+ goto FAILED;
+ }
+
+ /* If the character following a repeat is '+', or if certain optimization
+ tests above succeeded, possessive_quantifier is TRUE. For some of the
+ simpler opcodes, there is an special alternative opcode for this. For
+ anything else, we wrap the entire repeated item inside OP_ONCE brackets.
+ The '+' notation is just syntactic sugar, taken from Sun's Java package,
+ but the special opcodes can optimize it a bit. The repeated item starts at
+ tempcode, not at previous, which might be the first part of a string whose
+ (former) last char we repeated.
+
+ Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
+ an 'upto' may follow. We skip over an 'exact' item, and then test the
+ length of what remains before proceeding. */
+
+ if (possessive_quantifier)
+ {
+ int len;
+ if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
+ *tempcode == OP_NOTEXACT)
+ tempcode += _pcre_OP_lengths[*tempcode] +
+ ((*tempcode == OP_TYPEEXACT &&
+ (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
+ len = code - tempcode;
+ if (len > 0) switch (*tempcode)
+ {
+ case OP_STAR: *tempcode = OP_POSSTAR; break;
+ case OP_PLUS: *tempcode = OP_POSPLUS; break;
+ case OP_QUERY: *tempcode = OP_POSQUERY; break;
+ case OP_UPTO: *tempcode = OP_POSUPTO; break;
+
+ case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
+ case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
+ case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
+ case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
+
+ case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
+ case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
+ case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
+ case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
+
+ default:
+ memmove(tempcode + 1+LINK_SIZE, tempcode, len);
+ code += 1 + LINK_SIZE;
+ len += 1 + LINK_SIZE;
+ tempcode[0] = OP_ONCE;
+ *code++ = OP_KET;
+ PUTINC(code, 0, len);
+ PUT(tempcode, 1, len);
+ break;
+ }
+ }
+
+ /* In all case we no longer have a previous item. We also set the
+ "follows varying string" flag for subsequently encountered reqbytes if
+ it isn't already set and we have just passed a varying length item. */
+
+ END_REPEAT:
+ previous = NULL;
+ cd->req_varyopt |= reqvary;
+ break;
+
+
+ /* ===================================================================*/
+ /* Start of nested parenthesized sub-expression, or comment or lookahead or
+ lookbehind or option setting or condition or all the other extended
+ parenthesis forms. */
+
+ case '(':
+ newoptions = options;
+ skipbytes = 0;
+ bravalue = OP_CBRA;
+ save_hwm = cd->hwm;
+ reset_bracount = FALSE;
+
+ /* First deal with various "verbs" that can be introduced by '*'. */
+
+ if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
+ {
+ int i, namelen;
+ const char *vn = verbnames;
+ const uschar *name = ++ptr;
+ previous = NULL;
+ while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
+ if (*ptr == ':')
+ {
+ *errorcodeptr = ERR59; /* Not supported */
+ goto FAILED;
+ }
+ if (*ptr != ')')
+ {
+ *errorcodeptr = ERR60;
+ goto FAILED;
+ }
+ namelen = ptr - name;
+ for (i = 0; i < verbcount; i++)
+ {
+ if (namelen == verbs[i].len &&
+ strncmp((char *)name, vn, namelen) == 0)
+ {
+ *code = verbs[i].op;
+ if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
+ break;
+ }
+ vn += verbs[i].len + 1;
+ }
+ if (i < verbcount) continue;
+ *errorcodeptr = ERR60;
+ goto FAILED;
+ }
+
+ /* Deal with the extended parentheses; all are introduced by '?', and the
+ appearance of any of them means that this is not a capturing group. */
+
+ else if (*ptr == '?')
+ {
+ int i, set, unset, namelen;
+ int *optset;
+ const uschar *name;
+ uschar *slot;
+
+ switch (*(++ptr))
+ {
+ case '#': /* Comment; skip to ket */
+ ptr++;
+ while (*ptr != 0 && *ptr != ')') ptr++;
+ if (*ptr == 0)
+ {
+ *errorcodeptr = ERR18;
+ goto FAILED;
+ }
+ continue;
+
+
+ /* ------------------------------------------------------------ */
+ case '|': /* Reset capture count for each branch */
+ reset_bracount = TRUE;
+ /* Fall through */
+
+ /* ------------------------------------------------------------ */
+ case ':': /* Non-capturing bracket */
+ bravalue = OP_BRA;
+ ptr++;
+ break;
+
+
+ /* ------------------------------------------------------------ */
+ case '(':
+ bravalue = OP_COND; /* Conditional group */
+
+ /* A condition can be an assertion, a number (referring to a numbered
+ group), a name (referring to a named group), or 'R', referring to
+ recursion. R and R&name are also permitted for recursion tests.
+
+ There are several syntaxes for testing a named group: (?(name)) is used
+ by Python; Perl 5.10 onwards uses (?() or (?('name')).
+
+ There are two unfortunate ambiguities, caused by history. (a) 'R' can
+ be the recursive thing or the name 'R' (and similarly for 'R' followed
+ by digits), and (b) a number could be a name that consists of digits.
+ In both cases, we look for a name first; if not found, we try the other
+ cases. */
+
+ /* For conditions that are assertions, check the syntax, and then exit
+ the switch. This will take control down to where bracketed groups,
+ including assertions, are processed. */
+
+ if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
+ break;
+
+ /* Most other conditions use OP_CREF (a couple change to OP_RREF
+ below), and all need to skip 3 bytes at the start of the group. */
+
+ code[1+LINK_SIZE] = OP_CREF;
+ skipbytes = 3;
+ refsign = -1;
+
+ /* Check for a test for recursion in a named group. */
+
+ if (ptr[1] == 'R' && ptr[2] == '&')
+ {
+ terminator = -1;
+ ptr += 2;
+ code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
+ }
+
+ /* Check for a test for a named group's having been set, using the Perl
+ syntax (?() or (?('name') */
+
+ else if (ptr[1] == '<')
+ {
+ terminator = '>';
+ ptr++;
+ }
+ else if (ptr[1] == '\'')
+ {
+ terminator = '\'';
+ ptr++;
+ }
+ else
+ {
+ terminator = 0;
+ if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
+ }
+
+ /* We now expect to read a name; any thing else is an error */
+
+ if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
+ {
+ ptr += 1; /* To get the right offset */
+ *errorcodeptr = ERR28;
+ goto FAILED;
+ }
+
+ /* Read the name, but also get it as a number if it's all digits */
+
+ recno = 0;
+ name = ++ptr;
+ while ((cd->ctypes[*ptr] & ctype_word) != 0)
+ {
+ if (recno >= 0)
+ recno = ((digitab[*ptr] & ctype_digit) != 0)?
+ recno * 10 + *ptr - '0' : -1;
+ ptr++;
+ }
+ namelen = ptr - name;
+
+ if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
+ {
+ ptr--; /* Error offset */
+ *errorcodeptr = ERR26;
+ goto FAILED;
+ }
+
+ /* Do no further checking in the pre-compile phase. */
+
+ if (lengthptr != NULL) break;
+
+ /* In the real compile we do the work of looking for the actual
+ reference. If the string started with "+" or "-" we require the rest to
+ be digits, in which case recno will be set. */
+
+ if (refsign > 0)
+ {
+ if (recno <= 0)
+ {
+ *errorcodeptr = ERR58;
+ goto FAILED;
+ }
+ recno = (refsign == '-')?
+ cd->bracount - recno + 1 : recno +cd->bracount;
+ if (recno <= 0 || recno > cd->final_bracount)
+ {
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
+ PUT2(code, 2+LINK_SIZE, recno);
+ break;
+ }
+
+ /* Otherwise (did not start with "+" or "-"), start by looking for the
+ name. */
+
+ slot = cd->name_table;
+ for (i = 0; i < cd->names_found; i++)
+ {
+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
+ slot += cd->name_entry_size;
+ }
+
+ /* Found a previous named subpattern */
+
+ if (i < cd->names_found)
+ {
+ recno = GET2(slot, 0);
+ PUT2(code, 2+LINK_SIZE, recno);
+ }
+
+ /* Search the pattern for a forward reference */
+
+ else if ((i = find_parens(ptr, cd, name, namelen,
+ (options & PCRE_EXTENDED) != 0)) > 0)
+ {
+ PUT2(code, 2+LINK_SIZE, i);
+ }
+
+ /* If terminator == 0 it means that the name followed directly after
+ the opening parenthesis [e.g. (?(abc)...] and in this case there are
+ some further alternatives to try. For the cases where terminator != 0
+ [things like (?(... or (?('name')... or (?(R&name)... ] we have
+ now checked all the possibilities, so give an error. */
+
+ else if (terminator != 0)
+ {
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
+
+ /* Check for (?(R) for recursion. Allow digits after R to specify a
+ specific group number. */
+
+ else if (*name == 'R')
+ {
+ recno = 0;
+ for (i = 1; i < namelen; i++)
+ {
+ if ((digitab[name[i]] & ctype_digit) == 0)
+ {
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
+ recno = recno * 10 + name[i] - '0';
+ }
+ if (recno == 0) recno = RREF_ANY;
+ code[1+LINK_SIZE] = OP_RREF; /* Change test type */
+ PUT2(code, 2+LINK_SIZE, recno);
+ }
+
+ /* Similarly, check for the (?(DEFINE) "condition", which is always
+ false. */
+
+ else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
+ {
+ code[1+LINK_SIZE] = OP_DEF;
+ skipbytes = 1;
+ }
+
+ /* Check for the "name" actually being a subpattern number. We are
+ in the second pass here, so final_bracount is set. */
+
+ else if (recno > 0 && recno <= cd->final_bracount)
+ {
+ PUT2(code, 2+LINK_SIZE, recno);
+ }
+
+ /* Either an unidentified subpattern, or a reference to (?(0) */
+
+ else
+ {
+ *errorcodeptr = (recno == 0)? ERR35: ERR15;
+ goto FAILED;
+ }
+ break;
+
+
+ /* ------------------------------------------------------------ */
+ case '=': /* Positive lookahead */
+ bravalue = OP_ASSERT;
+ ptr++;
+ break;
+
+
+ /* ------------------------------------------------------------ */
+ case '!': /* Negative lookahead */
+ ptr++;
+ if (*ptr == ')') /* Optimize (?!) */
+ {
+ *code++ = OP_FAIL;
+ previous = NULL;
+ continue;
+ }
+ bravalue = OP_ASSERT_NOT;
+ break;
+
+
+ /* ------------------------------------------------------------ */
+ case '<': /* Lookbehind or named define */
+ switch (ptr[1])
+ {
+ case '=': /* Positive lookbehind */
+ bravalue = OP_ASSERTBACK;
+ ptr += 2;
+ break;
+
+ case '!': /* Negative lookbehind */
+ bravalue = OP_ASSERTBACK_NOT;
+ ptr += 2;
+ break;
+
+ default: /* Could be name define, else bad */
+ if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
+ ptr++; /* Correct offset for error */
+ *errorcodeptr = ERR24;
+ goto FAILED;
+ }
+ break;
+
+
+ /* ------------------------------------------------------------ */
+ case '>': /* One-time brackets */
+ bravalue = OP_ONCE;
+ ptr++;
+ break;
+
+
+ /* ------------------------------------------------------------ */
+ case 'C': /* Callout - may be followed by digits; */
+ previous_callout = code; /* Save for later completion */
+ after_manual_callout = 1; /* Skip one item before completing */
+ *code++ = OP_CALLOUT;
+ {
+ int n = 0;
+ while ((digitab[*(++ptr)] & ctype_digit) != 0)
+ n = n * 10 + *ptr - '0';
+ if (*ptr != ')')
+ {
+ *errorcodeptr = ERR39;
+ goto FAILED;
+ }
+ if (n > 255)
+ {
+ *errorcodeptr = ERR38;
+ goto FAILED;
+ }
+ *code++ = n;
+ PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
+ PUT(code, LINK_SIZE, 0); /* Default length */
+ code += 2 * LINK_SIZE;
+ }
+ previous = NULL;
+ continue;
+
+
+ /* ------------------------------------------------------------ */
+ case 'P': /* Python-style named subpattern handling */
+ if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
+ {
+ is_recurse = *ptr == '>';
+ terminator = ')';
+ goto NAMED_REF_OR_RECURSE;
+ }
+ else if (*ptr != '<') /* Test for Python-style definition */
+ {
+ *errorcodeptr = ERR41;
+ goto FAILED;
+ }
+ /* Fall through to handle (?P< as (?< is handled */
+
+
+ /* ------------------------------------------------------------ */
+ DEFINE_NAME: /* Come here from (?< handling */
+ case '\'':
+ {
+ terminator = (*ptr == '<')? '>' : '\'';
+ name = ++ptr;
+
+ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
+ namelen = ptr - name;
+
+ /* In the pre-compile phase, just do a syntax check. */
+
+ if (lengthptr != NULL)
+ {
+ if (*ptr != terminator)
+ {
+ *errorcodeptr = ERR42;
+ goto FAILED;
+ }
+ if (cd->names_found >= MAX_NAME_COUNT)
+ {
+ *errorcodeptr = ERR49;
+ goto FAILED;
+ }
+ if (namelen + 3 > cd->name_entry_size)
+ {
+ cd->name_entry_size = namelen + 3;
+ if (namelen > MAX_NAME_SIZE)
+ {
+ *errorcodeptr = ERR48;
+ goto FAILED;
+ }
+ }
+ }
+
+ /* In the real compile, create the entry in the table */
+
+ else
+ {
+ slot = cd->name_table;
+ for (i = 0; i < cd->names_found; i++)
+ {
+ int crc = memcmp(name, slot+2, namelen);
+ if (crc == 0)
+ {
+ if (slot[2+namelen] == 0)
+ {
+ if ((options & PCRE_DUPNAMES) == 0)
+ {
+ *errorcodeptr = ERR43;
+ goto FAILED;
+ }
+ }
+ else crc = -1; /* Current name is substring */
+ }
+ if (crc < 0)
+ {
+ memmove(slot + cd->name_entry_size, slot,
+ (cd->names_found - i) * cd->name_entry_size);
+ break;
+ }
+ slot += cd->name_entry_size;
+ }
+
+ PUT2(slot, 0, cd->bracount + 1);
+ memcpy(slot + 2, name, namelen);
+ slot[2+namelen] = 0;
+ }
+ }
+
+ /* In both cases, count the number of names we've encountered. */
+
+ ptr++; /* Move past > or ' */
+ cd->names_found++;
+ goto NUMBERED_GROUP;
+
+
+ /* ------------------------------------------------------------ */
+ case '&': /* Perl recursion/subroutine syntax */
+ terminator = ')';
+ is_recurse = TRUE;
+ /* Fall through */
+
+ /* We come here from the Python syntax above that handles both
+ references (?P=name) and recursion (?P>name), as well as falling
+ through from the Perl recursion syntax (?&name). We also come here from
+ the Perl \k or \k'name' back reference syntax and the \k{name}
+ .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
+
+ NAMED_REF_OR_RECURSE:
+ name = ++ptr;
+ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
+ namelen = ptr - name;
+
+ /* In the pre-compile phase, do a syntax check and set a dummy
+ reference number. */
+
+ if (lengthptr != NULL)
+ {
+ if (namelen == 0)
+ {
+ *errorcodeptr = ERR62;
+ goto FAILED;
+ }
+ if (*ptr != terminator)
+ {
+ *errorcodeptr = ERR42;
+ goto FAILED;
+ }
+ if (namelen > MAX_NAME_SIZE)
+ {
+ *errorcodeptr = ERR48;
+ goto FAILED;
+ }
+ recno = 0;
+ }
+
+ /* In the real compile, seek the name in the table. We check the name
+ first, and then check that we have reached the end of the name in the
+ table. That way, if the name that is longer than any in the table,
+ the comparison will fail without reading beyond the table entry. */
+
+ else
+ {
+ slot = cd->name_table;
+ for (i = 0; i < cd->names_found; i++)
+ {
+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
+ slot[2+namelen] == 0)
+ break;
+ slot += cd->name_entry_size;
+ }
+
+ if (i < cd->names_found) /* Back reference */
+ {
+ recno = GET2(slot, 0);
+ }
+ else if ((recno = /* Forward back reference */
+ find_parens(ptr, cd, name, namelen,
+ (options & PCRE_EXTENDED) != 0)) <= 0)
+ {
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
+ }
+
+ /* In both phases, we can now go to the code than handles numerical
+ recursion or backreferences. */
+
+ if (is_recurse) goto HANDLE_RECURSION;
+ else goto HANDLE_REFERENCE;
+
+
+ /* ------------------------------------------------------------ */
+ case 'R': /* Recursion */
+ ptr++; /* Same as (?0) */
+ /* Fall through */
+
+
+ /* ------------------------------------------------------------ */
+ case '-': case '+':
+ case '0': case '1': case '2': case '3': case '4': /* Recursion or */
+ case '5': case '6': case '7': case '8': case '9': /* subroutine */
+ {
+ const uschar *called;
+ terminator = ')';
+
+ /* Come here from the \g<...> and \g'...' code (Oniguruma
+ compatibility). However, the syntax has been checked to ensure that
+ the ... are a (signed) number, so that neither ERR63 nor ERR29 will
+ be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
+ ever be taken. */
+
+ HANDLE_NUMERICAL_RECURSION:
+
+ if ((refsign = *ptr) == '+')
+ {
+ ptr++;
+ if ((digitab[*ptr] & ctype_digit) == 0)
+ {
+ *errorcodeptr = ERR63;
+ goto FAILED;
+ }
+ }
+ else if (refsign == '-')
+ {
+ if ((digitab[ptr[1]] & ctype_digit) == 0)
+ goto OTHER_CHAR_AFTER_QUERY;
+ ptr++;
+ }
+
+ recno = 0;
+ while((digitab[*ptr] & ctype_digit) != 0)
+ recno = recno * 10 + *ptr++ - '0';
+
+ if (*ptr != terminator)
+ {
+ *errorcodeptr = ERR29;
+ goto FAILED;
+ }
+
+ if (refsign == '-')
+ {
+ if (recno == 0)
+ {
+ *errorcodeptr = ERR58;
+ goto FAILED;
+ }
+ recno = cd->bracount - recno + 1;
+ if (recno <= 0)
+ {
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
+ }
+ else if (refsign == '+')
+ {
+ if (recno == 0)
+ {
+ *errorcodeptr = ERR58;
+ goto FAILED;
+ }
+ recno += cd->bracount;
+ }
+
+ /* Come here from code above that handles a named recursion */
+
+ HANDLE_RECURSION:
+
+ previous = code;
+ called = cd->start_code;
+
+ /* When we are actually compiling, find the bracket that is being
+ referenced. Temporarily end the regex in case it doesn't exist before
+ this point. If we end up with a forward reference, first check that
+ the bracket does occur later so we can give the error (and position)
+ now. Then remember this forward reference in the workspace so it can
+ be filled in at the end. */
+
+ if (lengthptr == NULL)
+ {
+ *code = OP_END;
+ if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
+
+ /* Forward reference */
+
+ if (called == NULL)
+ {
+ if (find_parens(ptr, cd, NULL, recno,
+ (options & PCRE_EXTENDED) != 0) < 0)
+ {
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
+ called = cd->start_code + recno;
+ PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
+ }
+
+ /* If not a forward reference, and the subpattern is still open,
+ this is a recursive call. We check to see if this is a left
+ recursion that could loop for ever, and diagnose that case. */
+
+ else if (GET(called, 1) == 0 &&
+ could_be_empty(called, code, bcptr, utf8))
+ {
+ *errorcodeptr = ERR40;
+ goto FAILED;
+ }
+ }
+
+ /* Insert the recursion/subroutine item, automatically wrapped inside
+ "once" brackets. Set up a "previous group" length so that a
+ subsequent quantifier will work. */
+
+ *code = OP_ONCE;
+ PUT(code, 1, 2 + 2*LINK_SIZE);
+ code += 1 + LINK_SIZE;
+
+ *code = OP_RECURSE;
+ PUT(code, 1, called - cd->start_code);
+ code += 1 + LINK_SIZE;
+
+ *code = OP_KET;
+ PUT(code, 1, 2 + 2*LINK_SIZE);
+ code += 1 + LINK_SIZE;
+
+ length_prevgroup = 3 + 3*LINK_SIZE;
+ }
+
+ /* Can't determine a first byte now */
+
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ continue;
+
+
+ /* ------------------------------------------------------------ */
+ default: /* Other characters: check option setting */
+ OTHER_CHAR_AFTER_QUERY:
+ set = unset = 0;
+ optset = &set;
+
+ while (*ptr != ')' && *ptr != ':')
+ {
+ switch (*ptr++)
+ {
+ case '-': optset = &unset; break;
+
+ case 'J': /* Record that it changed in the external options */
+ *optset |= PCRE_DUPNAMES;
+ cd->external_flags |= PCRE_JCHANGED;
+ break;
+
+ case 'i': *optset |= PCRE_CASELESS; break;
+ case 'm': *optset |= PCRE_MULTILINE; break;
+ case 's': *optset |= PCRE_DOTALL; break;
+ case 'x': *optset |= PCRE_EXTENDED; break;
+ case 'U': *optset |= PCRE_UNGREEDY; break;
+ case 'X': *optset |= PCRE_EXTRA; break;
+
+ default: *errorcodeptr = ERR12;
+ ptr--; /* Correct the offset */
+ goto FAILED;
+ }
+ }
+
+ /* Set up the changed option bits, but don't change anything yet. */
+
+ newoptions = (options | set) & (~unset);
+
+ /* If the options ended with ')' this is not the start of a nested
+ group with option changes, so the options change at this level. If this
+ item is right at the start of the pattern, the options can be
+ abstracted and made external in the pre-compile phase, and ignored in
+ the compile phase. This can be helpful when matching -- for instance in
+ caseless checking of required bytes.
+
+ If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
+ definitely *not* at the start of the pattern because something has been
+ compiled. In the pre-compile phase, however, the code pointer can have
+ that value after the start, because it gets reset as code is discarded
+ during the pre-compile. However, this can happen only at top level - if
+ we are within parentheses, the starting BRA will still be present. At
+ any parenthesis level, the length value can be used to test if anything
+ has been compiled at that level. Thus, a test for both these conditions
+ is necessary to ensure we correctly detect the start of the pattern in
+ both phases.
+
+ If we are not at the pattern start, compile code to change the ims
+ options if this setting actually changes any of them. We also pass the
+ new setting back so that it can be put at the start of any following
+ branches, and when this group ends (if we are in a group), a resetting
+ item can be compiled. */
+
+ if (*ptr == ')')
+ {
+ if (code == cd->start_code + 1 + LINK_SIZE &&
+ (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
+ {
+ cd->external_options = newoptions;
+ options = newoptions;
+ }
+ else
+ {
+ if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
+ {
+ *code++ = OP_OPT;
+ *code++ = newoptions & PCRE_IMS;
+ }
+
+ /* Change options at this level, and pass them back for use
+ in subsequent branches. Reset the greedy defaults and the case
+ value for firstbyte and reqbyte. */
+
+ *optionsptr = options = newoptions;
+ greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
+ greedy_non_default = greedy_default ^ 1;
+ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
+ }
+
+ previous = NULL; /* This item can't be repeated */
+ continue; /* It is complete */
+ }
+
+ /* If the options ended with ':' we are heading into a nested group
+ with possible change of options. Such groups are non-capturing and are
+ not assertions of any kind. All we need to do is skip over the ':';
+ the newoptions value is handled below. */
+
+ bravalue = OP_BRA;
+ ptr++;
+ } /* End of switch for character following (? */
+ } /* End of (? handling */
+
+ /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
+ all unadorned brackets become non-capturing and behave like (?:...)
+ brackets. */
+
+ else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
+ {
+ bravalue = OP_BRA;
+ }
+
+ /* Else we have a capturing group. */
+
+ else
+ {
+ NUMBERED_GROUP:
+ cd->bracount += 1;
+ PUT2(code, 1+LINK_SIZE, cd->bracount);
+ skipbytes = 2;
+ }
+
+ /* Process nested bracketed regex. Assertions may not be repeated, but
+ other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
+ non-register variable in order to be able to pass its address because some
+ compilers complain otherwise. Pass in a new setting for the ims options if
+ they have changed. */
+
+ previous = (bravalue >= OP_ONCE)? code : NULL;
+ *code = bravalue;
+ tempcode = code;
+ tempreqvary = cd->req_varyopt; /* Save value before bracket */
+ length_prevgroup = 0; /* Initialize for pre-compile phase */
+
+ if (!compile_regex(
+ newoptions, /* The complete new option state */
+ options & PCRE_IMS, /* The previous ims option state */
+ &tempcode, /* Where to put code (updated) */
+ &ptr, /* Input pointer (updated) */
+ errorcodeptr, /* Where to put an error message */
+ (bravalue == OP_ASSERTBACK ||
+ bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
+ reset_bracount, /* True if (?| group */
+ skipbytes, /* Skip over bracket number */
+ &subfirstbyte, /* For possible first char */
+ &subreqbyte, /* For possible last char */
+ bcptr, /* Current branch chain */
+ cd, /* Tables block */
+ (lengthptr == NULL)? NULL : /* Actual compile phase */
+ &length_prevgroup /* Pre-compile phase */
+ ))
+ goto FAILED;
+
+ /* At the end of compiling, code is still pointing to the start of the
+ group, while tempcode has been updated to point past the end of the group
+ and any option resetting that may follow it. The pattern pointer (ptr)
+ is on the bracket. */
+
+ /* If this is a conditional bracket, check that there are no more than
+ two branches in the group, or just one if it's a DEFINE group. We do this
+ in the real compile phase, not in the pre-pass, where the whole group may
+ not be available. */
+
+ if (bravalue == OP_COND && lengthptr == NULL)
+ {
+ uschar *tc = code;
+ int condcount = 0;
+
+ do {
+ condcount++;
+ tc += GET(tc,1);
+ }
+ while (*tc != OP_KET);
+
+ /* A DEFINE group is never obeyed inline (the "condition" is always
+ false). It must have only one branch. */
+
+ if (code[LINK_SIZE+1] == OP_DEF)
+ {
+ if (condcount > 1)
+ {
+ *errorcodeptr = ERR54;
+ goto FAILED;
+ }
+ bravalue = OP_DEF; /* Just a flag to suppress char handling below */
+ }
+
+ /* A "normal" conditional group. If there is just one branch, we must not
+ make use of its firstbyte or reqbyte, because this is equivalent to an
+ empty second branch. */
+
+ else
+ {
+ if (condcount > 2)
+ {
+ *errorcodeptr = ERR27;
+ goto FAILED;
+ }
+ if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
+ }
+ }
+
+ /* Error if hit end of pattern */
+
+ if (*ptr != ')')
+ {
+ *errorcodeptr = ERR14;
+ goto FAILED;
+ }
+
+ /* In the pre-compile phase, update the length by the length of the group,
+ less the brackets at either end. Then reduce the compiled code to just a
+ set of non-capturing brackets so that it doesn't use much memory if it is
+ duplicated by a quantifier.*/
+
+ if (lengthptr != NULL)
+ {
+ if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
+ {
+ *errorcodeptr = ERR20;
+ goto FAILED;
+ }
+ *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
+ *code++ = OP_BRA;
+ PUTINC(code, 0, 1 + LINK_SIZE);
+ *code++ = OP_KET;
+ PUTINC(code, 0, 1 + LINK_SIZE);
+ break; /* No need to waste time with special character handling */
+ }
+
+ /* Otherwise update the main code pointer to the end of the group. */
+
+ code = tempcode;
+
+ /* For a DEFINE group, required and first character settings are not
+ relevant. */
+
+ if (bravalue == OP_DEF) break;
+
+ /* Handle updating of the required and first characters for other types of
+ group. Update for normal brackets of all kinds, and conditions with two
+ branches (see code above). If the bracket is followed by a quantifier with
+ zero repeat, we have to back off. Hence the definition of zeroreqbyte and
+ zerofirstbyte outside the main loop so that they can be accessed for the
+ back off. */
+
+ zeroreqbyte = reqbyte;
+ zerofirstbyte = firstbyte;
+ groupsetfirstbyte = FALSE;
+
+ if (bravalue >= OP_ONCE)
+ {
+ /* If we have not yet set a firstbyte in this branch, take it from the
+ subpattern, remembering that it was set here so that a repeat of more
+ than one can replicate it as reqbyte if necessary. If the subpattern has
+ no firstbyte, set "none" for the whole branch. In both cases, a zero
+ repeat forces firstbyte to "none". */
+
+ if (firstbyte == REQ_UNSET)
+ {
+ if (subfirstbyte >= 0)
+ {
+ firstbyte = subfirstbyte;
+ groupsetfirstbyte = TRUE;
+ }
+ else firstbyte = REQ_NONE;
+ zerofirstbyte = REQ_NONE;
+ }
+
+ /* If firstbyte was previously set, convert the subpattern's firstbyte
+ into reqbyte if there wasn't one, using the vary flag that was in
+ existence beforehand. */
+
+ else if (subfirstbyte >= 0 && subreqbyte < 0)
+ subreqbyte = subfirstbyte | tempreqvary;
+
+ /* If the subpattern set a required byte (or set a first byte that isn't
+ really the first byte - see above), set it. */
+
+ if (subreqbyte >= 0) reqbyte = subreqbyte;
+ }
+
+ /* For a forward assertion, we take the reqbyte, if set. This can be
+ helpful if the pattern that follows the assertion doesn't set a different
+ char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
+ for an assertion, however because it leads to incorrect effect for patterns
+ such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
+ of a firstbyte. This is overcome by a scan at the end if there's no
+ firstbyte, looking for an asserted first char. */
+
+ else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
+ break; /* End of processing '(' */
+
+
+ /* ===================================================================*/
+ /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
+ are arranged to be the negation of the corresponding OP_values. For the
+ back references, the values are ESC_REF plus the reference number. Only
+ back references and those types that consume a character may be repeated.
+ We can test for values between ESC_b and ESC_Z for the latter; this may
+ have to change if any new ones are ever created. */
+
+ case '\\':
+ tempptr = ptr;
+ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
+ if (*errorcodeptr != 0) goto FAILED;
+
+ if (c < 0)
+ {
+ if (-c == ESC_Q) /* Handle start of quoted string */
+ {
+ if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
+ else inescq = TRUE;
+ continue;
+ }
+
+ if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
+
+ /* For metasequences that actually match a character, we disable the
+ setting of a first character if it hasn't already been set. */
+
+ if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
+ firstbyte = REQ_NONE;
+
+ /* Set values to reset to if this is followed by a zero repeat. */
+
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+
+ /* \g or \g'name' is a subroutine call by name and \g or \g'n'
+ is a subroutine call by number (Oniguruma syntax). In fact, the value
+ -ESC_g is returned only for these cases. So we don't need to check for <
+ or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
+ -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
+ that is a synonym for a named back reference). */
+
+ if (-c == ESC_g)
+ {
+ const uschar *p;
+ save_hwm = cd->hwm; /* Normally this is set when '(' is read */
+ terminator = (*(++ptr) == '<')? '>' : '\'';
+
+ /* These two statements stop the compiler for warning about possibly
+ unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
+ fact, because we actually check for a number below, the paths that
+ would actually be in error are never taken. */
+
+ skipbytes = 0;
+ reset_bracount = FALSE;
+
+ /* Test for a name */
+
+ if (ptr[1] != '+' && ptr[1] != '-')
+ {
+ BOOL isnumber = TRUE;
+ for (p = ptr + 1; *p != 0 && *p != terminator; p++)
+ {
+ if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
+ if ((cd->ctypes[*p] & ctype_word) == 0) break;
+ }
+ if (*p != terminator)
+ {
+ *errorcodeptr = ERR57;
+ break;
+ }
+ if (isnumber)
+ {
+ ptr++;
+ goto HANDLE_NUMERICAL_RECURSION;
+ }
+ is_recurse = TRUE;
+ goto NAMED_REF_OR_RECURSE;
+ }
+
+ /* Test a signed number in angle brackets or quotes. */
+
+ p = ptr + 2;
+ while ((digitab[*p] & ctype_digit) != 0) p++;
+ if (*p != terminator)
+ {
+ *errorcodeptr = ERR57;
+ break;
+ }
+ ptr++;
+ goto HANDLE_NUMERICAL_RECURSION;
+ }
+
+ /* \k or \k'name' is a back reference by name (Perl syntax).
+ We also support \k{name} (.NET syntax) */
+
+ if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
+ {
+ is_recurse = FALSE;
+ terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
+ goto NAMED_REF_OR_RECURSE;
+ }
+
+ /* Back references are handled specially; must disable firstbyte if
+ not set to cope with cases like (?=(\w+))\1: which would otherwise set
+ ':' later. */
+
+ if (-c >= ESC_REF)
+ {
+ recno = -c - ESC_REF;
+
+ HANDLE_REFERENCE: /* Come here from named backref handling */
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ previous = code;
+ *code++ = OP_REF;
+ PUT2INC(code, 0, recno);
+ cd->backref_map |= (recno < 32)? (1 << recno) : 1;
+ if (recno > cd->top_backref) cd->top_backref = recno;
+ }
+
+ /* So are Unicode property matches, if supported. */
+
+#ifdef SUPPORT_UCP
+ else if (-c == ESC_P || -c == ESC_p)
+ {
+ BOOL negated;
+ int pdata;
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
+ if (ptype < 0) goto FAILED;
+ previous = code;
+ *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
+ *code++ = ptype;
+ *code++ = pdata;
+ }
+#else
+
+ /* If Unicode properties are not supported, \X, \P, and \p are not
+ allowed. */
+
+ else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
+ {
+ *errorcodeptr = ERR45;
+ goto FAILED;
+ }
+#endif
+
+ /* For the rest (including \X when Unicode properties are supported), we
+ can obtain the OP value by negating the escape value. */
+
+ else
+ {
+ previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
+ *code++ = -c;
+ }
+ continue;
+ }
+
+ /* We have a data character whose value is in c. In UTF-8 mode it may have
+ a value > 127. We set its representation in the length/buffer, and then
+ handle it as a data character. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c > 127)
+ mclength = _pcre_ord2utf8(c, mcbuffer);
+ else
+#endif
+
+ {
+ mcbuffer[0] = c;
+ mclength = 1;
+ }
+ goto ONE_CHAR;
+
+
+ /* ===================================================================*/
+ /* Handle a literal character. It is guaranteed not to be whitespace or #
+ when the extended flag is set. If we are in UTF-8 mode, it may be a
+ multi-byte literal character. */
+
+ default:
+ NORMAL_CHAR:
+ mclength = 1;
+ mcbuffer[0] = c;
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 0xc0)
+ {
+ while ((ptr[1] & 0xc0) == 0x80)
+ mcbuffer[mclength++] = *(++ptr);
+ }
+#endif
+
+ /* At this point we have the character's bytes in mcbuffer, and the length
+ in mclength. When not in UTF-8 mode, the length is always 1. */
+
+ ONE_CHAR:
+ previous = code;
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
+ for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
+
+ /* Remember if \r or \n were seen */
+
+ if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
+ cd->external_flags |= PCRE_HASCRORLF;
+
+ /* Set the first and required bytes appropriately. If no previous first
+ byte, set it from this character, but revert to none on a zero repeat.
+ Otherwise, leave the firstbyte value alone, and don't change it on a zero
+ repeat. */
+
+ if (firstbyte == REQ_UNSET)
+ {
+ zerofirstbyte = REQ_NONE;
+ zeroreqbyte = reqbyte;
+
+ /* If the character is more than one byte long, we can set firstbyte
+ only if it is not to be matched caselessly. */
+
+ if (mclength == 1 || req_caseopt == 0)
+ {
+ firstbyte = mcbuffer[0] | req_caseopt;
+ if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
+ }
+ else firstbyte = reqbyte = REQ_NONE;
+ }
+
+ /* firstbyte was previously set; we can set reqbyte only the length is
+ 1 or the matching is caseful. */
+
+ else
+ {
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+ if (mclength == 1 || req_caseopt == 0)
+ reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
+ }
+
+ break; /* End of literal character handling */
+ }
+ } /* end of big loop */
+
+
+/* Control never reaches here by falling through, only by a goto for all the
+error states. Pass back the position in the pattern so that it can be displayed
+to the user for diagnosing the error. */
+
+FAILED:
+*ptrptr = ptr;
+return FALSE;
+}
+
+
+
+
+/*************************************************
+* Compile sequence of alternatives *
+*************************************************/
+
+/* On entry, ptr is pointing past the bracket character, but on return it
+points to the closing bracket, or vertical bar, or end of string. The code
+variable is pointing at the byte into which the BRA operator has been stored.
+If the ims options are changed at the start (for a (?ims: group) or during any
+branch, we need to insert an OP_OPT item at the start of every following branch
+to ensure they get set correctly at run time, and also pass the new options
+into every subsequent branch compile.
+
+This function is used during the pre-compile phase when we are trying to find
+out the amount of memory needed, as well as during the real compile phase. The
+value of lengthptr distinguishes the two phases.
+
+Arguments:
+ options option bits, including any changes for this subpattern
+ oldims previous settings of ims option bits
+ codeptr -> the address of the current code pointer
+ ptrptr -> the address of the current pattern pointer
+ errorcodeptr -> pointer to error code variable
+ lookbehind TRUE if this is a lookbehind assertion
+ reset_bracount TRUE to reset the count for each branch
+ skipbytes skip this many bytes at start (for brackets and OP_COND)
+ firstbyteptr place to put the first required character, or a negative number
+ reqbyteptr place to put the last required character, or a negative number
+ bcptr pointer to the chain of currently open branches
+ cd points to the data block with tables pointers etc.
+ lengthptr NULL during the real compile phase
+ points to length accumulator during pre-compile phase
+
+Returns: TRUE on success
+*/
+
+static BOOL
+compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
+ int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
+ int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
+ int *lengthptr)
+{
+const uschar *ptr = *ptrptr;
+uschar *code = *codeptr;
+uschar *last_branch = code;
+uschar *start_bracket = code;
+uschar *reverse_count = NULL;
+int firstbyte, reqbyte;
+int branchfirstbyte, branchreqbyte;
+int length;
+int orig_bracount;
+int max_bracount;
+branch_chain bc;
+
+bc.outer = bcptr;
+bc.current = code;
+
+firstbyte = reqbyte = REQ_UNSET;
+
+/* Accumulate the length for use in the pre-compile phase. Start with the
+length of the BRA and KET and any extra bytes that are required at the
+beginning. We accumulate in a local variable to save frequent testing of
+lenthptr for NULL. We cannot do this by looking at the value of code at the
+start and end of each alternative, because compiled items are discarded during
+the pre-compile phase so that the work space is not exceeded. */
+
+length = 2 + 2*LINK_SIZE + skipbytes;
+
+/* WARNING: If the above line is changed for any reason, you must also change
+the code that abstracts option settings at the start of the pattern and makes
+them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
+pre-compile phase to find out whether anything has yet been compiled or not. */
+
+/* Offset is set zero to mark that this bracket is still open */
+
+PUT(code, 1, 0);
+code += 1 + LINK_SIZE + skipbytes;
+
+/* Loop for each alternative branch */
+
+orig_bracount = max_bracount = cd->bracount;
+for (;;)
+ {
+ /* For a (?| group, reset the capturing bracket count so that each branch
+ uses the same numbers. */
+
+ if (reset_bracount) cd->bracount = orig_bracount;
+
+ /* Handle a change of ims options at the start of the branch */
+
+ if ((options & PCRE_IMS) != oldims)
+ {
+ *code++ = OP_OPT;
+ *code++ = options & PCRE_IMS;
+ length += 2;
+ }
+
+ /* Set up dummy OP_REVERSE if lookbehind assertion */
+
+ if (lookbehind)
+ {
+ *code++ = OP_REVERSE;
+ reverse_count = code;
+ PUTINC(code, 0, 0);
+ length += 1 + LINK_SIZE;
+ }
+
+ /* Now compile the branch; in the pre-compile phase its length gets added
+ into the length. */
+
+ if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
+ &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
+ {
+ *ptrptr = ptr;
+ return FALSE;
+ }
+
+ /* Keep the highest bracket count in case (?| was used and some branch
+ has fewer than the rest. */
+
+ if (cd->bracount > max_bracount) max_bracount = cd->bracount;
+
+ /* In the real compile phase, there is some post-processing to be done. */
+
+ if (lengthptr == NULL)
+ {
+ /* If this is the first branch, the firstbyte and reqbyte values for the
+ branch become the values for the regex. */
+
+ if (*last_branch != OP_ALT)
+ {
+ firstbyte = branchfirstbyte;
+ reqbyte = branchreqbyte;
+ }
+
+ /* If this is not the first branch, the first char and reqbyte have to
+ match the values from all the previous branches, except that if the
+ previous value for reqbyte didn't have REQ_VARY set, it can still match,
+ and we set REQ_VARY for the regex. */
+
+ else
+ {
+ /* If we previously had a firstbyte, but it doesn't match the new branch,
+ we have to abandon the firstbyte for the regex, but if there was
+ previously no reqbyte, it takes on the value of the old firstbyte. */
+
+ if (firstbyte >= 0 && firstbyte != branchfirstbyte)
+ {
+ if (reqbyte < 0) reqbyte = firstbyte;
+ firstbyte = REQ_NONE;
+ }
+
+ /* If we (now or from before) have no firstbyte, a firstbyte from the
+ branch becomes a reqbyte if there isn't a branch reqbyte. */
+
+ if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
+ branchreqbyte = branchfirstbyte;
+
+ /* Now ensure that the reqbytes match */
+
+ if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
+ reqbyte = REQ_NONE;
+ else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
+ }
+
+ /* If lookbehind, check that this branch matches a fixed-length string, and
+ put the length into the OP_REVERSE item. Temporarily mark the end of the
+ branch with OP_END. */
+
+ if (lookbehind)
+ {
+ int fixed_length;
+ *code = OP_END;
+ fixed_length = find_fixedlength(last_branch, options);
+ DPRINTF(("fixed length = %d\n", fixed_length));
+ if (fixed_length < 0)
+ {
+ *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
+ *ptrptr = ptr;
+ return FALSE;
+ }
+ PUT(reverse_count, 0, fixed_length);
+ }
+ }
+
+ /* Reached end of expression, either ')' or end of pattern. In the real
+ compile phase, go back through the alternative branches and reverse the chain
+ of offsets, with the field in the BRA item now becoming an offset to the
+ first alternative. If there are no alternatives, it points to the end of the
+ group. The length in the terminating ket is always the length of the whole
+ bracketed item. If any of the ims options were changed inside the group,
+ compile a resetting op-code following, except at the very end of the pattern.
+ Return leaving the pointer at the terminating char. */
+
+ if (*ptr != '|')
+ {
+ if (lengthptr == NULL)
+ {
+ int branch_length = code - last_branch;
+ do
+ {
+ int prev_length = GET(last_branch, 1);
+ PUT(last_branch, 1, branch_length);
+ branch_length = prev_length;
+ last_branch -= branch_length;
+ }
+ while (branch_length > 0);
+ }
+
+ /* Fill in the ket */
+
+ *code = OP_KET;
+ PUT(code, 1, code - start_bracket);
+ code += 1 + LINK_SIZE;
+
+ /* Resetting option if needed */
+
+ if ((options & PCRE_IMS) != oldims && *ptr == ')')
+ {
+ *code++ = OP_OPT;
+ *code++ = oldims;
+ length += 2;
+ }
+
+ /* Retain the highest bracket number, in case resetting was used. */
+
+ cd->bracount = max_bracount;
+
+ /* Set values to pass back */
+
+ *codeptr = code;
+ *ptrptr = ptr;
+ *firstbyteptr = firstbyte;
+ *reqbyteptr = reqbyte;
+ if (lengthptr != NULL)
+ {
+ if (OFLOW_MAX - *lengthptr < length)
+ {
+ *errorcodeptr = ERR20;
+ return FALSE;
+ }
+ *lengthptr += length;
+ }
+ return TRUE;
+ }
+
+ /* Another branch follows. In the pre-compile phase, we can move the code
+ pointer back to where it was for the start of the first branch. (That is,
+ pretend that each branch is the only one.)
+
+ In the real compile phase, insert an ALT node. Its length field points back
+ to the previous branch while the bracket remains open. At the end the chain
+ is reversed. It's done like this so that the start of the bracket has a
+ zero offset until it is closed, making it possible to detect recursion. */
+
+ if (lengthptr != NULL)
+ {
+ code = *codeptr + 1 + LINK_SIZE + skipbytes;
+ length += 1 + LINK_SIZE;
+ }
+ else
+ {
+ *code = OP_ALT;
+ PUT(code, 1, code - last_branch);
+ bc.current = last_branch = code;
+ code += 1 + LINK_SIZE;
+ }
+
+ ptr++;
+ }
+/* Control never reaches here */
+}
+
+
+
+
+/*************************************************
+* Check for anchored expression *
+*************************************************/
+
+/* Try to find out if this is an anchored regular expression. Consider each
+alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
+all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
+it's anchored. However, if this is a multiline pattern, then only OP_SOD
+counts, since OP_CIRC can match in the middle.
+
+We can also consider a regex to be anchored if OP_SOM starts all its branches.
+This is the code for \G, which means "match at start of match position, taking
+into account the match offset".
+
+A branch is also implicitly anchored if it starts with .* and DOTALL is set,
+because that will try the rest of the pattern at all possible matching points,
+so there is no point trying again.... er ....
+
+.... except when the .* appears inside capturing parentheses, and there is a
+subsequent back reference to those parentheses. We haven't enough information
+to catch that case precisely.
+
+At first, the best we could do was to detect when .* was in capturing brackets
+and the highest back reference was greater than or equal to that level.
+However, by keeping a bitmap of the first 31 back references, we can catch some
+of the more common cases more precisely.
+
+Arguments:
+ code points to start of expression (the bracket)
+ options points to the options setting
+ bracket_map a bitmap of which brackets we are inside while testing; this
+ handles up to substring 31; after that we just have to take
+ the less precise approach
+ backref_map the back reference bitmap
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
+ unsigned int backref_map)
+{
+do {
+ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
+ options, PCRE_MULTILINE, FALSE);
+ register int op = *scode;
+
+ /* Non-capturing brackets */
+
+ if (op == OP_BRA)
+ {
+ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
+ }
+
+ /* Capturing brackets */
+
+ else if (op == OP_CBRA)
+ {
+ int n = GET2(scode, 1+LINK_SIZE);
+ int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
+ if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
+ }
+
+ /* Other brackets */
+
+ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
+ {
+ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
+ }
+
+ /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
+ it isn't in brackets that are or may be referenced. */
+
+ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
+ op == OP_TYPEPOSSTAR))
+ {
+ if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
+ return FALSE;
+ }
+
+ /* Check for explicit anchoring */
+
+ else if (op != OP_SOD && op != OP_SOM &&
+ ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
+ return FALSE;
+ code += GET(code, 1);
+ }
+while (*code == OP_ALT); /* Loop for each alternative */
+return TRUE;
+}
+
+
+
+/*************************************************
+* Check for starting with ^ or .* *
+*************************************************/
+
+/* This is called to find out if every branch starts with ^ or .* so that
+"first char" processing can be done to speed things up in multiline
+matching and for non-DOTALL patterns that start with .* (which must start at
+the beginning or after \n). As in the case of is_anchored() (see above), we
+have to take account of back references to capturing brackets that contain .*
+because in that case we can't make the assumption.
+
+Arguments:
+ code points to start of expression (the bracket)
+ bracket_map a bitmap of which brackets we are inside while testing; this
+ handles up to substring 31; after that we just have to take
+ the less precise approach
+ backref_map the back reference bitmap
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+is_startline(const uschar *code, unsigned int bracket_map,
+ unsigned int backref_map)
+{
+do {
+ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
+ NULL, 0, FALSE);
+ register int op = *scode;
+
+ /* Non-capturing brackets */
+
+ if (op == OP_BRA)
+ {
+ if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
+ }
+
+ /* Capturing brackets */
+
+ else if (op == OP_CBRA)
+ {
+ int n = GET2(scode, 1+LINK_SIZE);
+ int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
+ if (!is_startline(scode, new_map, backref_map)) return FALSE;
+ }
+
+ /* Other brackets */
+
+ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
+ { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
+
+ /* .* means "start at start or after \n" if it isn't in brackets that
+ may be referenced. */
+
+ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
+ {
+ if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+ }
+
+ /* Check for explicit circumflex */
+
+ else if (op != OP_CIRC) return FALSE;
+
+ /* Move on to the next alternative */
+
+ code += GET(code, 1);
+ }
+while (*code == OP_ALT); /* Loop for each alternative */
+return TRUE;
+}
+
+
+
+/*************************************************
+* Check for asserted fixed first char *
+*************************************************/
+
+/* During compilation, the "first char" settings from forward assertions are
+discarded, because they can cause conflicts with actual literals that follow.
+However, if we end up without a first char setting for an unanchored pattern,
+it is worth scanning the regex to see if there is an initial asserted first
+char. If all branches start with the same asserted char, or with a bracket all
+of whose alternatives start with the same asserted char (recurse ad lib), then
+we return that char, otherwise -1.
+
+Arguments:
+ code points to start of expression (the bracket)
+ options pointer to the options (used to check casing changes)
+ inassert TRUE if in an assertion
+
+Returns: -1 or the fixed first char
+*/
+
+static int
+find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
+{
+register int c = -1;
+do {
+ int d;
+ const uschar *scode =
+ first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
+ register int op = *scode;
+
+ switch(op)
+ {
+ default:
+ return -1;
+
+ case OP_BRA:
+ case OP_CBRA:
+ case OP_ASSERT:
+ case OP_ONCE:
+ case OP_COND:
+ if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
+ return -1;
+ if (c < 0) c = d; else if (c != d) return -1;
+ break;
+
+ case OP_EXACT: /* Fall through */
+ scode += 2;
+
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_POSPLUS:
+ if (!inassert) return -1;
+ if (c < 0)
+ {
+ c = scode[1];
+ if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
+ }
+ else if (c != scode[1]) return -1;
+ break;
+ }
+
+ code += GET(code, 1);
+ }
+while (*code == OP_ALT);
+return c;
+}
+
+
+
+/*************************************************
+* Compile a Regular Expression *
+*************************************************/
+
+/* This function takes a string and returns a pointer to a block of store
+holding a compiled version of the expression. The original API for this
+function had no error code return variable; it is retained for backwards
+compatibility. The new function is given a new name.
+
+Arguments:
+ pattern the regular expression
+ options various option bits
+ errorcodeptr pointer to error code variable (pcre_compile2() only)
+ can be NULL if you don't want a code value
+ errorptr pointer to pointer to error text
+ erroroffset ptr offset in pattern where error was detected
+ tables pointer to character tables or NULL
+
+Returns: pointer to compiled data block, or NULL on error,
+ with errorptr and erroroffset set
+*/
+
+PCRE_EXP_DEFN pcre *
+pcre_compile(const char *pattern, int options, const char **errorptr,
+ int *erroroffset, const unsigned char *tables)
+{
+return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
+}
+
+
+PCRE_EXP_DEFN pcre *
+pcre_compile2(const char *pattern, int options, int *errorcodeptr,
+ const char **errorptr, int *erroroffset, const unsigned char *tables)
+{
+real_pcre *re;
+int length = 1; /* For final END opcode */
+int firstbyte, reqbyte, newline;
+int errorcode = 0;
+int skipatstart = 0;
+#ifdef SUPPORT_UTF8
+BOOL utf8;
+#endif
+size_t size;
+uschar *code;
+const uschar *codestart;
+const uschar *ptr;
+compile_data compile_block;
+compile_data *cd = &compile_block;
+
+/* This space is used for "compiling" into during the first phase, when we are
+computing the amount of memory that is needed. Compiled items are thrown away
+as soon as possible, so that a fairly large buffer should be sufficient for
+this purpose. The same space is used in the second phase for remembering where
+to fill in forward references to subpatterns. */
+
+uschar cworkspace[COMPILE_WORK_SIZE];
+
+/* Set this early so that early errors get offset 0. */
+
+ptr = (const uschar *)pattern;
+
+/* We can't pass back an error message if errorptr is NULL; I guess the best we
+can do is just return NULL, but we can set a code value if there is a code
+pointer. */
+
+if (errorptr == NULL)
+ {
+ if (errorcodeptr != NULL) *errorcodeptr = 99;
+ return NULL;
+ }
+
+*errorptr = NULL;
+if (errorcodeptr != NULL) *errorcodeptr = ERR0;
+
+/* However, we can give a message for this error */
+
+if (erroroffset == NULL)
+ {
+ errorcode = ERR16;
+ goto PCRE_EARLY_ERROR_RETURN2;
+ }
+
+*erroroffset = 0;
+
+/* Can't support UTF8 unless PCRE has been compiled to include the code. */
+
+#ifdef SUPPORT_UTF8
+utf8 = (options & PCRE_UTF8) != 0;
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
+ (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
+ {
+ errorcode = ERR44;
+ goto PCRE_EARLY_ERROR_RETURN2;
+ }
+#else
+if ((options & PCRE_UTF8) != 0)
+ {
+ errorcode = ERR32;
+ goto PCRE_EARLY_ERROR_RETURN;
+ }
+#endif
+
+if ((options & ~PUBLIC_OPTIONS) != 0)
+ {
+ errorcode = ERR17;
+ goto PCRE_EARLY_ERROR_RETURN;
+ }
+
+/* Set up pointers to the individual character tables */
+
+if (tables == NULL) tables = _pcre_default_tables;
+cd->lcc = tables + lcc_offset;
+cd->fcc = tables + fcc_offset;
+cd->cbits = tables + cbits_offset;
+cd->ctypes = tables + ctypes_offset;
+
+/* Check for global one-time settings at the start of the pattern, and remember
+the offset for later. */
+
+while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
+ {
+ int newnl = 0;
+ int newbsr = 0;
+
+ if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
+ { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
+ else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
+ { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
+ else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
+ { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
+ else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
+ { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
+ else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
+ { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
+
+ else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
+ { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
+ else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
+ { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
+
+ if (newnl != 0)
+ options = (options & ~PCRE_NEWLINE_BITS) | newnl;
+ else if (newbsr != 0)
+ options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
+ else break;
+ }
+
+/* Check validity of \R options. */
+
+switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
+ {
+ case 0:
+ case PCRE_BSR_ANYCRLF:
+ case PCRE_BSR_UNICODE:
+ break;
+ default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
+ }
+
+/* Handle different types of newline. The three bits give seven cases. The
+current code allows for fixed one- or two-byte sequences, plus "any" and
+"anycrlf". */
+
+switch (options & PCRE_NEWLINE_BITS)
+ {
+ case 0: newline = NEWLINE; break; /* Build-time default */
+ case PCRE_NEWLINE_CR: newline = '\r'; break;
+ case PCRE_NEWLINE_LF: newline = '\n'; break;
+ case PCRE_NEWLINE_CR+
+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+ case PCRE_NEWLINE_ANY: newline = -1; break;
+ case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
+ default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
+ }
+
+if (newline == -2)
+ {
+ cd->nltype = NLTYPE_ANYCRLF;
+ }
+else if (newline < 0)
+ {
+ cd->nltype = NLTYPE_ANY;
+ }
+else
+ {
+ cd->nltype = NLTYPE_FIXED;
+ if (newline > 255)
+ {
+ cd->nllen = 2;
+ cd->nl[0] = (newline >> 8) & 255;
+ cd->nl[1] = newline & 255;
+ }
+ else
+ {
+ cd->nllen = 1;
+ cd->nl[0] = newline;
+ }
+ }
+
+/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
+references to help in deciding whether (.*) can be treated as anchored or not.
+*/
+
+cd->top_backref = 0;
+cd->backref_map = 0;
+
+/* Reflect pattern for debugging output */
+
+DPRINTF(("------------------------------------------------------------------\n"));
+DPRINTF(("%s\n", pattern));
+
+/* Pretend to compile the pattern while actually just accumulating the length
+of memory required. This behaviour is triggered by passing a non-NULL final
+argument to compile_regex(). We pass a block of workspace (cworkspace) for it
+to compile parts of the pattern into; the compiled code is discarded when it is
+no longer needed, so hopefully this workspace will never overflow, though there
+is a test for its doing so. */
+
+cd->bracount = cd->final_bracount = 0;
+cd->names_found = 0;
+cd->name_entry_size = 0;
+cd->name_table = NULL;
+cd->start_workspace = cworkspace;
+cd->start_code = cworkspace;
+cd->hwm = cworkspace;
+cd->start_pattern = (const uschar *)pattern;
+cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
+cd->req_varyopt = 0;
+cd->external_options = options;
+cd->external_flags = 0;
+
+/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
+don't need to look at the result of the function here. The initial options have
+been put into the cd block so that they can be changed if an option setting is
+found within the regex right at the beginning. Bringing initial option settings
+outside can help speed up starting point checks. */
+
+ptr += skipatstart;
+code = cworkspace;
+*code = OP_BRA;
+(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
+ &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
+ &length);
+if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
+
+DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
+ cd->hwm - cworkspace));
+
+if (length > MAX_PATTERN_SIZE)
+ {
+ errorcode = ERR20;
+ goto PCRE_EARLY_ERROR_RETURN;
+ }
+
+/* Compute the size of data block needed and get it, either from malloc or
+externally provided function. Integer overflow should no longer be possible
+because nowadays we limit the maximum value of cd->names_found and
+cd->name_entry_size. */
+
+size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
+re = (real_pcre *)(pcre_malloc)(size);
+
+if (re == NULL)
+ {
+ errorcode = ERR21;
+ goto PCRE_EARLY_ERROR_RETURN;
+ }
+
+/* Put in the magic number, and save the sizes, initial options, internal
+flags, and character table pointer. NULL is used for the default character
+tables. The nullpad field is at the end; it's there to help in the case when a
+regex compiled on a system with 4-byte pointers is run on another with 8-byte
+pointers. */
+
+re->magic_number = MAGIC_NUMBER;
+re->size = size;
+re->options = cd->external_options;
+re->flags = cd->external_flags;
+re->dummy1 = 0;
+re->first_byte = 0;
+re->req_byte = 0;
+re->name_table_offset = sizeof(real_pcre);
+re->name_entry_size = cd->name_entry_size;
+re->name_count = cd->names_found;
+re->ref_count = 0;
+re->tables = (tables == _pcre_default_tables)? NULL : tables;
+re->nullpad = NULL;
+
+/* The starting points of the name/number translation table and of the code are
+passed around in the compile data block. The start/end pattern and initial
+options are already set from the pre-compile phase, as is the name_entry_size
+field. Reset the bracket count and the names_found field. Also reset the hwm
+field; this time it's used for remembering forward references to subpatterns.
+*/
+
+cd->final_bracount = cd->bracount; /* Save for checking forward references */
+cd->bracount = 0;
+cd->names_found = 0;
+cd->name_table = (uschar *)re + re->name_table_offset;
+codestart = cd->name_table + re->name_entry_size * re->name_count;
+cd->start_code = codestart;
+cd->hwm = cworkspace;
+cd->req_varyopt = 0;
+cd->had_accept = FALSE;
+
+/* Set up a starting, non-extracting bracket, then compile the expression. On
+error, errorcode will be set non-zero, so we don't need to look at the result
+of the function here. */
+
+ptr = (const uschar *)pattern + skipatstart;
+code = (uschar *)codestart;
+*code = OP_BRA;
+(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
+ &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
+re->top_bracket = cd->bracount;
+re->top_backref = cd->top_backref;
+re->flags = cd->external_flags;
+
+if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
+
+/* If not reached end of pattern on success, there's an excess bracket. */
+
+if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
+
+/* Fill in the terminating state and check for disastrous overflow, but
+if debugging, leave the test till after things are printed out. */
+
+*code++ = OP_END;
+
+#ifndef DEBUG
+if (code - codestart > length) errorcode = ERR23;
+#endif
+
+/* Fill in any forward references that are required. */
+
+while (errorcode == 0 && cd->hwm > cworkspace)
+ {
+ int offset, recno;
+ const uschar *groupptr;
+ cd->hwm -= LINK_SIZE;
+ offset = GET(cd->hwm, 0);
+ recno = GET(codestart, offset);
+ groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
+ if (groupptr == NULL) errorcode = ERR53;
+ else PUT(((uschar *)codestart), offset, groupptr - codestart);
+ }
+
+/* Give an error if there's back reference to a non-existent capturing
+subpattern. */
+
+if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
+
+/* Failed to compile, or error while post-processing */
+
+if (errorcode != 0)
+ {
+ (pcre_free)(re);
+ PCRE_EARLY_ERROR_RETURN:
+ *erroroffset = ptr - (const uschar *)pattern;
+ PCRE_EARLY_ERROR_RETURN2:
+ *errorptr = find_error_text(errorcode);
+ if (errorcodeptr != NULL) *errorcodeptr = errorcode;
+ return NULL;
+ }
+
+/* If the anchored option was not passed, set the flag if we can determine that
+the pattern is anchored by virtue of ^ characters or \A or anything else (such
+as starting with .* when DOTALL is set).
+
+Otherwise, if we know what the first byte has to be, save it, because that
+speeds up unanchored matches no end. If not, see if we can set the
+PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
+start with ^. and also when all branches start with .* for non-DOTALL matches.
+*/
+
+if ((re->options & PCRE_ANCHORED) == 0)
+ {
+ int temp_options = re->options; /* May get changed during these scans */
+ if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
+ re->options |= PCRE_ANCHORED;
+ else
+ {
+ if (firstbyte < 0)
+ firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
+ if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
+ {
+ int ch = firstbyte & 255;
+ re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
+ cd->fcc[ch] == ch)? ch : firstbyte;
+ re->flags |= PCRE_FIRSTSET;
+ }
+ else if (is_startline(codestart, 0, cd->backref_map))
+ re->flags |= PCRE_STARTLINE;
+ }
+ }
+
+/* For an anchored pattern, we use the "required byte" only if it follows a
+variable length item in the regex. Remove the caseless flag for non-caseable
+bytes. */
+
+if (reqbyte >= 0 &&
+ ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
+ {
+ int ch = reqbyte & 255;
+ re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
+ cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
+ re->flags |= PCRE_REQCHSET;
+ }
+
+/* Print out the compiled data if debugging is enabled. This is never the
+case when building a production library. */
+
+#ifdef DEBUG
+
+printf("Length = %d top_bracket = %d top_backref = %d\n",
+ length, re->top_bracket, re->top_backref);
+
+printf("Options=%08x\n", re->options);
+
+if ((re->flags & PCRE_FIRSTSET) != 0)
+ {
+ int ch = re->first_byte & 255;
+ const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
+ "" : " (caseless)";
+ if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
+ else printf("First char = \\x%02x%s\n", ch, caseless);
+ }
+
+if ((re->flags & PCRE_REQCHSET) != 0)
+ {
+ int ch = re->req_byte & 255;
+ const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
+ "" : " (caseless)";
+ if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
+ else printf("Req char = \\x%02x%s\n", ch, caseless);
+ }
+
+pcre_printint(re, stdout, TRUE);
+
+/* This check is done here in the debugging case so that the code that
+was compiled can be seen. */
+
+if (code - codestart > length)
+ {
+ (pcre_free)(re);
+ *errorptr = find_error_text(ERR23);
+ *erroroffset = ptr - (uschar *)pattern;
+ if (errorcodeptr != NULL) *errorcodeptr = ERR23;
+ return NULL;
+ }
+#endif /* DEBUG */
+
+return (pcre *)re;
+}
+
+/* End of pcre_compile.c */
diff --git a/src/pcre_config.c b/src/pcre_config.c
new file mode 100644
index 0000000..454fed9
--- /dev/null
+++ b/src/pcre_config.c
@@ -0,0 +1,128 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_config(). */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Return info about what features are configured *
+*************************************************/
+
+/* This function has an extensible interface so that additional items can be
+added compatibly.
+
+Arguments:
+ what what information is required
+ where where to put the information
+
+Returns: 0 if data returned, negative on error
+*/
+
+PCRE_EXP_DEFN int
+pcre_config(int what, void *where)
+{
+switch (what)
+ {
+ case PCRE_CONFIG_UTF8:
+#ifdef SUPPORT_UTF8
+ *((int *)where) = 1;
+#else
+ *((int *)where) = 0;
+#endif
+ break;
+
+ case PCRE_CONFIG_UNICODE_PROPERTIES:
+#ifdef SUPPORT_UCP
+ *((int *)where) = 1;
+#else
+ *((int *)where) = 0;
+#endif
+ break;
+
+ case PCRE_CONFIG_NEWLINE:
+ *((int *)where) = NEWLINE;
+ break;
+
+ case PCRE_CONFIG_BSR:
+#ifdef BSR_ANYCRLF
+ *((int *)where) = 1;
+#else
+ *((int *)where) = 0;
+#endif
+ break;
+
+ case PCRE_CONFIG_LINK_SIZE:
+ *((int *)where) = LINK_SIZE;
+ break;
+
+ case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
+ *((int *)where) = POSIX_MALLOC_THRESHOLD;
+ break;
+
+ case PCRE_CONFIG_MATCH_LIMIT:
+ *((unsigned int *)where) = MATCH_LIMIT;
+ break;
+
+ case PCRE_CONFIG_MATCH_LIMIT_RECURSION:
+ *((unsigned int *)where) = MATCH_LIMIT_RECURSION;
+ break;
+
+ case PCRE_CONFIG_STACKRECURSE:
+#ifdef NO_RECURSE
+ *((int *)where) = 0;
+#else
+ *((int *)where) = 1;
+#endif
+ break;
+
+ default: return PCRE_ERROR_BADOPTION;
+ }
+
+return 0;
+}
+
+/* End of pcre_config.c */
diff --git a/src/pcre_dfa_exec.c b/src/pcre_dfa_exec.c
new file mode 100644
index 0000000..01aad5d
--- /dev/null
+++ b/src/pcre_dfa_exec.c
@@ -0,0 +1,2899 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_dfa_exec(), which is an
+alternative matching function that uses a sort of DFA algorithm (not a true
+FSM). This is NOT Perl- compatible, but it has advantages in certain
+applications. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define NLBLOCK md /* Block containing newline information */
+#define PSSTART start_subject /* Field containing processed string start */
+#define PSEND end_subject /* Field containing processed string end */
+
+#include "pcre_internal.h"
+
+
+/* For use to indent debugging output */
+
+#define SP " "
+
+
+
+/*************************************************
+* Code parameters and static tables *
+*************************************************/
+
+/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
+into others, under special conditions. A gap of 20 between the blocks should be
+enough. The resulting opcodes don't have to be less than 256 because they are
+never stored, so we push them well clear of the normal opcodes. */
+
+#define OP_PROP_EXTRA 300
+#define OP_EXTUNI_EXTRA 320
+#define OP_ANYNL_EXTRA 340
+#define OP_HSPACE_EXTRA 360
+#define OP_VSPACE_EXTRA 380
+
+
+/* This table identifies those opcodes that are followed immediately by a
+character that is to be tested in some way. This makes is possible to
+centralize the loading of these characters. In the case of Type * etc, the
+"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
+small value. ***NOTE*** If the start of this table is modified, the two tables
+that follow must also be modified. */
+
+static const uschar coptable[] = {
+ 0, /* End */
+ 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
+ 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
+ 0, 0, 0, /* Any, AllAny, Anybyte */
+ 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
+ 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
+ 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
+ 1, /* Char */
+ 1, /* Charnc */
+ 1, /* not */
+ /* Positive single-char repeats */
+ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
+ 3, 3, 3, /* upto, minupto, exact */
+ 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
+ /* Negative single-char repeats - only for chars < 256 */
+ 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
+ 3, 3, 3, /* NOT upto, minupto, exact */
+ 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
+ /* Positive type repeats */
+ 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
+ 3, 3, 3, /* Type upto, minupto, exact */
+ 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
+ /* Character class & ref repeats */
+ 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
+ 0, 0, /* CRRANGE, CRMINRANGE */
+ 0, /* CLASS */
+ 0, /* NCLASS */
+ 0, /* XCLASS - variable length */
+ 0, /* REF */
+ 0, /* RECURSE */
+ 0, /* CALLOUT */
+ 0, /* Alt */
+ 0, /* Ket */
+ 0, /* KetRmax */
+ 0, /* KetRmin */
+ 0, /* Assert */
+ 0, /* Assert not */
+ 0, /* Assert behind */
+ 0, /* Assert behind not */
+ 0, /* Reverse */
+ 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
+ 0, 0, 0, /* SBRA, SCBRA, SCOND */
+ 0, /* CREF */
+ 0, /* RREF */
+ 0, /* DEF */
+ 0, 0, /* BRAZERO, BRAMINZERO */
+ 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
+ 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
+};
+
+/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
+and \w */
+
+static const uschar toptable1[] = {
+ 0, 0, 0, 0, 0, 0,
+ ctype_digit, ctype_digit,
+ ctype_space, ctype_space,
+ ctype_word, ctype_word,
+ 0, 0 /* OP_ANY, OP_ALLANY */
+};
+
+static const uschar toptable2[] = {
+ 0, 0, 0, 0, 0, 0,
+ ctype_digit, 0,
+ ctype_space, 0,
+ ctype_word, 0,
+ 1, 1 /* OP_ANY, OP_ALLANY */
+};
+
+
+/* Structure for holding data about a particular state, which is in effect the
+current data for an active path through the match tree. It must consist
+entirely of ints because the working vector we are passed, and which we put
+these structures in, is a vector of ints. */
+
+typedef struct stateblock {
+ int offset; /* Offset to opcode */
+ int count; /* Count for repeats */
+ int ims; /* ims flag bits */
+ int data; /* Some use extra data */
+} stateblock;
+
+#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
+
+
+#ifdef DEBUG
+/*************************************************
+* Print character string *
+*************************************************/
+
+/* Character string printing function for debugging.
+
+Arguments:
+ p points to string
+ length number of bytes
+ f where to print
+
+Returns: nothing
+*/
+
+static void
+pchars(unsigned char *p, int length, FILE *f)
+{
+int c;
+while (length-- > 0)
+ {
+ if (isprint(c = *(p++)))
+ fprintf(f, "%c", c);
+ else
+ fprintf(f, "\\x%02x", c);
+ }
+}
+#endif
+
+
+
+/*************************************************
+* Execute a Regular Expression - DFA engine *
+*************************************************/
+
+/* This internal function applies a compiled pattern to a subject string,
+starting at a given point, using a DFA engine. This function is called from the
+external one, possibly multiple times if the pattern is not anchored. The
+function calls itself recursively for some kinds of subpattern.
+
+Arguments:
+ md the match_data block with fixed information
+ this_start_code the opening bracket of this subexpression's code
+ current_subject where we currently are in the subject string
+ start_offset start offset in the subject string
+ offsets vector to contain the matching string offsets
+ offsetcount size of same
+ workspace vector of workspace
+ wscount size of same
+ ims the current ims flags
+ rlevel function call recursion level
+ recursing regex recursive call level
+
+Returns: > 0 => number of match offset pairs placed in offsets
+ = 0 => offsets overflowed; longest matches are present
+ -1 => failed to match
+ < -1 => some kind of unexpected problem
+
+The following macros are used for adding states to the two state vectors (one
+for the current character, one for the following character). */
+
+#define ADD_ACTIVE(x,y) \
+ if (active_count++ < wscount) \
+ { \
+ next_active_state->offset = (x); \
+ next_active_state->count = (y); \
+ next_active_state->ims = ims; \
+ next_active_state++; \
+ DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
+ } \
+ else return PCRE_ERROR_DFA_WSSIZE
+
+#define ADD_ACTIVE_DATA(x,y,z) \
+ if (active_count++ < wscount) \
+ { \
+ next_active_state->offset = (x); \
+ next_active_state->count = (y); \
+ next_active_state->ims = ims; \
+ next_active_state->data = (z); \
+ next_active_state++; \
+ DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+ } \
+ else return PCRE_ERROR_DFA_WSSIZE
+
+#define ADD_NEW(x,y) \
+ if (new_count++ < wscount) \
+ { \
+ next_new_state->offset = (x); \
+ next_new_state->count = (y); \
+ next_new_state->ims = ims; \
+ next_new_state++; \
+ DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
+ } \
+ else return PCRE_ERROR_DFA_WSSIZE
+
+#define ADD_NEW_DATA(x,y,z) \
+ if (new_count++ < wscount) \
+ { \
+ next_new_state->offset = (x); \
+ next_new_state->count = (y); \
+ next_new_state->ims = ims; \
+ next_new_state->data = (z); \
+ next_new_state++; \
+ DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+ } \
+ else return PCRE_ERROR_DFA_WSSIZE
+
+/* And now, here is the code */
+
+static int
+internal_dfa_exec(
+ dfa_match_data *md,
+ const uschar *this_start_code,
+ const uschar *current_subject,
+ int start_offset,
+ int *offsets,
+ int offsetcount,
+ int *workspace,
+ int wscount,
+ int ims,
+ int rlevel,
+ int recursing)
+{
+stateblock *active_states, *new_states, *temp_states;
+stateblock *next_active_state, *next_new_state;
+
+const uschar *ctypes, *lcc, *fcc;
+const uschar *ptr;
+const uschar *end_code, *first_op;
+
+int active_count, new_count, match_count;
+
+/* Some fields in the md block are frequently referenced, so we load them into
+independent variables in the hope that this will perform better. */
+
+const uschar *start_subject = md->start_subject;
+const uschar *end_subject = md->end_subject;
+const uschar *start_code = md->start_code;
+
+#ifdef SUPPORT_UTF8
+BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
+#else
+BOOL utf8 = FALSE;
+#endif
+
+rlevel++;
+offsetcount &= (-2);
+
+wscount -= 2;
+wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
+ (2 * INTS_PER_STATEBLOCK);
+
+DPRINTF(("\n%.*s---------------------\n"
+ "%.*sCall to internal_dfa_exec f=%d r=%d\n",
+ rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
+
+ctypes = md->tables + ctypes_offset;
+lcc = md->tables + lcc_offset;
+fcc = md->tables + fcc_offset;
+
+match_count = PCRE_ERROR_NOMATCH; /* A negative number */
+
+active_states = (stateblock *)(workspace + 2);
+next_new_state = new_states = active_states + wscount;
+new_count = 0;
+
+first_op = this_start_code + 1 + LINK_SIZE +
+ ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
+
+/* The first thing in any (sub) pattern is a bracket of some sort. Push all
+the alternative states onto the list, and find out where the end is. This
+makes is possible to use this function recursively, when we want to stop at a
+matching internal ket rather than at the end.
+
+If the first opcode in the first alternative is OP_REVERSE, we are dealing with
+a backward assertion. In that case, we have to find out the maximum amount to
+move back, and set up each alternative appropriately. */
+
+if (*first_op == OP_REVERSE)
+ {
+ int max_back = 0;
+ int gone_back;
+
+ end_code = this_start_code;
+ do
+ {
+ int back = GET(end_code, 2+LINK_SIZE);
+ if (back > max_back) max_back = back;
+ end_code += GET(end_code, 1);
+ }
+ while (*end_code == OP_ALT);
+
+ /* If we can't go back the amount required for the longest lookbehind
+ pattern, go back as far as we can; some alternatives may still be viable. */
+
+#ifdef SUPPORT_UTF8
+ /* In character mode we have to step back character by character */
+
+ if (utf8)
+ {
+ for (gone_back = 0; gone_back < max_back; gone_back++)
+ {
+ if (current_subject <= start_subject) break;
+ current_subject--;
+ while (current_subject > start_subject &&
+ (*current_subject & 0xc0) == 0x80)
+ current_subject--;
+ }
+ }
+ else
+#endif
+
+ /* In byte-mode we can do this quickly. */
+
+ {
+ gone_back = (current_subject - max_back < start_subject)?
+ current_subject - start_subject : max_back;
+ current_subject -= gone_back;
+ }
+
+ /* Now we can process the individual branches. */
+
+ end_code = this_start_code;
+ do
+ {
+ int back = GET(end_code, 2+LINK_SIZE);
+ if (back <= gone_back)
+ {
+ int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
+ ADD_NEW_DATA(-bstate, 0, gone_back - back);
+ }
+ end_code += GET(end_code, 1);
+ }
+ while (*end_code == OP_ALT);
+ }
+
+/* This is the code for a "normal" subpattern (not a backward assertion). The
+start of a whole pattern is always one of these. If we are at the top level,
+we may be asked to restart matching from the same point that we reached for a
+previous partial match. We still have to scan through the top-level branches to
+find the end state. */
+
+else
+ {
+ end_code = this_start_code;
+
+ /* Restarting */
+
+ if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
+ {
+ do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
+ new_count = workspace[1];
+ if (!workspace[0])
+ memcpy(new_states, active_states, new_count * sizeof(stateblock));
+ }
+
+ /* Not restarting */
+
+ else
+ {
+ int length = 1 + LINK_SIZE +
+ ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
+ do
+ {
+ ADD_NEW(end_code - start_code + length, 0);
+ end_code += GET(end_code, 1);
+ length = 1 + LINK_SIZE;
+ }
+ while (*end_code == OP_ALT);
+ }
+ }
+
+workspace[0] = 0; /* Bit indicating which vector is current */
+
+DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
+
+/* Loop for scanning the subject */
+
+ptr = current_subject;
+for (;;)
+ {
+ int i, j;
+ int clen, dlen;
+ unsigned int c, d;
+
+ /* Make the new state list into the active state list and empty the
+ new state list. */
+
+ temp_states = active_states;
+ active_states = new_states;
+ new_states = temp_states;
+ active_count = new_count;
+ new_count = 0;
+
+ workspace[0] ^= 1; /* Remember for the restarting feature */
+ workspace[1] = active_count;
+
+#ifdef DEBUG
+ printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
+ pchars((uschar *)ptr, strlen((char *)ptr), stdout);
+ printf("\"\n");
+
+ printf("%.*sActive states: ", rlevel*2-2, SP);
+ for (i = 0; i < active_count; i++)
+ printf("%d/%d ", active_states[i].offset, active_states[i].count);
+ printf("\n");
+#endif
+
+ /* Set the pointers for adding new states */
+
+ next_active_state = active_states + active_count;
+ next_new_state = new_states;
+
+ /* Load the current character from the subject outside the loop, as many
+ different states may want to look at it, and we assume that at least one
+ will. */
+
+ if (ptr < end_subject)
+ {
+ clen = 1; /* Number of bytes in the character */
+#ifdef SUPPORT_UTF8
+ if (utf8) { GETCHARLEN(c, ptr, clen); } else
+#endif /* SUPPORT_UTF8 */
+ c = *ptr;
+ }
+ else
+ {
+ clen = 0; /* This indicates the end of the subject */
+ c = NOTACHAR; /* This value should never actually be used */
+ }
+
+ /* Scan up the active states and act on each one. The result of an action
+ may be to add more states to the currently active list (e.g. on hitting a
+ parenthesis) or it may be to put states on the new list, for considering
+ when we move the character pointer on. */
+
+ for (i = 0; i < active_count; i++)
+ {
+ stateblock *current_state = active_states + i;
+ const uschar *code;
+ int state_offset = current_state->offset;
+ int count, codevalue;
+#ifdef SUPPORT_UCP
+ int chartype, script;
+#endif
+
+#ifdef DEBUG
+ printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
+ if (clen == 0) printf("EOL\n");
+ else if (c > 32 && c < 127) printf("'%c'\n", c);
+ else printf("0x%02x\n", c);
+#endif
+
+ /* This variable is referred to implicity in the ADD_xxx macros. */
+
+ ims = current_state->ims;
+
+ /* A negative offset is a special case meaning "hold off going to this
+ (negated) state until the number of characters in the data field have
+ been skipped". */
+
+ if (state_offset < 0)
+ {
+ if (current_state->data > 0)
+ {
+ DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
+ ADD_NEW_DATA(state_offset, current_state->count,
+ current_state->data - 1);
+ continue;
+ }
+ else
+ {
+ current_state->offset = state_offset = -state_offset;
+ }
+ }
+
+ /* Check for a duplicate state with the same count, and skip if found. */
+
+ for (j = 0; j < i; j++)
+ {
+ if (active_states[j].offset == state_offset &&
+ active_states[j].count == current_state->count)
+ {
+ DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
+ goto NEXT_ACTIVE_STATE;
+ }
+ }
+
+ /* The state offset is the offset to the opcode */
+
+ code = start_code + state_offset;
+ codevalue = *code;
+
+ /* If this opcode is followed by an inline character, load it. It is
+ tempting to test for the presence of a subject character here, but that
+ is wrong, because sometimes zero repetitions of the subject are
+ permitted.
+
+ We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
+ argument that is not a data character - but is always one byte long. We
+ have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
+ this case. To keep the other cases fast, convert these ones to new opcodes.
+ */
+
+ if (coptable[codevalue] > 0)
+ {
+ dlen = 1;
+#ifdef SUPPORT_UTF8
+ if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
+#endif /* SUPPORT_UTF8 */
+ d = code[coptable[codevalue]];
+ if (codevalue >= OP_TYPESTAR)
+ {
+ switch(d)
+ {
+ case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
+ case OP_NOTPROP:
+ case OP_PROP: codevalue += OP_PROP_EXTRA; break;
+ case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
+ case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
+ case OP_NOT_HSPACE:
+ case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
+ case OP_NOT_VSPACE:
+ case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
+ default: break;
+ }
+ }
+ }
+ else
+ {
+ dlen = 0; /* Not strictly necessary, but compilers moan */
+ d = NOTACHAR; /* if these variables are not set. */
+ }
+
+
+ /* Now process the individual opcodes */
+
+ switch (codevalue)
+ {
+
+/* ========================================================================== */
+ /* Reached a closing bracket. If not at the end of the pattern, carry
+ on with the next opcode. Otherwise, unless we have an empty string and
+ PCRE_NOTEMPTY is set, save the match data, shifting up all previous
+ matches so we always have the longest first. */
+
+ case OP_KET:
+ case OP_KETRMIN:
+ case OP_KETRMAX:
+ if (code != end_code)
+ {
+ ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
+ if (codevalue != OP_KET)
+ {
+ ADD_ACTIVE(state_offset - GET(code, 1), 0);
+ }
+ }
+ else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
+ {
+ if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
+ else if (match_count > 0 && ++match_count * 2 >= offsetcount)
+ match_count = 0;
+ count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
+ if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
+ if (offsetcount >= 2)
+ {
+ offsets[0] = current_subject - start_subject;
+ offsets[1] = ptr - start_subject;
+ DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
+ offsets[1] - offsets[0], current_subject));
+ }
+ if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
+ {
+ DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
+ "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
+ match_count, rlevel*2-2, SP));
+ return match_count;
+ }
+ }
+ break;
+
+/* ========================================================================== */
+ /* These opcodes add to the current list of states without looking
+ at the current character. */
+
+ /*-----------------------------------------------------------------*/
+ case OP_ALT:
+ do { code += GET(code, 1); } while (*code == OP_ALT);
+ ADD_ACTIVE(code - start_code, 0);
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_BRA:
+ case OP_SBRA:
+ do
+ {
+ ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ code += GET(code, 1);
+ }
+ while (*code == OP_ALT);
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_CBRA:
+ case OP_SCBRA:
+ ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
+ code += GET(code, 1);
+ while (*code == OP_ALT)
+ {
+ ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ code += GET(code, 1);
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_BRAZERO:
+ case OP_BRAMINZERO:
+ ADD_ACTIVE(state_offset + 1, 0);
+ code += 1 + GET(code, 2);
+ while (*code == OP_ALT) code += GET(code, 1);
+ ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_SKIPZERO:
+ code += 1 + GET(code, 2);
+ while (*code == OP_ALT) code += GET(code, 1);
+ ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_CIRC:
+ if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
+ ((ims & PCRE_MULTILINE) != 0 &&
+ ptr != end_subject &&
+ WAS_NEWLINE(ptr)))
+ { ADD_ACTIVE(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_EOD:
+ if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_OPT:
+ ims = code[1];
+ ADD_ACTIVE(state_offset + 2, 0);
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_SOD:
+ if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_SOM:
+ if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
+ break;
+
+
+/* ========================================================================== */
+ /* These opcodes inspect the next subject character, and sometimes
+ the previous one as well, but do not have an argument. The variable
+ clen contains the length of the current character and is zero if we are
+ at the end of the subject. */
+
+ /*-----------------------------------------------------------------*/
+ case OP_ANY:
+ if (clen > 0 && !IS_NEWLINE(ptr))
+ { ADD_NEW(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_ALLANY:
+ if (clen > 0)
+ { ADD_NEW(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_EODN:
+ if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
+ { ADD_ACTIVE(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_DOLL:
+ if ((md->moptions & PCRE_NOTEOL) == 0)
+ {
+ if (clen == 0 ||
+ (IS_NEWLINE(ptr) &&
+ ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
+ ))
+ { ADD_ACTIVE(state_offset + 1, 0); }
+ }
+ else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
+ { ADD_ACTIVE(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+
+ case OP_DIGIT:
+ case OP_WHITESPACE:
+ case OP_WORDCHAR:
+ if (clen > 0 && c < 256 &&
+ ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
+ { ADD_NEW(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_NOT_DIGIT:
+ case OP_NOT_WHITESPACE:
+ case OP_NOT_WORDCHAR:
+ if (clen > 0 && (c >= 256 ||
+ ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
+ { ADD_NEW(state_offset + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_WORD_BOUNDARY:
+ case OP_NOT_WORD_BOUNDARY:
+ {
+ int left_word, right_word;
+
+ if (ptr > start_subject)
+ {
+ const uschar *temp = ptr - 1;
+#ifdef SUPPORT_UTF8
+ if (utf8) BACKCHAR(temp);
+#endif
+ GETCHARTEST(d, temp);
+ left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
+ }
+ else left_word = 0;
+
+ if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
+ else right_word = 0;
+
+ if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
+ { ADD_ACTIVE(state_offset + 1, 0); }
+ }
+ break;
+
+
+ /*-----------------------------------------------------------------*/
+ /* Check the next character by Unicode property. We will get here only
+ if the support is in the binary; otherwise a compile-time error occurs.
+ */
+
+#ifdef SUPPORT_UCP
+ case OP_PROP:
+ case OP_NOTPROP:
+ if (clen > 0)
+ {
+ BOOL OK;
+ int category = _pcre_ucp_findprop(c, &chartype, &script);
+ switch(code[1])
+ {
+ case PT_ANY:
+ OK = TRUE;
+ break;
+
+ case PT_LAMP:
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ break;
+
+ case PT_GC:
+ OK = category == code[2];
+ break;
+
+ case PT_PC:
+ OK = chartype == code[2];
+ break;
+
+ case PT_SC:
+ OK = script == code[2];
+ break;
+
+ /* Should never occur, but keep compilers from grumbling. */
+
+ default:
+ OK = codevalue != OP_PROP;
+ break;
+ }
+
+ if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
+ }
+ break;
+#endif
+
+
+
+/* ========================================================================== */
+ /* These opcodes likewise inspect the subject character, but have an
+ argument that is not a data character. It is one of these opcodes:
+ OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
+ OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
+
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
+ if (clen > 0)
+ {
+ if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ (c < 256 &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
+ ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
+ {
+ if (count > 0 && codevalue == OP_TYPEPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW(state_offset, count);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ case OP_TYPEPOSQUERY:
+ ADD_ACTIVE(state_offset + 2, 0);
+ if (clen > 0)
+ {
+ if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ (c < 256 &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
+ ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
+ {
+ if (codevalue == OP_TYPEPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(state_offset + 2, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPOSSTAR:
+ ADD_ACTIVE(state_offset + 2, 0);
+ if (clen > 0)
+ {
+ if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ (c < 256 &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
+ ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
+ {
+ if (codevalue == OP_TYPEPOSSTAR)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(state_offset, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_TYPEEXACT:
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ (c < 256 &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
+ ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
+ {
+ if (++count >= GET2(code, 1))
+ { ADD_NEW(state_offset + 4, 0); }
+ else
+ { ADD_NEW(state_offset, count); }
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ case OP_TYPEPOSUPTO:
+ ADD_ACTIVE(state_offset + 4, 0);
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+ (c < 256 &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
+ ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
+ {
+ if (codevalue == OP_TYPEPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW(state_offset + 4, 0); }
+ else
+ { ADD_NEW(state_offset, count); }
+ }
+ }
+ break;
+
+/* ========================================================================== */
+ /* These are virtual opcodes that are used when something like
+ OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
+ argument. It keeps the code above fast for the other cases. The argument
+ is in the d variable. */
+
+#ifdef SUPPORT_UCP
+ case OP_PROP_EXTRA + OP_TYPEPLUS:
+ case OP_PROP_EXTRA + OP_TYPEMINPLUS:
+ case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
+ if (clen > 0)
+ {
+ BOOL OK;
+ int category = _pcre_ucp_findprop(c, &chartype, &script);
+ switch(code[2])
+ {
+ case PT_ANY:
+ OK = TRUE;
+ break;
+
+ case PT_LAMP:
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ break;
+
+ case PT_GC:
+ OK = category == code[3];
+ break;
+
+ case PT_PC:
+ OK = chartype == code[3];
+ break;
+
+ case PT_SC:
+ OK = script == code[3];
+ break;
+
+ /* Should never occur, but keep compilers from grumbling. */
+
+ default:
+ OK = codevalue != OP_PROP;
+ break;
+ }
+
+ if (OK == (d == OP_PROP))
+ {
+ if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW(state_offset, count);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
+ case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
+ case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
+ if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
+ {
+ const uschar *nptr = ptr + clen;
+ int ncount = 0;
+ if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ while (nptr < end_subject)
+ {
+ int nd;
+ int ndlen = 1;
+ GETCHARLEN(nd, nptr, ndlen);
+ if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
+ ncount++;
+ nptr += ndlen;
+ }
+ count++;
+ ADD_NEW_DATA(-state_offset, count, ncount);
+ }
+ break;
+#endif
+
+ /*-----------------------------------------------------------------*/
+ case OP_ANYNL_EXTRA + OP_TYPEPLUS:
+ case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
+ case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
+ if (clen > 0)
+ {
+ int ncount = 0;
+ switch (c)
+ {
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
+ goto ANYNL01;
+
+ case 0x000d:
+ if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
+ /* Fall through */
+
+ ANYNL01:
+ case 0x000a:
+ if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW_DATA(-state_offset, count, ncount);
+ break;
+
+ default:
+ break;
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_VSPACE_EXTRA + OP_TYPEPLUS:
+ case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
+ case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
+ if (clen > 0)
+ {
+ BOOL OK;
+ switch (c)
+ {
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = FALSE;
+ break;
+ }
+
+ if (OK == (d == OP_VSPACE))
+ {
+ if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW_DATA(-state_offset, count, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_HSPACE_EXTRA + OP_TYPEPLUS:
+ case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
+ case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
+ if (clen > 0)
+ {
+ BOOL OK;
+ switch (c)
+ {
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ OK = TRUE;
+ break;
+
+ default:
+ OK = FALSE;
+ break;
+ }
+
+ if (OK == (d == OP_HSPACE))
+ {
+ if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW_DATA(-state_offset, count, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+#ifdef SUPPORT_UCP
+ case OP_PROP_EXTRA + OP_TYPEQUERY:
+ case OP_PROP_EXTRA + OP_TYPEMINQUERY:
+ case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
+ count = 4;
+ goto QS1;
+
+ case OP_PROP_EXTRA + OP_TYPESTAR:
+ case OP_PROP_EXTRA + OP_TYPEMINSTAR:
+ case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
+ count = 0;
+
+ QS1:
+
+ ADD_ACTIVE(state_offset + 4, 0);
+ if (clen > 0)
+ {
+ BOOL OK;
+ int category = _pcre_ucp_findprop(c, &chartype, &script);
+ switch(code[2])
+ {
+ case PT_ANY:
+ OK = TRUE;
+ break;
+
+ case PT_LAMP:
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ break;
+
+ case PT_GC:
+ OK = category == code[3];
+ break;
+
+ case PT_PC:
+ OK = chartype == code[3];
+ break;
+
+ case PT_SC:
+ OK = script == code[3];
+ break;
+
+ /* Should never occur, but keep compilers from grumbling. */
+
+ default:
+ OK = codevalue != OP_PROP;
+ break;
+ }
+
+ if (OK == (d == OP_PROP))
+ {
+ if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
+ codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(state_offset + count, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
+ case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
+ case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
+ count = 2;
+ goto QS2;
+
+ case OP_EXTUNI_EXTRA + OP_TYPESTAR:
+ case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
+ case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
+ count = 0;
+
+ QS2:
+
+ ADD_ACTIVE(state_offset + 2, 0);
+ if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
+ {
+ const uschar *nptr = ptr + clen;
+ int ncount = 0;
+ if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
+ codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ while (nptr < end_subject)
+ {
+ int nd;
+ int ndlen = 1;
+ GETCHARLEN(nd, nptr, ndlen);
+ if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
+ ncount++;
+ nptr += ndlen;
+ }
+ ADD_NEW_DATA(-(state_offset + count), 0, ncount);
+ }
+ break;
+#endif
+
+ /*-----------------------------------------------------------------*/
+ case OP_ANYNL_EXTRA + OP_TYPEQUERY:
+ case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
+ case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
+ count = 2;
+ goto QS3;
+
+ case OP_ANYNL_EXTRA + OP_TYPESTAR:
+ case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
+ case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
+ count = 0;
+
+ QS3:
+ ADD_ACTIVE(state_offset + 2, 0);
+ if (clen > 0)
+ {
+ int ncount = 0;
+ switch (c)
+ {
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
+ goto ANYNL02;
+
+ case 0x000d:
+ if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
+ /* Fall through */
+
+ ANYNL02:
+ case 0x000a:
+ if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
+ codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW_DATA(-(state_offset + count), 0, ncount);
+ break;
+
+ default:
+ break;
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_VSPACE_EXTRA + OP_TYPEQUERY:
+ case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
+ case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
+ count = 2;
+ goto QS4;
+
+ case OP_VSPACE_EXTRA + OP_TYPESTAR:
+ case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
+ case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
+ count = 0;
+
+ QS4:
+ ADD_ACTIVE(state_offset + 2, 0);
+ if (clen > 0)
+ {
+ BOOL OK;
+ switch (c)
+ {
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = FALSE;
+ break;
+ }
+ if (OK == (d == OP_VSPACE))
+ {
+ if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
+ codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW_DATA(-(state_offset + count), 0, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_HSPACE_EXTRA + OP_TYPEQUERY:
+ case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
+ case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
+ count = 2;
+ goto QS5;
+
+ case OP_HSPACE_EXTRA + OP_TYPESTAR:
+ case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
+ case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
+ count = 0;
+
+ QS5:
+ ADD_ACTIVE(state_offset + 2, 0);
+ if (clen > 0)
+ {
+ BOOL OK;
+ switch (c)
+ {
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ OK = TRUE;
+ break;
+
+ default:
+ OK = FALSE;
+ break;
+ }
+
+ if (OK == (d == OP_HSPACE))
+ {
+ if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
+ codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW_DATA(-(state_offset + count), 0, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+#ifdef SUPPORT_UCP
+ case OP_PROP_EXTRA + OP_TYPEEXACT:
+ case OP_PROP_EXTRA + OP_TYPEUPTO:
+ case OP_PROP_EXTRA + OP_TYPEMINUPTO:
+ case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
+ if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
+ { ADD_ACTIVE(state_offset + 6, 0); }
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ BOOL OK;
+ int category = _pcre_ucp_findprop(c, &chartype, &script);
+ switch(code[4])
+ {
+ case PT_ANY:
+ OK = TRUE;
+ break;
+
+ case PT_LAMP:
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ break;
+
+ case PT_GC:
+ OK = category == code[5];
+ break;
+
+ case PT_PC:
+ OK = chartype == code[5];
+ break;
+
+ case PT_SC:
+ OK = script == code[5];
+ break;
+
+ /* Should never occur, but keep compilers from grumbling. */
+
+ default:
+ OK = codevalue != OP_PROP;
+ break;
+ }
+
+ if (OK == (d == OP_PROP))
+ {
+ if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW(state_offset + 6, 0); }
+ else
+ { ADD_NEW(state_offset, count); }
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
+ case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
+ case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
+ case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
+ if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
+ { ADD_ACTIVE(state_offset + 4, 0); }
+ count = current_state->count; /* Number already matched */
+ if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
+ {
+ const uschar *nptr = ptr + clen;
+ int ncount = 0;
+ if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ while (nptr < end_subject)
+ {
+ int nd;
+ int ndlen = 1;
+ GETCHARLEN(nd, nptr, ndlen);
+ if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
+ ncount++;
+ nptr += ndlen;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
+ else
+ { ADD_NEW_DATA(-state_offset, count, ncount); }
+ }
+ break;
+#endif
+
+ /*-----------------------------------------------------------------*/
+ case OP_ANYNL_EXTRA + OP_TYPEEXACT:
+ case OP_ANYNL_EXTRA + OP_TYPEUPTO:
+ case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
+ case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
+ if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
+ { ADD_ACTIVE(state_offset + 4, 0); }
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ int ncount = 0;
+ switch (c)
+ {
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
+ goto ANYNL03;
+
+ case 0x000d:
+ if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
+ /* Fall through */
+
+ ANYNL03:
+ case 0x000a:
+ if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
+ else
+ { ADD_NEW_DATA(-state_offset, count, ncount); }
+ break;
+
+ default:
+ break;
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_VSPACE_EXTRA + OP_TYPEEXACT:
+ case OP_VSPACE_EXTRA + OP_TYPEUPTO:
+ case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
+ case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
+ if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
+ { ADD_ACTIVE(state_offset + 4, 0); }
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ BOOL OK;
+ switch (c)
+ {
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = FALSE;
+ }
+
+ if (OK == (d == OP_VSPACE))
+ {
+ if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
+ else
+ { ADD_NEW_DATA(-state_offset, count, 0); }
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_HSPACE_EXTRA + OP_TYPEEXACT:
+ case OP_HSPACE_EXTRA + OP_TYPEUPTO:
+ case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
+ case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
+ if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
+ { ADD_ACTIVE(state_offset + 4, 0); }
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ BOOL OK;
+ switch (c)
+ {
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ OK = TRUE;
+ break;
+
+ default:
+ OK = FALSE;
+ break;
+ }
+
+ if (OK == (d == OP_HSPACE))
+ {
+ if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
+ else
+ { ADD_NEW_DATA(-state_offset, count, 0); }
+ }
+ }
+ break;
+
+/* ========================================================================== */
+ /* These opcodes are followed by a character that is usually compared
+ to the current subject character; it is loaded into d. We still get
+ here even if there is no subject character, because in some cases zero
+ repetitions are permitted. */
+
+ /*-----------------------------------------------------------------*/
+ case OP_CHAR:
+ if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_CHARNC:
+ if (clen == 0) break;
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
+ {
+ unsigned int othercase;
+ if (c < 128) othercase = fcc[c]; else
+
+ /* If we have Unicode property support, we can use it to test the
+ other case of the character. */
+
+#ifdef SUPPORT_UCP
+ othercase = _pcre_ucp_othercase(c);
+#else
+ othercase = NOTACHAR;
+#endif
+
+ if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
+ }
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Non-UTF-8 mode */
+ {
+ if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
+ }
+ break;
+
+
+#ifdef SUPPORT_UCP
+ /*-----------------------------------------------------------------*/
+ /* This is a tricky one because it can match more than one character.
+ Find out how many characters to skip, and then set up a negative state
+ to wait for them to pass before continuing. */
+
+ case OP_EXTUNI:
+ if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
+ {
+ const uschar *nptr = ptr + clen;
+ int ncount = 0;
+ while (nptr < end_subject)
+ {
+ int nclen = 1;
+ GETCHARLEN(c, nptr, nclen);
+ if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
+ ncount++;
+ nptr += nclen;
+ }
+ ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
+ }
+ break;
+#endif
+
+ /*-----------------------------------------------------------------*/
+ /* This is a tricky like EXTUNI because it too can match more than one
+ character (when CR is followed by LF). In this case, set up a negative
+ state to wait for one character to pass before continuing. */
+
+ case OP_ANYNL:
+ if (clen > 0) switch(c)
+ {
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
+
+ case 0x000a:
+ ADD_NEW(state_offset + 1, 0);
+ break;
+
+ case 0x000d:
+ if (ptr + 1 < end_subject && ptr[1] == 0x0a)
+ {
+ ADD_NEW_DATA(-(state_offset + 1), 0, 1);
+ }
+ else
+ {
+ ADD_NEW(state_offset + 1, 0);
+ }
+ break;
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_NOT_VSPACE:
+ if (clen > 0) switch(c)
+ {
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ break;
+
+ default:
+ ADD_NEW(state_offset + 1, 0);
+ break;
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_VSPACE:
+ if (clen > 0) switch(c)
+ {
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ ADD_NEW(state_offset + 1, 0);
+ break;
+
+ default: break;
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_NOT_HSPACE:
+ if (clen > 0) switch(c)
+ {
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ break;
+
+ default:
+ ADD_NEW(state_offset + 1, 0);
+ break;
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_HSPACE:
+ if (clen > 0) switch(c)
+ {
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ ADD_NEW(state_offset + 1, 0);
+ break;
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ /* Match a negated single character. This is only used for one-byte
+ characters, that is, we know that d < 256. The character we are
+ checking (c) can be multibyte. */
+
+ case OP_NOT:
+ if (clen > 0)
+ {
+ unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
+ if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_POSPLUS:
+ case OP_NOTPLUS:
+ case OP_NOTMINPLUS:
+ case OP_NOTPOSPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
+ if (clen > 0)
+ {
+ unsigned int otherd = NOTACHAR;
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = _pcre_ucp_othercase(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ otherd = fcc[d];
+ }
+ if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
+ {
+ if (count > 0 &&
+ (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW(state_offset, count);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_QUERY:
+ case OP_MINQUERY:
+ case OP_POSQUERY:
+ case OP_NOTQUERY:
+ case OP_NOTMINQUERY:
+ case OP_NOTPOSQUERY:
+ ADD_ACTIVE(state_offset + dlen + 1, 0);
+ if (clen > 0)
+ {
+ unsigned int otherd = NOTACHAR;
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = _pcre_ucp_othercase(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ otherd = fcc[d];
+ }
+ if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
+ {
+ if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(state_offset + dlen + 1, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_POSSTAR:
+ case OP_NOTSTAR:
+ case OP_NOTMINSTAR:
+ case OP_NOTPOSSTAR:
+ ADD_ACTIVE(state_offset + dlen + 1, 0);
+ if (clen > 0)
+ {
+ unsigned int otherd = NOTACHAR;
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = _pcre_ucp_othercase(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ otherd = fcc[d];
+ }
+ if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
+ {
+ if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(state_offset, 0);
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_EXACT:
+ case OP_NOTEXACT:
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ unsigned int otherd = NOTACHAR;
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = _pcre_ucp_othercase(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ otherd = fcc[d];
+ }
+ if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
+ {
+ if (++count >= GET2(code, 1))
+ { ADD_NEW(state_offset + dlen + 3, 0); }
+ else
+ { ADD_NEW(state_offset, count); }
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_UPTO:
+ case OP_MINUPTO:
+ case OP_POSUPTO:
+ case OP_NOTUPTO:
+ case OP_NOTMINUPTO:
+ case OP_NOTPOSUPTO:
+ ADD_ACTIVE(state_offset + dlen + 3, 0);
+ count = current_state->count; /* Number already matched */
+ if (clen > 0)
+ {
+ unsigned int otherd = NOTACHAR;
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = _pcre_ucp_othercase(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+ otherd = fcc[d];
+ }
+ if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
+ {
+ if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ if (++count >= GET2(code, 1))
+ { ADD_NEW(state_offset + dlen + 3, 0); }
+ else
+ { ADD_NEW(state_offset, count); }
+ }
+ }
+ break;
+
+
+/* ========================================================================== */
+ /* These are the class-handling opcodes */
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ case OP_XCLASS:
+ {
+ BOOL isinclass = FALSE;
+ int next_state_offset;
+ const uschar *ecode;
+
+ /* For a simple class, there is always just a 32-byte table, and we
+ can set isinclass from it. */
+
+ if (codevalue != OP_XCLASS)
+ {
+ ecode = code + 33;
+ if (clen > 0)
+ {
+ isinclass = (c > 255)? (codevalue == OP_NCLASS) :
+ ((code[1 + c/8] & (1 << (c&7))) != 0);
+ }
+ }
+
+ /* An extended class may have a table or a list of single characters,
+ ranges, or both, and it may be positive or negative. There's a
+ function that sorts all this out. */
+
+ else
+ {
+ ecode = code + GET(code, 1);
+ if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
+ }
+
+ /* At this point, isinclass is set for all kinds of class, and ecode
+ points to the byte after the end of the class. If there is a
+ quantifier, this is where it will be. */
+
+ next_state_offset = ecode - start_code;
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ ADD_ACTIVE(next_state_offset + 1, 0);
+ if (isinclass) { ADD_NEW(state_offset, 0); }
+ break;
+
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ count = current_state->count; /* Already matched */
+ if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
+ if (isinclass) { count++; ADD_NEW(state_offset, count); }
+ break;
+
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ ADD_ACTIVE(next_state_offset + 1, 0);
+ if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ count = current_state->count; /* Already matched */
+ if (count >= GET2(ecode, 1))
+ { ADD_ACTIVE(next_state_offset + 5, 0); }
+ if (isinclass)
+ {
+ int max = GET2(ecode, 3);
+ if (++count >= max && max != 0) /* Max 0 => no limit */
+ { ADD_NEW(next_state_offset + 5, 0); }
+ else
+ { ADD_NEW(state_offset, count); }
+ }
+ break;
+
+ default:
+ if (isinclass) { ADD_NEW(next_state_offset, 0); }
+ break;
+ }
+ }
+ break;
+
+/* ========================================================================== */
+ /* These are the opcodes for fancy brackets of various kinds. We have
+ to use recursion in order to handle them. The "always failing" assersion
+ (?!) is optimised when compiling to OP_FAIL, so we have to support that,
+ though the other "backtracking verbs" are not supported. */
+
+ case OP_FAIL:
+ break;
+
+ case OP_ASSERT:
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ {
+ int rc;
+ int local_offsets[2];
+ int local_workspace[1000];
+ const uschar *endasscode = code + GET(code, 1);
+
+ while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
+
+ rc = internal_dfa_exec(
+ md, /* static match data */
+ code, /* this subexpression's code */
+ ptr, /* where we currently are */
+ ptr - start_subject, /* start offset */
+ local_offsets, /* offset vector */
+ sizeof(local_offsets)/sizeof(int), /* size of same */
+ local_workspace, /* workspace vector */
+ sizeof(local_workspace)/sizeof(int), /* size of same */
+ ims, /* the current ims flags */
+ rlevel, /* function recursion level */
+ recursing); /* pass on regex recursion */
+
+ if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
+ { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_COND:
+ case OP_SCOND:
+ {
+ int local_offsets[1000];
+ int local_workspace[1000];
+ int condcode = code[LINK_SIZE+1];
+
+ /* Back reference conditions are not supported */
+
+ if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
+
+ /* The DEFINE condition is always false */
+
+ if (condcode == OP_DEF)
+ {
+ ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
+ }
+
+ /* The only supported version of OP_RREF is for the value RREF_ANY,
+ which means "test if in any recursion". We can't test for specifically
+ recursed groups. */
+
+ else if (condcode == OP_RREF)
+ {
+ int value = GET2(code, LINK_SIZE+2);
+ if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
+ if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
+ else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
+ }
+
+ /* Otherwise, the condition is an assertion */
+
+ else
+ {
+ int rc;
+ const uschar *asscode = code + LINK_SIZE + 1;
+ const uschar *endasscode = asscode + GET(asscode, 1);
+
+ while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
+
+ rc = internal_dfa_exec(
+ md, /* fixed match data */
+ asscode, /* this subexpression's code */
+ ptr, /* where we currently are */
+ ptr - start_subject, /* start offset */
+ local_offsets, /* offset vector */
+ sizeof(local_offsets)/sizeof(int), /* size of same */
+ local_workspace, /* workspace vector */
+ sizeof(local_workspace)/sizeof(int), /* size of same */
+ ims, /* the current ims flags */
+ rlevel, /* function recursion level */
+ recursing); /* pass on regex recursion */
+
+ if ((rc >= 0) ==
+ (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
+ { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ else
+ { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
+ }
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_RECURSE:
+ {
+ int local_offsets[1000];
+ int local_workspace[1000];
+ int rc;
+
+ DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
+ recursing + 1));
+
+ rc = internal_dfa_exec(
+ md, /* fixed match data */
+ start_code + GET(code, 1), /* this subexpression's code */
+ ptr, /* where we currently are */
+ ptr - start_subject, /* start offset */
+ local_offsets, /* offset vector */
+ sizeof(local_offsets)/sizeof(int), /* size of same */
+ local_workspace, /* workspace vector */
+ sizeof(local_workspace)/sizeof(int), /* size of same */
+ ims, /* the current ims flags */
+ rlevel, /* function recursion level */
+ recursing + 1); /* regex recurse level */
+
+ DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
+ recursing + 1, rc));
+
+ /* Ran out of internal offsets */
+
+ if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
+
+ /* For each successful matched substring, set up the next state with a
+ count of characters to skip before trying it. Note that the count is in
+ characters, not bytes. */
+
+ if (rc > 0)
+ {
+ for (rc = rc*2 - 2; rc >= 0; rc -= 2)
+ {
+ const uschar *p = start_subject + local_offsets[rc];
+ const uschar *pp = start_subject + local_offsets[rc+1];
+ int charcount = local_offsets[rc+1] - local_offsets[rc];
+ while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
+ if (charcount > 0)
+ {
+ ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
+ }
+ else
+ {
+ ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
+ }
+ }
+ }
+ else if (rc != PCRE_ERROR_NOMATCH) return rc;
+ }
+ break;
+
+ /*-----------------------------------------------------------------*/
+ case OP_ONCE:
+ {
+ int local_offsets[2];
+ int local_workspace[1000];
+
+ int rc = internal_dfa_exec(
+ md, /* fixed match data */
+ code, /* this subexpression's code */
+ ptr, /* where we currently are */
+ ptr - start_subject, /* start offset */
+ local_offsets, /* offset vector */
+ sizeof(local_offsets)/sizeof(int), /* size of same */
+ local_workspace, /* workspace vector */
+ sizeof(local_workspace)/sizeof(int), /* size of same */
+ ims, /* the current ims flags */
+ rlevel, /* function recursion level */
+ recursing); /* pass on regex recursion */
+
+ if (rc >= 0)
+ {
+ const uschar *end_subpattern = code;
+ int charcount = local_offsets[1] - local_offsets[0];
+ int next_state_offset, repeat_state_offset;
+
+ do { end_subpattern += GET(end_subpattern, 1); }
+ while (*end_subpattern == OP_ALT);
+ next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
+
+ /* If the end of this subpattern is KETRMAX or KETRMIN, we must
+ arrange for the repeat state also to be added to the relevant list.
+ Calculate the offset, or set -1 for no repeat. */
+
+ repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
+ *end_subpattern == OP_KETRMIN)?
+ end_subpattern - start_code - GET(end_subpattern, 1) : -1;
+
+ /* If we have matched an empty string, add the next state at the
+ current character pointer. This is important so that the duplicate
+ checking kicks in, which is what breaks infinite loops that match an
+ empty string. */
+
+ if (charcount == 0)
+ {
+ ADD_ACTIVE(next_state_offset, 0);
+ }
+
+ /* Optimization: if there are no more active states, and there
+ are no new states yet set up, then skip over the subject string
+ right here, to save looping. Otherwise, set up the new state to swing
+ into action when the end of the substring is reached. */
+
+ else if (i + 1 >= active_count && new_count == 0)
+ {
+ ptr += charcount;
+ clen = 0;
+ ADD_NEW(next_state_offset, 0);
+
+ /* If we are adding a repeat state at the new character position,
+ we must fudge things so that it is the only current state.
+ Otherwise, it might be a duplicate of one we processed before, and
+ that would cause it to be skipped. */
+
+ if (repeat_state_offset >= 0)
+ {
+ next_active_state = active_states;
+ active_count = 0;
+ i = -1;
+ ADD_ACTIVE(repeat_state_offset, 0);
+ }
+ }
+ else
+ {
+ const uschar *p = start_subject + local_offsets[0];
+ const uschar *pp = start_subject + local_offsets[1];
+ while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
+ ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
+ if (repeat_state_offset >= 0)
+ { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
+ }
+
+ }
+ else if (rc != PCRE_ERROR_NOMATCH) return rc;
+ }
+ break;
+
+
+/* ========================================================================== */
+ /* Handle callouts */
+
+ case OP_CALLOUT:
+ if (pcre_callout != NULL)
+ {
+ int rrc;
+ pcre_callout_block cb;
+ cb.version = 1; /* Version 1 of the callout block */
+ cb.callout_number = code[1];
+ cb.offset_vector = offsets;
+ cb.subject = (PCRE_SPTR)start_subject;
+ cb.subject_length = end_subject - start_subject;
+ cb.start_match = current_subject - start_subject;
+ cb.current_position = ptr - start_subject;
+ cb.pattern_position = GET(code, 2);
+ cb.next_item_length = GET(code, 2 + LINK_SIZE);
+ cb.capture_top = 1;
+ cb.capture_last = -1;
+ cb.callout_data = md->callout_data;
+ if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
+ if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
+ }
+ break;
+
+
+/* ========================================================================== */
+ default: /* Unsupported opcode */
+ return PCRE_ERROR_DFA_UITEM;
+ }
+
+ NEXT_ACTIVE_STATE: continue;
+
+ } /* End of loop scanning active states */
+
+ /* We have finished the processing at the current subject character. If no
+ new states have been set for the next character, we have found all the
+ matches that we are going to find. If we are at the top level and partial
+ matching has been requested, check for appropriate conditions. */
+
+ if (new_count <= 0)
+ {
+ if (match_count < 0 && /* No matches found */
+ rlevel == 1 && /* Top level match function */
+ (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
+ ptr >= end_subject && /* Reached end of subject */
+ ptr > current_subject) /* Matched non-empty string */
+ {
+ if (offsetcount >= 2)
+ {
+ offsets[0] = current_subject - start_subject;
+ offsets[1] = end_subject - start_subject;
+ }
+ match_count = PCRE_ERROR_PARTIAL;
+ }
+
+ DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
+ "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
+ rlevel*2-2, SP));
+ break; /* In effect, "return", but see the comment below */
+ }
+
+ /* One or more states are active for the next character. */
+
+ ptr += clen; /* Advance to next subject character */
+ } /* Loop to move along the subject string */
+
+/* Control gets here from "break" a few lines above. We do it this way because
+if we use "return" above, we have compiler trouble. Some compilers warn if
+there's nothing here because they think the function doesn't return a value. On
+the other hand, if we put a dummy statement here, some more clever compilers
+complain that it can't be reached. Sigh. */
+
+return match_count;
+}
+
+
+
+
+/*************************************************
+* Execute a Regular Expression - DFA engine *
+*************************************************/
+
+/* This external function applies a compiled re to a subject string using a DFA
+engine. This function calls the internal function multiple times if the pattern
+is not anchored.
+
+Arguments:
+ argument_re points to the compiled expression
+ extra_data points to extra data or is NULL
+ subject points to the subject string
+ length length of subject string (may contain binary zeros)
+ start_offset where to start in the subject string
+ options option bits
+ offsets vector of match offsets
+ offsetcount size of same
+ workspace workspace vector
+ wscount size of same
+
+Returns: > 0 => number of match offset pairs placed in offsets
+ = 0 => offsets overflowed; longest matches are present
+ -1 => failed to match
+ < -1 => some kind of unexpected problem
+*/
+
+PCRE_EXP_DEFN int
+pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
+ const char *subject, int length, int start_offset, int options, int *offsets,
+ int offsetcount, int *workspace, int wscount)
+{
+real_pcre *re = (real_pcre *)argument_re;
+dfa_match_data match_block;
+dfa_match_data *md = &match_block;
+BOOL utf8, anchored, startline, firstline;
+const uschar *current_subject, *end_subject, *lcc;
+
+pcre_study_data internal_study;
+const pcre_study_data *study = NULL;
+real_pcre internal_re;
+
+const uschar *req_byte_ptr;
+const uschar *start_bits = NULL;
+BOOL first_byte_caseless = FALSE;
+BOOL req_byte_caseless = FALSE;
+int first_byte = -1;
+int req_byte = -1;
+int req_byte2 = -1;
+int newline;
+
+/* Plausibility checks */
+
+if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
+if (re == NULL || subject == NULL || workspace == NULL ||
+ (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
+if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+
+/* We need to find the pointer to any study data before we test for byte
+flipping, so we scan the extra_data block first. This may set two fields in the
+match block, so we must initialize them beforehand. However, the other fields
+in the match block must not be set until after the byte flipping. */
+
+md->tables = re->tables;
+md->callout_data = NULL;
+
+if (extra_data != NULL)
+ {
+ unsigned int flags = extra_data->flags;
+ if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
+ study = (const pcre_study_data *)extra_data->study_data;
+ if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
+ if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
+ return PCRE_ERROR_DFA_UMLIMIT;
+ if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
+ md->callout_data = extra_data->callout_data;
+ if ((flags & PCRE_EXTRA_TABLES) != 0)
+ md->tables = extra_data->tables;
+ }
+
+/* Check that the first field in the block is the magic number. If it is not,
+test for a regex that was compiled on a host of opposite endianness. If this is
+the case, flipped values are put in internal_re and internal_study if there was
+study data too. */
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
+/* Set some local values */
+
+current_subject = (const unsigned char *)subject + start_offset;
+end_subject = (const unsigned char *)subject + length;
+req_byte_ptr = current_subject - 1;
+
+#ifdef SUPPORT_UTF8
+utf8 = (re->options & PCRE_UTF8) != 0;
+#else
+utf8 = FALSE;
+#endif
+
+anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
+ (re->options & PCRE_ANCHORED) != 0;
+
+/* The remaining fixed data for passing around. */
+
+md->start_code = (const uschar *)argument_re +
+ re->name_table_offset + re->name_count * re->name_entry_size;
+md->start_subject = (const unsigned char *)subject;
+md->end_subject = end_subject;
+md->moptions = options;
+md->poptions = re->options;
+
+/* If the BSR option is not set at match time, copy what was set
+at compile time. */
+
+if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
+ {
+ if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
+ md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
+#ifdef BSR_ANYCRLF
+ else md->moptions |= PCRE_BSR_ANYCRLF;
+#endif
+ }
+
+/* Handle different types of newline. The three bits give eight cases. If
+nothing is set at run time, whatever was used at compile time applies. */
+
+switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
+ PCRE_NEWLINE_BITS)
+ {
+ case 0: newline = NEWLINE; break; /* Compile-time default */
+ case PCRE_NEWLINE_CR: newline = '\r'; break;
+ case PCRE_NEWLINE_LF: newline = '\n'; break;
+ case PCRE_NEWLINE_CR+
+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+ case PCRE_NEWLINE_ANY: newline = -1; break;
+ case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
+ default: return PCRE_ERROR_BADNEWLINE;
+ }
+
+if (newline == -2)
+ {
+ md->nltype = NLTYPE_ANYCRLF;
+ }
+else if (newline < 0)
+ {
+ md->nltype = NLTYPE_ANY;
+ }
+else
+ {
+ md->nltype = NLTYPE_FIXED;
+ if (newline > 255)
+ {
+ md->nllen = 2;
+ md->nl[0] = (newline >> 8) & 255;
+ md->nl[1] = newline & 255;
+ }
+ else
+ {
+ md->nllen = 1;
+ md->nl[0] = newline;
+ }
+ }
+
+/* Check a UTF-8 string if required. Unfortunately there's no way of passing
+back the character offset. */
+
+#ifdef SUPPORT_UTF8
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+ {
+ if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
+ return PCRE_ERROR_BADUTF8;
+ if (start_offset > 0 && start_offset < length)
+ {
+ int tb = ((uschar *)subject)[start_offset];
+ if (tb > 127)
+ {
+ tb &= 0xc0;
+ if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
+ }
+ }
+ }
+#endif
+
+/* If the exec call supplied NULL for tables, use the inbuilt ones. This
+is a feature that makes it possible to save compiled regex and re-use them
+in other programs later. */
+
+if (md->tables == NULL) md->tables = _pcre_default_tables;
+
+/* The lower casing table and the "must be at the start of a line" flag are
+used in a loop when finding where to start. */
+
+lcc = md->tables + lcc_offset;
+startline = (re->flags & PCRE_STARTLINE) != 0;
+firstline = (re->options & PCRE_FIRSTLINE) != 0;
+
+/* Set up the first character to match, if available. The first_byte value is
+never set for an anchored regular expression, but the anchoring may be forced
+at run time, so we have to test for anchoring. The first char may be unset for
+an unanchored pattern, of course. If there's no first char and the pattern was
+studied, there may be a bitmap of possible first characters. */
+
+if (!anchored)
+ {
+ if ((re->flags & PCRE_FIRSTSET) != 0)
+ {
+ first_byte = re->first_byte & 255;
+ if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
+ first_byte = lcc[first_byte];
+ }
+ else
+ {
+ if (startline && study != NULL &&
+ (study->options & PCRE_STUDY_MAPPED) != 0)
+ start_bits = study->start_bits;
+ }
+ }
+
+/* For anchored or unanchored matches, there may be a "last known required
+character" set. */
+
+if ((re->flags & PCRE_REQCHSET) != 0)
+ {
+ req_byte = re->req_byte & 255;
+ req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
+ req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
+ }
+
+/* Call the main matching function, looping for a non-anchored regex after a
+failed match. Unless restarting, optimize by moving to the first match
+character if possible, when not anchored. Then unless wanting a partial match,
+check for a required later character. */
+
+for (;;)
+ {
+ int rc;
+
+ if ((options & PCRE_DFA_RESTART) == 0)
+ {
+ const uschar *save_end_subject = end_subject;
+
+ /* Advance to a unique first char if possible. If firstline is TRUE, the
+ start of the match is constrained to the first line of a multiline string.
+ Implement this by temporarily adjusting end_subject so that we stop
+ scanning at a newline. If the match fails at the newline, later code breaks
+ this loop. */
+
+ if (firstline)
+ {
+ const uschar *t = current_subject;
+ while (t < md->end_subject && !IS_NEWLINE(t)) t++;
+ end_subject = t;
+ }
+
+ if (first_byte >= 0)
+ {
+ if (first_byte_caseless)
+ while (current_subject < end_subject &&
+ lcc[*current_subject] != first_byte)
+ current_subject++;
+ else
+ while (current_subject < end_subject && *current_subject != first_byte)
+ current_subject++;
+ }
+
+ /* Or to just after a linebreak for a multiline match if possible */
+
+ else if (startline)
+ {
+ if (current_subject > md->start_subject + start_offset)
+ {
+ while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
+ current_subject++;
+
+ /* If we have just passed a CR and the newline option is ANY or
+ ANYCRLF, and we are now at a LF, advance the match position by one more
+ character. */
+
+ if (current_subject[-1] == '\r' &&
+ (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
+ current_subject < end_subject &&
+ *current_subject == '\n')
+ current_subject++;
+ }
+ }
+
+ /* Or to a non-unique first char after study */
+
+ else if (start_bits != NULL)
+ {
+ while (current_subject < end_subject)
+ {
+ register unsigned int c = *current_subject;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
+ else break;
+ }
+ }
+
+ /* Restore fudged end_subject */
+
+ end_subject = save_end_subject;
+ }
+
+ /* If req_byte is set, we know that that character must appear in the subject
+ for the match to succeed. If the first character is set, req_byte must be
+ later in the subject; otherwise the test starts at the match point. This
+ optimization can save a huge amount of work in patterns with nested unlimited
+ repeats that aren't going to match. Writing separate code for cased/caseless
+ versions makes it go faster, as does using an autoincrement and backing off
+ on a match.
+
+ HOWEVER: when the subject string is very, very long, searching to its end can
+ take a long time, and give bad performance on quite ordinary patterns. This
+ showed up when somebody was matching /^C/ on a 32-megabyte string... so we
+ don't do this when the string is sufficiently long.
+
+ ALSO: this processing is disabled when partial matching is requested.
+ */
+
+ if (req_byte >= 0 &&
+ end_subject - current_subject < REQ_BYTE_MAX &&
+ (options & PCRE_PARTIAL) == 0)
+ {
+ register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
+
+ /* We don't need to repeat the search if we haven't yet reached the
+ place we found it at last time. */
+
+ if (p > req_byte_ptr)
+ {
+ if (req_byte_caseless)
+ {
+ while (p < end_subject)
+ {
+ register int pp = *p++;
+ if (pp == req_byte || pp == req_byte2) { p--; break; }
+ }
+ }
+ else
+ {
+ while (p < end_subject)
+ {
+ if (*p++ == req_byte) { p--; break; }
+ }
+ }
+
+ /* If we can't find the required character, break the matching loop,
+ which will cause a return or PCRE_ERROR_NOMATCH. */
+
+ if (p >= end_subject) break;
+
+ /* If we have found the required character, save the point where we
+ found it, so that we don't search again next time round the loop if
+ the start hasn't passed this character yet. */
+
+ req_byte_ptr = p;
+ }
+ }
+
+ /* OK, now we can do the business */
+
+ rc = internal_dfa_exec(
+ md, /* fixed match data */
+ md->start_code, /* this subexpression's code */
+ current_subject, /* where we currently are */
+ start_offset, /* start offset in subject */
+ offsets, /* offset vector */
+ offsetcount, /* size of same */
+ workspace, /* workspace vector */
+ wscount, /* size of same */
+ re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
+ 0, /* function recurse level */
+ 0); /* regex recurse level */
+
+ /* Anything other than "no match" means we are done, always; otherwise, carry
+ on only if not anchored. */
+
+ if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
+
+ /* Advance to the next subject character unless we are at the end of a line
+ and firstline is set. */
+
+ if (firstline && IS_NEWLINE(current_subject)) break;
+ current_subject++;
+ if (utf8)
+ {
+ while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
+ current_subject++;
+ }
+ if (current_subject > end_subject) break;
+
+ /* If we have just passed a CR and we are now at a LF, and the pattern does
+ not contain any explicit matches for \r or \n, and the newline option is CRLF
+ or ANY or ANYCRLF, advance the match position by one more character. */
+
+ if (current_subject[-1] == '\r' &&
+ current_subject < end_subject &&
+ *current_subject == '\n' &&
+ (re->flags & PCRE_HASCRORLF) == 0 &&
+ (md->nltype == NLTYPE_ANY ||
+ md->nltype == NLTYPE_ANYCRLF ||
+ md->nllen == 2))
+ current_subject++;
+
+ } /* "Bumpalong" loop */
+
+return PCRE_ERROR_NOMATCH;
+}
+
+/* End of pcre_dfa_exec.c */
diff --git a/src/pcre_exec.c b/src/pcre_exec.c
new file mode 100644
index 0000000..ed28ae7
--- /dev/null
+++ b/src/pcre_exec.c
@@ -0,0 +1,4953 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains pcre_exec(), the externally visible function that does
+pattern matching using an NFA algorithm, trying to mimic Perl as closely as
+possible. There are also some static supporting functions. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define NLBLOCK md /* Block containing newline information */
+#define PSSTART start_subject /* Field containing processed string start */
+#define PSEND end_subject /* Field containing processed string end */
+
+#include "pcre_internal.h"
+
+/* Undefine some potentially clashing cpp symbols */
+
+#undef min
+#undef max
+
+/* Flag bits for the match() function */
+
+#define match_condassert 0x01 /* Called to check a condition assertion */
+#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
+
+/* Non-error returns from the match() function. Error returns are externally
+defined PCRE_ERROR_xxx codes, which are all negative. */
+
+#define MATCH_MATCH 1
+#define MATCH_NOMATCH 0
+
+/* Special internal returns from the match() function. Make them sufficiently
+negative to avoid the external error codes. */
+
+#define MATCH_COMMIT (-999)
+#define MATCH_PRUNE (-998)
+#define MATCH_SKIP (-997)
+#define MATCH_THEN (-996)
+
+/* Maximum number of ints of offset to save on the stack for recursive calls.
+If the offset vector is bigger, malloc is used. This should be a multiple of 3,
+because the offset vector is always a multiple of 3 long. */
+
+#define REC_STACK_SAVE_MAX 30
+
+/* Min and max values for the common repeats; for the maxima, 0 => infinity */
+
+static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
+static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
+
+
+
+#ifdef DEBUG
+/*************************************************
+* Debugging function to print chars *
+*************************************************/
+
+/* Print a sequence of chars in printable format, stopping at the end of the
+subject if the requested.
+
+Arguments:
+ p points to characters
+ length number to print
+ is_subject TRUE if printing from within md->start_subject
+ md pointer to matching data block, if is_subject is TRUE
+
+Returns: nothing
+*/
+
+static void
+pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
+{
+unsigned int c;
+if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
+while (length-- > 0)
+ if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
+}
+#endif
+
+
+
+/*************************************************
+* Match a back-reference *
+*************************************************/
+
+/* If a back reference hasn't been set, the length that is passed is greater
+than the number of characters left in the string, so the match fails.
+
+Arguments:
+ offset index into the offset vector
+ eptr points into the subject
+ length length to be matched
+ md points to match data block
+ ims the ims flags
+
+Returns: TRUE if matched
+*/
+
+static BOOL
+match_ref(int offset, register USPTR eptr, int length, match_data *md,
+ unsigned long int ims)
+{
+USPTR p = md->start_subject + md->offset_vector[offset];
+
+#ifdef DEBUG
+if (eptr >= md->end_subject)
+ printf("matching subject ");
+else
+ {
+ printf("matching subject ");
+ pchars(eptr, length, TRUE, md);
+ }
+printf(" against backref ");
+pchars(p, length, FALSE, md);
+printf("\n");
+#endif
+
+/* Always fail if not enough characters left */
+
+if (length > md->end_subject - eptr) return FALSE;
+
+/* Separate the caselesss case for speed */
+
+if ((ims & PCRE_CASELESS) != 0)
+ {
+ while (length-- > 0)
+ if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
+ }
+else
+ { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
+
+return TRUE;
+}
+
+
+
+/***************************************************************************
+****************************************************************************
+ RECURSION IN THE match() FUNCTION
+
+The match() function is highly recursive, though not every recursive call
+increases the recursive depth. Nevertheless, some regular expressions can cause
+it to recurse to a great depth. I was writing for Unix, so I just let it call
+itself recursively. This uses the stack for saving everything that has to be
+saved for a recursive call. On Unix, the stack can be large, and this works
+fine.
+
+It turns out that on some non-Unix-like systems there are problems with
+programs that use a lot of stack. (This despite the fact that every last chip
+has oodles of memory these days, and techniques for extending the stack have
+been known for decades.) So....
+
+There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
+calls by keeping local variables that need to be preserved in blocks of memory
+obtained from malloc() instead instead of on the stack. Macros are used to
+achieve this so that the actual code doesn't look very different to what it
+always used to.
+
+The original heap-recursive code used longjmp(). However, it seems that this
+can be very slow on some operating systems. Following a suggestion from Stan
+Switzer, the use of longjmp() has been abolished, at the cost of having to
+provide a unique number for each call to RMATCH. There is no way of generating
+a sequence of numbers at compile time in C. I have given them names, to make
+them stand out more clearly.
+
+Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
+FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
+tests. Furthermore, not using longjmp() means that local dynamic variables
+don't have indeterminate values; this has meant that the frame size can be
+reduced because the result can be "passed back" by straight setting of the
+variable instead of being passed in the frame.
+****************************************************************************
+***************************************************************************/
+
+/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
+below must be updated in sync. */
+
+enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
+ RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
+ RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
+ RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
+ RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
+ RM51, RM52, RM53, RM54 };
+
+/* These versions of the macros use the stack, as normal. There are debugging
+versions and production versions. Note that the "rw" argument of RMATCH isn't
+actuall used in this definition. */
+
+#ifndef NO_RECURSE
+#define REGISTER register
+
+#ifdef DEBUG
+#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
+ { \
+ printf("match() called in line %d\n", __LINE__); \
+ rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
+ printf("to line %d\n", __LINE__); \
+ }
+#define RRETURN(ra) \
+ { \
+ printf("match() returned %d from line %d ", ra, __LINE__); \
+ return ra; \
+ }
+#else
+#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
+ rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
+#define RRETURN(ra) return ra
+#endif
+
+#else
+
+
+/* These versions of the macros manage a private stack on the heap. Note that
+the "rd" argument of RMATCH isn't actually used in this definition. It's the md
+argument of match(), which never changes. */
+
+#define REGISTER
+
+#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
+ {\
+ heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
+ frame->Xwhere = rw; \
+ newframe->Xeptr = ra;\
+ newframe->Xecode = rb;\
+ newframe->Xmstart = mstart;\
+ newframe->Xoffset_top = rc;\
+ newframe->Xims = re;\
+ newframe->Xeptrb = rf;\
+ newframe->Xflags = rg;\
+ newframe->Xrdepth = frame->Xrdepth + 1;\
+ newframe->Xprevframe = frame;\
+ frame = newframe;\
+ DPRINTF(("restarting from line %d\n", __LINE__));\
+ goto HEAP_RECURSE;\
+ L_##rw:\
+ DPRINTF(("jumped back to line %d\n", __LINE__));\
+ }
+
+#define RRETURN(ra)\
+ {\
+ heapframe *newframe = frame;\
+ frame = newframe->Xprevframe;\
+ (pcre_stack_free)(newframe);\
+ if (frame != NULL)\
+ {\
+ rrc = ra;\
+ goto HEAP_RETURN;\
+ }\
+ return ra;\
+ }
+
+
+/* Structure for remembering the local variables in a private frame */
+
+typedef struct heapframe {
+ struct heapframe *Xprevframe;
+
+ /* Function arguments that may change */
+
+ const uschar *Xeptr;
+ const uschar *Xecode;
+ const uschar *Xmstart;
+ int Xoffset_top;
+ long int Xims;
+ eptrblock *Xeptrb;
+ int Xflags;
+ unsigned int Xrdepth;
+
+ /* Function local variables */
+
+ const uschar *Xcallpat;
+ const uschar *Xcharptr;
+ const uschar *Xdata;
+ const uschar *Xnext;
+ const uschar *Xpp;
+ const uschar *Xprev;
+ const uschar *Xsaved_eptr;
+
+ recursion_info Xnew_recursive;
+
+ BOOL Xcur_is_word;
+ BOOL Xcondition;
+ BOOL Xprev_is_word;
+
+ unsigned long int Xoriginal_ims;
+
+#ifdef SUPPORT_UCP
+ int Xprop_type;
+ int Xprop_value;
+ int Xprop_fail_result;
+ int Xprop_category;
+ int Xprop_chartype;
+ int Xprop_script;
+ int Xoclength;
+ uschar Xocchars[8];
+#endif
+
+ int Xctype;
+ unsigned int Xfc;
+ int Xfi;
+ int Xlength;
+ int Xmax;
+ int Xmin;
+ int Xnumber;
+ int Xoffset;
+ int Xop;
+ int Xsave_capture_last;
+ int Xsave_offset1, Xsave_offset2, Xsave_offset3;
+ int Xstacksave[REC_STACK_SAVE_MAX];
+
+ eptrblock Xnewptrb;
+
+ /* Where to jump back to */
+
+ int Xwhere;
+
+} heapframe;
+
+#endif
+
+
+/***************************************************************************
+***************************************************************************/
+
+
+
+/*************************************************
+* Match from current position *
+*************************************************/
+
+/* This function is called recursively in many circumstances. Whenever it
+returns a negative (error) response, the outer incarnation must also return the
+same response.
+
+Performance note: It might be tempting to extract commonly used fields from the
+md structure (e.g. utf8, end_subject) into individual variables to improve
+performance. Tests using gcc on a SPARC disproved this; in the first case, it
+made performance worse.
+
+Arguments:
+ eptr pointer to current character in subject
+ ecode pointer to current position in compiled code
+ mstart pointer to the current match start position (can be modified
+ by encountering \K)
+ offset_top current top pointer
+ md pointer to "static" info for the match
+ ims current /i, /m, and /s options
+ eptrb pointer to chain of blocks containing eptr at start of
+ brackets - for testing for empty matches
+ flags can contain
+ match_condassert - this is an assertion condition
+ match_cbegroup - this is the start of an unlimited repeat
+ group that can match an empty string
+ rdepth the recursion depth
+
+Returns: MATCH_MATCH if matched ) these values are >= 0
+ MATCH_NOMATCH if failed to match )
+ a negative PCRE_ERROR_xxx value if aborted by an error condition
+ (e.g. stopped by repeated call or recursion limit)
+*/
+
+static int
+match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
+ int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
+ int flags, unsigned int rdepth)
+{
+/* These variables do not need to be preserved over recursion in this function,
+so they can be ordinary variables in all cases. Mark some of them with
+"register" because they are used a lot in loops. */
+
+register int rrc; /* Returns from recursive calls */
+register int i; /* Used for loops not involving calls to RMATCH() */
+register unsigned int c; /* Character values not kept over RMATCH() calls */
+register BOOL utf8; /* Local copy of UTF-8 flag for speed */
+
+BOOL minimize, possessive; /* Quantifier options */
+
+/* When recursion is not being used, all "local" variables that have to be
+preserved over calls to RMATCH() are part of a "frame" which is obtained from
+heap storage. Set up the top-level frame here; others are obtained from the
+heap whenever RMATCH() does a "recursion". See the macro definitions above. */
+
+#ifdef NO_RECURSE
+heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
+frame->Xprevframe = NULL; /* Marks the top level */
+
+/* Copy in the original argument variables */
+
+frame->Xeptr = eptr;
+frame->Xecode = ecode;
+frame->Xmstart = mstart;
+frame->Xoffset_top = offset_top;
+frame->Xims = ims;
+frame->Xeptrb = eptrb;
+frame->Xflags = flags;
+frame->Xrdepth = rdepth;
+
+/* This is where control jumps back to to effect "recursion" */
+
+HEAP_RECURSE:
+
+/* Macros make the argument variables come from the current frame */
+
+#define eptr frame->Xeptr
+#define ecode frame->Xecode
+#define mstart frame->Xmstart
+#define offset_top frame->Xoffset_top
+#define ims frame->Xims
+#define eptrb frame->Xeptrb
+#define flags frame->Xflags
+#define rdepth frame->Xrdepth
+
+/* Ditto for the local variables */
+
+#ifdef SUPPORT_UTF8
+#define charptr frame->Xcharptr
+#endif
+#define callpat frame->Xcallpat
+#define data frame->Xdata
+#define next frame->Xnext
+#define pp frame->Xpp
+#define prev frame->Xprev
+#define saved_eptr frame->Xsaved_eptr
+
+#define new_recursive frame->Xnew_recursive
+
+#define cur_is_word frame->Xcur_is_word
+#define condition frame->Xcondition
+#define prev_is_word frame->Xprev_is_word
+
+#define original_ims frame->Xoriginal_ims
+
+#ifdef SUPPORT_UCP
+#define prop_type frame->Xprop_type
+#define prop_value frame->Xprop_value
+#define prop_fail_result frame->Xprop_fail_result
+#define prop_category frame->Xprop_category
+#define prop_chartype frame->Xprop_chartype
+#define prop_script frame->Xprop_script
+#define oclength frame->Xoclength
+#define occhars frame->Xocchars
+#endif
+
+#define ctype frame->Xctype
+#define fc frame->Xfc
+#define fi frame->Xfi
+#define length frame->Xlength
+#define max frame->Xmax
+#define min frame->Xmin
+#define number frame->Xnumber
+#define offset frame->Xoffset
+#define op frame->Xop
+#define save_capture_last frame->Xsave_capture_last
+#define save_offset1 frame->Xsave_offset1
+#define save_offset2 frame->Xsave_offset2
+#define save_offset3 frame->Xsave_offset3
+#define stacksave frame->Xstacksave
+
+#define newptrb frame->Xnewptrb
+
+/* When recursion is being used, local variables are allocated on the stack and
+get preserved during recursion in the normal way. In this environment, fi and
+i, and fc and c, can be the same variables. */
+
+#else /* NO_RECURSE not defined */
+#define fi i
+#define fc c
+
+
+#ifdef SUPPORT_UTF8 /* Many of these variables are used only */
+const uschar *charptr; /* in small blocks of the code. My normal */
+#endif /* style of coding would have declared */
+const uschar *callpat; /* them within each of those blocks. */
+const uschar *data; /* However, in order to accommodate the */
+const uschar *next; /* version of this code that uses an */
+USPTR pp; /* external "stack" implemented on the */
+const uschar *prev; /* heap, it is easier to declare them all */
+USPTR saved_eptr; /* here, so the declarations can be cut */
+ /* out in a block. The only declarations */
+recursion_info new_recursive; /* within blocks below are for variables */
+ /* that do not have to be preserved over */
+BOOL cur_is_word; /* a recursive call to RMATCH(). */
+BOOL condition;
+BOOL prev_is_word;
+
+unsigned long int original_ims;
+
+#ifdef SUPPORT_UCP
+int prop_type;
+int prop_value;
+int prop_fail_result;
+int prop_category;
+int prop_chartype;
+int prop_script;
+int oclength;
+uschar occhars[8];
+#endif
+
+int ctype;
+int length;
+int max;
+int min;
+int number;
+int offset;
+int op;
+int save_capture_last;
+int save_offset1, save_offset2, save_offset3;
+int stacksave[REC_STACK_SAVE_MAX];
+
+eptrblock newptrb;
+#endif /* NO_RECURSE */
+
+/* These statements are here to stop the compiler complaining about unitialized
+variables. */
+
+#ifdef SUPPORT_UCP
+prop_value = 0;
+prop_fail_result = 0;
+#endif
+
+
+/* This label is used for tail recursion, which is used in a few cases even
+when NO_RECURSE is not defined, in order to reduce the amount of stack that is
+used. Thanks to Ian Taylor for noticing this possibility and sending the
+original patch. */
+
+TAIL_RECURSE:
+
+/* OK, now we can get on with the real code of the function. Recursive calls
+are specified by the macro RMATCH and RRETURN is used to return. When
+NO_RECURSE is *not* defined, these just turn into a recursive call to match()
+and a "return", respectively (possibly with some debugging if DEBUG is
+defined). However, RMATCH isn't like a function call because it's quite a
+complicated macro. It has to be used in one particular way. This shouldn't,
+however, impact performance when true recursion is being used. */
+
+#ifdef SUPPORT_UTF8
+utf8 = md->utf8; /* Local copy of the flag */
+#else
+utf8 = FALSE;
+#endif
+
+/* First check that we haven't called match() too many times, or that we
+haven't exceeded the recursive call limit. */
+
+if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
+if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
+
+original_ims = ims; /* Save for resetting on ')' */
+
+/* At the start of a group with an unlimited repeat that may match an empty
+string, the match_cbegroup flag is set. When this is the case, add the current
+subject pointer to the chain of such remembered pointers, to be checked when we
+hit the closing ket, in order to break infinite loops that match no characters.
+When match() is called in other circumstances, don't add to the chain. The
+match_cbegroup flag must NOT be used with tail recursion, because the memory
+block that is used is on the stack, so a new one may be required for each
+match(). */
+
+if ((flags & match_cbegroup) != 0)
+ {
+ newptrb.epb_saved_eptr = eptr;
+ newptrb.epb_prev = eptrb;
+ eptrb = &newptrb;
+ }
+
+/* Now start processing the opcodes. */
+
+for (;;)
+ {
+ minimize = possessive = FALSE;
+ op = *ecode;
+
+ /* For partial matching, remember if we ever hit the end of the subject after
+ matching at least one subject character. */
+
+ if (md->partial &&
+ eptr >= md->end_subject &&
+ eptr > mstart)
+ md->hitend = TRUE;
+
+ switch(op)
+ {
+ case OP_FAIL:
+ RRETURN(MATCH_NOMATCH);
+
+ case OP_PRUNE:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM51);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RRETURN(MATCH_PRUNE);
+
+ case OP_COMMIT:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM52);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RRETURN(MATCH_COMMIT);
+
+ case OP_SKIP:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM53);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ md->start_match_ptr = eptr; /* Pass back current position */
+ RRETURN(MATCH_SKIP);
+
+ case OP_THEN:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM54);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RRETURN(MATCH_THEN);
+
+ /* Handle a capturing bracket. If there is space in the offset vector, save
+ the current subject position in the working slot at the top of the vector.
+ We mustn't change the current values of the data slot, because they may be
+ set from a previous iteration of this group, and be referred to by a
+ reference inside the group.
+
+ If the bracket fails to match, we need to restore this value and also the
+ values of the final offsets, in case they were set by a previous iteration
+ of the same bracket.
+
+ If there isn't enough space in the offset vector, treat this as if it were
+ a non-capturing bracket. Don't worry about setting the flag for the error
+ case here; that is handled in the code for KET. */
+
+ case OP_CBRA:
+ case OP_SCBRA:
+ number = GET2(ecode, 1+LINK_SIZE);
+ offset = number << 1;
+
+#ifdef DEBUG
+ printf("start bracket %d\n", number);
+ printf("subject=");
+ pchars(eptr, 16, TRUE, md);
+ printf("\n");
+#endif
+
+ if (offset < md->offset_max)
+ {
+ save_offset1 = md->offset_vector[offset];
+ save_offset2 = md->offset_vector[offset+1];
+ save_offset3 = md->offset_vector[md->offset_end - number];
+ save_capture_last = md->capture_last;
+
+ DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
+ md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
+
+ flags = (op == OP_SCBRA)? match_cbegroup : 0;
+ do
+ {
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM1);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ md->capture_last = save_capture_last;
+ ecode += GET(ecode, 1);
+ }
+ while (*ecode == OP_ALT);
+
+ DPRINTF(("bracket %d failed\n", number));
+
+ md->offset_vector[offset] = save_offset1;
+ md->offset_vector[offset+1] = save_offset2;
+ md->offset_vector[md->offset_end - number] = save_offset3;
+
+ RRETURN(MATCH_NOMATCH);
+ }
+
+ /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
+ as a non-capturing bracket. */
+
+ /* VVVVVVVVVVVVVVVVVVVVVVVVV */
+ /* VVVVVVVVVVVVVVVVVVVVVVVVV */
+
+ DPRINTF(("insufficient capture room: treat as non-capturing\n"));
+
+ /* VVVVVVVVVVVVVVVVVVVVVVVVV */
+ /* VVVVVVVVVVVVVVVVVVVVVVVVV */
+
+ /* Non-capturing bracket. Loop for all the alternatives. When we get to the
+ final alternative within the brackets, we would return the result of a
+ recursive call to match() whatever happened. We can reduce stack usage by
+ turning this into a tail recursion, except in the case when match_cbegroup
+ is set.*/
+
+ case OP_BRA:
+ case OP_SBRA:
+ DPRINTF(("start non-capturing bracket\n"));
+ flags = (op >= OP_SBRA)? match_cbegroup : 0;
+ for (;;)
+ {
+ if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
+ {
+ if (flags == 0) /* Not a possibly empty group */
+ {
+ ecode += _pcre_OP_lengths[*ecode];
+ DPRINTF(("bracket 0 tail recursion\n"));
+ goto TAIL_RECURSE;
+ }
+
+ /* Possibly empty group; can't use tail recursion. */
+
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
+ eptrb, flags, RM48);
+ RRETURN(rrc);
+ }
+
+ /* For non-final alternatives, continue the loop for a NOMATCH result;
+ otherwise return. */
+
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
+ eptrb, flags, RM2);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ ecode += GET(ecode, 1);
+ }
+ /* Control never reaches here. */
+
+ /* Conditional group: compilation checked that there are no more than
+ two branches. If the condition is false, skipping the first branch takes us
+ past the end if there is only one branch, but that's OK because that is
+ exactly what going to the ket would do. As there is only one branch to be
+ obeyed, we can use tail recursion to avoid using another stack frame. */
+
+ case OP_COND:
+ case OP_SCOND:
+ if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
+ {
+ offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
+ condition = md->recursive != NULL &&
+ (offset == RREF_ANY || offset == md->recursive->group_num);
+ ecode += condition? 3 : GET(ecode, 1);
+ }
+
+ else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
+ {
+ offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
+ condition = offset < offset_top && md->offset_vector[offset] >= 0;
+ ecode += condition? 3 : GET(ecode, 1);
+ }
+
+ else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
+ {
+ condition = FALSE;
+ ecode += GET(ecode, 1);
+ }
+
+ /* The condition is an assertion. Call match() to evaluate it - setting
+ the final argument match_condassert causes it to stop at the end of an
+ assertion. */
+
+ else
+ {
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
+ match_condassert, RM3);
+ if (rrc == MATCH_MATCH)
+ {
+ condition = TRUE;
+ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
+ while (*ecode == OP_ALT) ecode += GET(ecode, 1);
+ }
+ else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+ {
+ RRETURN(rrc); /* Need braces because of following else */
+ }
+ else
+ {
+ condition = FALSE;
+ ecode += GET(ecode, 1);
+ }
+ }
+
+ /* We are now at the branch that is to be obeyed. As there is only one,
+ we can use tail recursion to avoid using another stack frame, except when
+ match_cbegroup is required for an unlimited repeat of a possibly empty
+ group. If the second alternative doesn't exist, we can just plough on. */
+
+ if (condition || *ecode == OP_ALT)
+ {
+ ecode += 1 + LINK_SIZE;
+ if (op == OP_SCOND) /* Possibly empty group */
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
+ RRETURN(rrc);
+ }
+ else /* Group must match something */
+ {
+ flags = 0;
+ goto TAIL_RECURSE;
+ }
+ }
+ else /* Condition false & no 2nd alternative */
+ {
+ ecode += 1 + LINK_SIZE;
+ }
+ break;
+
+
+ /* End of the pattern, either real or forced. If we are in a top-level
+ recursion, we should restore the offsets appropriately and continue from
+ after the call. */
+
+ case OP_ACCEPT:
+ case OP_END:
+ if (md->recursive != NULL && md->recursive->group_num == 0)
+ {
+ recursion_info *rec = md->recursive;
+ DPRINTF(("End of pattern in a (?0) recursion\n"));
+ md->recursive = rec->prevrec;
+ memmove(md->offset_vector, rec->offset_save,
+ rec->saved_max * sizeof(int));
+ mstart = rec->save_start;
+ ims = original_ims;
+ ecode = rec->after_call;
+ break;
+ }
+
+ /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
+ string - backtracking will then try other alternatives, if any. */
+
+ if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
+ md->end_match_ptr = eptr; /* Record where we ended */
+ md->end_offset_top = offset_top; /* and how many extracts were taken */
+ md->start_match_ptr = mstart; /* and the start (\K can modify) */
+ RRETURN(MATCH_MATCH);
+
+ /* Change option settings */
+
+ case OP_OPT:
+ ims = ecode[1];
+ ecode += 2;
+ DPRINTF(("ims set to %02lx\n", ims));
+ break;
+
+ /* Assertion brackets. Check the alternative branches in turn - the
+ matching won't pass the KET for an assertion. If any one branch matches,
+ the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
+ start of each branch to move the current point backwards, so the code at
+ this level is identical to the lookahead case. */
+
+ case OP_ASSERT:
+ case OP_ASSERTBACK:
+ do
+ {
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
+ RM4);
+ if (rrc == MATCH_MATCH) break;
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ ecode += GET(ecode, 1);
+ }
+ while (*ecode == OP_ALT);
+ if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
+
+ /* If checking an assertion for a condition, return MATCH_MATCH. */
+
+ if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
+
+ /* Continue from after the assertion, updating the offsets high water
+ mark, since extracts may have been taken during the assertion. */
+
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ ecode += 1 + LINK_SIZE;
+ offset_top = md->end_offset_top;
+ continue;
+
+ /* Negative assertion: all branches must fail to match */
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK_NOT:
+ do
+ {
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
+ RM5);
+ if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ ecode += GET(ecode,1);
+ }
+ while (*ecode == OP_ALT);
+
+ if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
+
+ ecode += 1 + LINK_SIZE;
+ continue;
+
+ /* Move the subject pointer back. This occurs only at the start of
+ each branch of a lookbehind assertion. If we are too close to the start to
+ move back, this match function fails. When working with UTF-8 we move
+ back a number of characters, not bytes. */
+
+ case OP_REVERSE:
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ i = GET(ecode, 1);
+ while (i-- > 0)
+ {
+ eptr--;
+ if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+
+ /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
+
+ {
+ eptr -= GET(ecode, 1);
+ if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Skip to next op code */
+
+ ecode += 1 + LINK_SIZE;
+ break;
+
+ /* The callout item calls an external function, if one is provided, passing
+ details of the match so far. This is mainly for debugging, though the
+ function is able to force a failure. */
+
+ case OP_CALLOUT:
+ if (pcre_callout != NULL)
+ {
+ pcre_callout_block cb;
+ cb.version = 1; /* Version 1 of the callout block */
+ cb.callout_number = ecode[1];
+ cb.offset_vector = md->offset_vector;
+ cb.subject = (PCRE_SPTR)md->start_subject;
+ cb.subject_length = md->end_subject - md->start_subject;
+ cb.start_match = mstart - md->start_subject;
+ cb.current_position = eptr - md->start_subject;
+ cb.pattern_position = GET(ecode, 2);
+ cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
+ cb.capture_top = offset_top/2;
+ cb.capture_last = md->capture_last;
+ cb.callout_data = md->callout_data;
+ if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+ if (rrc < 0) RRETURN(rrc);
+ }
+ ecode += 2 + 2*LINK_SIZE;
+ break;
+
+ /* Recursion either matches the current regex, or some subexpression. The
+ offset data is the offset to the starting bracket from the start of the
+ whole pattern. (This is so that it works from duplicated subpatterns.)
+
+ If there are any capturing brackets started but not finished, we have to
+ save their starting points and reinstate them after the recursion. However,
+ we don't know how many such there are (offset_top records the completed
+ total) so we just have to save all the potential data. There may be up to
+ 65535 such values, which is too large to put on the stack, but using malloc
+ for small numbers seems expensive. As a compromise, the stack is used when
+ there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
+ is used. A problem is what to do if the malloc fails ... there is no way of
+ returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
+ values on the stack, and accept that the rest may be wrong.
+
+ There are also other values that have to be saved. We use a chained
+ sequence of blocks that actually live on the stack. Thanks to Robin Houston
+ for the original version of this logic. */
+
+ case OP_RECURSE:
+ {
+ callpat = md->start_code + GET(ecode, 1);
+ new_recursive.group_num = (callpat == md->start_code)? 0 :
+ GET2(callpat, 1 + LINK_SIZE);
+
+ /* Add to "recursing stack" */
+
+ new_recursive.prevrec = md->recursive;
+ md->recursive = &new_recursive;
+
+ /* Find where to continue from afterwards */
+
+ ecode += 1 + LINK_SIZE;
+ new_recursive.after_call = ecode;
+
+ /* Now save the offset data. */
+
+ new_recursive.saved_max = md->offset_end;
+ if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
+ new_recursive.offset_save = stacksave;
+ else
+ {
+ new_recursive.offset_save =
+ (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
+ if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
+ }
+
+ memcpy(new_recursive.offset_save, md->offset_vector,
+ new_recursive.saved_max * sizeof(int));
+ new_recursive.save_start = mstart;
+ mstart = eptr;
+
+ /* OK, now we can do the recursion. For each top-level alternative we
+ restore the offset and recursion data. */
+
+ DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
+ flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
+ do
+ {
+ RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
+ md, ims, eptrb, flags, RM6);
+ if (rrc == MATCH_MATCH)
+ {
+ DPRINTF(("Recursion matched\n"));
+ md->recursive = new_recursive.prevrec;
+ if (new_recursive.offset_save != stacksave)
+ (pcre_free)(new_recursive.offset_save);
+ RRETURN(MATCH_MATCH);
+ }
+ else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+ {
+ DPRINTF(("Recursion gave error %d\n", rrc));
+ RRETURN(rrc);
+ }
+
+ md->recursive = &new_recursive;
+ memcpy(md->offset_vector, new_recursive.offset_save,
+ new_recursive.saved_max * sizeof(int));
+ callpat += GET(callpat, 1);
+ }
+ while (*callpat == OP_ALT);
+
+ DPRINTF(("Recursion didn't match\n"));
+ md->recursive = new_recursive.prevrec;
+ if (new_recursive.offset_save != stacksave)
+ (pcre_free)(new_recursive.offset_save);
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never reaches here */
+
+ /* "Once" brackets are like assertion brackets except that after a match,
+ the point in the subject string is not moved back. Thus there can never be
+ a move back into the brackets. Friedl calls these "atomic" subpatterns.
+ Check the alternative branches in turn - the matching won't pass the KET
+ for this kind of subpattern. If any one branch matches, we carry on as at
+ the end of a normal bracket, leaving the subject pointer. */
+
+ case OP_ONCE:
+ prev = ecode;
+ saved_eptr = eptr;
+
+ do
+ {
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
+ if (rrc == MATCH_MATCH) break;
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ ecode += GET(ecode,1);
+ }
+ while (*ecode == OP_ALT);
+
+ /* If hit the end of the group (which could be repeated), fail */
+
+ if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
+
+ /* Continue as from after the assertion, updating the offsets high water
+ mark, since extracts may have been taken. */
+
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+
+ offset_top = md->end_offset_top;
+ eptr = md->end_match_ptr;
+
+ /* For a non-repeating ket, just continue at this level. This also
+ happens for a repeating ket if no characters were matched in the group.
+ This is the forcible breaking of infinite loops as implemented in Perl
+ 5.005. If there is an options reset, it will get obeyed in the normal
+ course of events. */
+
+ if (*ecode == OP_KET || eptr == saved_eptr)
+ {
+ ecode += 1+LINK_SIZE;
+ break;
+ }
+
+ /* The repeating kets try the rest of the pattern or restart from the
+ preceding bracket, in the appropriate order. The second "call" of match()
+ uses tail recursion, to avoid using another stack frame. We need to reset
+ any options that changed within the bracket before re-running it, so
+ check the next opcode. */
+
+ if (ecode[1+LINK_SIZE] == OP_OPT)
+ {
+ ims = (ims & ~PCRE_IMS) | ecode[4];
+ DPRINTF(("ims set to %02lx at group repeat\n", ims));
+ }
+
+ if (*ecode == OP_KETRMIN)
+ {
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode = prev;
+ flags = 0;
+ goto TAIL_RECURSE;
+ }
+ else /* OP_KETRMAX */
+ {
+ RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode += 1 + LINK_SIZE;
+ flags = 0;
+ goto TAIL_RECURSE;
+ }
+ /* Control never gets here */
+
+ /* An alternation is the end of a branch; scan along to find the end of the
+ bracketed group and go to there. */
+
+ case OP_ALT:
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ break;
+
+ /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
+ indicating that it may occur zero times. It may repeat infinitely, or not
+ at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
+ with fixed upper repeat limits are compiled as a number of copies, with the
+ optional ones preceded by BRAZERO or BRAMINZERO. */
+
+ case OP_BRAZERO:
+ {
+ next = ecode+1;
+ RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ do next += GET(next,1); while (*next == OP_ALT);
+ ecode = next + 1 + LINK_SIZE;
+ }
+ break;
+
+ case OP_BRAMINZERO:
+ {
+ next = ecode+1;
+ do next += GET(next, 1); while (*next == OP_ALT);
+ RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode++;
+ }
+ break;
+
+ case OP_SKIPZERO:
+ {
+ next = ecode+1;
+ do next += GET(next,1); while (*next == OP_ALT);
+ ecode = next + 1 + LINK_SIZE;
+ }
+ break;
+
+ /* End of a group, repeated or non-repeating. */
+
+ case OP_KET:
+ case OP_KETRMIN:
+ case OP_KETRMAX:
+ prev = ecode - GET(ecode, 1);
+
+ /* If this was a group that remembered the subject start, in order to break
+ infinite repeats of empty string matches, retrieve the subject start from
+ the chain. Otherwise, set it NULL. */
+
+ if (*prev >= OP_SBRA)
+ {
+ saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
+ eptrb = eptrb->epb_prev; /* Backup to previous group */
+ }
+ else saved_eptr = NULL;
+
+ /* If we are at the end of an assertion group, stop matching and return
+ MATCH_MATCH, but record the current high water mark for use by positive
+ assertions. Do this also for the "once" (atomic) groups. */
+
+ if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
+ *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
+ *prev == OP_ONCE)
+ {
+ md->end_match_ptr = eptr; /* For ONCE */
+ md->end_offset_top = offset_top;
+ RRETURN(MATCH_MATCH);
+ }
+
+ /* For capturing groups we have to check the group number back at the start
+ and if necessary complete handling an extraction by setting the offsets and
+ bumping the high water mark. Note that whole-pattern recursion is coded as
+ a recurse into group 0, so it won't be picked up here. Instead, we catch it
+ when the OP_END is reached. Other recursion is handled here. */
+
+ if (*prev == OP_CBRA || *prev == OP_SCBRA)
+ {
+ number = GET2(prev, 1+LINK_SIZE);
+ offset = number << 1;
+
+#ifdef DEBUG
+ printf("end bracket %d", number);
+ printf("\n");
+#endif
+
+ md->capture_last = number;
+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ {
+ md->offset_vector[offset] =
+ md->offset_vector[md->offset_end - number];
+ md->offset_vector[offset+1] = eptr - md->start_subject;
+ if (offset_top <= offset) offset_top = offset + 2;
+ }
+
+ /* Handle a recursively called group. Restore the offsets
+ appropriately and continue from after the call. */
+
+ if (md->recursive != NULL && md->recursive->group_num == number)
+ {
+ recursion_info *rec = md->recursive;
+ DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
+ md->recursive = rec->prevrec;
+ mstart = rec->save_start;
+ memcpy(md->offset_vector, rec->offset_save,
+ rec->saved_max * sizeof(int));
+ ecode = rec->after_call;
+ ims = original_ims;
+ break;
+ }
+ }
+
+ /* For both capturing and non-capturing groups, reset the value of the ims
+ flags, in case they got changed during the group. */
+
+ ims = original_ims;
+ DPRINTF(("ims reset to %02lx\n", ims));
+
+ /* For a non-repeating ket, just continue at this level. This also
+ happens for a repeating ket if no characters were matched in the group.
+ This is the forcible breaking of infinite loops as implemented in Perl
+ 5.005. If there is an options reset, it will get obeyed in the normal
+ course of events. */
+
+ if (*ecode == OP_KET || eptr == saved_eptr)
+ {
+ ecode += 1 + LINK_SIZE;
+ break;
+ }
+
+ /* The repeating kets try the rest of the pattern or restart from the
+ preceding bracket, in the appropriate order. In the second case, we can use
+ tail recursion to avoid using another stack frame, unless we have an
+ unlimited repeat of a group that can match an empty string. */
+
+ flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
+
+ if (*ecode == OP_KETRMIN)
+ {
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (flags != 0) /* Could match an empty string */
+ {
+ RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
+ RRETURN(rrc);
+ }
+ ecode = prev;
+ goto TAIL_RECURSE;
+ }
+ else /* OP_KETRMAX */
+ {
+ RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode += 1 + LINK_SIZE;
+ flags = 0;
+ goto TAIL_RECURSE;
+ }
+ /* Control never gets here */
+
+ /* Start of subject unless notbol, or after internal newline if multiline */
+
+ case OP_CIRC:
+ if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
+ if ((ims & PCRE_MULTILINE) != 0)
+ {
+ if (eptr != md->start_subject &&
+ (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+ }
+ /* ... else fall through */
+
+ /* Start of subject assertion */
+
+ case OP_SOD:
+ if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* Start of match assertion */
+
+ case OP_SOM:
+ if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* Reset the start of match point */
+
+ case OP_SET_SOM:
+ mstart = eptr;
+ ecode++;
+ break;
+
+ /* Assert before internal newline if multiline, or before a terminating
+ newline unless endonly is set, else end of subject unless noteol is set. */
+
+ case OP_DOLL:
+ if ((ims & PCRE_MULTILINE) != 0)
+ {
+ if (eptr < md->end_subject)
+ { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
+ else
+ { if (md->noteol) RRETURN(MATCH_NOMATCH); }
+ ecode++;
+ break;
+ }
+ else
+ {
+ if (md->noteol) RRETURN(MATCH_NOMATCH);
+ if (!md->endonly)
+ {
+ if (eptr != md->end_subject &&
+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+ }
+ }
+ /* ... else fall through for endonly */
+
+ /* End of subject assertion (\z) */
+
+ case OP_EOD:
+ if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* End of subject or ending \n assertion (\Z) */
+
+ case OP_EODN:
+ if (eptr != md->end_subject &&
+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* Word boundary assertions */
+
+ case OP_NOT_WORD_BOUNDARY:
+ case OP_WORD_BOUNDARY:
+ {
+
+ /* Find out if the previous and current characters are "word" characters.
+ It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
+ be "non-word" characters. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ if (eptr == md->start_subject) prev_is_word = FALSE; else
+ {
+ const uschar *lastptr = eptr - 1;
+ while((*lastptr & 0xc0) == 0x80) lastptr--;
+ GETCHAR(c, lastptr);
+ prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
+ }
+ if (eptr >= md->end_subject) cur_is_word = FALSE; else
+ {
+ GETCHAR(c, eptr);
+ cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
+ }
+ }
+ else
+#endif
+
+ /* More streamlined when not in UTF-8 mode */
+
+ {
+ prev_is_word = (eptr != md->start_subject) &&
+ ((md->ctypes[eptr[-1]] & ctype_word) != 0);
+ cur_is_word = (eptr < md->end_subject) &&
+ ((md->ctypes[*eptr] & ctype_word) != 0);
+ }
+
+ /* Now see if the situation is what we want */
+
+ if ((*ecode++ == OP_WORD_BOUNDARY)?
+ cur_is_word == prev_is_word : cur_is_word != prev_is_word)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ /* Match a single character type; inline for speed */
+
+ case OP_ANY:
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ /* Fall through */
+
+ case OP_ALLANY:
+ if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ ecode++;
+ break;
+
+ /* Match a single byte, even in UTF-8 mode. This opcode really does match
+ any byte, even newline, independent of the setting of PCRE_DOTALL. */
+
+ case OP_ANYBYTE:
+ if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_NOT_DIGIT:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c < 256 &&
+#endif
+ (md->ctypes[c] & ctype_digit) != 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_DIGIT:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c >= 256 ||
+#endif
+ (md->ctypes[c] & ctype_digit) == 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_NOT_WHITESPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c < 256 &&
+#endif
+ (md->ctypes[c] & ctype_space) != 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_WHITESPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c >= 256 ||
+#endif
+ (md->ctypes[c] & ctype_space) == 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_NOT_WORDCHAR:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c < 256 &&
+#endif
+ (md->ctypes[c] & ctype_word) != 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_WORDCHAR:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c >= 256 ||
+#endif
+ (md->ctypes[c] & ctype_word) == 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_ANYNL:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+
+ case 0x000a:
+ break;
+
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ break;
+ }
+ ecode++;
+ break;
+
+ case OP_NOT_HSPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ switch(c)
+ {
+ default: break;
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ RRETURN(MATCH_NOMATCH);
+ }
+ ecode++;
+ break;
+
+ case OP_HSPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ break;
+ }
+ ecode++;
+ break;
+
+ case OP_NOT_VSPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ switch(c)
+ {
+ default: break;
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ RRETURN(MATCH_NOMATCH);
+ }
+ ecode++;
+ break;
+
+ case OP_VSPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ break;
+ }
+ ecode++;
+ break;
+
+#ifdef SUPPORT_UCP
+ /* Check the next character by Unicode property. We will get here only
+ if the support is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ {
+ int chartype, script;
+ int category = _pcre_ucp_findprop(c, &chartype, &script);
+
+ switch(ecode[1])
+ {
+ case PT_ANY:
+ if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_LAMP:
+ if ((chartype == ucp_Lu ||
+ chartype == ucp_Ll ||
+ chartype == ucp_Lt) == (op == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_GC:
+ if ((ecode[2] != category) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_PC:
+ if ((ecode[2] != chartype) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_SC:
+ if ((ecode[2] != script) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+
+ ecode += 3;
+ }
+ break;
+
+ /* Match an extended Unicode sequence. We will get here only if the support
+ is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_EXTUNI:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ {
+ int chartype, script;
+ int category = _pcre_ucp_findprop(c, &chartype, &script);
+ if (category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ category = _pcre_ucp_findprop(c, &chartype, &script);
+ if (category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ ecode++;
+ break;
+#endif
+
+
+ /* Match a back reference, possibly repeatedly. Look past the end of the
+ item to see if there is repeat information following. The code is similar
+ to that for character classes, but repeated for efficiency. Then obey
+ similar code to character type repeats - written out again for speed.
+ However, if the referenced string is the empty string, always treat
+ it as matched, any number of times (otherwise there could be infinite
+ loops). */
+
+ case OP_REF:
+ {
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 3;
+
+ /* If the reference is unset, there are two possibilities:
+
+ (a) In the default, Perl-compatible state, set the length to be longer
+ than the amount of subject left; this ensures that every attempt at a
+ match fails. We can't just fail here, because of the possibility of
+ quantifiers with zero minima.
+
+ (b) If the JavaScript compatibility flag is set, set the length to zero
+ so that the back reference matches an empty string.
+
+ Otherwise, set the length to the length of what was matched by the
+ referenced subpattern. */
+
+ if (offset >= offset_top || md->offset_vector[offset] < 0)
+ length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
+
+ /* Set up for repetition, or handle the non-repeated case */
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
+
+ default: /* No repeat follows */
+ if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
+ eptr += length;
+ continue; /* With the main loop */
+ }
+
+ /* If the length of the reference is zero, just continue with the
+ main loop. */
+
+ if (length == 0) continue;
+
+ /* First, ensure the minimum number of matches are present. We get back
+ the length of the reference string explicitly rather than passing the
+ address of eptr, so that eptr can be a register variable. */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
+ eptr += length;
+ }
+
+ /* If min = max, continue at the same level without recursion.
+ They are not both allowed to be zero. */
+
+ if (min == max) continue;
+
+ /* If minimizing, keep trying and advancing the pointer */
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || !match_ref(offset, eptr, length, md, ims))
+ RRETURN(MATCH_NOMATCH);
+ eptr += length;
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, find the longest string and work backwards */
+
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (!match_ref(offset, eptr, length, md, ims)) break;
+ eptr += length;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr -= length;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+
+
+ /* Match a bit-mapped character class, possibly repeatedly. This op code is
+ used when all the characters in the class have values in the range 0-255,
+ and either the matching is caseful, or the characters are in the range
+ 0-127 when UTF-8 processing is enabled. The only difference between
+ OP_CLASS and OP_NCLASS occurs when a data character outside the range is
+ encountered.
+
+ First, look past the end of the item to see if there is repeat information
+ following. Then obey similar code to character type repeats - written out
+ again for speed. */
+
+ case OP_NCLASS:
+ case OP_CLASS:
+ {
+ data = ecode + 1; /* Save for matching */
+ ecode += 33; /* Advance past the item */
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
+
+ default: /* No repeat follows */
+ min = max = 1;
+ break;
+ }
+
+ /* First, ensure the minimum number of matches are present. */
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (c > 255)
+ {
+ if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ c = *eptr++;
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* If max == min we can continue with the main loop without the
+ need to recurse. */
+
+ if (min == max) continue;
+
+ /* If minimizing, keep testing the rest of the expression and advancing
+ the pointer while it matches the class. */
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (c > 255)
+ {
+ if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ c = *eptr++;
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, find the longest possible run, then work backwards. */
+
+ else
+ {
+ pp = eptr;
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c > 255)
+ {
+ if (op == OP_CLASS) break;
+ }
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) break;
+ }
+ eptr += len;
+ }
+ for (;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if ((data[c/8] & (1 << (c&7))) == 0) break;
+ eptr++;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr--;
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+
+ /* Match an extended character class. This opcode is encountered only
+ in UTF-8 mode, because that's the only time it is compiled. */
+
+#ifdef SUPPORT_UTF8
+ case OP_XCLASS:
+ {
+ data = ecode + 1 + LINK_SIZE; /* Save for matching */
+ ecode += GET(ecode, 1); /* Advance past the item */
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
+
+ default: /* No repeat follows */
+ min = max = 1;
+ break;
+ }
+
+ /* First, ensure the minimum number of matches are present. */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ }
+
+ /* If max == min we can continue with the main loop without the
+ need to recurse. */
+
+ if (min == max) continue;
+
+ /* If minimizing, keep testing the rest of the expression and advancing
+ the pointer while it matches the class. */
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, find the longest possible run, then work backwards. */
+
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (!_pcre_xclass(c, data)) break;
+ eptr += len;
+ }
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ if (utf8) BACKCHAR(eptr);
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Control never gets here */
+ }
+#endif /* End of XCLASS */
+
+ /* Match a single character, casefully */
+
+ case OP_CHAR:
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ length = 1;
+ ecode++;
+ GETCHARLEN(fc, ecode, length);
+ if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ }
+ else
+#endif
+
+ /* Non-UTF-8 mode */
+ {
+ if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
+ if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
+ ecode += 2;
+ }
+ break;
+
+ /* Match a single character, caselessly */
+
+ case OP_CHARNC:
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ length = 1;
+ ecode++;
+ GETCHARLEN(fc, ecode, length);
+
+ if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+
+ /* If the pattern character's value is < 128, we have only one byte, and
+ can use the fast lookup table. */
+
+ if (fc < 128)
+ {
+ if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Otherwise we must pick up the subject character */
+
+ else
+ {
+ unsigned int dc;
+ GETCHARINC(dc, eptr);
+ ecode += length;
+
+ /* If we have Unicode property support, we can use it to test the other
+ case of the character, if there is one. */
+
+ if (fc != dc)
+ {
+#ifdef SUPPORT_UCP
+ if (dc != _pcre_ucp_othercase(fc))
+#endif
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Non-UTF-8 mode */
+ {
+ if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ ecode += 2;
+ }
+ break;
+
+ /* Match a single character repeatedly. */
+
+ case OP_EXACT:
+ min = max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATCHAR;
+
+ case OP_POSUPTO:
+ possessive = TRUE;
+ /* Fall through */
+
+ case OP_UPTO:
+ case OP_MINUPTO:
+ min = 0;
+ max = GET2(ecode, 1);
+ minimize = *ecode == OP_MINUPTO;
+ ecode += 3;
+ goto REPEATCHAR;
+
+ case OP_POSSTAR:
+ possessive = TRUE;
+ min = 0;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATCHAR;
+
+ case OP_POSPLUS:
+ possessive = TRUE;
+ min = 1;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATCHAR;
+
+ case OP_POSQUERY:
+ possessive = TRUE;
+ min = 0;
+ max = 1;
+ ecode++;
+ goto REPEATCHAR;
+
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ c = *ecode++ - OP_STAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+
+ /* Common code for all repeated single-character matches. We can give
+ up quickly if there are fewer than the minimum number of characters left in
+ the subject. */
+
+ REPEATCHAR:
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ length = 1;
+ charptr = ecode;
+ GETCHARLEN(fc, ecode, length);
+ if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ ecode += length;
+
+ /* Handle multibyte character matching specially here. There is
+ support for caseless matching if UCP support is present. */
+
+ if (length > 1)
+ {
+#ifdef SUPPORT_UCP
+ unsigned int othercase;
+ if ((ims & PCRE_CASELESS) != 0 &&
+ (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
+ oclength = _pcre_ord2utf8(othercase, occhars);
+ else oclength = 0;
+#endif /* SUPPORT_UCP */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+#ifdef SUPPORT_UCP
+ /* Need braces because of following else */
+ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
+ eptr += oclength;
+ }
+#else /* without SUPPORT_UCP */
+ else { RRETURN(MATCH_NOMATCH); }
+#endif /* SUPPORT_UCP */
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+#ifdef SUPPORT_UCP
+ /* Need braces because of following else */
+ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
+ eptr += oclength;
+ }
+#else /* without SUPPORT_UCP */
+ else { RRETURN (MATCH_NOMATCH); }
+#endif /* SUPPORT_UCP */
+ }
+ /* Control never gets here */
+ }
+
+ else /* Maximize */
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr > md->end_subject - length) break;
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+#ifdef SUPPORT_UCP
+ else if (oclength == 0) break;
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) break;
+ eptr += oclength;
+ }
+#else /* without SUPPORT_UCP */
+ else break;
+#endif /* SUPPORT_UCP */
+ }
+
+ if (possessive) continue;
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr == pp) RRETURN(MATCH_NOMATCH);
+#ifdef SUPPORT_UCP
+ eptr--;
+ BACKCHAR(eptr);
+#else /* without SUPPORT_UCP */
+ eptr -= length;
+#endif /* SUPPORT_UCP */
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* If the length of a UTF-8 character is 1, we fall through here, and
+ obey the code as for non-UTF-8 characters below, though in this case the
+ value of fc will always be < 128. */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* When not in UTF-8 mode, load a single-byte character. */
+ {
+ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ fc = *ecode++;
+ }
+
+ /* The value of fc at this point is always less than 256, though we may or
+ may not be in UTF-8 mode. The code is duplicated for the caseless and
+ caseful cases, for speed, since matching characters is likely to be quite
+ common. First, ensure the minimum number of matches are present. If min =
+ max, continue at the same level without recursing. Otherwise, if
+ minimizing, keep trying the rest of the expression and advancing one
+ matching character if failing, up to the maximum. Alternatively, if
+ maximizing, find the maximum number of characters and work backwards. */
+
+ DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
+ max, eptr));
+
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+ fc = md->lcc[fc];
+ for (i = 1; i <= min; i++)
+ if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (min == max) continue;
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject ||
+ fc != md->lcc[*eptr++])
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+ else /* Maximize */
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
+ eptr++;
+ }
+ if (possessive) continue;
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* Caseful comparisons (includes all multi-byte characters) */
+
+ else
+ {
+ for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (min == max) continue;
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+ else /* Maximize */
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc != *eptr) break;
+ eptr++;
+ }
+ if (possessive) continue;
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+ /* Match a negated single one-byte character. The character we are
+ checking can be multibyte. */
+
+ case OP_NOT:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ GETCHARINCTEST(c, eptr);
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (c < 256)
+#endif
+ c = md->lcc[c];
+ if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ /* Match a negated single one-byte character repeatedly. This is almost a
+ repeat of the code for a repeated single character, but I haven't found a
+ nice way of commoning these up that doesn't require a test of the
+ positive/negative option for each character match. Maybe that wouldn't add
+ very much to the time taken, but character matching *is* what this is all
+ about... */
+
+ case OP_NOTEXACT:
+ min = max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTUPTO:
+ case OP_NOTMINUPTO:
+ min = 0;
+ max = GET2(ecode, 1);
+ minimize = *ecode == OP_NOTMINUPTO;
+ ecode += 3;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSSTAR:
+ possessive = TRUE;
+ min = 0;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSPLUS:
+ possessive = TRUE;
+ min = 1;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSQUERY:
+ possessive = TRUE;
+ min = 0;
+ max = 1;
+ ecode++;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSUPTO:
+ possessive = TRUE;
+ min = 0;
+ max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTSTAR:
+ case OP_NOTMINSTAR:
+ case OP_NOTPLUS:
+ case OP_NOTMINPLUS:
+ case OP_NOTQUERY:
+ case OP_NOTMINQUERY:
+ c = *ecode++ - OP_NOTSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+
+ /* Common code for all repeated single-byte matches. We can give up quickly
+ if there are fewer than the minimum number of bytes left in the
+ subject. */
+
+ REPEATNOTCHAR:
+ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ fc = *ecode++;
+
+ /* The code is duplicated for the caseless and caseful cases, for speed,
+ since matching characters is likely to be quite common. First, ensure the
+ minimum number of matches are present. If min = max, continue at the same
+ level without recursing. Otherwise, if minimizing, keep trying the rest of
+ the expression and advancing one matching character if failing, up to the
+ maximum. Alternatively, if maximizing, find the maximum number of
+ characters and work backwards. */
+
+ DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
+ max, eptr));
+
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+ fc = md->lcc[fc];
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ register unsigned int d;
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINC(d, eptr);
+ if (d < 256) d = md->lcc[d];
+ if (fc == d) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+
+ /* Not UTF-8 mode */
+ {
+ for (i = 1; i <= min; i++)
+ if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ register unsigned int d;
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ GETCHARINC(d, eptr);
+ if (d < 256) d = md->lcc[d];
+ if (fi >= max || eptr >= md->end_subject || fc == d)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* Maximize case */
+
+ else
+ {
+ pp = eptr;
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ register unsigned int d;
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(d, eptr, len);
+ if (d < 256) d = md->lcc[d];
+ if (fc == d) break;
+ eptr += len;
+ }
+ if (possessive) continue;
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
+ eptr++;
+ }
+ if (possessive) continue;
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr--;
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* Caseful comparisons */
+
+ else
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ register unsigned int d;
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINC(d, eptr);
+ if (fc == d) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = 1; i <= min; i++)
+ if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ register unsigned int d;
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ GETCHARINC(d, eptr);
+ if (fi >= max || eptr >= md->end_subject || fc == d)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* Maximize case */
+
+ else
+ {
+ pp = eptr;
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ register unsigned int d;
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(d, eptr, len);
+ if (fc == d) break;
+ eptr += len;
+ }
+ if (possessive) continue;
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc == *eptr) break;
+ eptr++;
+ }
+ if (possessive) continue;
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr--;
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+ /* Match a single character type repeatedly; several different opcodes
+ share code. This is very similar to the code for single characters, but we
+ repeat it in the interests of efficiency. */
+
+ case OP_TYPEEXACT:
+ min = max = GET2(ecode, 1);
+ minimize = TRUE;
+ ecode += 3;
+ goto REPEATTYPE;
+
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ min = 0;
+ max = GET2(ecode, 1);
+ minimize = *ecode == OP_TYPEMINUPTO;
+ ecode += 3;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSSTAR:
+ possessive = TRUE;
+ min = 0;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSPLUS:
+ possessive = TRUE;
+ min = 1;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSQUERY:
+ possessive = TRUE;
+ min = 0;
+ max = 1;
+ ecode++;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSUPTO:
+ possessive = TRUE;
+ min = 0;
+ max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATTYPE;
+
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ c = *ecode++ - OP_TYPESTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+
+ /* Common code for all repeated single character type matches. Note that
+ in UTF-8 mode, '.' matches a character of any length, but for the other
+ character types, the valid characters are all one-byte long. */
+
+ REPEATTYPE:
+ ctype = *ecode++; /* Code for the character type */
+
+#ifdef SUPPORT_UCP
+ if (ctype == OP_PROP || ctype == OP_NOTPROP)
+ {
+ prop_fail_result = ctype == OP_NOTPROP;
+ prop_type = *ecode++;
+ prop_value = *ecode++;
+ }
+ else prop_type = -1;
+#endif
+
+ /* First, ensure the minimum number of matches are present. Use inline
+ code for maximizing the speed, and do the type test once at the start
+ (i.e. keep it out of the loop). Also we can test that there are at least
+ the minimum number of bytes before we start. This isn't as effective in
+ UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
+ is tidier. Also separate the UCP code, which can be the same for both UTF-8
+ and single-bytes. */
+
+ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ if (min > 0)
+ {
+#ifdef SUPPORT_UCP
+ if (prop_type >= 0)
+ {
+ switch(prop_type)
+ {
+ case PT_ANY:
+ if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ }
+ break;
+
+ case PT_LAMP:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_chartype == ucp_Lu ||
+ prop_chartype == ucp_Ll ||
+ prop_chartype == ucp_Lt) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_GC:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_category == prop_value) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_PC:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_chartype == prop_value) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_SC:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_script == prop_value) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+/* Handle all other cases when the coding is UTF-8 */
+
+#ifdef SUPPORT_UTF8
+ if (utf8) switch(ctype)
+ {
+ case OP_ANY:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr))
+ RRETURN(MATCH_NOMATCH);
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ break;
+
+ case OP_ALLANY:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ break;
+
+ case OP_ANYBYTE:
+ eptr += min;
+ break;
+
+ case OP_ANYNL:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+
+ case 0x000a:
+ break;
+
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ break;
+ }
+ }
+ break;
+
+ case OP_NOT_HSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ switch(c)
+ {
+ default: break;
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ break;
+
+ case OP_HSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ break;
+ }
+ }
+ break;
+
+ case OP_NOT_VSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ switch(c)
+ {
+ default: break;
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ break;
+
+ case OP_VSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ break;
+ }
+ }
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case OP_DIGIT:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
+ RRETURN(MATCH_NOMATCH);
+ /* No need to skip more bytes - we know it's a 1-byte character */
+ }
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
+ RRETURN(MATCH_NOMATCH);
+ while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+ }
+ break;
+
+ case OP_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
+ RRETURN(MATCH_NOMATCH);
+ /* No need to skip more bytes - we know it's a 1-byte character */
+ }
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
+ RRETURN(MATCH_NOMATCH);
+ while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+ }
+ break;
+
+ case OP_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ /* No need to skip more bytes - we know it's a 1-byte character */
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ } /* End switch(ctype) */
+
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Code for the non-UTF-8 case for minimum matching of operators other
+ than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
+ number of bytes present, as this was tested above. */
+
+ switch(ctype)
+ {
+ case OP_ANY:
+ for (i = 1; i <= min; i++)
+ {
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ eptr++;
+ }
+ break;
+
+ case OP_ALLANY:
+ eptr += min;
+ break;
+
+ case OP_ANYBYTE:
+ eptr += min;
+ break;
+
+ /* Because of the CRLF case, we can't assume the minimum number of
+ bytes are present in this case. */
+
+ case OP_ANYNL:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ switch(*eptr++)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ break;
+
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ break;
+ }
+ }
+ break;
+
+ case OP_NOT_HSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ switch(*eptr++)
+ {
+ default: break;
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ break;
+
+ case OP_HSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ switch(*eptr++)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ break;
+ }
+ }
+ break;
+
+ case OP_NOT_VSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ switch(*eptr++)
+ {
+ default: break;
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ break;
+
+ case OP_VSPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ switch(*eptr++)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ break;
+ }
+ }
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_DIGIT:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_word) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+
+ /* If min = max, continue at the same level without recursing */
+
+ if (min == max) continue;
+
+ /* If minimizing, we have to test the rest of the pattern before each
+ subsequent match. Again, separate the UTF-8 case for speed, and also
+ separate the UCP cases. */
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UCP
+ if (prop_type >= 0)
+ {
+ switch(prop_type)
+ {
+ case PT_ANY:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_LAMP:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_chartype == ucp_Lu ||
+ prop_chartype == ucp_Ll ||
+ prop_chartype == ucp_Lt) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_GC:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_category == prop_value) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_PC:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_chartype == prop_value) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_SC:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_script == prop_value) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (utf8)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject ||
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
+ RRETURN(MATCH_NOMATCH);
+
+ GETCHARINC(c, eptr);
+ switch(ctype)
+ {
+ case OP_ANY: /* This is the non-NL case */
+ case OP_ALLANY:
+ case OP_ANYBYTE:
+ break;
+
+ case OP_ANYNL:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ break;
+
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ break;
+ }
+ break;
+
+ case OP_NOT_HSPACE:
+ switch(c)
+ {
+ default: break;
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case OP_HSPACE:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ break;
+ }
+ break;
+
+ case OP_NOT_VSPACE:
+ switch(c)
+ {
+ default: break;
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case OP_VSPACE:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ break;
+ }
+ break;
+
+ case OP_NOT_DIGIT:
+ if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_DIGIT:
+ if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WHITESPACE:
+ if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WHITESPACE:
+ if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WORDCHAR:
+ if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WORDCHAR:
+ if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject ||
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
+ RRETURN(MATCH_NOMATCH);
+
+ c = *eptr++;
+ switch(ctype)
+ {
+ case OP_ANY: /* This is the non-NL case */
+ case OP_ALLANY:
+ case OP_ANYBYTE:
+ break;
+
+ case OP_ANYNL:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+
+ case 0x000a:
+ break;
+
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ break;
+ }
+ break;
+
+ case OP_NOT_HSPACE:
+ switch(c)
+ {
+ default: break;
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case OP_HSPACE:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ break;
+ }
+ break;
+
+ case OP_NOT_VSPACE:
+ switch(c)
+ {
+ default: break;
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case OP_VSPACE:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ break;
+ }
+ break;
+
+ case OP_NOT_DIGIT:
+ if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_DIGIT:
+ if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WHITESPACE:
+ if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WHITESPACE:
+ if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WORDCHAR:
+ if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WORDCHAR:
+ if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, it is worth using inline code for speed, doing the type
+ test once at the start (i.e. keep it out of the loop). Again, keep the
+ UTF-8 and UCP stuff separate. */
+
+ else
+ {
+ pp = eptr; /* Remember where we started */
+
+#ifdef SUPPORT_UCP
+ if (prop_type >= 0)
+ {
+ switch(prop_type)
+ {
+ case PT_ANY:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (prop_fail_result) break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_LAMP:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_chartype == ucp_Lu ||
+ prop_chartype == ucp_Ll ||
+ prop_chartype == ucp_Lt) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_GC:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_category == prop_value) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_PC:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_chartype == prop_value) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_SC:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if ((prop_script == prop_value) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ if (possessive) continue;
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ if (utf8) BACKCHAR(eptr);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ GETCHARINCTEST(c, eptr);
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category == ucp_M) break;
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ if (possessive) continue;
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ for (;;) /* Move back over one extended */
+ {
+ int len = 1;
+ if (!utf8) c = *eptr; else
+ {
+ BACKCHAR(eptr);
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ if (prop_category != ucp_M) break;
+ eptr--;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+
+ if (utf8)
+ {
+ switch(ctype)
+ {
+ case OP_ANY:
+ if (max < INT_MAX)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ }
+
+ /* Handle unlimited UTF-8 repeat */
+
+ else
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ }
+ break;
+
+ case OP_ALLANY:
+ if (max < INT_MAX)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ }
+ else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
+ break;
+
+ /* The byte case is the same as non-UTF8 */
+
+ case OP_ANYBYTE:
+ c = max - min;
+ if (c > (unsigned int)(md->end_subject - eptr))
+ c = md->end_subject - eptr;
+ eptr += c;
+ break;
+
+ case OP_ANYNL:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c == 0x000d)
+ {
+ if (++eptr >= md->end_subject) break;
+ if (*eptr == 0x000a) eptr++;
+ }
+ else
+ {
+ if (c != 0x000a &&
+ (md->bsr_anycrlf ||
+ (c != 0x000b && c != 0x000c &&
+ c != 0x0085 && c != 0x2028 && c != 0x2029)))
+ break;
+ eptr += len;
+ }
+ }
+ break;
+
+ case OP_NOT_HSPACE:
+ case OP_HSPACE:
+ for (i = min; i < max; i++)
+ {
+ BOOL gotspace;
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ switch(c)
+ {
+ default: gotspace = FALSE; break;
+ case 0x09: /* HT */
+ case 0x20: /* SPACE */
+ case 0xa0: /* NBSP */
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+ gotspace = TRUE;
+ break;
+ }
+ if (gotspace == (ctype == OP_NOT_HSPACE)) break;
+ eptr += len;
+ }
+ break;
+
+ case OP_NOT_VSPACE:
+ case OP_VSPACE:
+ for (i = min; i < max; i++)
+ {
+ BOOL gotspace;
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ switch(c)
+ {
+ default: gotspace = FALSE; break;
+ case 0x0a: /* LF */
+ case 0x0b: /* VT */
+ case 0x0c: /* FF */
+ case 0x0d: /* CR */
+ case 0x85: /* NEL */
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+ gotspace = TRUE;
+ break;
+ }
+ if (gotspace == (ctype == OP_NOT_VSPACE)) break;
+ eptr += len;
+ }
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
+ eptr+= len;
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ if (possessive) continue;
+ for(;;)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Not UTF-8 mode */
+ {
+ switch(ctype)
+ {
+ case OP_ANY:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ }
+ break;
+
+ case OP_ALLANY:
+ case OP_ANYBYTE:
+ c = max - min;
+ if (c > (unsigned int)(md->end_subject - eptr))
+ c = md->end_subject - eptr;
+ eptr += c;
+ break;
+
+ case OP_ANYNL:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if (c == 0x000d)
+ {
+ if (++eptr >= md->end_subject) break;
+ if (*eptr == 0x000a) eptr++;
+ }
+ else
+ {
+ if (c != 0x000a &&
+ (md->bsr_anycrlf ||
+ (c != 0x000b && c != 0x000c && c != 0x0085)))
+ break;
+ eptr++;
+ }
+ }
+ break;
+
+ case OP_NOT_HSPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if (c == 0x09 || c == 0x20 || c == 0xa0) break;
+ eptr++;
+ }
+ break;
+
+ case OP_HSPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if (c != 0x09 && c != 0x20 && c != 0xa0) break;
+ eptr++;
+ }
+ break;
+
+ case OP_NOT_VSPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_VSPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ if (possessive) continue;
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ }
+
+ /* Get here if we can't make it match with any permitted repetitions */
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ /* There's been some horrible disaster. Arrival here can only mean there is
+ something seriously wrong in the code above or the OP_xxx definitions. */
+
+ default:
+ DPRINTF(("Unknown opcode %d\n", *ecode));
+ RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
+ }
+
+ /* Do not stick any code in here without much thought; it is assumed
+ that "continue" in the code above comes out to here to repeat the main
+ loop. */
+
+ } /* End of main loop */
+/* Control never reaches here */
+
+
+/* When compiling to use the heap rather than the stack for recursive calls to
+match(), the RRETURN() macro jumps here. The number that is saved in
+frame->Xwhere indicates which label we actually want to return to. */
+
+#ifdef NO_RECURSE
+#define LBL(val) case val: goto L_RM##val;
+HEAP_RETURN:
+switch (frame->Xwhere)
+ {
+ LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
+ LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
+ LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
+ LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
+ LBL(53) LBL(54)
+#ifdef SUPPORT_UTF8
+ LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
+ LBL(32) LBL(34) LBL(42) LBL(46)
+#ifdef SUPPORT_UCP
+ LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
+#endif /* SUPPORT_UCP */
+#endif /* SUPPORT_UTF8 */
+ default:
+ DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
+ return PCRE_ERROR_INTERNAL;
+ }
+#undef LBL
+#endif /* NO_RECURSE */
+}
+
+
+/***************************************************************************
+****************************************************************************
+ RECURSION IN THE match() FUNCTION
+
+Undefine all the macros that were defined above to handle this. */
+
+#ifdef NO_RECURSE
+#undef eptr
+#undef ecode
+#undef mstart
+#undef offset_top
+#undef ims
+#undef eptrb
+#undef flags
+
+#undef callpat
+#undef charptr
+#undef data
+#undef next
+#undef pp
+#undef prev
+#undef saved_eptr
+
+#undef new_recursive
+
+#undef cur_is_word
+#undef condition
+#undef prev_is_word
+
+#undef original_ims
+
+#undef ctype
+#undef length
+#undef max
+#undef min
+#undef number
+#undef offset
+#undef op
+#undef save_capture_last
+#undef save_offset1
+#undef save_offset2
+#undef save_offset3
+#undef stacksave
+
+#undef newptrb
+
+#endif
+
+/* These two are defined as macros in both cases */
+
+#undef fc
+#undef fi
+
+/***************************************************************************
+***************************************************************************/
+
+
+
+/*************************************************
+* Execute a Regular Expression *
+*************************************************/
+
+/* This function applies a compiled re to a subject string and picks out
+portions of the string if it matches. Two elements in the vector are set for
+each substring: the offsets to the start and end of the substring.
+
+Arguments:
+ argument_re points to the compiled expression
+ extra_data points to extra data or is NULL
+ subject points to the subject string
+ length length of subject string (may contain binary zeros)
+ start_offset where to start in the subject string
+ options option bits
+ offsets points to a vector of ints to be filled in with offsets
+ offsetcount the number of elements in the vector
+
+Returns: > 0 => success; value is the number of elements filled in
+ = 0 => success, but offsets is not big enough
+ -1 => failed to match
+ < -1 => some kind of unexpected problem
+*/
+
+PCRE_EXP_DEFN int
+pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
+ PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
+ int offsetcount)
+{
+int rc, resetcount, ocount;
+int first_byte = -1;
+int req_byte = -1;
+int req_byte2 = -1;
+int newline;
+unsigned long int ims;
+BOOL using_temporary_offsets = FALSE;
+BOOL anchored;
+BOOL startline;
+BOOL firstline;
+BOOL first_byte_caseless = FALSE;
+BOOL req_byte_caseless = FALSE;
+BOOL utf8;
+match_data match_block;
+match_data *md = &match_block;
+const uschar *tables;
+const uschar *start_bits = NULL;
+USPTR start_match = (USPTR)subject + start_offset;
+USPTR end_subject;
+USPTR req_byte_ptr = start_match - 1;
+
+pcre_study_data internal_study;
+const pcre_study_data *study;
+
+real_pcre internal_re;
+const real_pcre *external_re = (const real_pcre *)argument_re;
+const real_pcre *re = external_re;
+
+/* Plausibility checks */
+
+if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
+if (re == NULL || subject == NULL ||
+ (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
+if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+
+/* Fish out the optional data from the extra_data structure, first setting
+the default values. */
+
+study = NULL;
+md->match_limit = MATCH_LIMIT;
+md->match_limit_recursion = MATCH_LIMIT_RECURSION;
+md->callout_data = NULL;
+
+/* The table pointer is always in native byte order. */
+
+tables = external_re->tables;
+
+if (extra_data != NULL)
+ {
+ register unsigned int flags = extra_data->flags;
+ if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
+ study = (const pcre_study_data *)extra_data->study_data;
+ if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
+ md->match_limit = extra_data->match_limit;
+ if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
+ md->match_limit_recursion = extra_data->match_limit_recursion;
+ if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
+ md->callout_data = extra_data->callout_data;
+ if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
+ }
+
+/* If the exec call supplied NULL for tables, use the inbuilt ones. This
+is a feature that makes it possible to save compiled regex and re-use them
+in other programs later. */
+
+if (tables == NULL) tables = _pcre_default_tables;
+
+/* Check that the first field in the block is the magic number. If it is not,
+test for a regex that was compiled on a host of opposite endianness. If this is
+the case, flipped values are put in internal_re and internal_study if there was
+study data too. */
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
+/* Set up other data */
+
+anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
+startline = (re->flags & PCRE_STARTLINE) != 0;
+firstline = (re->options & PCRE_FIRSTLINE) != 0;
+
+/* The code starts after the real_pcre block and the capture name table. */
+
+md->start_code = (const uschar *)external_re + re->name_table_offset +
+ re->name_count * re->name_entry_size;
+
+md->start_subject = (USPTR)subject;
+md->start_offset = start_offset;
+md->end_subject = md->start_subject + length;
+end_subject = md->end_subject;
+
+md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
+utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
+
+md->notbol = (options & PCRE_NOTBOL) != 0;
+md->noteol = (options & PCRE_NOTEOL) != 0;
+md->notempty = (options & PCRE_NOTEMPTY) != 0;
+md->partial = (options & PCRE_PARTIAL) != 0;
+md->hitend = FALSE;
+
+md->recursive = NULL; /* No recursion at top level */
+
+md->lcc = tables + lcc_offset;
+md->ctypes = tables + ctypes_offset;
+
+/* Handle different \R options. */
+
+switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
+ {
+ case 0:
+ if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
+ md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
+ else
+#ifdef BSR_ANYCRLF
+ md->bsr_anycrlf = TRUE;
+#else
+ md->bsr_anycrlf = FALSE;
+#endif
+ break;
+
+ case PCRE_BSR_ANYCRLF:
+ md->bsr_anycrlf = TRUE;
+ break;
+
+ case PCRE_BSR_UNICODE:
+ md->bsr_anycrlf = FALSE;
+ break;
+
+ default: return PCRE_ERROR_BADNEWLINE;
+ }
+
+/* Handle different types of newline. The three bits give eight cases. If
+nothing is set at run time, whatever was used at compile time applies. */
+
+switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
+ (pcre_uint32)options) & PCRE_NEWLINE_BITS)
+ {
+ case 0: newline = NEWLINE; break; /* Compile-time default */
+ case PCRE_NEWLINE_CR: newline = '\r'; break;
+ case PCRE_NEWLINE_LF: newline = '\n'; break;
+ case PCRE_NEWLINE_CR+
+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+ case PCRE_NEWLINE_ANY: newline = -1; break;
+ case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
+ default: return PCRE_ERROR_BADNEWLINE;
+ }
+
+if (newline == -2)
+ {
+ md->nltype = NLTYPE_ANYCRLF;
+ }
+else if (newline < 0)
+ {
+ md->nltype = NLTYPE_ANY;
+ }
+else
+ {
+ md->nltype = NLTYPE_FIXED;
+ if (newline > 255)
+ {
+ md->nllen = 2;
+ md->nl[0] = (newline >> 8) & 255;
+ md->nl[1] = newline & 255;
+ }
+ else
+ {
+ md->nllen = 1;
+ md->nl[0] = newline;
+ }
+ }
+
+/* Partial matching is supported only for a restricted set of regexes at the
+moment. */
+
+if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
+ return PCRE_ERROR_BADPARTIAL;
+
+/* Check a UTF-8 string if required. Unfortunately there's no way of passing
+back the character offset. */
+
+#ifdef SUPPORT_UTF8
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+ {
+ if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
+ return PCRE_ERROR_BADUTF8;
+ if (start_offset > 0 && start_offset < length)
+ {
+ int tb = ((uschar *)subject)[start_offset];
+ if (tb > 127)
+ {
+ tb &= 0xc0;
+ if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
+ }
+ }
+ }
+#endif
+
+/* The ims options can vary during the matching as a result of the presence
+of (?ims) items in the pattern. They are kept in a local variable so that
+restoring at the exit of a group is easy. */
+
+ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
+
+/* If the expression has got more back references than the offsets supplied can
+hold, we get a temporary chunk of working store to use during the matching.
+Otherwise, we can use the vector supplied, rounding down its size to a multiple
+of 3. */
+
+ocount = offsetcount - (offsetcount % 3);
+
+if (re->top_backref > 0 && re->top_backref >= ocount/3)
+ {
+ ocount = re->top_backref * 3 + 3;
+ md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
+ if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
+ using_temporary_offsets = TRUE;
+ DPRINTF(("Got memory to hold back references\n"));
+ }
+else md->offset_vector = offsets;
+
+md->offset_end = ocount;
+md->offset_max = (2*ocount)/3;
+md->offset_overflow = FALSE;
+md->capture_last = -1;
+
+/* Compute the minimum number of offsets that we need to reset each time. Doing
+this makes a huge difference to execution time when there aren't many brackets
+in the pattern. */
+
+resetcount = 2 + re->top_bracket * 2;
+if (resetcount > offsetcount) resetcount = ocount;
+
+/* Reset the working variable associated with each extraction. These should
+never be used unless previously set, but they get saved and restored, and so we
+initialize them to avoid reading uninitialized locations. */
+
+if (md->offset_vector != NULL)
+ {
+ register int *iptr = md->offset_vector + ocount;
+ register int *iend = iptr - resetcount/2 + 1;
+ while (--iptr >= iend) *iptr = -1;
+ }
+
+/* Set up the first character to match, if available. The first_byte value is
+never set for an anchored regular expression, but the anchoring may be forced
+at run time, so we have to test for anchoring. The first char may be unset for
+an unanchored pattern, of course. If there's no first char and the pattern was
+studied, there may be a bitmap of possible first characters. */
+
+if (!anchored)
+ {
+ if ((re->flags & PCRE_FIRSTSET) != 0)
+ {
+ first_byte = re->first_byte & 255;
+ if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
+ first_byte = md->lcc[first_byte];
+ }
+ else
+ if (!startline && study != NULL &&
+ (study->options & PCRE_STUDY_MAPPED) != 0)
+ start_bits = study->start_bits;
+ }
+
+/* For anchored or unanchored matches, there may be a "last known required
+character" set. */
+
+if ((re->flags & PCRE_REQCHSET) != 0)
+ {
+ req_byte = re->req_byte & 255;
+ req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
+ req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
+ }
+
+
+/* ==========================================================================*/
+
+/* Loop for handling unanchored repeated matching attempts; for anchored regexs
+the loop runs just once. */
+
+for(;;)
+ {
+ USPTR save_end_subject = end_subject;
+ USPTR new_start_match;
+
+ /* Reset the maximum number of extractions we might see. */
+
+ if (md->offset_vector != NULL)
+ {
+ register int *iptr = md->offset_vector;
+ register int *iend = iptr + resetcount;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
+ /* Advance to a unique first char if possible. If firstline is TRUE, the
+ start of the match is constrained to the first line of a multiline string.
+ That is, the match must be before or at the first newline. Implement this by
+ temporarily adjusting end_subject so that we stop scanning at a newline. If
+ the match fails at the newline, later code breaks this loop. */
+
+ if (firstline)
+ {
+ USPTR t = start_match;
+ while (t < md->end_subject && !IS_NEWLINE(t)) t++;
+ end_subject = t;
+ }
+
+ /* Now test for a unique first byte */
+
+ if (first_byte >= 0)
+ {
+ if (first_byte_caseless)
+ while (start_match < end_subject &&
+ md->lcc[*start_match] != first_byte)
+ { NEXTCHAR(start_match); }
+ else
+ while (start_match < end_subject && *start_match != first_byte)
+ { NEXTCHAR(start_match); }
+ }
+
+ /* Or to just after a linebreak for a multiline match if possible */
+
+ else if (startline)
+ {
+ if (start_match > md->start_subject + start_offset)
+ {
+ while (start_match <= end_subject && !WAS_NEWLINE(start_match))
+ { NEXTCHAR(start_match); }
+
+ /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
+ and we are now at a LF, advance the match position by one more character.
+ */
+
+ if (start_match[-1] == '\r' &&
+ (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
+ start_match < end_subject &&
+ *start_match == '\n')
+ start_match++;
+ }
+ }
+
+ /* Or to a non-unique first char after study */
+
+ else if (start_bits != NULL)
+ {
+ while (start_match < end_subject)
+ {
+ register unsigned int c = *start_match;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ { NEXTCHAR(start_match); }
+ else break;
+ }
+ }
+
+ /* Restore fudged end_subject */
+
+ end_subject = save_end_subject;
+
+#ifdef DEBUG /* Sigh. Some compilers never learn. */
+ printf(">>>> Match against: ");
+ pchars(start_match, end_subject - start_match, TRUE, md);
+ printf("\n");
+#endif
+
+ /* If req_byte is set, we know that that character must appear in the subject
+ for the match to succeed. If the first character is set, req_byte must be
+ later in the subject; otherwise the test starts at the match point. This
+ optimization can save a huge amount of backtracking in patterns with nested
+ unlimited repeats that aren't going to match. Writing separate code for
+ cased/caseless versions makes it go faster, as does using an autoincrement
+ and backing off on a match.
+
+ HOWEVER: when the subject string is very, very long, searching to its end can
+ take a long time, and give bad performance on quite ordinary patterns. This
+ showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
+ string... so we don't do this when the string is sufficiently long.
+
+ ALSO: this processing is disabled when partial matching is requested.
+ */
+
+ if (req_byte >= 0 &&
+ end_subject - start_match < REQ_BYTE_MAX &&
+ !md->partial)
+ {
+ register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
+
+ /* We don't need to repeat the search if we haven't yet reached the
+ place we found it at last time. */
+
+ if (p > req_byte_ptr)
+ {
+ if (req_byte_caseless)
+ {
+ while (p < end_subject)
+ {
+ register int pp = *p++;
+ if (pp == req_byte || pp == req_byte2) { p--; break; }
+ }
+ }
+ else
+ {
+ while (p < end_subject)
+ {
+ if (*p++ == req_byte) { p--; break; }
+ }
+ }
+
+ /* If we can't find the required character, break the matching loop,
+ forcing a match failure. */
+
+ if (p >= end_subject)
+ {
+ rc = MATCH_NOMATCH;
+ break;
+ }
+
+ /* If we have found the required character, save the point where we
+ found it, so that we don't search again next time round the loop if
+ the start hasn't passed this character yet. */
+
+ req_byte_ptr = p;
+ }
+ }
+
+ /* OK, we can now run the match. */
+
+ md->start_match_ptr = start_match;
+ md->match_call_count = 0;
+ rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
+
+ switch(rc)
+ {
+ /* NOMATCH and PRUNE advance by one character. THEN at this level acts
+ exactly like PRUNE. */
+
+ case MATCH_NOMATCH:
+ case MATCH_PRUNE:
+ case MATCH_THEN:
+ new_start_match = start_match + 1;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
+ new_start_match++;
+#endif
+ break;
+
+ /* SKIP passes back the next starting point explicitly. */
+
+ case MATCH_SKIP:
+ new_start_match = md->start_match_ptr;
+ break;
+
+ /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
+
+ case MATCH_COMMIT:
+ rc = MATCH_NOMATCH;
+ goto ENDLOOP;
+
+ /* Any other return is some kind of error. */
+
+ default:
+ goto ENDLOOP;
+ }
+
+ /* Control reaches here for the various types of "no match at this point"
+ result. Reset the code to MATCH_NOMATCH for subsequent checking. */
+
+ rc = MATCH_NOMATCH;
+
+ /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
+ newline in the subject (though it may continue over the newline). Therefore,
+ if we have just failed to match, starting at a newline, do not continue. */
+
+ if (firstline && IS_NEWLINE(start_match)) break;
+
+ /* Advance to new matching position */
+
+ start_match = new_start_match;
+
+ /* Break the loop if the pattern is anchored or if we have passed the end of
+ the subject. */
+
+ if (anchored || start_match > end_subject) break;
+
+ /* If we have just passed a CR and we are now at a LF, and the pattern does
+ not contain any explicit matches for \r or \n, and the newline option is CRLF
+ or ANY or ANYCRLF, advance the match position by one more character. */
+
+ if (start_match[-1] == '\r' &&
+ start_match < end_subject &&
+ *start_match == '\n' &&
+ (re->flags & PCRE_HASCRORLF) == 0 &&
+ (md->nltype == NLTYPE_ANY ||
+ md->nltype == NLTYPE_ANYCRLF ||
+ md->nllen == 2))
+ start_match++;
+
+ } /* End of for(;;) "bumpalong" loop */
+
+/* ==========================================================================*/
+
+/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
+conditions is true:
+
+(1) The pattern is anchored or the match was failed by (*COMMIT);
+
+(2) We are past the end of the subject;
+
+(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
+ this option requests that a match occur at or before the first newline in
+ the subject.
+
+When we have a match and the offset vector is big enough to deal with any
+backreferences, captured substring offsets will already be set up. In the case
+where we had to get some local store to hold offsets for backreference
+processing, copy those that we can. In this case there need not be overflow if
+certain parts of the pattern were not used, even though there are more
+capturing parentheses than vector slots. */
+
+ENDLOOP:
+
+if (rc == MATCH_MATCH)
+ {
+ if (using_temporary_offsets)
+ {
+ if (offsetcount >= 4)
+ {
+ memcpy(offsets + 2, md->offset_vector + 2,
+ (offsetcount - 2) * sizeof(int));
+ DPRINTF(("Copied offsets from temporary memory\n"));
+ }
+ if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
+ DPRINTF(("Freeing temporary memory\n"));
+ (pcre_free)(md->offset_vector);
+ }
+
+ /* Set the return code to the number of captured strings, or 0 if there are
+ too many to fit into the vector. */
+
+ rc = md->offset_overflow? 0 : md->end_offset_top/2;
+
+ /* If there is space, set up the whole thing as substring 0. The value of
+ md->start_match_ptr might be modified if \K was encountered on the success
+ matching path. */
+
+ if (offsetcount < 2) rc = 0; else
+ {
+ offsets[0] = md->start_match_ptr - md->start_subject;
+ offsets[1] = md->end_match_ptr - md->start_subject;
+ }
+
+ DPRINTF((">>>> returning %d\n", rc));
+ return rc;
+ }
+
+/* Control gets here if there has been an error, or if the overall match
+attempt has failed at all permitted starting positions. */
+
+if (using_temporary_offsets)
+ {
+ DPRINTF(("Freeing temporary memory\n"));
+ (pcre_free)(md->offset_vector);
+ }
+
+if (rc != MATCH_NOMATCH)
+ {
+ DPRINTF((">>>> error: returning %d\n", rc));
+ return rc;
+ }
+else if (md->partial && md->hitend)
+ {
+ DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
+ return PCRE_ERROR_PARTIAL;
+ }
+else
+ {
+ DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
+ return PCRE_ERROR_NOMATCH;
+ }
+}
+
+/* End of pcre_exec.c */
diff --git a/src/pcre_fullinfo.c b/src/pcre_fullinfo.c
new file mode 100644
index 0000000..7b001c6
--- /dev/null
+++ b/src/pcre_fullinfo.c
@@ -0,0 +1,165 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_fullinfo(), which returns
+information about a compiled pattern. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Return info about compiled pattern *
+*************************************************/
+
+/* This is a newer "info" function which has an extensible interface so
+that additional items can be added compatibly.
+
+Arguments:
+ argument_re points to compiled code
+ extra_data points extra data, or NULL
+ what what information is required
+ where where to put the information
+
+Returns: 0 if data returned, negative on error
+*/
+
+PCRE_EXP_DEFN int
+pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
+ void *where)
+{
+real_pcre internal_re;
+pcre_study_data internal_study;
+const real_pcre *re = (const real_pcre *)argument_re;
+const pcre_study_data *study = NULL;
+
+if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
+
+if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
+ study = (const pcre_study_data *)extra_data->study_data;
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
+switch (what)
+ {
+ case PCRE_INFO_OPTIONS:
+ *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
+ break;
+
+ case PCRE_INFO_SIZE:
+ *((size_t *)where) = re->size;
+ break;
+
+ case PCRE_INFO_STUDYSIZE:
+ *((size_t *)where) = (study == NULL)? 0 : study->size;
+ break;
+
+ case PCRE_INFO_CAPTURECOUNT:
+ *((int *)where) = re->top_bracket;
+ break;
+
+ case PCRE_INFO_BACKREFMAX:
+ *((int *)where) = re->top_backref;
+ break;
+
+ case PCRE_INFO_FIRSTBYTE:
+ *((int *)where) =
+ ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
+ ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
+ break;
+
+ /* Make sure we pass back the pointer to the bit vector in the external
+ block, not the internal copy (with flipped integer fields). */
+
+ case PCRE_INFO_FIRSTTABLE:
+ *((const uschar **)where) =
+ (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
+ ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
+ break;
+
+ case PCRE_INFO_LASTLITERAL:
+ *((int *)where) =
+ ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
+ break;
+
+ case PCRE_INFO_NAMEENTRYSIZE:
+ *((int *)where) = re->name_entry_size;
+ break;
+
+ case PCRE_INFO_NAMECOUNT:
+ *((int *)where) = re->name_count;
+ break;
+
+ case PCRE_INFO_NAMETABLE:
+ *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
+ break;
+
+ case PCRE_INFO_DEFAULT_TABLES:
+ *((const uschar **)where) = (const uschar *)(_pcre_default_tables);
+ break;
+
+ case PCRE_INFO_OKPARTIAL:
+ *((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0;
+ break;
+
+ case PCRE_INFO_JCHANGED:
+ *((int *)where) = (re->flags & PCRE_JCHANGED) != 0;
+ break;
+
+ case PCRE_INFO_HASCRORLF:
+ *((int *)where) = (re->flags & PCRE_HASCRORLF) != 0;
+ break;
+
+ default: return PCRE_ERROR_BADOPTION;
+ }
+
+return 0;
+}
+
+/* End of pcre_fullinfo.c */
diff --git a/src/pcre_get.c b/src/pcre_get.c
new file mode 100644
index 0000000..68b8de4
--- /dev/null
+++ b/src/pcre_get.c
@@ -0,0 +1,465 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains some convenience functions for extracting substrings
+from the subject string after a regex match has succeeded. The original idea
+for these functions came from Scott Wimer. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Find number for named string *
+*************************************************/
+
+/* This function is used by the get_first_set() function below, as well
+as being generally available. It assumes that names are unique.
+
+Arguments:
+ code the compiled regex
+ stringname the name whose number is required
+
+Returns: the number of the named parentheses, or a negative number
+ (PCRE_ERROR_NOSUBSTRING) if not found
+*/
+
+int
+pcre_get_stringnumber(const pcre *code, const char *stringname)
+{
+int rc;
+int entrysize;
+int top, bot;
+uschar *nametable;
+
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
+ return rc;
+if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
+
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
+ return rc;
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
+ return rc;
+
+bot = 0;
+while (top > bot)
+ {
+ int mid = (top + bot) / 2;
+ uschar *entry = nametable + entrysize*mid;
+ int c = strcmp(stringname, (char *)(entry + 2));
+ if (c == 0) return (entry[0] << 8) + entry[1];
+ if (c > 0) bot = mid + 1; else top = mid;
+ }
+
+return PCRE_ERROR_NOSUBSTRING;
+}
+
+
+
+/*************************************************
+* Find (multiple) entries for named string *
+*************************************************/
+
+/* This is used by the get_first_set() function below, as well as being
+generally available. It is used when duplicated names are permitted.
+
+Arguments:
+ code the compiled regex
+ stringname the name whose entries required
+ firstptr where to put the pointer to the first entry
+ lastptr where to put the pointer to the last entry
+
+Returns: the length of each entry, or a negative number
+ (PCRE_ERROR_NOSUBSTRING) if not found
+*/
+
+int
+pcre_get_stringtable_entries(const pcre *code, const char *stringname,
+ char **firstptr, char **lastptr)
+{
+int rc;
+int entrysize;
+int top, bot;
+uschar *nametable, *lastentry;
+
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
+ return rc;
+if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
+
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
+ return rc;
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
+ return rc;
+
+lastentry = nametable + entrysize * (top - 1);
+bot = 0;
+while (top > bot)
+ {
+ int mid = (top + bot) / 2;
+ uschar *entry = nametable + entrysize*mid;
+ int c = strcmp(stringname, (char *)(entry + 2));
+ if (c == 0)
+ {
+ uschar *first = entry;
+ uschar *last = entry;
+ while (first > nametable)
+ {
+ if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break;
+ first -= entrysize;
+ }
+ while (last < lastentry)
+ {
+ if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break;
+ last += entrysize;
+ }
+ *firstptr = (char *)first;
+ *lastptr = (char *)last;
+ return entrysize;
+ }
+ if (c > 0) bot = mid + 1; else top = mid;
+ }
+
+return PCRE_ERROR_NOSUBSTRING;
+}
+
+
+
+/*************************************************
+* Find first set of multiple named strings *
+*************************************************/
+
+/* This function allows for duplicate names in the table of named substrings.
+It returns the number of the first one that was set in a pattern match.
+
+Arguments:
+ code the compiled regex
+ stringname the name of the capturing substring
+ ovector the vector of matched substrings
+
+Returns: the number of the first that is set,
+ or the number of the last one if none are set,
+ or a negative number on error
+*/
+
+static int
+get_first_set(const pcre *code, const char *stringname, int *ovector)
+{
+const real_pcre *re = (const real_pcre *)code;
+int entrysize;
+char *first, *last;
+uschar *entry;
+if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
+ return pcre_get_stringnumber(code, stringname);
+entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
+if (entrysize <= 0) return entrysize;
+for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize)
+ {
+ int n = (entry[0] << 8) + entry[1];
+ if (ovector[n*2] >= 0) return n;
+ }
+return (first[0] << 8) + first[1];
+}
+
+
+
+
+/*************************************************
+* Copy captured string to given buffer *
+*************************************************/
+
+/* This function copies a single captured substring into a given buffer.
+Note that we use memcpy() rather than strncpy() in case there are binary zeros
+in the string.
+
+Arguments:
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringnumber the number of the required substring
+ buffer where to put the substring
+ size the size of the buffer
+
+Returns: if successful:
+ the length of the copied string, not including the zero
+ that is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) buffer too small
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
+*/
+
+int
+pcre_copy_substring(const char *subject, int *ovector, int stringcount,
+ int stringnumber, char *buffer, int size)
+{
+int yield;
+if (stringnumber < 0 || stringnumber >= stringcount)
+ return PCRE_ERROR_NOSUBSTRING;
+stringnumber *= 2;
+yield = ovector[stringnumber+1] - ovector[stringnumber];
+if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
+memcpy(buffer, subject + ovector[stringnumber], yield);
+buffer[yield] = 0;
+return yield;
+}
+
+
+
+/*************************************************
+* Copy named captured string to given buffer *
+*************************************************/
+
+/* This function copies a single captured substring into a given buffer,
+identifying it by name. If the regex permits duplicate names, the first
+substring that is set is chosen.
+
+Arguments:
+ code the compiled regex
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringname the name of the required substring
+ buffer where to put the substring
+ size the size of the buffer
+
+Returns: if successful:
+ the length of the copied string, not including the zero
+ that is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) buffer too small
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
+*/
+
+int
+pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
+ int stringcount, const char *stringname, char *buffer, int size)
+{
+int n = get_first_set(code, stringname, ovector);
+if (n <= 0) return n;
+return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
+}
+
+
+
+/*************************************************
+* Copy all captured strings to new store *
+*************************************************/
+
+/* This function gets one chunk of store and builds a list of pointers and all
+of the captured substrings in it. A NULL pointer is put on the end of the list.
+
+Arguments:
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ listptr set to point to the list of pointers
+
+Returns: if successful: 0
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) failed to get store
+*/
+
+int
+pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
+ const char ***listptr)
+{
+int i;
+int size = sizeof(char *);
+int double_count = stringcount * 2;
+char **stringlist;
+char *p;
+
+for (i = 0; i < double_count; i += 2)
+ size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
+
+stringlist = (char **)(pcre_malloc)(size);
+if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
+
+*listptr = (const char **)stringlist;
+p = (char *)(stringlist + stringcount + 1);
+
+for (i = 0; i < double_count; i += 2)
+ {
+ int len = ovector[i+1] - ovector[i];
+ memcpy(p, subject + ovector[i], len);
+ *stringlist++ = p;
+ p += len;
+ *p++ = 0;
+ }
+
+*stringlist = NULL;
+return 0;
+}
+
+
+
+/*************************************************
+* Free store obtained by get_substring_list *
+*************************************************/
+
+/* This function exists for the benefit of people calling PCRE from non-C
+programs that can call its functions, but not free() or (pcre_free)() directly.
+
+Argument: the result of a previous pcre_get_substring_list()
+Returns: nothing
+*/
+
+void
+pcre_free_substring_list(const char **pointer)
+{
+(pcre_free)((void *)pointer);
+}
+
+
+
+/*************************************************
+* Copy captured string to new store *
+*************************************************/
+
+/* This function copies a single captured substring into a piece of new
+store
+
+Arguments:
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringnumber the number of the required substring
+ stringptr where to put a pointer to the substring
+
+Returns: if successful:
+ the length of the string, not including the zero that
+ is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) failed to get store
+ PCRE_ERROR_NOSUBSTRING (-7) substring not present
+*/
+
+int
+pcre_get_substring(const char *subject, int *ovector, int stringcount,
+ int stringnumber, const char **stringptr)
+{
+int yield;
+char *substring;
+if (stringnumber < 0 || stringnumber >= stringcount)
+ return PCRE_ERROR_NOSUBSTRING;
+stringnumber *= 2;
+yield = ovector[stringnumber+1] - ovector[stringnumber];
+substring = (char *)(pcre_malloc)(yield + 1);
+if (substring == NULL) return PCRE_ERROR_NOMEMORY;
+memcpy(substring, subject + ovector[stringnumber], yield);
+substring[yield] = 0;
+*stringptr = substring;
+return yield;
+}
+
+
+
+/*************************************************
+* Copy named captured string to new store *
+*************************************************/
+
+/* This function copies a single captured substring, identified by name, into
+new store. If the regex permits duplicate names, the first substring that is
+set is chosen.
+
+Arguments:
+ code the compiled regex
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringname the name of the required substring
+ stringptr where to put the pointer
+
+Returns: if successful:
+ the length of the copied string, not including the zero
+ that is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) couldn't get memory
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
+*/
+
+int
+pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
+ int stringcount, const char *stringname, const char **stringptr)
+{
+int n = get_first_set(code, stringname, ovector);
+if (n <= 0) return n;
+return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
+}
+
+
+
+
+/*************************************************
+* Free store obtained by get_substring *
+*************************************************/
+
+/* This function exists for the benefit of people calling PCRE from non-C
+programs that can call its functions, but not free() or (pcre_free)() directly.
+
+Argument: the result of a previous pcre_get_substring()
+Returns: nothing
+*/
+
+void
+pcre_free_substring(const char *pointer)
+{
+(pcre_free)((void *)pointer);
+}
+
+/* End of pcre_get.c */
diff --git a/src/pcre_globals.c b/src/pcre_globals.c
new file mode 100644
index 0000000..24ed03d
--- /dev/null
+++ b/src/pcre_globals.c
@@ -0,0 +1,63 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains global variables that are exported by the PCRE library.
+PCRE is thread-clean and doesn't use any global variables in the normal sense.
+However, it calls memory allocation and freeing functions via the four
+indirections below, and it can optionally do callouts, using the fifth
+indirection. These values can be changed by the caller, but are shared between
+all threads. However, when compiling for Virtual Pascal, things are done
+differently, and global variables are not used (see pcre.in). */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+#ifndef VPCOMPAT
+PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc;
+PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = free;
+PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc;
+PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = free;
+PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
+#endif
+
+/* End of pcre_globals.c */
diff --git a/src/pcre_info.c b/src/pcre_info.c
new file mode 100644
index 0000000..638a475
--- /dev/null
+++ b/src/pcre_info.c
@@ -0,0 +1,93 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_info(), which gives some
+information about a compiled pattern. However, use of this function is now
+deprecated, as it has been superseded by pcre_fullinfo(). */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* (Obsolete) Return info about compiled pattern *
+*************************************************/
+
+/* This is the original "info" function. It picks potentially useful data out
+of the private structure, but its interface was too rigid. It remains for
+backwards compatibility. The public options are passed back in an int - though
+the re->options field has been expanded to a long int, all the public options
+at the low end of it, and so even on 16-bit systems this will still be OK.
+Therefore, I haven't changed the API for pcre_info().
+
+Arguments:
+ argument_re points to compiled code
+ optptr where to pass back the options
+ first_byte where to pass back the first character,
+ or -1 if multiline and all branches start ^,
+ or -2 otherwise
+
+Returns: number of capturing subpatterns
+ or negative values on error
+*/
+
+PCRE_EXP_DEFN int
+pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
+{
+real_pcre internal_re;
+const real_pcre *re = (const real_pcre *)argument_re;
+if (re == NULL) return PCRE_ERROR_NULL;
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ }
+if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
+if (first_byte != NULL)
+ *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
+ ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
+return re->top_bracket;
+}
+
+/* End of pcre_info.c */
diff --git a/src/pcre_internal.h b/src/pcre_internal.h
new file mode 100644
index 0000000..102e2d0
--- /dev/null
+++ b/src/pcre_internal.h
@@ -0,0 +1,1138 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This header contains definitions that are shared between the different
+modules, but which are not relevant to the exported API. This includes some
+functions whose names all begin with "_pcre_". */
+
+#ifndef PCRE_INTERNAL_H
+#define PCRE_INTERNAL_H
+
+/* Define DEBUG to get debugging output on stdout. */
+
+#if 0
+#define DEBUG
+#endif
+
+/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
+inline, and there are *still* stupid compilers about that don't like indented
+pre-processor statements, or at least there were when I first wrote this. After
+all, it had only been about 10 years then...
+
+It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
+be absolutely sure we get our version. */
+
+#undef DPRINTF
+#ifdef DEBUG
+#define DPRINTF(p) printf p
+#else
+#define DPRINTF(p) /* Nothing */
+#endif
+
+
+/* Standard C headers plus the external interface definition. The only time
+setjmp and stdarg are used is when NO_RECURSE is set. */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+/* When compiling a DLL for Windows, the exported symbols have to be declared
+using some MS magic. I found some useful information on this web page:
+http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
+information there, using __declspec(dllexport) without "extern" we have a
+definition; with "extern" we have a declaration. The settings here override the
+setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL,
+which is all that is needed for applications (they just import the symbols). We
+use:
+
+ PCRE_EXP_DECL for declarations
+ PCRE_EXP_DEFN for definitions of exported functions
+ PCRE_EXP_DATA_DEFN for definitions of exported variables
+
+The reason for the two DEFN macros is that in non-Windows environments, one
+does not want to have "extern" before variable definitions because it leads to
+compiler warnings. So we distinguish between functions and variables. In
+Windows, the two should always be the same.
+
+The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest,
+which is an application, but needs to import this file in order to "peek" at
+internals, can #include pcre.h first to get an application's-eye view.
+
+In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
+special-purpose environments) might want to stick other stuff in front of
+exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and
+PCRE_EXP_DATA_DEFN only if they are not already set. */
+
+#ifndef PCRE_EXP_DECL
+# ifdef _WIN32
+# ifndef PCRE_STATIC
+# define PCRE_EXP_DECL extern __declspec(dllexport)
+# define PCRE_EXP_DEFN __declspec(dllexport)
+# define PCRE_EXP_DATA_DEFN __declspec(dllexport)
+# else
+# define PCRE_EXP_DECL extern
+# define PCRE_EXP_DEFN
+# define PCRE_EXP_DATA_DEFN
+# endif
+# else
+# ifdef __cplusplus
+# define PCRE_EXP_DECL extern "C"
+# else
+# define PCRE_EXP_DECL extern
+# endif
+# ifndef PCRE_EXP_DEFN
+# define PCRE_EXP_DEFN PCRE_EXP_DECL
+# endif
+# ifndef PCRE_EXP_DATA_DEFN
+# define PCRE_EXP_DATA_DEFN
+# endif
+# endif
+#endif
+
+/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
+cannot determine these outside the compilation (e.g. by running a program as
+part of "configure") because PCRE is often cross-compiled for use on other
+systems. Instead we make use of the maximum sizes that are available at
+preprocessor time in standard C environments. */
+
+#if USHRT_MAX == 65535
+ typedef unsigned short pcre_uint16;
+#elif UINT_MAX == 65535
+ typedef unsigned int pcre_uint16;
+#else
+ #error Cannot determine a type for 16-bit unsigned integers
+#endif
+
+#if UINT_MAX == 4294967295
+ typedef unsigned int pcre_uint32;
+#elif ULONG_MAX == 4294967295
+ typedef unsigned long int pcre_uint32;
+#else
+ #error Cannot determine a type for 32-bit unsigned integers
+#endif
+
+/* All character handling must be done as unsigned characters. Otherwise there
+are problems with top-bit-set characters and functions such as isspace().
+However, we leave the interface to the outside world as char *, because that
+should make things easier for callers. We define a short type for unsigned char
+to save lots of typing. I tried "uchar", but it causes problems on Digital
+Unix, where it is defined in sys/types, so use "uschar" instead. */
+
+typedef unsigned char uschar;
+
+/* This is an unsigned int value that no character can ever have. UTF-8
+characters only go up to 0x7fffffff (though Unicode doesn't go beyond
+0x0010ffff). */
+
+#define NOTACHAR 0xffffffff
+
+/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
+"any" and "anycrlf" at present). The following macros are used to package up
+testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
+modules to indicate in which datablock the parameters exist, and what the
+start/end of string field names are. */
+
+#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
+#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
+#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
+
+/* This macro checks for a newline at the given position */
+
+#define IS_NEWLINE(p) \
+ ((NLBLOCK->nltype != NLTYPE_FIXED)? \
+ ((p) < NLBLOCK->PSEND && \
+ _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
+ utf8)) \
+ : \
+ ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
+ (p)[0] == NLBLOCK->nl[0] && \
+ (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
+ ) \
+ )
+
+/* This macro checks for a newline immediately preceding the given position */
+
+#define WAS_NEWLINE(p) \
+ ((NLBLOCK->nltype != NLTYPE_FIXED)? \
+ ((p) > NLBLOCK->PSSTART && \
+ _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
+ &(NLBLOCK->nllen), utf8)) \
+ : \
+ ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
+ (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
+ (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
+ ) \
+ )
+
+/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
+with a custom type. This makes it possible, for example, to allow pcre_exec()
+to process subject strings that are discontinuous by using a smart pointer
+class. It must always be possible to inspect all of the subject string in
+pcre_exec() because of the way it backtracks. Two macros are required in the
+normal case, for sign-unspecified and unsigned char pointers. The former is
+used for the external interface and appears in pcre.h, which is why its name
+must begin with PCRE_. */
+
+#ifdef CUSTOM_SUBJECT_PTR
+#define PCRE_SPTR CUSTOM_SUBJECT_PTR
+#define USPTR CUSTOM_SUBJECT_PTR
+#else
+#define PCRE_SPTR const char *
+#define USPTR const unsigned char *
+#endif
+
+
+
+/* Include the public PCRE header and the definitions of UCP character property
+values. */
+
+#include "pcre.h"
+#include "ucp.h"
+
+/* When compiling for use with the Virtual Pascal compiler, these functions
+need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
+option on the command line. */
+
+#ifdef VPCOMPAT
+#define strlen(s) _strlen(s)
+#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
+#define memcmp(s,c,n) _memcmp(s,c,n)
+#define memcpy(d,s,n) _memcpy(d,s,n)
+#define memmove(d,s,n) _memmove(d,s,n)
+#define memset(s,c,n) _memset(s,c,n)
+#else /* VPCOMPAT */
+
+/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
+define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
+is set. Otherwise, include an emulating function for those systems that have
+neither (there some non-Unix environments where this is the case). */
+
+#ifndef HAVE_MEMMOVE
+#undef memmove /* some systems may have a macro */
+#ifdef HAVE_BCOPY
+#define memmove(a, b, c) bcopy(b, a, c)
+#else /* HAVE_BCOPY */
+static void *
+pcre_memmove(void *d, const void *s, size_t n)
+{
+size_t i;
+unsigned char *dest = (unsigned char *)d;
+const unsigned char *src = (const unsigned char *)s;
+if (dest > src)
+ {
+ dest += n;
+ src += n;
+ for (i = 0; i < n; ++i) *(--dest) = *(--src);
+ return (void *)dest;
+ }
+else
+ {
+ for (i = 0; i < n; ++i) *dest++ = *src++;
+ return (void *)(dest - n);
+ }
+}
+#define memmove(a, b, c) pcre_memmove(a, b, c)
+#endif /* not HAVE_BCOPY */
+#endif /* not HAVE_MEMMOVE */
+#endif /* not VPCOMPAT */
+
+
+/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
+in big-endian order) by default. These are used, for example, to link from the
+start of a subpattern to its alternatives and its end. The use of 2 bytes per
+offset limits the size of the compiled regex to around 64K, which is big enough
+for almost everybody. However, I received a request for an even bigger limit.
+For this reason, and also to make the code easier to maintain, the storing and
+loading of offsets from the byte string is now handled by the macros that are
+defined here.
+
+The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
+the config.h file, but can be overridden by using -D on the command line. This
+is automated on Unix systems via the "configure" command. */
+
+#if LINK_SIZE == 2
+
+#define PUT(a,n,d) \
+ (a[n] = (d) >> 8), \
+ (a[(n)+1] = (d) & 255)
+
+#define GET(a,n) \
+ (((a)[n] << 8) | (a)[(n)+1])
+
+#define MAX_PATTERN_SIZE (1 << 16)
+
+
+#elif LINK_SIZE == 3
+
+#define PUT(a,n,d) \
+ (a[n] = (d) >> 16), \
+ (a[(n)+1] = (d) >> 8), \
+ (a[(n)+2] = (d) & 255)
+
+#define GET(a,n) \
+ (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
+
+#define MAX_PATTERN_SIZE (1 << 24)
+
+
+#elif LINK_SIZE == 4
+
+#define PUT(a,n,d) \
+ (a[n] = (d) >> 24), \
+ (a[(n)+1] = (d) >> 16), \
+ (a[(n)+2] = (d) >> 8), \
+ (a[(n)+3] = (d) & 255)
+
+#define GET(a,n) \
+ (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
+
+#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
+
+
+#else
+#error LINK_SIZE must be either 2, 3, or 4
+#endif
+
+
+/* Convenience macro defined in terms of the others */
+
+#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
+
+
+/* PCRE uses some other 2-byte quantities that do not change when the size of
+offsets changes. There are used for repeat counts and for other things such as
+capturing parenthesis numbers in back references. */
+
+#define PUT2(a,n,d) \
+ a[n] = (d) >> 8; \
+ a[(n)+1] = (d) & 255
+
+#define GET2(a,n) \
+ (((a)[n] << 8) | (a)[(n)+1])
+
+#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
+
+
+/* When UTF-8 encoding is being used, a character is no longer just a single
+byte. The macros for character handling generate simple sequences when used in
+byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
+never be called in byte mode. To make sure it can never even appear when UTF-8
+support is omitted, we don't even define it. */
+
+#ifndef SUPPORT_UTF8
+#define NEXTCHAR(p) p++;
+#define GETCHAR(c, eptr) c = *eptr;
+#define GETCHARTEST(c, eptr) c = *eptr;
+#define GETCHARINC(c, eptr) c = *eptr++;
+#define GETCHARINCTEST(c, eptr) c = *eptr++;
+#define GETCHARLEN(c, eptr, len) c = *eptr;
+/* #define BACKCHAR(eptr) */
+
+#else /* SUPPORT_UTF8 */
+
+/* Advance a character pointer one byte in non-UTF-8 mode and by one character
+in UTF-8 mode. */
+
+#define NEXTCHAR(p) \
+ p++; \
+ if (utf8) { while((*p & 0xc0) == 0x80) p++; }
+
+/* Get the next UTF-8 character, not advancing the pointer. This is called when
+we know we are in UTF-8 mode. */
+
+#define GETCHAR(c, eptr) \
+ c = *eptr; \
+ if (c >= 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
+pointer. */
+
+#define GETCHARTEST(c, eptr) \
+ c = *eptr; \
+ if (utf8 && c >= 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next UTF-8 character, advancing the pointer. This is called when we
+know we are in UTF-8 mode. */
+
+#define GETCHARINC(c, eptr) \
+ c = *eptr++; \
+ if (c >= 0xc0) \
+ { \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ while (gcaa-- > 0) \
+ { \
+ gcss -= 6; \
+ c |= (*eptr++ & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+
+#define GETCHARINCTEST(c, eptr) \
+ c = *eptr++; \
+ if (utf8 && c >= 0xc0) \
+ { \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ while (gcaa-- > 0) \
+ { \
+ gcss -= 6; \
+ c |= (*eptr++ & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next UTF-8 character, not advancing the pointer, incrementing length
+if there are extra bytes. This is called when we know we are in UTF-8 mode. */
+
+#define GETCHARLEN(c, eptr, len) \
+ c = *eptr; \
+ if (c >= 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ len += gcaa; \
+ }
+
+/* If the pointer is not at the start of a character, move it back until
+it is. This is called only in UTF-8 mode - we don't put a test within the macro
+because almost all calls are already within a block of UTF-8 only code. */
+
+#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
+
+#endif
+
+
+/* In case there is no definition of offsetof() provided - though any proper
+Standard C system should have one. */
+
+#ifndef offsetof
+#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
+#endif
+
+
+/* These are the public options that can change during matching. */
+
+#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
+
+/* Private flags containing information about the compiled regex. They used to
+live at the top end of the options word, but that got almost full, so now they
+are in a 16-bit flags word. */
+
+#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
+#define PCRE_FIRSTSET 0x0002 /* first_byte is set */
+#define PCRE_REQCHSET 0x0004 /* req_byte is set */
+#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
+#define PCRE_JCHANGED 0x0010 /* j option used in regex */
+#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
+
+/* Options for the "extra" block produced by pcre_study(). */
+
+#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
+
+/* Masks for identifying the public options that are permitted at compile
+time, run time, or study time, respectively. */
+
+#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
+ PCRE_NEWLINE_ANYCRLF)
+
+#define PUBLIC_OPTIONS \
+ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
+ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
+ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
+ PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
+ PCRE_JAVASCRIPT_COMPAT)
+
+#define PUBLIC_EXEC_OPTIONS \
+ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
+ PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
+
+#define PUBLIC_DFA_EXEC_OPTIONS \
+ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
+ PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \
+ PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
+
+#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
+
+/* Magic number to provide a small check against being handed junk. Also used
+to detect whether a pattern was compiled on a host of different endianness. */
+
+#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
+
+/* Negative values for the firstchar and reqchar variables */
+
+#define REQ_UNSET (-2)
+#define REQ_NONE (-1)
+
+/* The maximum remaining length of subject we are prepared to search for a
+req_byte match. */
+
+#define REQ_BYTE_MAX 1000
+
+/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
+variable-length repeat, or a anything other than literal characters. */
+
+#define REQ_CASELESS 0x0100 /* indicates caselessness */
+#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
+
+/* Miscellaneous definitions */
+
+typedef int BOOL;
+
+#define FALSE 0
+#define TRUE 1
+
+/* Escape items that are just an encoding of a particular data value. */
+
+#ifndef ESC_e
+#define ESC_e 27
+#endif
+
+#ifndef ESC_f
+#define ESC_f '\f'
+#endif
+
+#ifndef ESC_n
+#define ESC_n '\n'
+#endif
+
+#ifndef ESC_r
+#define ESC_r '\r'
+#endif
+
+/* We can't officially use ESC_t because it is a POSIX reserved identifier
+(presumably because of all the others like size_t). */
+
+#ifndef ESC_tee
+#define ESC_tee '\t'
+#endif
+
+/* Codes for different types of Unicode property */
+
+#define PT_ANY 0 /* Any property - matches all chars */
+#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
+#define PT_GC 2 /* General characteristic (e.g. L) */
+#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
+#define PT_SC 4 /* Script (e.g. Han) */
+
+/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
+contain UTF-8 characters with values greater than 255. */
+
+#define XCL_NOT 0x01 /* Flag: this is a negative class */
+#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
+
+#define XCL_END 0 /* Marks end of individual items */
+#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
+#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
+#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
+#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
+
+/* These are escaped items that aren't just an encoding of a particular data
+value such as \n. They must have non-zero values, as check_escape() returns
+their negation. Also, they must appear in the same order as in the opcode
+definitions below, up to ESC_z. There's a dummy for OP_ANY because it
+corresponds to "." rather than an escape sequence, and another for OP_ALLANY
+(which is used for [^] in JavaScript compatibility mode).
+
+The final escape must be ESC_REF as subsequent values are used for
+backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
+greater than ESC_b and less than ESC_Z to detect the types that may be
+repeated. These are the types that consume characters. If any new escapes are
+put in between that don't consume a character, that code will have to change.
+*/
+
+enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
+ ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
+ ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_REF };
+
+
+/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
+OP_EOD must correspond in order to the list of escapes immediately above.
+
+*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
+that follow must also be updated to match. There is also a table called
+"coptable" in pcre_dfa_exec.c that must be updated. */
+
+enum {
+ OP_END, /* 0 End of pattern */
+
+ /* Values corresponding to backslashed metacharacters */
+
+ OP_SOD, /* 1 Start of data: \A */
+ OP_SOM, /* 2 Start of match (subject + offset): \G */
+ OP_SET_SOM, /* 3 Set start of match (\K) */
+ OP_NOT_WORD_BOUNDARY, /* 4 \B */
+ OP_WORD_BOUNDARY, /* 5 \b */
+ OP_NOT_DIGIT, /* 6 \D */
+ OP_DIGIT, /* 7 \d */
+ OP_NOT_WHITESPACE, /* 8 \S */
+ OP_WHITESPACE, /* 9 \s */
+ OP_NOT_WORDCHAR, /* 10 \W */
+ OP_WORDCHAR, /* 11 \w */
+ OP_ANY, /* 12 Match any character (subject to DOTALL) */
+ OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
+ OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
+ OP_NOTPROP, /* 15 \P (not Unicode property) */
+ OP_PROP, /* 16 \p (Unicode property) */
+ OP_ANYNL, /* 17 \R (any newline sequence) */
+ OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
+ OP_HSPACE, /* 19 \h (horizontal whitespace) */
+ OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
+ OP_VSPACE, /* 21 \v (vertical whitespace) */
+ OP_EXTUNI, /* 22 \X (extended Unicode sequence */
+ OP_EODN, /* 23 End of data or \n at end of data: \Z. */
+ OP_EOD, /* 24 End of data: \z */
+
+ OP_OPT, /* 25 Set runtime options */
+ OP_CIRC, /* 26 Start of line - varies with multiline switch */
+ OP_DOLL, /* 27 End of line - varies with multiline switch */
+ OP_CHAR, /* 28 Match one character, casefully */
+ OP_CHARNC, /* 29 Match one character, caselessly */
+ OP_NOT, /* 30 Match one character, not the following one */
+
+ OP_STAR, /* 31 The maximizing and minimizing versions of */
+ OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
+ OP_PLUS, /* 33 the minimizing one second. */
+ OP_MINPLUS, /* 34 This first set applies to single characters.*/
+ OP_QUERY, /* 35 */
+ OP_MINQUERY, /* 36 */
+
+ OP_UPTO, /* 37 From 0 to n matches */
+ OP_MINUPTO, /* 38 */
+ OP_EXACT, /* 39 Exactly n matches */
+
+ OP_POSSTAR, /* 40 Possessified star */
+ OP_POSPLUS, /* 41 Possessified plus */
+ OP_POSQUERY, /* 42 Posesssified query */
+ OP_POSUPTO, /* 43 Possessified upto */
+
+ OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
+ OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
+ OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
+ OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
+ OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
+ OP_NOTMINQUERY, /* 49 */
+
+ OP_NOTUPTO, /* 50 From 0 to n matches */
+ OP_NOTMINUPTO, /* 51 */
+ OP_NOTEXACT, /* 52 Exactly n matches */
+
+ OP_NOTPOSSTAR, /* 53 Possessified versions */
+ OP_NOTPOSPLUS, /* 54 */
+ OP_NOTPOSQUERY, /* 55 */
+ OP_NOTPOSUPTO, /* 56 */
+
+ OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
+ OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
+ OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
+ OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
+ OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
+ OP_TYPEMINQUERY, /* 62 */
+
+ OP_TYPEUPTO, /* 63 From 0 to n matches */
+ OP_TYPEMINUPTO, /* 64 */
+ OP_TYPEEXACT, /* 65 Exactly n matches */
+
+ OP_TYPEPOSSTAR, /* 66 Possessified versions */
+ OP_TYPEPOSPLUS, /* 67 */
+ OP_TYPEPOSQUERY, /* 68 */
+ OP_TYPEPOSUPTO, /* 69 */
+
+ OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
+ OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
+ OP_CRPLUS, /* 72 the minimizing one second. These codes must */
+ OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
+ OP_CRQUERY, /* 74 These are for character classes and back refs */
+ OP_CRMINQUERY, /* 75 */
+ OP_CRRANGE, /* 76 These are different to the three sets above. */
+ OP_CRMINRANGE, /* 77 */
+
+ OP_CLASS, /* 78 Match a character class, chars < 256 only */
+ OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
+ class - the difference is relevant only when a UTF-8
+ character > 255 is encountered. */
+
+ OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
+ class. This does both positive and negative. */
+
+ OP_REF, /* 81 Match a back reference */
+ OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
+ OP_CALLOUT, /* 83 Call out to external function if provided */
+
+ OP_ALT, /* 84 Start of alternation */
+ OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
+ OP_KETRMAX, /* 86 These two must remain together and in this */
+ OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
+
+ /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
+
+ OP_ASSERT, /* 88 Positive lookahead */
+ OP_ASSERT_NOT, /* 89 Negative lookahead */
+ OP_ASSERTBACK, /* 90 Positive lookbehind */
+ OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
+ OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
+
+ /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
+ as there's a test for >= ONCE for a subpattern that isn't an assertion. */
+
+ OP_ONCE, /* 93 Atomic group */
+ OP_BRA, /* 94 Start of non-capturing bracket */
+ OP_CBRA, /* 95 Start of capturing bracket */
+ OP_COND, /* 96 Conditional group */
+
+ /* These three must follow the previous three, in the same order. There's a
+ check for >= SBRA to distinguish the two sets. */
+
+ OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
+ OP_SCBRA, /* 98 Start of capturing bracket, check empty */
+ OP_SCOND, /* 99 Conditional group, check empty */
+
+ OP_CREF, /* 100 Used to hold a capture number as condition */
+ OP_RREF, /* 101 Used to hold a recursion number as condition */
+ OP_DEF, /* 102 The DEFINE condition */
+
+ OP_BRAZERO, /* 103 These two must remain together and in this */
+ OP_BRAMINZERO, /* 104 order. */
+
+ /* These are backtracking control verbs */
+
+ OP_PRUNE, /* 105 */
+ OP_SKIP, /* 106 */
+ OP_THEN, /* 107 */
+ OP_COMMIT, /* 108 */
+
+ /* These are forced failure and success verbs */
+
+ OP_FAIL, /* 109 */
+ OP_ACCEPT, /* 110 */
+
+ /* This is used to skip a subpattern with a {0} quantifier */
+
+ OP_SKIPZERO /* 111 */
+};
+
+
+/* This macro defines textual names for all the opcodes. These are used only
+for debugging. The macro is referenced only in pcre_printint.c. */
+
+#define OP_NAME_LIST \
+ "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
+ "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
+ "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
+ "extuni", "\\Z", "\\z", \
+ "Opt", "^", "$", "char", "charnc", "not", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
+ "*+","++", "?+", "{", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
+ "*+","++", "?+", "{", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
+ "*+","++", "?+", "{", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", \
+ "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
+ "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
+ "AssertB", "AssertB not", "Reverse", \
+ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
+ "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
+ "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
+ "Skip zero"
+
+
+/* This macro defines the length of fixed length operations in the compiled
+regex. The lengths are used when searching for specific things, and also in the
+debugging printing of a compiled regex. We use a macro so that it can be
+defined close to the definitions of the opcodes themselves.
+
+As things have been extended, some of these are no longer fixed lenths, but are
+minima instead. For example, the length of a single-character repeat may vary
+in UTF-8 mode. The code that uses this table must know about such things. */
+
+#define OP_LENGTHS \
+ 1, /* End */ \
+ 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
+ 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
+ 1, 1, 1, /* Any, AllAny, Anybyte */ \
+ 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
+ 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
+ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
+ 2, /* Char - the minimum length */ \
+ 2, /* Charnc - the minimum length */ \
+ 2, /* not */ \
+ /* Positive single-char repeats ** These are */ \
+ 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
+ 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
+ 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
+ /* Negative single-char repeats - only for chars < 256 */ \
+ 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
+ 4, 4, 4, /* NOT upto, minupto, exact */ \
+ 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
+ /* Positive type repeats */ \
+ 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
+ 4, 4, 4, /* Type upto, minupto, exact */ \
+ 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
+ /* Character class & ref repeats */ \
+ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
+ 5, 5, /* CRRANGE, CRMINRANGE */ \
+ 33, /* CLASS */ \
+ 33, /* NCLASS */ \
+ 0, /* XCLASS - variable length */ \
+ 3, /* REF */ \
+ 1+LINK_SIZE, /* RECURSE */ \
+ 2+2*LINK_SIZE, /* CALLOUT */ \
+ 1+LINK_SIZE, /* Alt */ \
+ 1+LINK_SIZE, /* Ket */ \
+ 1+LINK_SIZE, /* KetRmax */ \
+ 1+LINK_SIZE, /* KetRmin */ \
+ 1+LINK_SIZE, /* Assert */ \
+ 1+LINK_SIZE, /* Assert not */ \
+ 1+LINK_SIZE, /* Assert behind */ \
+ 1+LINK_SIZE, /* Assert behind not */ \
+ 1+LINK_SIZE, /* Reverse */ \
+ 1+LINK_SIZE, /* ONCE */ \
+ 1+LINK_SIZE, /* BRA */ \
+ 3+LINK_SIZE, /* CBRA */ \
+ 1+LINK_SIZE, /* COND */ \
+ 1+LINK_SIZE, /* SBRA */ \
+ 3+LINK_SIZE, /* SCBRA */ \
+ 1+LINK_SIZE, /* SCOND */ \
+ 3, /* CREF */ \
+ 3, /* RREF */ \
+ 1, /* DEF */ \
+ 1, 1, /* BRAZERO, BRAMINZERO */ \
+ 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
+ 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
+
+
+/* A magic value for OP_RREF to indicate the "any recursion" condition. */
+
+#define RREF_ANY 0xffff
+
+/* Error code numbers. They are given names so that they can more easily be
+tracked. */
+
+enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
+ ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
+ ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
+ ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
+ ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
+ ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
+ ERR60, ERR61, ERR62, ERR63, ERR64 };
+
+/* The real format of the start of the pcre block; the index of names and the
+code vector run on as long as necessary after the end. We store an explicit
+offset to the name table so that if a regex is compiled on one host, saved, and
+then run on another where the size of pointers is different, all might still
+be well. For the case of compiled-on-4 and run-on-8, we include an extra
+pointer that is always NULL. For future-proofing, a few dummy fields were
+originally included - even though you can never get this planning right - but
+there is only one left now.
+
+NOTE NOTE NOTE:
+Because people can now save and re-use compiled patterns, any additions to this
+structure should be made at the end, and something earlier (e.g. a new
+flag in the options or one of the dummy fields) should indicate that the new
+fields are present. Currently PCRE always sets the dummy fields to zero.
+NOTE NOTE NOTE:
+*/
+
+typedef struct real_pcre {
+ pcre_uint32 magic_number;
+ pcre_uint32 size; /* Total that was malloced */
+ pcre_uint32 options; /* Public options */
+ pcre_uint16 flags; /* Private flags */
+ pcre_uint16 dummy1; /* For future use */
+ pcre_uint16 top_bracket;
+ pcre_uint16 top_backref;
+ pcre_uint16 first_byte;
+ pcre_uint16 req_byte;
+ pcre_uint16 name_table_offset; /* Offset to name table that follows */
+ pcre_uint16 name_entry_size; /* Size of any name items */
+ pcre_uint16 name_count; /* Number of name items */
+ pcre_uint16 ref_count; /* Reference count */
+
+ const unsigned char *tables; /* Pointer to tables or NULL for std */
+ const unsigned char *nullpad; /* NULL padding */
+} real_pcre;
+
+/* The format of the block used to store data from pcre_study(). The same
+remark (see NOTE above) about extending this structure applies. */
+
+typedef struct pcre_study_data {
+ pcre_uint32 size; /* Total that was malloced */
+ pcre_uint32 options;
+ uschar start_bits[32];
+} pcre_study_data;
+
+/* Structure for passing "static" information around between the functions
+doing the compiling, so that they are thread-safe. */
+
+typedef struct compile_data {
+ const uschar *lcc; /* Points to lower casing table */
+ const uschar *fcc; /* Points to case-flipping table */
+ const uschar *cbits; /* Points to character type table */
+ const uschar *ctypes; /* Points to table of type maps */
+ const uschar *start_workspace;/* The start of working space */
+ const uschar *start_code; /* The start of the compiled code */
+ const uschar *start_pattern; /* The start of the pattern */
+ const uschar *end_pattern; /* The end of the pattern */
+ uschar *hwm; /* High watermark of workspace */
+ uschar *name_table; /* The name/number table */
+ int names_found; /* Number of entries so far */
+ int name_entry_size; /* Size of each entry */
+ int bracount; /* Count of capturing parens as we compile */
+ int final_bracount; /* Saved value after first pass */
+ int top_backref; /* Maximum back reference */
+ unsigned int backref_map; /* Bitmap of low back refs */
+ int external_options; /* External (initial) options */
+ int external_flags; /* External flag bits to be set */
+ int req_varyopt; /* "After variable item" flag for reqbyte */
+ BOOL had_accept; /* (*ACCEPT) encountered */
+ int nltype; /* Newline type */
+ int nllen; /* Newline string length */
+ uschar nl[4]; /* Newline string when fixed length */
+} compile_data;
+
+/* Structure for maintaining a chain of pointers to the currently incomplete
+branches, for testing for left recursion. */
+
+typedef struct branch_chain {
+ struct branch_chain *outer;
+ uschar *current;
+} branch_chain;
+
+/* Structure for items in a linked list that represents an explicit recursive
+call within the pattern. */
+
+typedef struct recursion_info {
+ struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
+ int group_num; /* Number of group that was called */
+ const uschar *after_call; /* "Return value": points after the call in the expr */
+ USPTR save_start; /* Old value of mstart */
+ int *offset_save; /* Pointer to start of saved offsets */
+ int saved_max; /* Number of saved offsets */
+} recursion_info;
+
+/* Structure for building a chain of data for holding the values of the subject
+pointer at the start of each subpattern, so as to detect when an empty string
+has been matched by a subpattern - to break infinite loops. */
+
+typedef struct eptrblock {
+ struct eptrblock *epb_prev;
+ USPTR epb_saved_eptr;
+} eptrblock;
+
+
+/* Structure for passing "static" information around between the functions
+doing traditional NFA matching, so that they are thread-safe. */
+
+typedef struct match_data {
+ unsigned long int match_call_count; /* As it says */
+ unsigned long int match_limit; /* As it says */
+ unsigned long int match_limit_recursion; /* As it says */
+ int *offset_vector; /* Offset vector */
+ int offset_end; /* One past the end */
+ int offset_max; /* The maximum usable for return data */
+ int nltype; /* Newline type */
+ int nllen; /* Newline string length */
+ uschar nl[4]; /* Newline string when fixed */
+ const uschar *lcc; /* Points to lower casing table */
+ const uschar *ctypes; /* Points to table of type maps */
+ BOOL offset_overflow; /* Set if too many extractions */
+ BOOL notbol; /* NOTBOL flag */
+ BOOL noteol; /* NOTEOL flag */
+ BOOL utf8; /* UTF8 flag */
+ BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
+ BOOL endonly; /* Dollar not before final \n */
+ BOOL notempty; /* Empty string match not wanted */
+ BOOL partial; /* PARTIAL flag */
+ BOOL hitend; /* Hit the end of the subject at some point */
+ BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
+ const uschar *start_code; /* For use when recursing */
+ USPTR start_subject; /* Start of the subject string */
+ USPTR end_subject; /* End of the subject string */
+ USPTR start_match_ptr; /* Start of matched string */
+ USPTR end_match_ptr; /* Subject position at end match */
+ int end_offset_top; /* Highwater mark at end of match */
+ int capture_last; /* Most recent capture number */
+ int start_offset; /* The start offset value */
+ eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
+ int eptrn; /* Next free eptrblock */
+ recursion_info *recursive; /* Linked list of recursion data */
+ void *callout_data; /* To pass back to callouts */
+} match_data;
+
+/* A similar structure is used for the same purpose by the DFA matching
+functions. */
+
+typedef struct dfa_match_data {
+ const uschar *start_code; /* Start of the compiled pattern */
+ const uschar *start_subject; /* Start of the subject string */
+ const uschar *end_subject; /* End of subject string */
+ const uschar *tables; /* Character tables */
+ int moptions; /* Match options */
+ int poptions; /* Pattern options */
+ int nltype; /* Newline type */
+ int nllen; /* Newline string length */
+ uschar nl[4]; /* Newline string when fixed */
+ void *callout_data; /* To pass back to callouts */
+} dfa_match_data;
+
+/* Bit definitions for entries in the pcre_ctypes table. */
+
+#define ctype_space 0x01
+#define ctype_letter 0x02
+#define ctype_digit 0x04
+#define ctype_xdigit 0x08
+#define ctype_word 0x10 /* alphanumeric or '_' */
+#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
+
+/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
+of bits for a class map. Some classes are built by combining these tables. */
+
+#define cbit_space 0 /* [:space:] or \s */
+#define cbit_xdigit 32 /* [:xdigit:] */
+#define cbit_digit 64 /* [:digit:] or \d */
+#define cbit_upper 96 /* [:upper:] */
+#define cbit_lower 128 /* [:lower:] */
+#define cbit_word 160 /* [:word:] or \w */
+#define cbit_graph 192 /* [:graph:] */
+#define cbit_print 224 /* [:print:] */
+#define cbit_punct 256 /* [:punct:] */
+#define cbit_cntrl 288 /* [:cntrl:] */
+#define cbit_length 320 /* Length of the cbits table */
+
+/* Offsets of the various tables from the base tables pointer, and
+total length. */
+
+#define lcc_offset 0
+#define fcc_offset 256
+#define cbits_offset 512
+#define ctypes_offset (cbits_offset + cbit_length)
+#define tables_length (ctypes_offset + 256)
+
+/* Layout of the UCP type table that translates property names into types and
+codes. Each entry used to point directly to a name, but to reduce the number of
+relocations in shared libraries, it now has an offset into a single string
+instead. */
+
+typedef struct {
+ pcre_uint16 name_offset;
+ pcre_uint16 type;
+ pcre_uint16 value;
+} ucp_type_table;
+
+
+/* Internal shared data tables. These are tables that are used by more than one
+of the exported public functions. They have to be "external" in the C sense,
+but are not part of the PCRE public API. The data for these tables is in the
+pcre_tables.c module. */
+
+extern const int _pcre_utf8_table1[];
+extern const int _pcre_utf8_table2[];
+extern const int _pcre_utf8_table3[];
+extern const uschar _pcre_utf8_table4[];
+
+extern const int _pcre_utf8_table1_size;
+
+extern const char _pcre_utt_names[];
+extern const ucp_type_table _pcre_utt[];
+extern const int _pcre_utt_size;
+
+extern const uschar _pcre_default_tables[];
+
+extern const uschar _pcre_OP_lengths[];
+
+
+/* Internal shared functions. These are functions that are used by more than
+one of the exported public functions. They have to be "external" in the C
+sense, but are not part of the PCRE public API. */
+
+extern BOOL _pcre_is_newline(const uschar *, int, const uschar *,
+ int *, BOOL);
+extern int _pcre_ord2utf8(int, uschar *);
+extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
+ const pcre_study_data *, pcre_study_data *);
+extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
+extern unsigned int _pcre_ucp_othercase(const unsigned int);
+extern int _pcre_valid_utf8(const uschar *, int);
+extern BOOL _pcre_was_newline(const uschar *, int, const uschar *,
+ int *, BOOL);
+extern BOOL _pcre_xclass(int, const uschar *);
+
+#endif
+
+/* End of pcre_internal.h */
diff --git a/src/pcre_maketables.c b/src/pcre_maketables.c
new file mode 100644
index 0000000..219973e
--- /dev/null
+++ b/src/pcre_maketables.c
@@ -0,0 +1,143 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_maketables(), which builds
+character tables for PCRE in the current locale. The file is compiled on its
+own as part of the PCRE library. However, it is also included in the
+compilation of dftables.c, in which case the macro DFTABLES is defined. */
+
+
+#ifndef DFTABLES
+# ifdef HAVE_CONFIG_H
+# include "config.h"
+# endif
+# include "pcre_internal.h"
+#endif
+
+
+/*************************************************
+* Create PCRE character tables *
+*************************************************/
+
+/* This function builds a set of character tables for use by PCRE and returns
+a pointer to them. They are build using the ctype functions, and consequently
+their contents will depend upon the current locale setting. When compiled as
+part of the library, the store is obtained via pcre_malloc(), but when compiled
+inside dftables, use malloc().
+
+Arguments: none
+Returns: pointer to the contiguous block of data
+*/
+
+const unsigned char *
+pcre_maketables(void)
+{
+unsigned char *yield, *p;
+int i;
+
+#ifndef DFTABLES
+yield = (unsigned char*)(pcre_malloc)(tables_length);
+#else
+yield = (unsigned char*)malloc(tables_length);
+#endif
+
+if (yield == NULL) return NULL;
+p = yield;
+
+/* First comes the lower casing table */
+
+for (i = 0; i < 256; i++) *p++ = tolower(i);
+
+/* Next the case-flipping table */
+
+for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
+
+/* Then the character class tables. Don't try to be clever and save effort on
+exclusive ones - in some locales things may be different. Note that the table
+for "space" includes everything "isspace" gives, including VT in the default
+locale. This makes it work for the POSIX class [:space:]. Note also that it is
+possible for a character to be alnum or alpha without being lower or upper,
+such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at
+least under Debian Linux's locales as of 12/2005). So we must test for alnum
+specially. */
+
+memset(p, 0, cbit_length);
+for (i = 0; i < 256; i++)
+ {
+ if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7);
+ if (isupper(i)) p[cbit_upper + i/8] |= 1 << (i&7);
+ if (islower(i)) p[cbit_lower + i/8] |= 1 << (i&7);
+ if (isalnum(i)) p[cbit_word + i/8] |= 1 << (i&7);
+ if (i == '_') p[cbit_word + i/8] |= 1 << (i&7);
+ if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7);
+ if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
+ if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7);
+ if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7);
+ if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7);
+ if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7);
+ }
+p += cbit_length;
+
+/* Finally, the character type table. In this, we exclude VT from the white
+space chars, because Perl doesn't recognize it as such for \s and for comments
+within regexes. */
+
+for (i = 0; i < 256; i++)
+ {
+ int x = 0;
+ if (i != 0x0b && isspace(i)) x += ctype_space;
+ if (isalpha(i)) x += ctype_letter;
+ if (isdigit(i)) x += ctype_digit;
+ if (isxdigit(i)) x += ctype_xdigit;
+ if (isalnum(i) || i == '_') x += ctype_word;
+
+ /* Note: strchr includes the terminating zero in the characters it considers.
+ In this instance, that is ok because we want binary zero to be flagged as a
+ meta-character, which in this sense is any character that terminates a run
+ of data characters. */
+
+ if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta;
+ *p++ = x;
+ }
+
+return yield;
+}
+
+/* End of pcre_maketables.c */
diff --git a/src/pcre_newline.c b/src/pcre_newline.c
new file mode 100644
index 0000000..5888576
--- /dev/null
+++ b/src/pcre_newline.c
@@ -0,0 +1,164 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains internal functions for testing newlines when more than
+one kind of newline is to be recognized. When a newline is found, its length is
+returned. In principle, we could implement several newline "types", each
+referring to a different set of newline characters. At present, PCRE supports
+only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
+and NLTYPE_ANY. The full list of Unicode newline characters is taken from
+http://unicode.org/unicode/reports/tr18/. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+
+/*************************************************
+* Check for newline at given position *
+*************************************************/
+
+/* It is guaranteed that the initial value of ptr is less than the end of the
+string that is being processed.
+
+Arguments:
+ ptr pointer to possible newline
+ type the newline type
+ endptr pointer to the end of the string
+ lenptr where to return the length
+ utf8 TRUE if in utf8 mode
+
+Returns: TRUE or FALSE
+*/
+
+BOOL
+_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr,
+ int *lenptr, BOOL utf8)
+{
+int c;
+if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
+
+if (type == NLTYPE_ANYCRLF) switch(c)
+ {
+ case 0x000a: *lenptr = 1; return TRUE; /* LF */
+ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
+ return TRUE; /* CR */
+ default: return FALSE;
+ }
+
+/* NLTYPE_ANY */
+
+else switch(c)
+ {
+ case 0x000a: /* LF */
+ case 0x000b: /* VT */
+ case 0x000c: *lenptr = 1; return TRUE; /* FF */
+ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
+ return TRUE; /* CR */
+ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: *lenptr = 3; return TRUE; /* PS */
+ default: return FALSE;
+ }
+}
+
+
+
+/*************************************************
+* Check for newline at previous position *
+*************************************************/
+
+/* It is guaranteed that the initial value of ptr is greater than the start of
+the string that is being processed.
+
+Arguments:
+ ptr pointer to possible newline
+ type the newline type
+ startptr pointer to the start of the string
+ lenptr where to return the length
+ utf8 TRUE if in utf8 mode
+
+Returns: TRUE or FALSE
+*/
+
+BOOL
+_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr,
+ int *lenptr, BOOL utf8)
+{
+int c;
+ptr--;
+#ifdef SUPPORT_UTF8
+if (utf8)
+ {
+ BACKCHAR(ptr);
+ GETCHAR(c, ptr);
+ }
+else c = *ptr;
+#else /* no UTF-8 support */
+c = *ptr;
+#endif /* SUPPORT_UTF8 */
+
+if (type == NLTYPE_ANYCRLF) switch(c)
+ {
+ case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
+ return TRUE; /* LF */
+ case 0x000d: *lenptr = 1; return TRUE; /* CR */
+ default: return FALSE;
+ }
+
+else switch(c)
+ {
+ case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
+ return TRUE; /* LF */
+ case 0x000b: /* VT */
+ case 0x000c: /* FF */
+ case 0x000d: *lenptr = 1; return TRUE; /* CR */
+ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: *lenptr = 3; return TRUE; /* PS */
+ default: return FALSE;
+ }
+}
+
+/* End of pcre_newline.c */
diff --git a/src/pcre_ord2utf8.c b/src/pcre_ord2utf8.c
new file mode 100644
index 0000000..0fdc512
--- /dev/null
+++ b/src/pcre_ord2utf8.c
@@ -0,0 +1,85 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This file contains a private PCRE function that converts an ordinal
+character value into a UTF8 string. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Convert character value to UTF-8 *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x7fffffff
+and encodes it as a UTF-8 character in 0 to 6 bytes.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 6 bytes long
+
+Returns: number of characters placed in the buffer
+*/
+
+int
+_pcre_ord2utf8(int cvalue, uschar *buffer)
+{
+#ifdef SUPPORT_UTF8
+register int i, j;
+for (i = 0; i < _pcre_utf8_table1_size; i++)
+ if (cvalue <= _pcre_utf8_table1[i]) break;
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = _pcre_utf8_table2[i] | cvalue;
+return i + 1;
+#else
+return 0; /* Keep compiler happy; this function won't ever be */
+#endif /* called when SUPPORT_UTF8 is not defined. */
+}
+
+/* End of pcre_ord2utf8.c */
diff --git a/src/pcre_refcount.c b/src/pcre_refcount.c
new file mode 100644
index 0000000..eeb2897
--- /dev/null
+++ b/src/pcre_refcount.c
@@ -0,0 +1,82 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains the external function pcre_refcount(), which is an
+auxiliary function that can be used to maintain a reference count in a compiled
+pattern data block. This might be helpful in applications where the block is
+shared by different users. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Maintain reference count *
+*************************************************/
+
+/* The reference count is a 16-bit field, initialized to zero. It is not
+possible to transfer a non-zero count from one host to a different host that
+has a different byte order - though I can't see why anyone in their right mind
+would ever want to do that!
+
+Arguments:
+ argument_re points to compiled code
+ adjust value to add to the count
+
+Returns: the (possibly updated) count value (a non-negative number), or
+ a negative error number
+*/
+
+PCRE_EXP_DEFN int
+pcre_refcount(pcre *argument_re, int adjust)
+{
+real_pcre *re = (real_pcre *)argument_re;
+if (re == NULL) return PCRE_ERROR_NULL;
+re->ref_count = (-adjust > re->ref_count)? 0 :
+ (adjust + re->ref_count > 65535)? 65535 :
+ re->ref_count + adjust;
+return re->ref_count;
+}
+
+/* End of pcre_refcount.c */
diff --git a/src/pcre_scanner.cc b/src/pcre_scanner.cc
new file mode 100644
index 0000000..a817a68
--- /dev/null
+++ b/src/pcre_scanner.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: Sanjay Ghemawat
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include
+#include
+
+#include "pcrecpp_internal.h"
+#include "pcre_scanner.h"
+
+using std::vector;
+
+namespace pcrecpp {
+
+Scanner::Scanner()
+ : data_(),
+ input_(data_),
+ skip_(NULL),
+ should_skip_(false),
+ skip_repeat_(false),
+ save_comments_(false),
+ comments_(NULL),
+ comments_offset_(0) {
+}
+
+Scanner::Scanner(const string& in)
+ : data_(in),
+ input_(data_),
+ skip_(NULL),
+ should_skip_(false),
+ skip_repeat_(false),
+ save_comments_(false),
+ comments_(NULL),
+ comments_offset_(0) {
+}
+
+Scanner::~Scanner() {
+ delete skip_;
+ delete comments_;
+}
+
+void Scanner::SetSkipExpression(const char* re) {
+ delete skip_;
+ if (re != NULL) {
+ skip_ = new RE(re);
+ should_skip_ = true;
+ skip_repeat_ = true;
+ ConsumeSkip();
+ } else {
+ skip_ = NULL;
+ should_skip_ = false;
+ skip_repeat_ = false;
+ }
+}
+
+void Scanner::Skip(const char* re) {
+ delete skip_;
+ if (re != NULL) {
+ skip_ = new RE(re);
+ should_skip_ = true;
+ skip_repeat_ = false;
+ ConsumeSkip();
+ } else {
+ skip_ = NULL;
+ should_skip_ = false;
+ skip_repeat_ = false;
+ }
+}
+
+void Scanner::DisableSkip() {
+ assert(skip_ != NULL);
+ should_skip_ = false;
+}
+
+void Scanner::EnableSkip() {
+ assert(skip_ != NULL);
+ should_skip_ = true;
+ ConsumeSkip();
+}
+
+int Scanner::LineNumber() const {
+ // TODO: Make it more efficient by keeping track of the last point
+ // where we computed line numbers and counting newlines since then.
+ // We could use std:count, but not all systems have it. :-(
+ int count = 1;
+ for (const char* p = data_.data(); p < input_.data(); ++p)
+ if (*p == '\n')
+ ++count;
+ return count;
+}
+
+int Scanner::Offset() const {
+ return input_.data() - data_.c_str();
+}
+
+bool Scanner::LookingAt(const RE& re) const {
+ int consumed;
+ return re.DoMatch(input_, RE::ANCHOR_START, &consumed, 0, 0);
+}
+
+
+bool Scanner::Consume(const RE& re,
+ const Arg& arg0,
+ const Arg& arg1,
+ const Arg& arg2) {
+ const bool result = re.Consume(&input_, arg0, arg1, arg2);
+ if (result && should_skip_) ConsumeSkip();
+ return result;
+}
+
+// helper function to consume *skip_ and honour save_comments_
+void Scanner::ConsumeSkip() {
+ const char* start_data = input_.data();
+ while (skip_->Consume(&input_)) {
+ if (!skip_repeat_) {
+ // Only one skip allowed.
+ break;
+ }
+ }
+ if (save_comments_) {
+ if (comments_ == NULL) {
+ comments_ = new vector;
+ }
+ // already pointing one past end, so no need to +1
+ int length = input_.data() - start_data;
+ if (length > 0) {
+ comments_->push_back(StringPiece(start_data, length));
+ }
+ }
+}
+
+
+void Scanner::GetComments(int start, int end, vector *ranges) {
+ // short circuit out if we've not yet initialized comments_
+ // (e.g., when save_comments is false)
+ if (!comments_) {
+ return;
+ }
+ // TODO: if we guarantee that comments_ will contain StringPieces
+ // that are ordered by their start, then we can do a binary search
+ // for the first StringPiece at or past start and then scan for the
+ // ones contained in the range, quit early (use equal_range or
+ // lower_bound)
+ for (vector::const_iterator it = comments_->begin();
+ it != comments_->end(); ++it) {
+ if ((it->data() >= data_.c_str() + start &&
+ it->data() + it->size() <= data_.c_str() + end)) {
+ ranges->push_back(*it);
+ }
+ }
+}
+
+
+void Scanner::GetNextComments(vector *ranges) {
+ // short circuit out if we've not yet initialized comments_
+ // (e.g., when save_comments is false)
+ if (!comments_) {
+ return;
+ }
+ for (vector::const_iterator it =
+ comments_->begin() + comments_offset_;
+ it != comments_->end(); ++it) {
+ ranges->push_back(*it);
+ ++comments_offset_;
+ }
+}
+
+} // namespace pcrecpp
diff --git a/src/pcre_scanner.h b/src/pcre_scanner.h
new file mode 100644
index 0000000..c3a96e0
--- /dev/null
+++ b/src/pcre_scanner.h
@@ -0,0 +1,172 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: Sanjay Ghemawat
+//
+// Regular-expression based scanner for parsing an input stream.
+//
+// Example 1: parse a sequence of "var = number" entries from input:
+//
+// Scanner scanner(input);
+// string var;
+// int number;
+// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
+// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
+// ...;
+// }
+
+#ifndef _PCRE_SCANNER_H
+#define _PCRE_SCANNER_H
+
+#include
+#include
+#include
+
+#include
+#include
+
+namespace pcrecpp {
+
+class PCRECPP_EXP_DEFN Scanner {
+ public:
+ Scanner();
+ explicit Scanner(const std::string& input);
+ ~Scanner();
+
+ // Return current line number. The returned line-number is
+ // one-based. I.e. it returns 1 + the number of consumed newlines.
+ //
+ // Note: this method may be slow. It may take time proportional to
+ // the size of the input.
+ int LineNumber() const;
+
+ // Return the byte-offset that the scanner is looking in the
+ // input data;
+ int Offset() const;
+
+ // Return true iff the start of the remaining input matches "re"
+ bool LookingAt(const RE& re) const;
+
+ // Return true iff all of the following are true
+ // a. the start of the remaining input matches "re",
+ // b. if any arguments are supplied, matched sub-patterns can be
+ // parsed and stored into the arguments.
+ // If it returns true, it skips over the matched input and any
+ // following input that matches the "skip" regular expression.
+ bool Consume(const RE& re,
+ const Arg& arg0 = RE::no_arg,
+ const Arg& arg1 = RE::no_arg,
+ const Arg& arg2 = RE::no_arg
+ // TODO: Allow more arguments?
+ );
+
+ // Set the "skip" regular expression. If after consuming some data,
+ // a prefix of the input matches this RE, it is automatically
+ // skipped. For example, a programming language scanner would use
+ // a skip RE that matches white space and comments.
+ //
+ // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
+ //
+ // Skipping repeats as long as it succeeds. We used to let people do
+ // this by writing "(...)*" in the regular expression, but that added
+ // up to lots of recursive calls within the pcre library, so now we
+ // control repetition explicitly via the function call API.
+ //
+ // You can pass NULL for "re" if you do not want any data to be skipped.
+ void Skip(const char* re); // DEPRECATED; does *not* repeat
+ void SetSkipExpression(const char* re);
+
+ // Temporarily pause "skip"ing. This
+ // Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
+ // is similar to
+ // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo");
+ // but avoids creating/deleting new RE objects.
+ void DisableSkip();
+
+ // Reenable previously paused skipping. Any prefix of the input
+ // that matches the skip pattern is immediately dropped.
+ void EnableSkip();
+
+ /***** Special wrappers around SetSkip() for some common idioms *****/
+
+ // Arranges to skip whitespace, C comments, C++ comments.
+ // The overall RE is a disjunction of the following REs:
+ // \\s whitespace
+ // //.*\n C++ comment
+ // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x)
+ // We get repetition via the semantics of SetSkipExpression, not by using *
+ void SkipCXXComments() {
+ SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
+ }
+
+ void set_save_comments(bool comments) {
+ save_comments_ = comments;
+ }
+
+ bool save_comments() {
+ return save_comments_;
+ }
+
+ // Append to vector ranges the comments found in the
+ // byte range [start,end] (inclusive) of the input data.
+ // Only comments that were extracted entirely within that
+ // range are returned: no range splitting of atomically-extracted
+ // comments is performed.
+ void GetComments(int start, int end, std::vector *ranges);
+
+ // Append to vector ranges the comments added
+ // since the last time this was called. This
+ // functionality is provided for efficiency when
+ // interleaving scanning with parsing.
+ void GetNextComments(std::vector *ranges);
+
+ private:
+ std::string data_; // All the input data
+ StringPiece input_; // Unprocessed input
+ RE* skip_; // If non-NULL, RE for skipping input
+ bool should_skip_; // If true, use skip_
+ bool skip_repeat_; // If true, repeat skip_ as long as it works
+ bool save_comments_; // If true, aggregate the skip expression
+
+ // the skipped comments
+ // TODO: later consider requiring that the StringPieces be added
+ // in order by their start position
+ std::vector *comments_;
+
+ // the offset into comments_ that has been returned by GetNextComments
+ int comments_offset_;
+
+ // helper function to consume *skip_ and honour
+ // save_comments_
+ void ConsumeSkip();
+};
+
+} // namespace pcrecpp
+
+#endif /* _PCRE_SCANNER_H */
diff --git a/src/pcre_scanner_unittest.cc b/src/pcre_scanner_unittest.cc
new file mode 100644
index 0000000..284c8ea
--- /dev/null
+++ b/src/pcre_scanner_unittest.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: Greg J. Badros
+//
+// Unittest for scanner, especially GetNextComments and GetComments()
+// functionality.
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include
+#include
+#include
+
+#include "pcrecpp.h"
+#include "pcre_stringpiece.h"
+#include "pcre_scanner.h"
+
+#define FLAGS_unittest_stack_size 49152
+
+// Dies with a fatal error if the two values are not equal.
+#define CHECK_EQ(a, b) do { \
+ if ( (a) != (b) ) { \
+ fprintf(stderr, "%s:%d: Check failed because %s != %s\n", \
+ __FILE__, __LINE__, #a, #b); \
+ exit(1); \
+ } \
+} while (0)
+
+using std::vector;
+using pcrecpp::StringPiece;
+using pcrecpp::Scanner;
+
+static void TestScanner() {
+ const char input[] = "\n"
+ "alpha = 1; // this sets alpha\n"
+ "bravo = 2; // bravo is set here\n"
+ "gamma = 33; /* and here is gamma */\n";
+
+ const char *re = "(\\w+) = (\\d+);";
+
+ Scanner s(input);
+ string var;
+ int number;
+ s.SkipCXXComments();
+ s.set_save_comments(true);
+ vector comments;
+
+ s.Consume(re, &var, &number);
+ CHECK_EQ(var, "alpha");
+ CHECK_EQ(number, 1);
+ CHECK_EQ(s.LineNumber(), 3);
+ s.GetNextComments(&comments);
+ CHECK_EQ(comments.size(), 1);
+ CHECK_EQ(comments[0].as_string(), " // this sets alpha\n");
+ comments.resize(0);
+
+ s.Consume(re, &var, &number);
+ CHECK_EQ(var, "bravo");
+ CHECK_EQ(number, 2);
+ s.GetNextComments(&comments);
+ CHECK_EQ(comments.size(), 1);
+ CHECK_EQ(comments[0].as_string(), " // bravo is set here\n");
+ comments.resize(0);
+
+ s.Consume(re, &var, &number);
+ CHECK_EQ(var, "gamma");
+ CHECK_EQ(number, 33);
+ s.GetNextComments(&comments);
+ CHECK_EQ(comments.size(), 1);
+ CHECK_EQ(comments[0].as_string(), " /* and here is gamma */\n");
+ comments.resize(0);
+
+ s.GetComments(0, sizeof(input), &comments);
+ CHECK_EQ(comments.size(), 3);
+ CHECK_EQ(comments[0].as_string(), " // this sets alpha\n");
+ CHECK_EQ(comments[1].as_string(), " // bravo is set here\n");
+ CHECK_EQ(comments[2].as_string(), " /* and here is gamma */\n");
+ comments.resize(0);
+
+ s.GetComments(0, strchr(input, '/') - input, &comments);
+ CHECK_EQ(comments.size(), 0);
+ comments.resize(0);
+
+ s.GetComments(strchr(input, '/') - input - 1, sizeof(input),
+ &comments);
+ CHECK_EQ(comments.size(), 3);
+ CHECK_EQ(comments[0].as_string(), " // this sets alpha\n");
+ CHECK_EQ(comments[1].as_string(), " // bravo is set here\n");
+ CHECK_EQ(comments[2].as_string(), " /* and here is gamma */\n");
+ comments.resize(0);
+
+ s.GetComments(strchr(input, '/') - input - 1,
+ strchr(input + 1, '\n') - input + 1, &comments);
+ CHECK_EQ(comments.size(), 1);
+ CHECK_EQ(comments[0].as_string(), " // this sets alpha\n");
+ comments.resize(0);
+}
+
+static void TestBigComment() {
+ string input;
+ for (int i = 0; i < 1024; ++i) {
+ char buf[1024]; // definitely big enough
+ sprintf(buf, " # Comment %d\n", i);
+ input += buf;
+ }
+ input += "name = value;\n";
+
+ Scanner s(input.c_str());
+ s.SetSkipExpression("\\s+|#.*\n");
+
+ string name;
+ string value;
+ s.Consume("(\\w+) = (\\w+);", &name, &value);
+ CHECK_EQ(name, "name");
+ CHECK_EQ(value, "value");
+}
+
+// TODO: also test scanner and big-comment in a thread with a
+// small stack size
+
+int main(int argc, char** argv) {
+ TestScanner();
+ TestBigComment();
+
+ // Done
+ printf("OK\n");
+
+ return 0;
+}
diff --git a/src/pcre_stringpiece.cc b/src/pcre_stringpiece.cc
new file mode 100644
index 0000000..67c0f1f
--- /dev/null
+++ b/src/pcre_stringpiece.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wilsonh@google.com (Wilson Hsieh)
+//
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include
+#include "pcrecpp_internal.h"
+#include "pcre_stringpiece.h"
+
+std::ostream& operator<<(std::ostream& o, const pcrecpp::StringPiece& piece) {
+ return (o << piece.as_string());
+}
diff --git a/src/pcre_stringpiece_unittest.cc b/src/pcre_stringpiece_unittest.cc
new file mode 100644
index 0000000..1e821ab
--- /dev/null
+++ b/src/pcre_stringpiece_unittest.cc
@@ -0,0 +1,151 @@
+// Copyright 2003 and onwards Google Inc.
+// Author: Sanjay Ghemawat
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include
+#include