src/corelib/tools/qregexp.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include "qregexp.h"
  43
  44 #include "qalgorithms.h"
  45 #include "qbitarray.h"
  46 #include "qcache.h"
  47 #include "qdatastream.h"
  48 #include "qdebug.h"
  49 #include "qlist.h"
  50 #include "qmap.h"
  51 #include "qmutex.h"
  52 #include "qstring.h"
  53 #include "qstringlist.h"
  54 #include "qstringmatcher.h"
  55 #include "qvector.h"
  56 #include "private/qfunctions_p.h"
  57
  58 #include <limits.h>
  59
  60 QT_BEGIN_NAMESPACE
  61
  62 int qFindString(const QChar *haystack, int haystackLen, int from,
  63     const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
  64
  65 // error strings for the regexp parser
  66 #define RXERR_OK         QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
  67 #define RXERR_DISABLED   QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
  68 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
  69 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
  70 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
  71 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
  72 #define RXERR_OCTAL      QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
  73 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
  74 #define RXERR_END        QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
  75 #define RXERR_LIMIT      QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
  76 #define RXERR_INTERVAL   QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
  77 #define RXERR_CATEGORY   QT_TRANSLATE_NOOP("QRegExp", "invalid category")
  78
  79 /*!
  80     \class QRegExp
  81     \reentrant
  82     \brief The QRegExp class provides pattern matching using regular expressions.
  83
  84     \ingroup tools
  85     \ingroup shared
  86
  87     \keyword regular expression
  88
  89     A regular expression, or "regexp", is a pattern for matching
  90     substrings in a text. This is useful in many contexts, e.g.,
  91
  92     \table
  93     \row \li Validation
  94          \li A regexp can test whether a substring meets some criteria,
  95          e.g. is an integer or contains no whitespace.
  96     \row \li Searching
  97          \li A regexp provides more powerful pattern matching than
  98          simple substring matching, e.g., match one of the words
  99          \e{mail}, \e{letter} or \e{correspondence}, but none of the
 100          words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
 101      \row \li Search and Replace
 102          \li A regexp can replace all occurrences of a substring with a
 103          different substring, e.g., replace all occurrences of \e{&}
 104          with \e{\&amp;} except where the \e{&} is already followed by
 105          an \e{amp;}.
 106     \row \li String Splitting
 107          \li A regexp can be used to identify where a string should be
 108          split apart, e.g. splitting tab-delimited strings.
 109     \endtable
 110
 111     A brief introduction to regexps is presented, a description of
 112     Qt's regexp language, some examples, and the function
 113     documentation itself. QRegExp is modeled on Perl's regexp
 114     language. It fully supports Unicode. QRegExp can also be used in a
 115     simpler, \e{wildcard mode} that is similar to the functionality
 116     found in command shells. The syntax rules used by QRegExp can be
 117     changed with setPatternSyntax(). In particular, the pattern syntax
 118     can be set to QRegExp::FixedString, which means the pattern to be
 119     matched is interpreted as a plain string, i.e., special characters
 120     (e.g., backslash) are not escaped.
 121
 122     A good text on regexps is \e {Mastering Regular Expressions}
 123     (Third Edition) by Jeffrey E. F.  Friedl, ISBN 0-596-52812-4.
 124
 125     \tableofcontents
 126
 127     \section1 Introduction
 128
 129     Regexps are built up from expressions, quantifiers, and
 130     assertions. The simplest expression is a character, e.g. \b{x}
 131     or \b{5}. An expression can also be a set of characters
 132     enclosed in square brackets. \b{[ABCD]} will match an \b{A}
 133     or a \b{B} or a \b{C} or a \b{D}. We can write this same
 134     expression as \b{[A-D]}, and an experession to match any
 135     captital letter in the English alphabet is written as
 136     \b{[A-Z]}.
 137
 138     A quantifier specifies the number of occurrences of an expression
 139     that must be matched. \b{x{1,1}} means match one and only one
 140     \b{x}. \b{x{1,5}} means match a sequence of \b{x}
 141     characters that contains at least one \b{x} but no more than
 142     five.
 143
 144     Note that in general regexps cannot be used to check for balanced
 145     brackets or tags. For example, a regexp can be written to match an
 146     opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
 147     are not nested, but if the \c{<b>} tags are nested, that same
 148     regexp will match an opening \c{<b>} tag with the wrong closing
 149     \c{</b>}.  For the fragment \c{<b>bold <b>bolder</b></b>}, the
 150     first \c{<b>} would be matched with the first \c{</b>}, which is
 151     not correct. However, it is possible to write a regexp that will
 152     match nested brackets or tags correctly, but only if the number of
 153     nesting levels is fixed and known. If the number of nesting levels
 154     is not fixed and known, it is impossible to write a regexp that
 155     will not fail.
 156
 157     Suppose we want a regexp to match integers in the range 0 to 99.
 158     At least one digit is required, so we start with the expression
 159     \b{[0-9]{1,1}}, which matches a single digit exactly once. This
 160     regexp matches integers in the range 0 to 9. To match integers up
 161     to 99, increase the maximum number of occurrences to 2, so the
 162     regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
 163     original requirement to match integers from 0 to 99, but it will
 164     also match integers that occur in the middle of strings. If we
 165     want the matched integer to be the whole string, we must use the
 166     anchor assertions, \b{^} (caret) and \b{$} (dollar). When
 167     \b{^} is the first character in a regexp, it means the regexp
 168     must match from the beginning of the string. When \b{$} is the
 169     last character of the regexp, it means the regexp must match to
 170     the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
 171     Note that assertions, e.g. \b{^} and \b{$}, do not match
 172     characters but locations in the string.
 173
 174     If you have seen regexps described elsewhere, they may have looked
 175     different from the ones shown here. This is because some sets of
 176     characters and some quantifiers are so common that they have been
 177     given special symbols to represent them. \b{[0-9]} can be
 178     replaced with the symbol \b{\\d}. The quantifier to match
 179     exactly one occurrence, \b{{1,1}}, can be replaced with the
 180     expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
 181     our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
 182     also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
 183     the string, match a digit, followed immediately by 0 or 1 digits}.
 184     In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
 185     is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
 186     occurrences. \b{?} makes an expression optional. The regexp
 187     \b{^\\d\\d?$} means \e{From the beginning of the string, match
 188     one digit, followed immediately by 0 or 1 more digit, followed
 189     immediately by end of string}.
 190
 191     To write a regexp that matches one of the words 'mail' \e or
 192     'letter' \e or 'correspondence' but does not match words that
 193     contain these words, e.g., 'email', 'mailman', 'mailer', and
 194     'letterbox', start with a regexp that matches 'mail'. Expressed
 195     fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
 196     a character expression is automatically quantified by
 197     \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
 198     'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
 199     we can use the vertical bar \b{|}, which means \b{or}, to
 200     include the other two words, so our regexp for matching any of the
 201     three words becomes \b{mail|letter|correspondence}. Match
 202     'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
 203     regexp will match one of the three words we want to match, it will
 204     also match words we don't want to match, e.g., 'email'.  To
 205     prevent the regexp from matching unwanted words, we must tell it
 206     to begin and end the match at word boundaries. First we enclose
 207     our regexp in parentheses, \b{(mail|letter|correspondence)}.
 208     Parentheses group expressions together, and they identify a part
 209     of the regexp that we wish to \l{capturing text}{capture}.
 210     Enclosing the expression in parentheses allows us to use it as a
 211     component in more complex regexps. It also allows us to examine
 212     which of the three words was actually matched. To force the match
 213     to begin and end on word boundaries, we enclose the regexp in
 214     \b{\\b} \e{word boundary} assertions:
 215     \b{\\b(mail|letter|correspondence)\\b}.  Now the regexp means:
 216     \e{Match a word boundary, followed by the regexp in parentheses,
 217     followed by a word boundary}. The \b{\\b} assertion matches a
 218     \e position in the regexp, not a \e character. A word boundary is
 219     any non-word character, e.g., a space, newline, or the beginning
 220     or ending of a string.
 221
 222     If we want to replace ampersand characters with the HTML entity
 223     \b{\&amp;}, the regexp to match is simply \b{\&}. But this
 224     regexp will also match ampersands that have already been converted
 225     to HTML entities. We want to replace only ampersands that are not
 226     already followed by \b{amp;}. For this, we need the negative
 227     lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
 228     written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
 229     \b{not} \e{followed by} \b{amp;}.
 230
 231     If we want to count all the occurrences of 'Eric' and 'Eirik' in a
 232     string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
 233     \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
 234     required to avoid matching words that contain either name,
 235     e.g. 'Ericsson'. Note that the second regexp matches more
 236     spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
 237
 238     Some of the examples discussed above are implemented in the
 239     \link #code-examples code examples \endlink section.
 240
 241     \target characters-and-abbreviations-for-sets-of-characters
 242     \section1 Characters and Abbreviations for Sets of Characters
 243
 244     \table
 245     \header \li Element \li Meaning
 246     \row \li \b{c}
 247          \li A character represents itself unless it has a special
 248          regexp meaning. e.g. \b{c} matches the character \e c.
 249     \row \li \b{\\c}
 250          \li A character that follows a backslash matches the character
 251          itself, except as specified below. e.g., To match a literal
 252          caret at the beginning of a string, write \b{\\^}.
 253     \row \li \b{\\a}
 254          \li Matches the ASCII bell (BEL, 0x07).
 255     \row \li \b{\\f}
 256          \li Matches the ASCII form feed (FF, 0x0C).
 257     \row \li \b{\\n}
 258          \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
 259     \row \li \b{\\r}
 260          \li Matches the ASCII carriage return (CR, 0x0D).
 261     \row \li \b{\\t}
 262          \li Matches the ASCII horizontal tab (HT, 0x09).
 263     \row \li \b{\\v}
 264          \li Matches the ASCII vertical tab (VT, 0x0B).
 265     \row \li \b{\\x\e{hhhh}}
 266          \li Matches the Unicode character corresponding to the
 267          hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
 268     \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
 269          \li matches the ASCII/Latin1 character for the octal number
 270          \e{ooo} (between 0 and 0377).
 271     \row \li \b{. (dot)}
 272          \li Matches any character (including newline).
 273     \row \li \b{\\d}
 274          \li Matches a digit (QChar::isDigit()).
 275     \row \li \b{\\D}
 276          \li Matches a non-digit.
 277     \row \li \b{\\s}
 278          \li Matches a whitespace character (QChar::isSpace()).
 279     \row \li \b{\\S}
 280          \li Matches a non-whitespace character.
 281     \row \li \b{\\w}
 282          \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
 283     \row \li \b{\\W}
 284          \li Matches a non-word character.
 285     \row \li \b{\\\e{n}}
 286          \li The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
 287     \endtable
 288
 289     \b{Note:} The C++ compiler transforms backslashes in strings.
 290     To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
 291     To match the backslash character itself, enter it four times, i.e.
 292     \c{\\\\}.
 293
 294     \target sets-of-characters
 295     \section1 Sets of Characters
 296
 297     Square brackets mean match any character contained in the square
 298     brackets. The character set abbreviations described above can
 299     appear in a character set in square brackets. Except for the
 300     character set abbreviations and the following two exceptions,
 301     characters do not have special meanings in square brackets.
 302
 303     \table
 304     \row \li \b{^}
 305
 306          \li The caret negates the character set if it occurs as the
 307          first character (i.e. immediately after the opening square
 308          bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
 309          \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
 310
 311     \row \li \b{-}
 312
 313          \li The dash indicates a range of characters. \b{[W-Z]}
 314          matches 'W' or 'X' or 'Y' or 'Z'.
 315
 316     \endtable
 317
 318     Using the predefined character set abbreviations is more portable
 319     than using character ranges across platforms and languages. For
 320     example, \b{[0-9]} matches a digit in Western alphabets but
 321     \b{\\d} matches a digit in \e any alphabet.
 322
 323     Note: In other regexp documentation, sets of characters are often
 324     called "character classes".
 325
 326     \target quantifiers
 327     \section1 Quantifiers
 328
 329     By default, an expression is automatically quantified by
 330     \b{{1,1}}, i.e. it should occur exactly once. In the following
 331     list, \b{\e {E}} stands for expression. An expression is a
 332     character, or an abbreviation for a set of characters, or a set of
 333     characters in square brackets, or an expression in parentheses.
 334
 335     \table
 336     \row \li \b{\e {E}?}
 337
 338          \li Matches zero or one occurrences of \e E. This quantifier
 339          means \e{The previous expression is optional}, because it
 340          will match whether or not the expression is found. \b{\e
 341          {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
 342          matches 'dent' or 'dents'.
 343
 344     \row \li \b{\e {E}+}
 345
 346          \li Matches one or more occurrences of \e E. \b{\e {E}+} is
 347          the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
 348          '00', '000', etc.
 349
 350     \row \li \b{\e {E}*}
 351
 352          \li Matches zero or more occurrences of \e E. It is the same
 353          as \b{\e {E}{0,}}. The \b{*} quantifier is often used
 354          in error where \b{+} should be used. For example, if
 355          \b{\\s*$} is used in an expression to match strings that
 356          end in whitespace, it will match every string because
 357          \b{\\s*$} means \e{Match zero or more whitespaces followed
 358          by end of string}. The correct regexp to match strings that
 359          have at least one trailing whitespace character is
 360          \b{\\s+$}.
 361
 362     \row \li \b{\e {E}{n}}
 363
 364          \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
 365          is the same as repeating \e E \e n times. For example,
 366          \b{x{5}} is the same as \b{xxxxx}. It is also the same
 367          as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
 368
 369     \row \li \b{\e {E}{n,}}
 370          \li Matches at least \e n occurrences of \e E.
 371
 372     \row \li \b{\e {E}{,m}}
 373          \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
 374          is the same as \b{\e {E}{0,m}}.
 375
 376     \row \li \b{\e {E}{n,m}}
 377          \li Matches at least \e n and at most \e m occurrences of \e E.
 378     \endtable
 379
 380     To apply a quantifier to more than just the preceding character,
 381     use parentheses to group characters together in an expression. For
 382     example, \b{tag+} matches a 't' followed by an 'a' followed by
 383     at least one 'g', whereas \b{(tag)+} matches at least one
 384     occurrence of 'tag'.
 385
 386     Note: Quantifiers are normally "greedy". They always match as much
 387     text as they can. For example, \b{0+} matches the first zero it
 388     finds and all the consecutive zeros after the first zero. Applied
 389     to '20005', it matches'2\underline{000}5'. Quantifiers can be made
 390     non-greedy, see setMinimal().
 391
 392     \target capturing parentheses
 393     \target backreferences
 394     \section1 Capturing Text
 395
 396     Parentheses allow us to group elements together so that we can
 397     quantify and capture them. For example if we have the expression
 398     \b{mail|letter|correspondence} that matches a string we know
 399     that \e one of the words matched but not which one. Using
 400     parentheses allows us to "capture" whatever is matched within
 401     their bounds, so if we used \b{(mail|letter|correspondence)}
 402     and matched this regexp against the string "I sent you some email"
 403     we can use the cap() or capturedTexts() functions to extract the
 404     matched characters, in this case 'mail'.
 405
 406     We can use captured text within the regexp itself. To refer to the
 407     captured text we use \e backreferences which are indexed from 1,
 408     the same as for cap(). For example we could search for duplicate
 409     words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
 410     word boundary followed by one or more word characters followed by
 411     one or more non-word characters followed by the same text as the
 412     first parenthesized expression followed by a word boundary.
 413
 414     If we want to use parentheses purely for grouping and not for
 415     capturing we can use the non-capturing syntax, e.g.
 416     \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
 417     end ')'. In this example we match either 'green' or 'blue' but we
 418     do not capture the match so we only know whether or not we matched
 419     but not which color we actually found. Using non-capturing
 420     parentheses is more efficient than using capturing parentheses
 421     since the regexp engine has to do less book-keeping.
 422
 423     Both capturing and non-capturing parentheses may be nested.
 424
 425     \target greedy quantifiers
 426
 427     For historical reasons, quantifiers (e.g. \b{*}) that apply to
 428     capturing parentheses are more "greedy" than other quantifiers.
 429     For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
 430     This behavior is different from what other regexp engines do
 431     (notably, Perl). To obtain a more intuitive capturing behavior,
 432     specify QRegExp::RegExp2 to the QRegExp constructor or call
 433     setPatternSyntax(QRegExp::RegExp2).
 434
 435     \target cap_in_a_loop
 436
 437     When the number of matches cannot be determined in advance, a
 438     common idiom is to use cap() in a loop. For example:
 439
 440     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
 441
 442     \target assertions
 443     \section1 Assertions
 444
 445     Assertions make some statement about the text at the point where
 446     they occur in the regexp but they do not match any characters. In
 447     the following list \b{\e {E}} stands for any expression.
 448
 449     \table
 450     \row \li \b{^}
 451          \li The caret signifies the beginning of the string. If you
 452          wish to match a literal \c{^} you must escape it by
 453          writing \c{\\^}. For example, \b{^#include} will only
 454          match strings which \e begin with the characters '#include'.
 455          (When the caret is the first character of a character set it
 456          has a special meaning, see \link #sets-of-characters Sets of
 457          Characters \endlink.)
 458
 459     \row \li \b{$}
 460          \li The dollar signifies the end of the string. For example
 461          \b{\\d\\s*$} will match strings which end with a digit
 462          optionally followed by whitespace. If you wish to match a
 463          literal \c{$} you must escape it by writing
 464          \c{\\$}.
 465
 466     \row \li \b{\\b}
 467          \li A word boundary. For example the regexp
 468          \b{\\bOK\\b} means match immediately after a word
 469          boundary (e.g. start of string or whitespace) the letter 'O'
 470          then the letter 'K' immediately before another word boundary
 471          (e.g. end of string or whitespace). But note that the
 472          assertion does not actually match any whitespace so if we
 473          write \b{(\\bOK\\b)} and we have a match it will only
 474          contain 'OK' even if the string is "It's \underline{OK} now".
 475
 476     \row \li \b{\\B}
 477          \li A non-word boundary. This assertion is true wherever
 478          \b{\\b} is false. For example if we searched for
 479          \b{\\Bon\\B} in "Left on" the match would fail (space
 480          and end of string aren't non-word boundaries), but it would
 481          match in "t\underline{on}ne".
 482
 483     \row \li \b{(?=\e E)}
 484          \li Positive lookahead. This assertion is true if the
 485          expression matches at this point in the regexp. For example,
 486          \b{const(?=\\s+char)} matches 'const' whenever it is
 487          followed by 'char', as in 'static \underline{const} char *'.
 488          (Compare with \b{const\\s+char}, which matches 'static
 489          \underline{const char} *'.)
 490
 491     \row \li \b{(?!\e E)}
 492          \li Negative lookahead. This assertion is true if the
 493          expression does not match at this point in the regexp. For
 494          example, \b{const(?!\\s+char)} matches 'const' \e except
 495          when it is followed by 'char'.
 496     \endtable
 497
 498     \keyword QRegExp wildcard matching
 499     \section1 Wildcard Matching
 500
 501     Most command shells such as \e bash or \e cmd.exe support "file
 502     globbing", the ability to identify a group of files by using
 503     wildcards. The setPatternSyntax() function is used to switch
 504     between regexp and wildcard mode. Wildcard matching is much
 505     simpler than full regexps and has only four features:
 506
 507     \table
 508     \row \li \b{c}
 509          \li Any character represents itself apart from those mentioned
 510          below. Thus \b{c} matches the character \e c.
 511     \row \li \b{?}
 512          \li Matches any single character. It is the same as
 513          \b{.} in full regexps.
 514     \row \li \b{*}
 515          \li Matches zero or more of any characters. It is the
 516          same as \b{.*} in full regexps.
 517     \row \li \b{[...]}
 518          \li Sets of characters can be represented in square brackets,
 519          similar to full regexps. Within the character class, like
 520          outside, backslash has no special meaning.
 521     \endtable
 522
 523     In the mode Wildcard, the wildcard characters cannot be
 524     escaped. In the mode WildcardUnix, the character '\\' escapes the
 525     wildcard.
 526
 527     For example if we are in wildcard mode and have strings which
 528     contain filenames we could identify HTML files with \b{*.html}.
 529     This will match zero or more characters followed by a dot followed
 530     by 'h', 't', 'm' and 'l'.
 531
 532     To test a string against a wildcard expression, use exactMatch().
 533     For example:
 534
 535     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
 536
 537     \target perl-users
 538     \section1 Notes for Perl Users
 539
 540     Most of the character class abbreviations supported by Perl are
 541     supported by QRegExp, see \link
 542     #characters-and-abbreviations-for-sets-of-characters characters
 543     and abbreviations for sets of characters \endlink.
 544
 545     In QRegExp, apart from within character classes, \c{^} always
 546     signifies the start of the string, so carets must always be
 547     escaped unless used for that purpose. In Perl the meaning of caret
 548     varies automagically depending on where it occurs so escaping it
 549     is rarely necessary. The same applies to \c{$} which in
 550     QRegExp always signifies the end of the string.
 551
 552     QRegExp's quantifiers are the same as Perl's greedy quantifiers
 553     (but see the \l{greedy quantifiers}{note above}). Non-greedy
 554     matching cannot be applied to individual quantifiers, but can be
 555     applied to all the quantifiers in the pattern. For example, to
 556     match the Perl regexp \b{ro+?m} requires:
 557
 558     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
 559
 560     The equivalent of Perl's \c{/i} option is
 561     setCaseSensitivity(Qt::CaseInsensitive).
 562
 563     Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
 564
 565     In QRegExp \b{.} matches any character, therefore all QRegExp
 566     regexps have the equivalent of Perl's \c{/s} option. QRegExp
 567     does not have an equivalent to Perl's \c{/m} option, but this
 568     can be emulated in various ways for example by splitting the input
 569     into lines or by looping with a regexp that searches for newlines.
 570
 571     Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
 572     assertions. The \\G assertion is not supported but can be emulated
 573     in a loop.
 574
 575     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
 576     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
 577     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
 578     capturedTexts()[2], etc.
 579
 580     To substitute a pattern use QString::replace().
 581
 582     Perl's extended \c{/x} syntax is not supported, nor are
 583     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
 584     the other hand, C++'s rules for literal strings can be used to
 585     achieve the same:
 586
 587     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
 588
 589     Both zero-width positive and zero-width negative lookahead
 590     assertions (?=pattern) and (?!pattern) are supported with the same
 591     syntax as Perl. Perl's lookbehind assertions, "independent"
 592     subexpressions and conditional expressions are not supported.
 593
 594     Non-capturing parentheses are also supported, with the same
 595     (?:pattern) syntax.
 596
 597     See QString::split() and QStringList::join() for equivalents
 598     to Perl's split and join functions.
 599
 600     Note: because C++ transforms \\'s they must be written \e twice in
 601     code, e.g. \b{\\b} must be written \b{\\\\b}.
 602
 603     \target code-examples
 604     \section1 Code Examples
 605
 606     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
 607
 608     The third string matches '\underline{6}'. This is a simple validation
 609     regexp for integers in the range 0 to 99.
 610
 611     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
 612
 613     The second string matches '\underline{This_is-OK}'. We've used the
 614     character set abbreviation '\\S' (non-whitespace) and the anchors
 615     to match strings which contain no whitespace.
 616
 617     In the following example we match strings containing 'mail' or
 618     'letter' or 'correspondence' but only match whole words i.e. not
 619     'email'
 620
 621     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
 622
 623     The second string matches "Please write the \underline{letter}". The
 624     word 'letter' is also captured (because of the parentheses). We
 625     can see what text we've captured like this:
 626
 627     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
 628
 629     This will capture the text from the first set of capturing
 630     parentheses (counting capturing left parentheses from left to
 631     right). The parentheses are counted from 1 since cap(0) is the
 632     whole matched regexp (equivalent to '&' in most regexp engines).
 633
 634     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
 635
 636     Here we've passed the QRegExp to QString's replace() function to
 637     replace the matched text with new text.
 638
 639     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
 640
 641     We've used the indexIn() function to repeatedly match the regexp in
 642     the string. Note that instead of moving forward by one character
 643     at a time \c pos++ we could have written \c {pos +=
 644     rx.matchedLength()} to skip over the already matched string. The
 645     count will equal 3, matching 'One \underline{Eric} another
 646     \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
 647     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
 648     by non-word boundaries.
 649
 650     One common use of regexps is to split lines of delimited data into
 651     their component fields.
 652
 653     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
 654
 655     In this example our input lines have the format company name, web
 656     address and country. Unfortunately the regexp is rather long and
 657     not very versatile -- the code will break if we add any more
 658     fields. A simpler and better solution is to look for the
 659     separator, '\\t' in this case, and take the surrounding text. The
 660     QString::split() function can take a separator string or regexp
 661     as an argument and split a string accordingly.
 662
 663     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
 664
 665     Here field[0] is the company, field[1] the web address and so on.
 666
 667     To imitate the matching of a shell we can use wildcard mode.
 668
 669     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
 670
 671     Wildcard matching can be convenient because of its simplicity, but
 672     any wildcard regexp can be defined using full regexps, e.g.
 673     \b{.*\\.html$}. Notice that we can't match both \c .html and \c
 674     .htm files with a wildcard unless we use \b{*.htm*} which will
 675     also match 'test.html.bak'. A full regexp gives us the precision
 676     we need, \b{.*\\.html?$}.
 677
 678     QRegExp can match case insensitively using setCaseSensitivity(),
 679     and can use non-greedy matching, see setMinimal(). By
 680     default QRegExp uses full regexps but this can be changed with
 681     setWildcard(). Searching can be forward with indexIn() or backward
 682     with lastIndexIn(). Captured text can be accessed using
 683     capturedTexts() which returns a string list of all captured
 684     strings, or using cap() which returns the captured string for the
 685     given index. The pos() function takes a match index and returns
 686     the position in the string where the match was made (or -1 if
 687     there was no match).
 688
 689     \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
 690         {tools/regexp}{Regular Expression Example}
 691 */
 692
 693 #if defined(Q_OS_VXWORKS) && defined(EOS)
 694 #  undef EOS
 695 #endif
 696
 697 const int NumBadChars = 64;
 698 #define BadChar(ch) ((ch).unicode() % NumBadChars)
 699
 700 const int NoOccurrence = INT_MAX;
 701 const int EmptyCapture = INT_MAX;
 702 const int InftyLen = INT_MAX;
 703 const int InftyRep = 1025;
 704 const int EOS = -1;
 705
 706 static bool isWord(QChar ch)
 707 {
 708     return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
 709 }
 710
 711 /*
 712   Merges two vectors of ints and puts the result into the first
 713   one.
 714 */
 715 static void mergeInto(QVector<int> *a, const QVector<int> &b)
 716 {
 717     int asize = a->size();
 718     int bsize = b.size();
 719     if (asize == 0) {
 720         *a = b;
 721 #ifndef QT_NO_REGEXP_OPTIM
 722     } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
 723         a->resize(asize + 1);
 724         (*a)[asize] = b.at(0);
 725 #endif
 726     } else if (bsize >= 1) {
 727         int csize = asize + bsize;
 728         QVector<int> c(csize);
 729         int i = 0, j = 0, k = 0;
 730         while (i < asize) {
 731             if (j < bsize) {
 732                 if (a->at(i) == b.at(j)) {
 733                     ++i;
 734                     --csize;
 735                 } else if (a->at(i) < b.at(j)) {
 736                     c[k++] = a->at(i++);
 737                 } else {
 738                     c[k++] = b.at(j++);
 739                 }
 740             } else {
 741                 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
 742                 break;
 743             }
 744         }
 745         c.resize(csize);
 746         if (j < bsize)
 747             memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
 748         *a = c;
 749     }
 750 }
 751
 752 #ifndef QT_NO_REGEXP_WILDCARD
 753 /*
 754   Translates a wildcard pattern to an equivalent regular expression
 755   pattern (e.g., *.cpp to .*\.cpp).
 756
 757   If enableEscaping is true, it is possible to escape the wildcard
 758   characters with \
 759 */
 760 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
 761 {
 762     const int wclen = wc_str.length();
 763     QString rx;
 764     int i = 0;
 765     bool isEscaping = false; // the previous character is '\'
 766     const QChar *wc = wc_str.unicode();
 767
 768     while (i < wclen) {
 769         const QChar c = wc[i++];
 770         switch (c.unicode()) {
 771         case '\\':
 772             if (enableEscaping) {
 773                 if (isEscaping) {
 774                     rx += QLatin1String("\\\\");
 775                 } // we insert the \\ later if necessary
 776                 if (i == wclen) { // the end
 777                     rx += QLatin1String("\\\\");
 778                 }
 779             } else {
 780                 rx += QLatin1String("\\\\");
 781             }
 782             isEscaping = true;
 783             break;
 784         case '*':
 785             if (isEscaping) {
 786                 rx += QLatin1String("\\*");
 787                 isEscaping = false;
 788             } else {
 789                 rx += QLatin1String(".*");
 790             }
 791             break;
 792         case '?':
 793             if (isEscaping) {
 794                 rx += QLatin1String("\\?");
 795                 isEscaping = false;
 796             } else {
 797                 rx += QLatin1Char('.');
 798             }
 799
 800             break;
 801         case '$':
 802         case '(':
 803         case ')':
 804         case '+':
 805         case '.':
 806         case '^':
 807         case '{':
 808         case '|':
 809         case '}':
 810             if (isEscaping) {
 811                 isEscaping = false;
 812                 rx += QLatin1String("\\\\");
 813             }
 814             rx += QLatin1Char('\\');
 815             rx += c;
 816             break;
 817          case '[':
 818             if (isEscaping) {
 819                 isEscaping = false;
 820                 rx += QLatin1String("\\[");
 821             } else {
 822                 rx += c;
 823                 if (wc[i] == QLatin1Char('^'))
 824                     rx += wc[i++];
 825                 if (i < wclen) {
 826                     if (rx[i] == QLatin1Char(']'))
 827                         rx += wc[i++];
 828                     while (i < wclen && wc[i] != QLatin1Char(']')) {
 829                         if (wc[i] == QLatin1Char('\\'))
 830                             rx += QLatin1Char('\\');
 831                         rx += wc[i++];
 832                     }
 833                 }
 834             }
 835              break;
 836
 837         case ']':
 838             if(isEscaping){
 839                 isEscaping = false;
 840                 rx += QLatin1String("\\");
 841             }
 842             rx += c;
 843             break;
 844
 845         default:
 846             if(isEscaping){
 847                 isEscaping = false;
 848                 rx += QLatin1String("\\\\");
 849             }
 850             rx += c;
 851         }
 852     }
 853     return rx;
 854 }
 855 #endif
 856
 857 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
 858 {
 859     if (caretMode == QRegExp::CaretAtZero) {
 860         return 0;
 861     } else if (caretMode == QRegExp::CaretAtOffset) {
 862         return offset;
 863     } else { // QRegExp::CaretWontMatch
 864         return -1;
 865     }
 866 }
 867
 868 /*
 869     The QRegExpEngineKey struct uniquely identifies an engine.
 870 */
 871 struct QRegExpEngineKey
 872 {
 873     QString pattern;
 874     QRegExp::PatternSyntax patternSyntax;
 875     Qt::CaseSensitivity cs;
 876
 877     inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
 878                             Qt::CaseSensitivity cs)
 879         : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
 880
 881     inline void clear() {
 882         pattern.clear();
 883         patternSyntax = QRegExp::RegExp;
 884         cs = Qt::CaseSensitive;
 885     }
 886 };
 887
 888 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
 889 {
 890     return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
 891            && key1.cs == key2.cs;
 892 }
 893
 894 class QRegExpEngine;
 895
 896 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
 897
 898 /*
 899   This is the engine state during matching.
 900 */
 901 struct QRegExpMatchState
 902 {
 903     const QChar *in; // a pointer to the input string data
 904     int pos; // the current position in the string
 905     int caretPos;
 906     int len; // the length of the input string
 907     bool minimal; // minimal matching?
 908     int *bigArray; // big array holding the data for the next pointers
 909     int *inNextStack; // is state is nextStack?
 910     int *curStack; // stack of current states
 911     int *nextStack; // stack of next states
 912     int *curCapBegin; // start of current states' captures
 913     int *nextCapBegin; // start of next states' captures
 914     int *curCapEnd; // end of current states' captures
 915     int *nextCapEnd; // end of next states' captures
 916     int *tempCapBegin; // start of temporary captures
 917     int *tempCapEnd; // end of temporary captures
 918     int *capBegin; // start of captures for a next state
 919     int *capEnd; // end of captures for a next state
 920     int *slideTab; // bump-along slide table for bad-character heuristic
 921     int *captured; // what match() returned last
 922     int slideTabSize; // size of slide table
 923     int capturedSize;
 924 #ifndef QT_NO_REGEXP_BACKREF
 925     QList<QVector<int> > sleeping; // list of back-reference sleepers
 926 #endif
 927     int matchLen; // length of match
 928     int oneTestMatchedLen; // length of partial match
 929
 930     const QRegExpEngine *eng;
 931
 932     inline QRegExpMatchState() : bigArray(0), captured(0) {}
 933     inline ~QRegExpMatchState() { free(bigArray); }
 934
 935     void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
 936     void prepareForMatch(QRegExpEngine *eng);
 937     void match(const QChar *str, int len, int pos, bool minimal,
 938         bool oneTest, int caretIndex);
 939     bool matchHere();
 940     bool testAnchor(int i, int a, const int *capBegin);
 941 };
 942
 943 /*
 944   The struct QRegExpAutomatonState represents one state in a modified NFA. The
 945   input characters matched are stored in the state instead of on
 946   the transitions, something possible for an automaton
 947   constructed from a regular expression.
 948 */
 949 struct QRegExpAutomatonState
 950 {
 951 #ifndef QT_NO_REGEXP_CAPTURE
 952     int atom; // which atom does this state belong to?
 953 #endif
 954     int match; // what does it match? (see CharClassBit and BackRefBit)
 955     QVector<int> outs; // out-transitions
 956     QMap<int, int> reenter; // atoms reentered when transiting out
 957     QMap<int, int> anchors; // anchors met when transiting out
 958
 959     inline QRegExpAutomatonState() { }
 960 #ifndef QT_NO_REGEXP_CAPTURE
 961     inline QRegExpAutomatonState(int a, int m)
 962         : atom(a), match(m) { }
 963 #else
 964     inline QRegExpAutomatonState(int m)
 965         : match(m) { }
 966 #endif
 967 };
 968
 969 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
 970
 971 /*
 972   The struct QRegExpCharClassRange represents a range of characters (e.g.,
 973   [0-9] denotes range 48 to 57).
 974 */
 975 struct QRegExpCharClassRange
 976 {
 977     ushort from; // 48
 978     ushort len; // 10
 979 };
 980
 981 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
 982
 983 #ifndef QT_NO_REGEXP_CAPTURE
 984 /*
 985   The struct QRegExpAtom represents one node in the hierarchy of regular
 986   expression atoms.
 987 */
 988 struct QRegExpAtom
 989 {
 990     enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
 991
 992     int parent; // index of parent in array of atoms
 993     int capture; // index of capture, from 1 to ncap - 1
 994 };
 995
 996 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
 997 #endif
 998
 999 struct QRegExpLookahead;
1000
1001 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1002 /*
1003   The struct QRegExpAnchorAlternation represents a pair of anchors with
1004   OR semantics.
1005 */
1006 struct QRegExpAnchorAlternation
1007 {
1008     int a; // this anchor...
1009     int b; // ...or this one
1010 };
1011
1012 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1013 #endif
1014
1015 #ifndef QT_NO_REGEXP_CCLASS
1016
1017 #define FLAG(x) (1 << (x))
1018 /*
1019   The class QRegExpCharClass represents a set of characters, such as can
1020   be found in regular expressions (e.g., [a-z] denotes the set
1021   {a, b, ..., z}).
1022 */
1023 class QRegExpCharClass
1024 {
1025 public:
1026     QRegExpCharClass();
1027     inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1028
1029     QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1030
1031     void clear();
1032     bool negative() const { return n; }
1033     void setNegative(bool negative);
1034     void addCategories(uint cats);
1035     void addRange(ushort from, ushort to);
1036     void addSingleton(ushort ch) { addRange(ch, ch); }
1037
1038     bool in(QChar ch) const;
1039 #ifndef QT_NO_REGEXP_OPTIM
1040     const QVector<int> &firstOccurrence() const { return occ1; }
1041 #endif
1042
1043 #if defined(QT_DEBUG)
1044     void dump() const;
1045 #endif
1046
1047 private:
1048     uint c; // character classes
1049     QVector<QRegExpCharClassRange> r; // character ranges
1050     bool n; // negative?
1051 #ifndef QT_NO_REGEXP_OPTIM
1052     QVector<int> occ1; // first-occurrence array
1053 #endif
1054 };
1055 #else
1056 struct QRegExpCharClass
1057 {
1058     int dummy;
1059
1060 #ifndef QT_NO_REGEXP_OPTIM
1061     QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1062
1063     const QVector<int> &firstOccurrence() const { return occ1; }
1064     QVector<int> occ1;
1065 #endif
1066 };
1067 #endif
1068
1069 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1070
1071 /*
1072   The QRegExpEngine class encapsulates a modified nondeterministic
1073   finite automaton (NFA).
1074 */
1075 class QRegExpEngine
1076 {
1077 public:
1078     QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1079         : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1080
1081     QRegExpEngine(const QRegExpEngineKey &key);
1082     ~QRegExpEngine();
1083
1084     bool isValid() const { return valid; }
1085     const QString &errorString() const { return yyError; }
1086     int captureCount() const { return officialncap; }
1087
1088     int createState(QChar ch);
1089     int createState(const QRegExpCharClass &cc);
1090 #ifndef QT_NO_REGEXP_BACKREF
1091     int createState(int bref);
1092 #endif
1093
1094     void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1095 #ifndef QT_NO_REGEXP_CAPTURE
1096     void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1097 #endif
1098
1099 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1100     int anchorAlternation(int a, int b);
1101     int anchorConcatenation(int a, int b);
1102 #else
1103     int anchorAlternation(int a, int b) { return a & b; }
1104     int anchorConcatenation(int a, int b) { return a | b; }
1105 #endif
1106     void addAnchors(int from, int to, int a);
1107
1108 #ifndef QT_NO_REGEXP_OPTIM
1109     void heuristicallyChooseHeuristic();
1110 #endif
1111
1112 #if defined(QT_DEBUG)
1113     void dump() const;
1114 #endif
1115
1116     QAtomicInt ref;
1117
1118 private:
1119     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1120     enum { InitialState = 0, FinalState = 1 };
1121
1122     void setup();
1123     int setupState(int match);
1124
1125     /*
1126       Let's hope that 13 lookaheads and 14 back-references are
1127       enough.
1128      */
1129     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1130     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1131            Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1132            Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1133            Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1134            Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1135
1136            Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1137                    ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1138 #ifndef QT_NO_REGEXP_CAPTURE
1139     int startAtom(bool officialCapture);
1140     void finishAtom(int atom, bool needCapture);
1141 #endif
1142
1143 #ifndef QT_NO_REGEXP_LOOKAHEAD
1144     int addLookahead(QRegExpEngine *eng, bool negative);
1145 #endif
1146
1147 #ifndef QT_NO_REGEXP_OPTIM
1148     bool goodStringMatch(QRegExpMatchState &matchState) const;
1149     bool badCharMatch(QRegExpMatchState &matchState) const;
1150 #else
1151     bool bruteMatch(QRegExpMatchState &matchState) const;
1152 #endif
1153
1154     QVector<QRegExpAutomatonState> s; // array of states
1155 #ifndef QT_NO_REGEXP_CAPTURE
1156     QVector<QRegExpAtom> f; // atom hierarchy
1157     int nf; // number of atoms
1158     int cf; // current atom
1159     QVector<int> captureForOfficialCapture;
1160 #endif
1161     int officialncap; // number of captures, seen from the outside
1162     int ncap; // number of captures, seen from the inside
1163 #ifndef QT_NO_REGEXP_CCLASS
1164     QVector<QRegExpCharClass> cl; // array of character classes
1165 #endif
1166 #ifndef QT_NO_REGEXP_LOOKAHEAD
1167     QVector<QRegExpLookahead *> ahead; // array of lookaheads
1168 #endif
1169 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1170     QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1171 #endif
1172 #ifndef QT_NO_REGEXP_OPTIM
1173     bool caretAnchored; // does the regexp start with ^?
1174     bool trivial; // is the good-string all that needs to match?
1175 #endif
1176     bool valid; // is the regular expression valid?
1177     Qt::CaseSensitivity cs; // case sensitive?
1178     bool greedyQuantifiers; // RegExp2?
1179     bool xmlSchemaExtensions;
1180 #ifndef QT_NO_REGEXP_BACKREF
1181     int nbrefs; // number of back-references
1182 #endif
1183
1184 #ifndef QT_NO_REGEXP_OPTIM
1185     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1186
1187     int goodEarlyStart; // the index where goodStr can first occur in a match
1188     int goodLateStart; // the index where goodStr can last occur in a match
1189     QString goodStr; // the string that any match has to contain
1190
1191     int minl; // the minimum length of a match
1192     QVector<int> occ1; // first-occurrence array
1193 #endif
1194
1195     /*
1196       The class Box is an abstraction for a regular expression
1197       fragment. It can also be seen as one node in the syntax tree of
1198       a regular expression with synthetized attributes.
1199
1200       Its interface is ugly for performance reasons.
1201     */
1202     class Box
1203     {
1204     public:
1205         Box(QRegExpEngine *engine);
1206         Box(const Box &b) { operator=(b); }
1207
1208         Box &operator=(const Box &b);
1209
1210         void clear() { operator=(Box(eng)); }
1211         void set(QChar ch);
1212         void set(const QRegExpCharClass &cc);
1213 #ifndef QT_NO_REGEXP_BACKREF
1214         void set(int bref);
1215 #endif
1216
1217         void cat(const Box &b);
1218         void orx(const Box &b);
1219         void plus(int atom);
1220         void opt();
1221         void catAnchor(int a);
1222 #ifndef QT_NO_REGEXP_OPTIM
1223         void setupHeuristics();
1224 #endif
1225
1226 #if defined(QT_DEBUG)
1227         void dump() const;
1228 #endif
1229
1230     private:
1231         void addAnchorsToEngine(const Box &to) const;
1232
1233         QRegExpEngine *eng; // the automaton under construction
1234         QVector<int> ls; // the left states (firstpos)
1235         QVector<int> rs; // the right states (lastpos)
1236         QMap<int, int> lanchors; // the left anchors
1237         QMap<int, int> ranchors; // the right anchors
1238         int skipanchors; // the anchors to match if the box is skipped
1239
1240 #ifndef QT_NO_REGEXP_OPTIM
1241         int earlyStart; // the index where str can first occur
1242         int lateStart; // the index where str can last occur
1243         QString str; // a string that has to occur in any match
1244         QString leftStr; // a string occurring at the left of this box
1245         QString rightStr; // a string occurring at the right of this box
1246         int maxl; // the maximum length of this box (possibly InftyLen)
1247 #endif
1248
1249         int minl; // the minimum length of this box
1250 #ifndef QT_NO_REGEXP_OPTIM
1251         QVector<int> occ1; // first-occurrence array
1252 #endif
1253     };
1254
1255     friend class Box;
1256
1257     /*
1258       This is the lexical analyzer for regular expressions.
1259     */
1260     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1261            Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1262            Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1263     int getChar();
1264     int getEscape();
1265 #ifndef QT_NO_REGEXP_INTERVAL
1266     int getRep(int def);
1267 #endif
1268 #ifndef QT_NO_REGEXP_LOOKAHEAD
1269     void skipChars(int n);
1270 #endif
1271     void error(const char *msg);
1272     void startTokenizer(const QChar *rx, int len);
1273     int getToken();
1274
1275     const QChar *yyIn; // a pointer to the input regular expression pattern
1276     int yyPos0; // the position of yyTok in the input pattern
1277     int yyPos; // the position of the next character to read
1278     int yyLen; // the length of yyIn
1279     int yyCh; // the last character read
1280     QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1281     int yyMinRep; // attribute for Tok_Quantifier
1282     int yyMaxRep; // ditto
1283     QString yyError; // syntax error or overflow during parsing?
1284
1285     /*
1286       This is the syntactic analyzer for regular expressions.
1287     */
1288     int parse(const QChar *rx, int len);
1289     void parseAtom(Box *box);
1290     void parseFactor(Box *box);
1291     void parseTerm(Box *box);
1292     void parseExpression(Box *box);
1293
1294     int yyTok; // the last token read
1295     bool yyMayCapture; // set this to false to disable capturing
1296
1297     friend struct QRegExpMatchState;
1298 };
1299
1300 #ifndef QT_NO_REGEXP_LOOKAHEAD
1301 /*
1302   The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1303   (?=foo) and (?!bar)).
1304 */
1305 struct QRegExpLookahead
1306 {
1307     QRegExpEngine *eng; // NFA representing the embedded regular expression
1308     bool neg; // negative lookahead?
1309
1310     inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1311         : eng(eng0), neg(neg0) { }
1312     inline ~QRegExpLookahead() { delete eng; }
1313 };
1314 #endif
1315
1316 /*! \internal
1317     convert the pattern string to the RegExp syntax.
1318
1319     This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1320  */
1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1322 {
1323     switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325     case QRegExp::Wildcard:
1326         return wc2rx(pattern, false);
1327         break;
1328     case QRegExp::WildcardUnix:
1329         return wc2rx(pattern, true);
1330         break;
1331 #endif
1332     case QRegExp::FixedString:
1333         return QRegExp::escape(pattern);
1334         break;
1335     case QRegExp::W3CXmlSchema11:
1336     default:
1337         return pattern;
1338     }
1339 }
1340
1341 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1342     : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1343       xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1344 {
1345     setup();
1346
1347     QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1348
1349     valid = (parse(rx.unicode(), rx.length()) == rx.length());
1350     if (!valid) {
1351 #ifndef QT_NO_REGEXP_OPTIM
1352         trivial = false;
1353 #endif
1354         error(RXERR_LEFTDELIM);
1355     }
1356 }
1357
1358 QRegExpEngine::~QRegExpEngine()
1359 {
1360 #ifndef QT_NO_REGEXP_LOOKAHEAD
1361     qDeleteAll(ahead);
1362 #endif
1363 }
1364
1365 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1366 {
1367     /*
1368       We use one QVector<int> for all the big data used a lot in
1369       matchHere() and friends.
1370     */
1371     int ns = eng->s.size(); // number of states
1372     int ncap = eng->ncap;
1373 #ifndef QT_NO_REGEXP_OPTIM
1374     int newSlideTabSize = qMax(eng->minl + 1, 16);
1375 #else
1376     int newSlideTabSize = 0;
1377 #endif
1378     int numCaptures = eng->captureCount();
1379     int newCapturedSize = 2 + 2 * numCaptures;
1380     bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1381
1382     // set all internal variables only _after_ bigArray is realloc'ed
1383     // to prevent a broken regexp in oom case
1384
1385     slideTabSize = newSlideTabSize;
1386     capturedSize = newCapturedSize;
1387     inNextStack = bigArray;
1388     memset(inNextStack, -1, ns * sizeof(int));
1389     curStack = inNextStack + ns;
1390     nextStack = inNextStack + 2 * ns;
1391
1392     curCapBegin = inNextStack + 3 * ns;
1393     nextCapBegin = curCapBegin + ncap * ns;
1394     curCapEnd = curCapBegin + 2 * ncap * ns;
1395     nextCapEnd = curCapBegin + 3 * ncap * ns;
1396
1397     tempCapBegin = curCapBegin + 4 * ncap * ns;
1398     tempCapEnd = tempCapBegin + ncap;
1399     capBegin = tempCapBegin + 2 * ncap;
1400     capEnd = tempCapBegin + 3 * ncap;
1401
1402     slideTab = tempCapBegin + 4 * ncap;
1403     captured = slideTab + slideTabSize;
1404     memset(captured, -1, capturedSize*sizeof(int));
1405     this->eng = eng;
1406 }
1407
1408 /*
1409   Tries to match in str and returns an array of (begin, length) pairs
1410   for captured text. If there is no match, all pairs are (-1, -1).
1411 */
1412 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1413     bool minimal0, bool oneTest, int caretIndex)
1414 {
1415     bool matched = false;
1416     QChar char_null;
1417
1418 #ifndef QT_NO_REGEXP_OPTIM
1419     if (eng->trivial && !oneTest) {
1420         pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1421         matchLen = eng->goodStr.length();
1422         matched = (pos != -1);
1423     } else
1424 #endif
1425     {
1426         in = str0;
1427         if (in == 0)
1428             in = &char_null;
1429         pos = pos0;
1430         caretPos = caretIndex;
1431         len = len0;
1432         minimal = minimal0;
1433         matchLen = 0;
1434         oneTestMatchedLen = 0;
1435
1436         if (eng->valid && pos >= 0 && pos <= len) {
1437 #ifndef QT_NO_REGEXP_OPTIM
1438             if (oneTest) {
1439                 matched = matchHere();
1440             } else {
1441                 if (pos <= len - eng->minl) {
1442                     if (eng->caretAnchored) {
1443                         matched = matchHere();
1444                     } else if (eng->useGoodStringHeuristic) {
1445                         matched = eng->goodStringMatch(*this);
1446                     } else {
1447                         matched = eng->badCharMatch(*this);
1448                     }
1449                 }
1450             }
1451 #else
1452             matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1453 #endif
1454         }
1455     }
1456
1457     if (matched) {
1458         int *c = captured;
1459         *c++ = pos;
1460         *c++ = matchLen;
1461
1462         int numCaptures = (capturedSize - 2) >> 1;
1463 #ifndef QT_NO_REGEXP_CAPTURE
1464         for (int i = 0; i < numCaptures; ++i) {
1465             int j = eng->captureForOfficialCapture.at(i);
1466             if (capBegin[j] != EmptyCapture) {
1467                 int len = capEnd[j] - capBegin[j];
1468                 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1469                 *c++ = len;
1470             } else {
1471                 *c++ = -1;
1472                 *c++ = -1;
1473             }
1474         }
1475 #endif
1476     } else {
1477         // we rely on 2's complement here
1478         memset(captured, -1, capturedSize * sizeof(int));
1479     }
1480 }
1481
1482 /*
1483   The three following functions add one state to the automaton and
1484   return the number of the state.
1485 */
1486
1487 int QRegExpEngine::createState(QChar ch)
1488 {
1489     return setupState(ch.unicode());
1490 }
1491
1492 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1493 {
1494 #ifndef QT_NO_REGEXP_CCLASS
1495     int n = cl.size();
1496     cl += QRegExpCharClass(cc);
1497     return setupState(CharClassBit | n);
1498 #else
1499     Q_UNUSED(cc);
1500     return setupState(CharClassBit);
1501 #endif
1502 }
1503
1504 #ifndef QT_NO_REGEXP_BACKREF
1505 int QRegExpEngine::createState(int bref)
1506 {
1507     if (bref > nbrefs) {
1508         nbrefs = bref;
1509         if (nbrefs > MaxBackRefs) {
1510             error(RXERR_LIMIT);
1511             return 0;
1512         }
1513     }
1514     return setupState(BackRefBit | bref);
1515 }
1516 #endif
1517
1518 /*
1519   The two following functions add a transition between all pairs of
1520   states (i, j) where i is found in from, and j is found in to.
1521
1522   Cat-transitions are distinguished from plus-transitions for
1523   capturing.
1524 */
1525
1526 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1527 {
1528     for (int i = 0; i < from.size(); i++)
1529         mergeInto(&s[from.at(i)].outs, to);
1530 }
1531
1532 #ifndef QT_NO_REGEXP_CAPTURE
1533 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1534 {
1535     for (int i = 0; i < from.size(); i++) {
1536         QRegExpAutomatonState &st = s[from.at(i)];
1537         const QVector<int> oldOuts = st.outs;
1538         mergeInto(&st.outs, to);
1539         if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1540             for (int j = 0; j < to.size(); j++) {
1541                 // ### st.reenter.contains(to.at(j)) check looks suspicious
1542                 if (!st.reenter.contains(to.at(j)) &&
1543                      qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1544                     st.reenter.insert(to.at(j), atom);
1545             }
1546         }
1547     }
1548 }
1549 #endif
1550
1551 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1552 /*
1553   Returns an anchor that means a OR b.
1554 */
1555 int QRegExpEngine::anchorAlternation(int a, int b)
1556 {
1557     if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1558         return a & b;
1559
1560     int n = aa.size();
1561 #ifndef QT_NO_REGEXP_OPTIM
1562     if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1563         return Anchor_Alternation | (n - 1);
1564 #endif
1565
1566     QRegExpAnchorAlternation element = {a, b};
1567     aa.append(element);
1568     return Anchor_Alternation | n;
1569 }
1570
1571 /*
1572   Returns an anchor that means a AND b.
1573 */
1574 int QRegExpEngine::anchorConcatenation(int a, int b)
1575 {
1576     if (((a | b) & Anchor_Alternation) == 0)
1577         return a | b;
1578     if ((b & Anchor_Alternation) != 0)
1579         qSwap(a, b);
1580
1581     int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1582     int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1583     return anchorAlternation(aprime, bprime);
1584 }
1585 #endif
1586
1587 /*
1588   Adds anchor a on a transition caracterised by its from state and
1589   its to state.
1590 */
1591 void QRegExpEngine::addAnchors(int from, int to, int a)
1592 {
1593     QRegExpAutomatonState &st = s[from];
1594     if (st.anchors.contains(to))
1595         a = anchorAlternation(st.anchors.value(to), a);
1596     st.anchors.insert(to, a);
1597 }
1598
1599 #ifndef QT_NO_REGEXP_OPTIM
1600 /*
1601   This function chooses between the good-string and the bad-character
1602   heuristics. It computes two scores and chooses the heuristic with
1603   the highest score.
1604
1605   Here are some common-sense constraints on the scores that should be
1606   respected if the formulas are ever modified: (1) If goodStr is
1607   empty, the good-string heuristic scores 0. (2) If the regular
1608   expression is trivial, the good-string heuristic should be used.
1609   (3) If the search is case insensitive, the good-string heuristic
1610   should be used, unless it scores 0. (Case insensitivity turns all
1611   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1612   big, the good-string heuristic should score less.
1613 */
1614 void QRegExpEngine::heuristicallyChooseHeuristic()
1615 {
1616     if (minl == 0) {
1617         useGoodStringHeuristic = false;
1618     } else if (trivial) {
1619         useGoodStringHeuristic = true;
1620     } else {
1621         /*
1622           Magic formula: The good string has to constitute a good
1623           proportion of the minimum-length string, and appear at a
1624           more-or-less known index.
1625         */
1626         int goodStringScore = (64 * goodStr.length() / minl) -
1627                               (goodLateStart - goodEarlyStart);
1628         /*
1629           Less magic formula: We pick some characters at random, and
1630           check whether they are good or bad.
1631         */
1632         int badCharScore = 0;
1633         int step = qMax(1, NumBadChars / 32);
1634         for (int i = 1; i < NumBadChars; i += step) {
1635             if (occ1.at(i) == NoOccurrence)
1636                 badCharScore += minl;
1637             else
1638                 badCharScore += occ1.at(i);
1639         }
1640         badCharScore /= minl;
1641         useGoodStringHeuristic = (goodStringScore > badCharScore);
1642     }
1643 }
1644 #endif
1645
1646 #if defined(QT_DEBUG)
1647 void QRegExpEngine::dump() const
1648 {
1649     int i, j;
1650     qDebug("Case %ssensitive engine", cs ? "" : "in");
1651     qDebug("  States");
1652     for (i = 0; i < s.size(); i++) {
1653         qDebug("  %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1654 #ifndef QT_NO_REGEXP_CAPTURE
1655         if (nf > 0)
1656             qDebug("    in atom %d", s[i].atom);
1657 #endif
1658         int m = s[i].match;
1659         if ((m & CharClassBit) != 0) {
1660             qDebug("    match character class %d", m ^ CharClassBit);
1661 #ifndef QT_NO_REGEXP_CCLASS
1662             cl[m ^ CharClassBit].dump();
1663 #else
1664             qDebug("    negative character class");
1665 #endif
1666         } else if ((m & BackRefBit) != 0) {
1667             qDebug("    match back-reference %d", m ^ BackRefBit);
1668         } else if (m >= 0x20 && m <= 0x7e) {
1669             qDebug("    match 0x%.4x (%c)", m, m);
1670         } else {
1671             qDebug("    match 0x%.4x", m);
1672         }
1673         for (j = 0; j < s[i].outs.size(); j++) {
1674             int next = s[i].outs[j];
1675             qDebug("    -> %d", next);
1676             if (s[i].reenter.contains(next))
1677                 qDebug("       [reenter %d]", s[i].reenter[next]);
1678             if (s[i].anchors.value(next) != 0)
1679                 qDebug("       [anchors 0x%.8x]", s[i].anchors[next]);
1680         }
1681     }
1682 #ifndef QT_NO_REGEXP_CAPTURE
1683     if (nf > 0) {
1684         qDebug("  Atom    Parent  Capture");
1685         for (i = 0; i < nf; i++) {
1686             if (f[i].capture == QRegExpAtom::NoCapture) {
1687                 qDebug("  %6d  %6d     nil", i, f[i].parent);
1688             } else {
1689                 int cap = f[i].capture;
1690                 bool official = captureForOfficialCapture.contains(cap);
1691                 qDebug("  %6d  %6d  %6d  %s", i, f[i].parent, f[i].capture,
1692                        official ? "official" : "");
1693             }
1694         }
1695     }
1696 #endif
1697 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1698     for (i = 0; i < aa.size(); i++)
1699         qDebug("  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1700 #endif
1701 }
1702 #endif
1703
1704 void QRegExpEngine::setup()
1705 {
1706     ref.store(1);
1707 #ifndef QT_NO_REGEXP_CAPTURE
1708     f.resize(32);
1709     nf = 0;
1710     cf = -1;
1711 #endif
1712     officialncap = 0;
1713     ncap = 0;
1714 #ifndef QT_NO_REGEXP_OPTIM
1715     caretAnchored = true;
1716     trivial = true;
1717 #endif
1718     valid = false;
1719 #ifndef QT_NO_REGEXP_BACKREF
1720     nbrefs = 0;
1721 #endif
1722 #ifndef QT_NO_REGEXP_OPTIM
1723     useGoodStringHeuristic = true;
1724     minl = 0;
1725     occ1.fill(0, NumBadChars);
1726 #endif
1727 }
1728
1729 int QRegExpEngine::setupState(int match)
1730 {
1731 #ifndef QT_NO_REGEXP_CAPTURE
1732     s += QRegExpAutomatonState(cf, match);
1733 #else
1734     s += QRegExpAutomatonState(match);
1735 #endif
1736     return s.size() - 1;
1737 }
1738
1739 #ifndef QT_NO_REGEXP_CAPTURE
1740 /*
1741   Functions startAtom() and finishAtom() should be called to delimit
1742   atoms. When a state is created, it is assigned to the current atom.
1743   The information is later used for capturing.
1744 */
1745 int QRegExpEngine::startAtom(bool officialCapture)
1746 {
1747     if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1748         f.resize((nf + 1) << 1);
1749     f[nf].parent = cf;
1750     cf = nf++;
1751     f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1752     return cf;
1753 }
1754
1755 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1756 {
1757     if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1758         f[atom].capture = QRegExpAtom::UnofficialCapture;
1759     cf = f.at(atom).parent;
1760 }
1761 #endif
1762
1763 #ifndef QT_NO_REGEXP_LOOKAHEAD
1764 /*
1765   Creates a lookahead anchor.
1766 */
1767 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1768 {
1769     int n = ahead.size();
1770     if (n == MaxLookaheads) {
1771         error(RXERR_LIMIT);
1772         return 0;
1773     }
1774     ahead += new QRegExpLookahead(eng, negative);
1775     return Anchor_FirstLookahead << n;
1776 }
1777 #endif
1778
1779 #ifndef QT_NO_REGEXP_CAPTURE
1780 /*
1781   We want the longest leftmost captures.
1782 */
1783 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1784                             const int *end2)
1785 {
1786     for (int i = 0; i < ncap; i++) {
1787         int delta = begin2[i] - begin1[i]; // it has to start early...
1788         if (delta == 0)
1789             delta = end1[i] - end2[i]; // ...and end late
1790
1791         if (delta != 0)
1792             return delta > 0;
1793     }
1794     return false;
1795 }
1796 #endif
1797
1798 /*
1799   Returns true if anchor a matches at position pos + i in the input
1800   string, otherwise false.
1801 */
1802 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1803 {
1804     int j;
1805
1806 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1807     if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1808         return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1809                || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1810 #endif
1811
1812     if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1813         if (pos + i != caretPos)
1814             return false;
1815     }
1816     if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1817         if (pos + i != len)
1818             return false;
1819     }
1820 #ifndef QT_NO_REGEXP_ESCAPE
1821     if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1822         bool before = false;
1823         bool after = false;
1824         if (pos + i != 0)
1825             before = isWord(in[pos + i - 1]);
1826         if (pos + i != len)
1827             after = isWord(in[pos + i]);
1828         if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1829             return false;
1830         if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1831             return false;
1832     }
1833 #endif
1834 #ifndef QT_NO_REGEXP_LOOKAHEAD
1835     if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1836         const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1837         for (j = 0; j < ahead.size(); j++) {
1838             if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1839                 QRegExpMatchState matchState;
1840                 matchState.prepareForMatch(ahead[j]->eng);
1841                 matchState.match(in + pos + i, len - pos - i, 0,
1842                     true, true, matchState.caretPos - matchState.pos - i);
1843                 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1844                     return false;
1845             }
1846         }
1847     }
1848 #endif
1849 #ifndef QT_NO_REGEXP_CAPTURE
1850 #ifndef QT_NO_REGEXP_BACKREF
1851     for (j = 0; j < eng->nbrefs; j++) {
1852         if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1853             int i = eng->captureForOfficialCapture.at(j);
1854             if (capBegin[i] != EmptyCapture)
1855                 return false;
1856         }
1857     }
1858 #endif
1859 #endif
1860     return true;
1861 }
1862
1863 #ifndef QT_NO_REGEXP_OPTIM
1864 /*
1865   The three following functions are what Jeffrey Friedl would call
1866   transmissions (or bump-alongs). Using one or the other should make
1867   no difference except in performance.
1868 */
1869
1870 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1871 {
1872     int k = matchState.pos + goodEarlyStart;
1873     QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1874     while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1875         int from = k - goodLateStart;
1876         int to = k - goodEarlyStart;
1877         if (from > matchState.pos)
1878             matchState.pos = from;
1879
1880         while (matchState.pos <= to) {
1881             if (matchState.matchHere())
1882                 return true;
1883             ++matchState.pos;
1884         }
1885         ++k;
1886     }
1887     return false;
1888 }
1889
1890 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1891 {
1892     int slideHead = 0;
1893     int slideNext = 0;
1894     int i;
1895     int lastPos = matchState.len - minl;
1896     memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1897
1898     /*
1899       Set up the slide table, used for the bad-character heuristic,
1900       using the table of first occurrence of each character.
1901     */
1902     for (i = 0; i < minl; i++) {
1903         int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1904         if (sk == NoOccurrence)
1905             sk = i + 1;
1906         if (sk > 0) {
1907             int k = i + 1 - sk;
1908             if (k < 0) {
1909                 sk = i + 1;
1910                 k = 0;
1911             }
1912             if (sk > matchState.slideTab[k])
1913                 matchState.slideTab[k] = sk;
1914         }
1915     }
1916
1917     if (matchState.pos > lastPos)
1918         return false;
1919
1920     for (;;) {
1921         if (++slideNext >= matchState.slideTabSize)
1922             slideNext = 0;
1923         if (matchState.slideTab[slideHead] > 0) {
1924             if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1925                 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1926             matchState.slideTab[slideHead] = 0;
1927         } else {
1928             if (matchState.matchHere())
1929                 return true;
1930         }
1931
1932         if (matchState.pos == lastPos)
1933             break;
1934
1935         /*
1936           Update the slide table. This code has much in common with
1937           the initialization code.
1938         */
1939         int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1940         if (sk == NoOccurrence) {
1941             matchState.slideTab[slideNext] = minl;
1942         } else if (sk > 0) {
1943             int k = slideNext + minl - sk;
1944             if (k >= matchState.slideTabSize)
1945                 k -= matchState.slideTabSize;
1946             if (sk > matchState.slideTab[k])
1947                 matchState.slideTab[k] = sk;
1948         }
1949         slideHead = slideNext;
1950         ++matchState.pos;
1951     }
1952     return false;
1953 }
1954 #else
1955 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1956 {
1957     while (matchState.pos <= matchState.len) {
1958         if (matchState.matchHere())
1959             return true;
1960         ++matchState.pos;
1961     }
1962     return false;
1963 }
1964 #endif
1965
1966 /*
1967   Here's the core of the engine. It tries to do a match here and now.
1968 */
1969 bool QRegExpMatchState::matchHere()
1970 {
1971     int ncur = 1, nnext = 0;
1972     int i = 0, j, k, m;
1973     bool stop = false;
1974
1975     matchLen = -1;
1976     oneTestMatchedLen = -1;
1977     curStack[0] = QRegExpEngine::InitialState;
1978
1979     int ncap = eng->ncap;
1980 #ifndef QT_NO_REGEXP_CAPTURE
1981     if (ncap > 0) {
1982         for (j = 0; j < ncap; j++) {
1983             curCapBegin[j] = EmptyCapture;
1984             curCapEnd[j] = EmptyCapture;
1985         }
1986     }
1987 #endif
1988
1989 #ifndef QT_NO_REGEXP_BACKREF
1990     while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1991 #else
1992     while (ncur > 0 && i <= len - pos && !stop)
1993 #endif
1994     {
1995         int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1996         for (j = 0; j < ncur; j++) {
1997             int cur = curStack[j];
1998             const QRegExpAutomatonState &scur = eng->s.at(cur);
1999             const QVector<int> &outs = scur.outs;
2000             for (k = 0; k < outs.size(); k++) {
2001                 int next = outs.at(k);
2002                 const QRegExpAutomatonState &snext = eng->s.at(next);
2003                 bool inside = true;
2004 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2005                 int needSomeSleep = 0;
2006 #endif
2007
2008                 /*
2009                   First, check if the anchors are anchored properly.
2010                 */
2011                 int a = scur.anchors.value(next);
2012                 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2013                     inside = false;
2014
2015                 /*
2016                   If indeed they are, check if the input character is
2017                   correct for this transition.
2018                 */
2019                 if (inside) {
2020                     m = snext.match;
2021                     if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2022                         if (eng->cs)
2023                             inside = (m == ch);
2024                         else
2025                             inside = (QChar(m).toLower() == QChar(ch).toLower());
2026                     } else if (next == QRegExpEngine::FinalState) {
2027                         matchLen = i;
2028                         stop = minimal;
2029                         inside = true;
2030                     } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2031 #ifndef QT_NO_REGEXP_CCLASS
2032                         const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2033                         if (eng->cs)
2034                             inside = cc.in(ch);
2035                         else if (cc.negative())
2036                             inside = cc.in(QChar(ch).toLower()) &&
2037                                      cc.in(QChar(ch).toUpper());
2038                         else
2039                             inside = cc.in(QChar(ch).toLower()) ||
2040                                      cc.in(QChar(ch).toUpper());
2041 #endif
2042 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2043                     } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2044                         int bref = m ^ QRegExpEngine::BackRefBit;
2045                         int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2046
2047                         inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2048                         if (inside) {
2049                             if (eng->cs)
2050                                 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2051                             else
2052                                 inside = (in[pos + curCapBegin[ell]].toLower()
2053                                        == QChar(ch).toLower());
2054                         }
2055
2056                         if (inside) {
2057                             int delta;
2058                             if (curCapEnd[ell] == EmptyCapture)
2059                                 delta = i - curCapBegin[ell];
2060                             else
2061                                 delta = curCapEnd[ell] - curCapBegin[ell];
2062
2063                             inside = (delta <= len - (pos + i));
2064                             if (inside && delta > 1) {
2065                                 int n = 1;
2066                                 if (eng->cs) {
2067                                     while (n < delta) {
2068                                         if (in[pos + curCapBegin[ell] + n]
2069                                             != in[pos + i + n])
2070                                             break;
2071                                         ++n;
2072                                     }
2073                                 } else {
2074                                     while (n < delta) {
2075                                         QChar a = in[pos + curCapBegin[ell] + n];
2076                                         QChar b = in[pos + i + n];
2077                                         if (a.toLower() != b.toLower())
2078                                             break;
2079                                         ++n;
2080                                     }
2081                                 }
2082                                 inside = (n == delta);
2083                                 if (inside)
2084                                     needSomeSleep = delta - 1;
2085                             }
2086                         }
2087 #endif
2088                     }
2089                 }
2090
2091                 /*
2092                   We must now update our data structures.
2093                 */
2094                 if (inside) {
2095 #ifndef QT_NO_REGEXP_CAPTURE
2096                     int *capBegin, *capEnd;
2097 #endif
2098                     /*
2099                       If the next state was not encountered yet, all
2100                       is fine.
2101                     */
2102                     if ((m = inNextStack[next]) == -1) {
2103                         m = nnext++;
2104                         nextStack[m] = next;
2105                         inNextStack[next] = m;
2106 #ifndef QT_NO_REGEXP_CAPTURE
2107                         capBegin = nextCapBegin + m * ncap;
2108                         capEnd = nextCapEnd + m * ncap;
2109
2110                     /*
2111                       Otherwise, we'll first maintain captures in
2112                       temporary arrays, and decide at the end whether
2113                       it's best to keep the previous capture zones or
2114                       the new ones.
2115                     */
2116                     } else {
2117                         capBegin = tempCapBegin;
2118                         capEnd = tempCapEnd;
2119 #endif
2120                     }
2121
2122 #ifndef QT_NO_REGEXP_CAPTURE
2123                     /*
2124                       Updating the capture zones is much of a task.
2125                     */
2126                     if (ncap > 0) {
2127                         memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2128                         memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2129                         int c = scur.atom, n = snext.atom;
2130                         int p = -1, q = -1;
2131                         int cap;
2132
2133                         /*
2134                           Lemma 1. For any x in the range [0..nf), we
2135                           have f[x].parent < x.
2136
2137                           Proof. By looking at startAtom(), it is
2138                           clear that cf < nf holds all the time, and
2139                           thus that f[nf].parent < nf.
2140                         */
2141
2142                         /*
2143                           If we are reentering an atom, we empty all
2144                           capture zones inside it.
2145                         */
2146                         if ((q = scur.reenter.value(next)) != 0) {
2147                             QBitArray b(eng->nf, false);
2148                             b.setBit(q, true);
2149                             for (int ell = q + 1; ell < eng->nf; ell++) {
2150                                 if (b.testBit(eng->f.at(ell).parent)) {
2151                                     b.setBit(ell, true);
2152                                     cap = eng->f.at(ell).capture;
2153                                     if (cap >= 0) {
2154                                         capBegin[cap] = EmptyCapture;
2155                                         capEnd[cap] = EmptyCapture;
2156                                     }
2157                                 }
2158                             }
2159                             p = eng->f.at(q).parent;
2160
2161                         /*
2162                           Otherwise, close the capture zones we are
2163                           leaving. We are leaving f[c].capture,
2164                           f[f[c].parent].capture,
2165                           f[f[f[c].parent].parent].capture, ...,
2166                           until f[x].capture, with x such that
2167                           f[x].parent is the youngest common ancestor
2168                           for c and n.
2169
2170                           We go up along c's and n's ancestry until
2171                           we find x.
2172                         */
2173                         } else {
2174                             p = c;
2175                             q = n;
2176                             while (p != q) {
2177                                 if (p > q) {
2178                                     cap = eng->f.at(p).capture;
2179                                     if (cap >= 0) {
2180                                         if (capBegin[cap] == i) {
2181                                             capBegin[cap] = EmptyCapture;
2182                                             capEnd[cap] = EmptyCapture;
2183                                         } else {
2184                                             capEnd[cap] = i;
2185                                         }
2186                                     }
2187                                     p = eng->f.at(p).parent;
2188                                 } else {
2189                                     q = eng->f.at(q).parent;
2190                                 }
2191                             }
2192                         }
2193
2194                         /*
2195                           In any case, we now open the capture zones
2196                           we are entering. We work upwards from n
2197                           until we reach p (the parent of the atom we
2198                           reenter or the youngest common ancestor).
2199                         */
2200                         while (n > p) {
2201                             cap = eng->f.at(n).capture;
2202                             if (cap >= 0) {
2203                                 capBegin[cap] = i;
2204                                 capEnd[cap] = EmptyCapture;
2205                             }
2206                             n = eng->f.at(n).parent;
2207                         }
2208                         /*
2209                           If the next state was already in
2210                           nextStack, we must choose carefully which
2211                           capture zones we want to keep.
2212                         */
2213                         if (capBegin == tempCapBegin &&
2214                                 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2215                                                 nextCapEnd + m * ncap)) {
2216                             memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2217                             memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2218                         }
2219                     }
2220 #ifndef QT_NO_REGEXP_BACKREF
2221                     /*
2222                       We are done with updating the capture zones.
2223                       It's now time to put the next state to sleep,
2224                       if it needs to, and to remove it from
2225                       nextStack.
2226                     */
2227                     if (needSomeSleep > 0) {
2228                         QVector<int> zzZ(2 + 2 * ncap);
2229                         zzZ[0] = i + needSomeSleep;
2230                         zzZ[1] = next;
2231                         if (ncap > 0) {
2232                             memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2233                             memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2234                         }
2235                         inNextStack[nextStack[--nnext]] = -1;
2236                         sleeping.append(zzZ);
2237                     }
2238 #endif
2239 #endif
2240                 }
2241             }
2242         }
2243 #ifndef QT_NO_REGEXP_CAPTURE
2244         /*
2245           If we reached the final state, hurray! Copy the captured
2246           zone.
2247         */
2248         if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2249             memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2250             memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2251         }
2252 #ifndef QT_NO_REGEXP_BACKREF
2253         /*
2254           It's time to wake up the sleepers.
2255         */
2256         j = 0;
2257         while (j < sleeping.count()) {
2258             if (sleeping.at(j)[0] == i) {
2259                 const QVector<int> &zzZ = sleeping.at(j);
2260                 int next = zzZ[1];
2261                 const int *capBegin = zzZ.data() + 2;
2262                 const int *capEnd = zzZ.data() + 2 + ncap;
2263                 bool copyOver = true;
2264
2265                 if ((m = inNextStack[next]) == -1) {
2266                     m = nnext++;
2267                     nextStack[m] = next;
2268                     inNextStack[next] = m;
2269                 } else {
2270                     copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2271                                                capBegin, capEnd);
2272                 }
2273                 if (copyOver) {
2274                     memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2275                     memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2276                 }
2277
2278                 sleeping.removeAt(j);
2279             } else {
2280                 ++j;
2281             }
2282         }
2283 #endif
2284 #endif
2285         for (j = 0; j < nnext; j++)
2286             inNextStack[nextStack[j]] = -1;
2287
2288         // avoid needless iteration that confuses oneTestMatchedLen
2289         if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2290 #ifndef QT_NO_REGEXP_BACKREF
2291              && sleeping.isEmpty()
2292 #endif
2293            )
2294             stop = true;
2295
2296         qSwap(curStack, nextStack);
2297 #ifndef QT_NO_REGEXP_CAPTURE
2298         qSwap(curCapBegin, nextCapBegin);
2299         qSwap(curCapEnd, nextCapEnd);
2300 #endif
2301         ncur = nnext;
2302         nnext = 0;
2303         ++i;
2304     }
2305
2306 #ifndef QT_NO_REGEXP_BACKREF
2307     /*
2308       If minimal matching is enabled, we might have some sleepers
2309       left.
2310     */
2311     if (!sleeping.isEmpty())
2312         sleeping.clear();
2313 #endif
2314
2315     oneTestMatchedLen = i - 1;
2316     return (matchLen >= 0);
2317 }
2318
2319 #ifndef QT_NO_REGEXP_CCLASS
2320
2321 QRegExpCharClass::QRegExpCharClass()
2322     : c(0), n(false)
2323 {
2324 #ifndef QT_NO_REGEXP_OPTIM
2325     occ1.fill(NoOccurrence, NumBadChars);
2326 #endif
2327 }
2328
2329 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2330 {
2331     c = cc.c;
2332     r = cc.r;
2333     n = cc.n;
2334 #ifndef QT_NO_REGEXP_OPTIM
2335     occ1 = cc.occ1;
2336 #endif
2337     return *this;
2338 }
2339
2340 void QRegExpCharClass::clear()
2341 {
2342     c = 0;
2343     r.resize(0);
2344     n = false;
2345 }
2346
2347 void QRegExpCharClass::setNegative(bool negative)
2348 {
2349     n = negative;
2350 #ifndef QT_NO_REGEXP_OPTIM
2351     occ1.fill(0, NumBadChars);
2352 #endif
2353 }
2354
2355 void QRegExpCharClass::addCategories(uint cats)
2356 {
2357     static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2358                                 FLAG(QChar::Mark_SpacingCombining) |
2359                                 FLAG(QChar::Mark_Enclosing) |
2360                                 FLAG(QChar::Number_DecimalDigit) |
2361                                 FLAG(QChar::Number_Letter) |
2362                                 FLAG(QChar::Number_Other) |
2363                                 FLAG(QChar::Separator_Space) |
2364                                 FLAG(QChar::Separator_Line) |
2365                                 FLAG(QChar::Separator_Paragraph) |
2366                                 FLAG(QChar::Other_Control) |
2367                                 FLAG(QChar::Other_Format) |
2368                                 FLAG(QChar::Other_Surrogate) |
2369                                 FLAG(QChar::Other_PrivateUse) |
2370                                 FLAG(QChar::Other_NotAssigned) |
2371                                 FLAG(QChar::Letter_Uppercase) |
2372                                 FLAG(QChar::Letter_Lowercase) |
2373                                 FLAG(QChar::Letter_Titlecase) |
2374                                 FLAG(QChar::Letter_Modifier) |
2375                                 FLAG(QChar::Letter_Other) |
2376                                 FLAG(QChar::Punctuation_Connector) |
2377                                 FLAG(QChar::Punctuation_Dash) |
2378                                 FLAG(QChar::Punctuation_Open) |
2379                                 FLAG(QChar::Punctuation_Close) |
2380                                 FLAG(QChar::Punctuation_InitialQuote) |
2381                                 FLAG(QChar::Punctuation_FinalQuote) |
2382                                 FLAG(QChar::Punctuation_Other) |
2383                                 FLAG(QChar::Symbol_Math) |
2384                                 FLAG(QChar::Symbol_Currency) |
2385                                 FLAG(QChar::Symbol_Modifier) |
2386                                 FLAG(QChar::Symbol_Other);
2387     c |= (all_cats & cats);
2388 #ifndef QT_NO_REGEXP_OPTIM
2389     occ1.fill(0, NumBadChars);
2390 #endif
2391 }
2392
2393 void QRegExpCharClass::addRange(ushort from, ushort to)
2394 {
2395     if (from > to)
2396         qSwap(from, to);
2397     int m = r.size();
2398     r.resize(m + 1);
2399     r[m].from = from;
2400     r[m].len = to - from + 1;
2401
2402 #ifndef QT_NO_REGEXP_OPTIM
2403     int i;
2404
2405     if (to - from < NumBadChars) {
2406         if (from % NumBadChars <= to % NumBadChars) {
2407             for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2408                 occ1[i] = 0;
2409         } else {
2410             for (i = 0; i <= to % NumBadChars; i++)
2411                 occ1[i] = 0;
2412             for (i = from % NumBadChars; i < NumBadChars; i++)
2413                 occ1[i] = 0;
2414         }
2415     } else {
2416         occ1.fill(0, NumBadChars);
2417     }
2418 #endif
2419 }
2420
2421 bool QRegExpCharClass::in(QChar ch) const
2422 {
2423 #ifndef QT_NO_REGEXP_OPTIM
2424     if (occ1.at(BadChar(ch)) == NoOccurrence)
2425         return n;
2426 #endif
2427
2428     if (c != 0 && (c & FLAG(ch.category())) != 0)
2429         return !n;
2430
2431     const int uc = ch.unicode();
2432     int size = r.size();
2433
2434     for (int i = 0; i < size; ++i) {
2435         const QRegExpCharClassRange &range = r.at(i);
2436         if (uint(uc - range.from) < uint(r.at(i).len))
2437             return !n;
2438     }
2439     return n;
2440 }
2441
2442 #if defined(QT_DEBUG)
2443 void QRegExpCharClass::dump() const
2444 {
2445     int i;
2446     qDebug("    %stive character class", n ? "nega" : "posi");
2447 #ifndef QT_NO_REGEXP_CCLASS
2448     if (c != 0)
2449         qDebug("      categories 0x%.8x", c);
2450 #endif
2451     for (i = 0; i < r.size(); i++)
2452         qDebug("      0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2453 }
2454 #endif
2455 #endif
2456
2457 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2458     : eng(engine), skipanchors(0)
2459 #ifndef QT_NO_REGEXP_OPTIM
2460       , earlyStart(0), lateStart(0), maxl(0)
2461 #endif
2462 {
2463 #ifndef QT_NO_REGEXP_OPTIM
2464     occ1.fill(NoOccurrence, NumBadChars);
2465 #endif
2466     minl = 0;
2467 }
2468
2469 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2470 {
2471     eng = b.eng;
2472     ls = b.ls;
2473     rs = b.rs;
2474     lanchors = b.lanchors;
2475     ranchors = b.ranchors;
2476     skipanchors = b.skipanchors;
2477 #ifndef QT_NO_REGEXP_OPTIM
2478     earlyStart = b.earlyStart;
2479     lateStart = b.lateStart;
2480     str = b.str;
2481     leftStr = b.leftStr;
2482     rightStr = b.rightStr;
2483     maxl = b.maxl;
2484     occ1 = b.occ1;
2485 #endif
2486     minl = b.minl;
2487     return *this;
2488 }
2489
2490 void QRegExpEngine::Box::set(QChar ch)
2491 {
2492     ls.resize(1);
2493     ls[0] = eng->createState(ch);
2494     rs = ls;
2495 #ifndef QT_NO_REGEXP_OPTIM
2496     str = ch;
2497     leftStr = ch;
2498     rightStr = ch;
2499     maxl = 1;
2500     occ1[BadChar(ch)] = 0;
2501 #endif
2502     minl = 1;
2503 }
2504
2505 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2506 {
2507     ls.resize(1);
2508     ls[0] = eng->createState(cc);
2509     rs = ls;
2510 #ifndef QT_NO_REGEXP_OPTIM
2511     maxl = 1;
2512     occ1 = cc.firstOccurrence();
2513 #endif
2514     minl = 1;
2515 }
2516
2517 #ifndef QT_NO_REGEXP_BACKREF
2518 void QRegExpEngine::Box::set(int bref)
2519 {
2520     ls.resize(1);
2521     ls[0] = eng->createState(bref);
2522     rs = ls;
2523     if (bref >= 1 && bref <= MaxBackRefs)
2524         skipanchors = Anchor_BackRef0Empty << bref;
2525 #ifndef QT_NO_REGEXP_OPTIM
2526     maxl = InftyLen;
2527 #endif
2528     minl = 0;
2529 }
2530 #endif
2531
2532 void QRegExpEngine::Box::cat(const Box &b)
2533 {
2534     eng->addCatTransitions(rs, b.ls);
2535     addAnchorsToEngine(b);
2536     if (minl == 0) {
2537         lanchors.unite(b.lanchors);
2538         if (skipanchors != 0) {
2539             for (int i = 0; i < b.ls.size(); i++) {
2540                 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2541                 lanchors.insert(b.ls.at(i), a);
2542             }
2543         }
2544         mergeInto(&ls, b.ls);
2545     }
2546     if (b.minl == 0) {
2547         ranchors.unite(b.ranchors);
2548         if (b.skipanchors != 0) {
2549             for (int i = 0; i < rs.size(); i++) {
2550                 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2551                 ranchors.insert(rs.at(i), a);
2552             }
2553         }
2554         mergeInto(&rs, b.rs);
2555     } else {
2556         ranchors = b.ranchors;
2557         rs = b.rs;
2558     }
2559
2560 #ifndef QT_NO_REGEXP_OPTIM
2561     if (maxl != InftyLen) {
2562         if (rightStr.length() + b.leftStr.length() >
2563              qMax(str.length(), b.str.length())) {
2564             earlyStart = minl - rightStr.length();
2565             lateStart = maxl - rightStr.length();
2566             str = rightStr + b.leftStr;
2567         } else if (b.str.length() > str.length()) {
2568             earlyStart = minl + b.earlyStart;
2569             lateStart = maxl + b.lateStart;
2570             str = b.str;
2571         }
2572     }
2573
2574     if (leftStr.length() == maxl)
2575         leftStr += b.leftStr;
2576
2577     if (b.rightStr.length() == b.maxl) {
2578         rightStr += b.rightStr;
2579     } else {
2580         rightStr = b.rightStr;
2581     }
2582
2583     if (maxl == InftyLen || b.maxl == InftyLen) {
2584         maxl = InftyLen;
2585     } else {
2586         maxl += b.maxl;
2587     }
2588
2589     for (int i = 0; i < NumBadChars; i++) {
2590         if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2591             occ1[i] = minl + b.occ1.at(i);
2592     }
2593 #endif
2594
2595     minl += b.minl;
2596     if (minl == 0)
2597         skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2598     else
2599         skipanchors = 0;
2600 }
2601
2602 void QRegExpEngine::Box::orx(const Box &b)
2603 {
2604     mergeInto(&ls, b.ls);
2605     lanchors.unite(b.lanchors);
2606     mergeInto(&rs, b.rs);
2607     ranchors.unite(b.ranchors);
2608
2609     if (b.minl == 0) {
2610         if (minl == 0)
2611             skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2612         else
2613             skipanchors = b.skipanchors;
2614     }
2615
2616 #ifndef QT_NO_REGEXP_OPTIM
2617     for (int i = 0; i < NumBadChars; i++) {
2618         if (occ1.at(i) > b.occ1.at(i))
2619             occ1[i] = b.occ1.at(i);
2620     }
2621     earlyStart = 0;
2622     lateStart = 0;
2623     str = QString();
2624     leftStr = QString();
2625     rightStr = QString();
2626     if (b.maxl > maxl)
2627         maxl = b.maxl;
2628 #endif
2629     if (b.minl < minl)
2630         minl = b.minl;
2631 }
2632
2633 void QRegExpEngine::Box::plus(int atom)
2634 {
2635 #ifndef QT_NO_REGEXP_CAPTURE
2636     eng->addPlusTransitions(rs, ls, atom);
2637 #else
2638     Q_UNUSED(atom);
2639     eng->addCatTransitions(rs, ls);
2640 #endif
2641     addAnchorsToEngine(*this);
2642 #ifndef QT_NO_REGEXP_OPTIM
2643     maxl = InftyLen;
2644 #endif
2645 }
2646
2647 void QRegExpEngine::Box::opt()
2648 {
2649 #ifndef QT_NO_REGEXP_OPTIM
2650     earlyStart = 0;
2651     lateStart = 0;
2652     str = QString();
2653     leftStr = QString();
2654     rightStr = QString();
2655 #endif
2656     skipanchors = 0;
2657     minl = 0;
2658 }
2659
2660 void QRegExpEngine::Box::catAnchor(int a)
2661 {
2662     if (a != 0) {
2663         for (int i = 0; i < rs.size(); i++) {
2664             a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2665             ranchors.insert(rs.at(i), a);
2666         }
2667         if (minl == 0)
2668             skipanchors = eng->anchorConcatenation(skipanchors, a);
2669     }
2670 }
2671
2672 #ifndef QT_NO_REGEXP_OPTIM
2673 void QRegExpEngine::Box::setupHeuristics()
2674 {
2675     eng->goodEarlyStart = earlyStart;
2676     eng->goodLateStart = lateStart;
2677     eng->goodStr = eng->cs ? str : str.toLower();
2678
2679     eng->minl = minl;
2680     if (eng->cs) {
2681         /*
2682           A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2683           1 at this point. An entry of occ1 has to be at most minl or
2684           infinity for the rest of the algorithm to go well.
2685
2686           We waited until here before normalizing these cases (instead of
2687           doing it in Box::orx()) because sometimes things improve by
2688           themselves. Consider for example (112|1)34.
2689         */
2690         for (int i = 0; i < NumBadChars; i++) {
2691             if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2692                 occ1[i] = minl;
2693         }
2694         eng->occ1 = occ1;
2695     } else {
2696         eng->occ1.fill(0, NumBadChars);
2697     }
2698
2699     eng->heuristicallyChooseHeuristic();
2700 }
2701 #endif
2702
2703 #if defined(QT_DEBUG)
2704 void QRegExpEngine::Box::dump() const
2705 {
2706     int i;
2707     qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2708     qDebug("  Left states:");
2709     for (i = 0; i < ls.size(); i++) {
2710         if (lanchors.value(ls[i], 0) == 0)
2711             qDebug("    %d", ls[i]);
2712         else
2713             qDebug("    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2714     }
2715     qDebug("  Right states:");
2716     for (i = 0; i < rs.size(); i++) {
2717         if (ranchors.value(rs[i], 0) == 0)
2718             qDebug("    %d", rs[i]);
2719         else
2720             qDebug("    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2721     }
2722     qDebug("  Skip anchors: 0x%.8x", skipanchors);
2723 }
2724 #endif
2725
2726 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2727 {
2728     for (int i = 0; i < to.ls.size(); i++) {
2729         for (int j = 0; j < rs.size(); j++) {
2730             int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2731                                              to.lanchors.value(to.ls.at(i), 0));
2732             eng->addAnchors(rs[j], to.ls[i], a);
2733         }
2734     }
2735 }
2736
2737 #ifndef QT_NO_REGEXP_CCLASS
2738 // fast lookup hash for xml schema extensions
2739 // sorted by name for b-search
2740 static const struct CategoriesRangeMapEntry {
2741     const char name[40];
2742     uint first, second;
2743 } categoriesRangeMap[] = {
2744     { "AegeanNumbers",                        0x10100, 0x1013F },
2745     { "AlphabeticPresentationForms",          0xFB00, 0xFB4F },
2746     { "AncientGreekMusicalNotation",          0x1D200, 0x1D24F },
2747     { "AncientGreekNumbers",                  0x10140, 0x1018F },
2748     { "Arabic",                               0x0600, 0x06FF },
2749     { "ArabicPresentationForms-A",            0xFB50, 0xFDFF },
2750     { "ArabicPresentationForms-B",            0xFE70, 0xFEFF },
2751     { "ArabicSupplement",                     0x0750, 0x077F },
2752     { "Armenian",                             0x0530, 0x058F },
2753     { "Arrows",                               0x2190, 0x21FF },
2754     { "BasicLatin",                           0x0000, 0x007F },
2755     { "Bengali",                              0x0980, 0x09FF },
2756     { "BlockElements",                        0x2580, 0x259F },
2757     { "Bopomofo",                             0x3100, 0x312F },
2758     { "BopomofoExtended",                     0x31A0, 0x31BF },
2759     { "BoxDrawing",                           0x2500, 0x257F },
2760     { "BraillePatterns",                      0x2800, 0x28FF },
2761     { "Buginese",                             0x1A00, 0x1A1F },
2762     { "Buhid",                                0x1740, 0x175F },
2763     { "ByzantineMusicalSymbols",              0x1D000, 0x1D0FF },
2764     { "CJKCompatibility",                     0x3300, 0x33FF },
2765     { "CJKCompatibilityForms",                0xFE30, 0xFE4F },
2766     { "CJKCompatibilityIdeographs",           0xF900, 0xFAFF },
2767     { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },
2768     { "CJKRadicalsSupplement",                0x2E80, 0x2EFF },
2769     { "CJKStrokes",                           0x31C0, 0x31EF },
2770     { "CJKSymbolsandPunctuation",             0x3000, 0x303F },
2771     { "CJKUnifiedIdeographs",                 0x4E00, 0x9FFF },
2772     { "CJKUnifiedIdeographsExtensionA",       0x3400, 0x4DB5 },
2773     { "CJKUnifiedIdeographsExtensionB",       0x20000, 0x2A6DF },
2774     { "Cherokee",                             0x13A0, 0x13FF },
2775     { "CombiningDiacriticalMarks",            0x0300, 0x036F },
2776     { "CombiningDiacriticalMarksSupplement",  0x1DC0, 0x1DFF },
2777     { "CombiningHalfMarks",                   0xFE20, 0xFE2F },
2778     { "CombiningMarksforSymbols",             0x20D0, 0x20FF },
2779     { "ControlPictures",                      0x2400, 0x243F },
2780     { "Coptic",                               0x2C80, 0x2CFF },
2781     { "CurrencySymbols",                      0x20A0, 0x20CF },
2782     { "CypriotSyllabary",                     0x10800, 0x1083F },
2783     { "Cyrillic",                             0x0400, 0x04FF },
2784     { "CyrillicSupplement",                   0x0500, 0x052F },
2785     { "Deseret",                              0x10400, 0x1044F },
2786     { "Devanagari",                           0x0900, 0x097F },
2787     { "Dingbats",                             0x2700, 0x27BF },
2788     { "EnclosedAlphanumerics",                0x2460, 0x24FF },
2789     { "EnclosedCJKLettersandMonths",          0x3200, 0x32FF },
2790     { "Ethiopic",                             0x1200, 0x137F },
2791     { "EthiopicExtended",                     0x2D80, 0x2DDF },
2792     { "EthiopicSupplement",                   0x1380, 0x139F },
2793     { "GeneralPunctuation",                   0x2000, 0x206F },
2794     { "GeometricShapes",                      0x25A0, 0x25FF },
2795     { "Georgian",                             0x10A0, 0x10FF },
2796     { "GeorgianSupplement",                   0x2D00, 0x2D2F },
2797     { "Glagolitic",                           0x2C00, 0x2C5F },
2798     { "Gothic",                               0x10330, 0x1034F },
2799     { "Greek",                                0x0370, 0x03FF },
2800     { "GreekExtended",                        0x1F00, 0x1FFF },
2801     { "Gujarati",                             0x0A80, 0x0AFF },
2802     { "Gurmukhi",                             0x0A00, 0x0A7F },
2803     { "HalfwidthandFullwidthForms",           0xFF00, 0xFFEF },
2804     { "HangulCompatibilityJamo",              0x3130, 0x318F },
2805     { "HangulJamo",                           0x1100, 0x11FF },
2806     { "HangulSyllables",                      0xAC00, 0xD7A3 },
2807     { "Hanunoo",                              0x1720, 0x173F },
2808     { "Hebrew",                               0x0590, 0x05FF },
2809     { "Hiragana",                             0x3040, 0x309F },
2810     { "IPAExtensions",                        0x0250, 0x02AF },
2811     { "IdeographicDescriptionCharacters",     0x2FF0, 0x2FFF },
2812     { "Kanbun",                               0x3190, 0x319F },
2813     { "KangxiRadicals",                       0x2F00, 0x2FDF },
2814     { "Kannada",                              0x0C80, 0x0CFF },
2815     { "Katakana",                             0x30A0, 0x30FF },
2816     { "KatakanaPhoneticExtensions",           0x31F0, 0x31FF },
2817     { "Kharoshthi",                           0x10A00, 0x10A5F },
2818     { "Khmer",                                0x1780, 0x17FF },
2819     { "KhmerSymbols",                         0x19E0, 0x19FF },
2820     { "Lao",                                  0x0E80, 0x0EFF },
2821     { "Latin-1Supplement",                    0x0080, 0x00FF },
2822     { "LatinExtended-A",                      0x0100, 0x017F },
2823     { "LatinExtended-B",                      0x0180, 0x024F },
2824     { "LatinExtendedAdditional",              0x1E00, 0x1EFF },
2825     { "LetterlikeSymbols",                    0x2100, 0x214F },
2826     { "Limbu",                                0x1900, 0x194F },
2827     { "LinearBIdeograms",                     0x10080, 0x100FF },
2828     { "LinearBSyllabary",                     0x10000, 0x1007F },
2829     { "Malayalam",                            0x0D00, 0x0D7F },
2830     { "MathematicalAlphanumericSymbols",      0x1D400, 0x1D7FF },
2831     { "MathematicalOperators",                0x2200, 0x22FF },
2832     { "MiscellaneousMathematicalSymbols-A",   0x27C0, 0x27EF },
2833     { "MiscellaneousMathematicalSymbols-B",   0x2980, 0x29FF },
2834     { "MiscellaneousSymbols",                 0x2600, 0x26FF },
2835     { "MiscellaneousSymbolsandArrows",        0x2B00, 0x2BFF },
2836     { "MiscellaneousTechnical",               0x2300, 0x23FF },
2837     { "ModifierToneLetters",                  0xA700, 0xA71F },
2838     { "Mongolian",                            0x1800, 0x18AF },
2839     { "MusicalSymbols",                       0x1D100, 0x1D1FF },
2840     { "Myanmar",                              0x1000, 0x109F },
2841     { "NewTaiLue",                            0x1980, 0x19DF },
2842     { "NumberForms",                          0x2150, 0x218F },
2843     { "Ogham",                                0x1680, 0x169F },
2844     { "OldItalic",                            0x10300, 0x1032F },
2845     { "OldPersian",                           0x103A0, 0x103DF },
2846     { "OpticalCharacterRecognition",          0x2440, 0x245F },
2847     { "Oriya",                                0x0B00, 0x0B7F },
2848     { "Osmanya",                              0x10480, 0x104AF },
2849     { "PhoneticExtensions",                   0x1D00, 0x1D7F },
2850     { "PhoneticExtensionsSupplement",         0x1D80, 0x1DBF },
2851     { "PrivateUse",                           0xE000, 0xF8FF },
2852     { "Runic",                                0x16A0, 0x16FF },
2853     { "Shavian",                              0x10450, 0x1047F },
2854     { "Sinhala",                              0x0D80, 0x0DFF },
2855     { "SmallFormVariants",                    0xFE50, 0xFE6F },
2856     { "SpacingModifierLetters",               0x02B0, 0x02FF },
2857     { "Specials",                             0xFFF0, 0xFFFF },
2858     { "SuperscriptsandSubscripts",            0x2070, 0x209F },
2859     { "SupplementalArrows-A",                 0x27F0, 0x27FF },
2860     { "SupplementalArrows-B",                 0x2900, 0x297F },
2861     { "SupplementalMathematicalOperators",    0x2A00, 0x2AFF },
2862     { "SupplementalPunctuation",              0x2E00, 0x2E7F },
2863     { "SupplementaryPrivateUseArea-A",        0xF0000, 0xFFFFF },
2864     { "SupplementaryPrivateUseArea-B",        0x100000, 0x10FFFF },
2865     { "SylotiNagri",                          0xA800, 0xA82F },
2866     { "Syriac",                               0x0700, 0x074F },
2867     { "Tagalog",                              0x1700, 0x171F },
2868     { "Tagbanwa",                             0x1760, 0x177F },
2869     { "Tags",                                 0xE0000, 0xE007F },
2870     { "TaiLe",                                0x1950, 0x197F },
2871     { "TaiXuanJingSymbols",                   0x1D300, 0x1D35F },
2872     { "Tamil",                                0x0B80, 0x0BFF },
2873     { "Telugu",                               0x0C00, 0x0C7F },
2874     { "Thaana",                               0x0780, 0x07BF },
2875     { "Thai",                                 0x0E00, 0x0E7F },
2876     { "Tibetan",                              0x0F00, 0x0FFF },
2877     { "Tifinagh",                             0x2D30, 0x2D7F },
2878     { "Ugaritic",                             0x10380, 0x1039F },
2879     { "UnifiedCanadianAboriginalSyllabics",   0x1400, 0x167F },
2880     { "VariationSelectors",                   0xFE00, 0xFE0F },
2881     { "VariationSelectorsSupplement",         0xE0100, 0xE01EF },
2882     { "VerticalForms",                        0xFE10, 0xFE1F },
2883     { "YiRadicals",                           0xA490, 0xA4CF },
2884     { "YiSyllables",                          0xA000, 0xA48F },
2885     { "YijingHexagramSymbols",                0x4DC0, 0x4DFF }
2886 };
2887
2888 inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2889 { return qstrcmp(name, entry.name) < 0; }
2890 inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2891 { return qstrcmp(entry.name, name) < 0; }
2892 #endif // QT_NO_REGEXP_CCLASS
2893
2894 int QRegExpEngine::getChar()
2895 {
2896     return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2897 }
2898
2899 int QRegExpEngine::getEscape()
2900 {
2901 #ifndef QT_NO_REGEXP_ESCAPE
2902     const char tab[] = "afnrtv"; // no b, as \b means word boundary
2903     const char backTab[] = "\a\f\n\r\t\v";
2904     ushort low;
2905     int i;
2906 #endif
2907     ushort val;
2908     int prevCh = yyCh;
2909
2910     if (prevCh == EOS) {
2911         error(RXERR_END);
2912         return Tok_Char | '\\';
2913     }
2914     yyCh = getChar();
2915 #ifndef QT_NO_REGEXP_ESCAPE
2916     if ((prevCh & ~0xff) == 0) {
2917         const char *p = strchr(tab, prevCh);
2918         if (p != 0)
2919             return Tok_Char | backTab[p - tab];
2920     }
2921 #endif
2922
2923     switch (prevCh) {
2924 #ifndef QT_NO_REGEXP_ESCAPE
2925     case '0':
2926         val = 0;
2927         for (i = 0; i < 3; i++) {
2928             if (yyCh >= '0' && yyCh <= '7')
2929                 val = (val << 3) | (yyCh - '0');
2930             else
2931                 break;
2932             yyCh = getChar();
2933         }
2934         if ((val & ~0377) != 0)
2935             error(RXERR_OCTAL);
2936         return Tok_Char | val;
2937 #endif
2938 #ifndef QT_NO_REGEXP_ESCAPE
2939     case 'B':
2940         return Tok_NonWord;
2941 #endif
2942 #ifndef QT_NO_REGEXP_CCLASS
2943     case 'D':
2944         // see QChar::isDigit()
2945         yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2946         return Tok_CharClass;
2947     case 'S':
2948         // see QChar::isSpace()
2949         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |
2950                                                FLAG(QChar::Separator_Line) |
2951                                                FLAG(QChar::Separator_Paragraph) |
2952                                                FLAG(QChar::Other_Control)));
2953         yyCharClass->addRange(0x0000, 0x0008);
2954         yyCharClass->addRange(0x000e, 0x001f);
2955         yyCharClass->addRange(0x007f, 0x0084);
2956         yyCharClass->addRange(0x0086, 0x009f);
2957         return Tok_CharClass;
2958     case 'W':
2959         // see QChar::isLetterOrNumber() and QChar::isMark()
2960         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2961                                                FLAG(QChar::Mark_SpacingCombining) |
2962                                                FLAG(QChar::Mark_Enclosing) |
2963                                                FLAG(QChar::Number_DecimalDigit) |
2964                                                FLAG(QChar::Number_Letter) |
2965                                                FLAG(QChar::Number_Other) |
2966                                                FLAG(QChar::Letter_Uppercase) |
2967                                                FLAG(QChar::Letter_Lowercase) |
2968                                                FLAG(QChar::Letter_Titlecase) |
2969                                                FLAG(QChar::Letter_Modifier) |
2970                                                FLAG(QChar::Letter_Other) |
2971                                                FLAG(QChar::Punctuation_Connector)));
2972         yyCharClass->addRange(0x203f, 0x2040);
2973         yyCharClass->addSingleton(0x2040);
2974         yyCharClass->addSingleton(0x2054);
2975         yyCharClass->addSingleton(0x30fb);
2976         yyCharClass->addRange(0xfe33, 0xfe34);
2977         yyCharClass->addRange(0xfe4d, 0xfe4f);
2978         yyCharClass->addSingleton(0xff3f);
2979         yyCharClass->addSingleton(0xff65);
2980         return Tok_CharClass;
2981 #endif
2982 #ifndef QT_NO_REGEXP_ESCAPE
2983     case 'b':
2984         return Tok_Word;
2985 #endif
2986 #ifndef QT_NO_REGEXP_CCLASS
2987     case 'd':
2988         // see QChar::isDigit()
2989         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2990         return Tok_CharClass;
2991     case 's':
2992         // see QChar::isSpace()
2993         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2994                                    FLAG(QChar::Separator_Line) |
2995                                    FLAG(QChar::Separator_Paragraph));
2996         yyCharClass->addRange(0x0009, 0x000d);
2997         yyCharClass->addSingleton(0x0085);
2998         return Tok_CharClass;
2999     case 'w':
3000         // see QChar::isLetterOrNumber() and QChar::isMark()
3001         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3002                                    FLAG(QChar::Mark_SpacingCombining) |
3003                                    FLAG(QChar::Mark_Enclosing) |
3004                                    FLAG(QChar::Number_DecimalDigit) |
3005                                    FLAG(QChar::Number_Letter) |
3006                                    FLAG(QChar::Number_Other) |
3007                                    FLAG(QChar::Letter_Uppercase) |
3008                                    FLAG(QChar::Letter_Lowercase) |
3009                                    FLAG(QChar::Letter_Titlecase) |
3010                                    FLAG(QChar::Letter_Modifier) |
3011                                    FLAG(QChar::Letter_Other));
3012         yyCharClass->addSingleton(0x005f); // '_'
3013         return Tok_CharClass;
3014     case 'I':
3015         if (xmlSchemaExtensions) {
3016             yyCharClass->setNegative(!yyCharClass->negative());
3017             // fall through
3018         } else {
3019             break;
3020         }
3021     case 'i':
3022         if (xmlSchemaExtensions) {
3023             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3024                                        FLAG(QChar::Mark_SpacingCombining) |
3025                                        FLAG(QChar::Mark_Enclosing) |
3026                                        FLAG(QChar::Number_DecimalDigit) |
3027                                        FLAG(QChar::Number_Letter) |
3028                                        FLAG(QChar::Number_Other) |
3029                                        FLAG(QChar::Letter_Uppercase) |
3030                                        FLAG(QChar::Letter_Lowercase) |
3031                                        FLAG(QChar::Letter_Titlecase) |
3032                                        FLAG(QChar::Letter_Modifier) |
3033                                        FLAG(QChar::Letter_Other));
3034             yyCharClass->addSingleton(0x003a); // ':'
3035             yyCharClass->addSingleton(0x005f); // '_'
3036             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3037             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3038             yyCharClass->addRange(0xc0, 0xd6);
3039             yyCharClass->addRange(0xd8, 0xf6);
3040             yyCharClass->addRange(0xf8, 0x2ff);
3041             yyCharClass->addRange(0x370, 0x37d);
3042             yyCharClass->addRange(0x37f, 0x1fff);
3043             yyCharClass->addRange(0x200c, 0x200d);
3044             yyCharClass->addRange(0x2070, 0x218f);
3045             yyCharClass->addRange(0x2c00, 0x2fef);
3046             yyCharClass->addRange(0x3001, 0xd7ff);
3047             yyCharClass->addRange(0xf900, 0xfdcf);
3048             yyCharClass->addRange(0xfdf0, 0xfffd);
3049             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3050             return Tok_CharClass;
3051         } else {
3052             break;
3053         }
3054     case 'C':
3055         if (xmlSchemaExtensions) {
3056             yyCharClass->setNegative(!yyCharClass->negative());
3057             // fall through
3058         } else {
3059             break;
3060         }
3061     case 'c':
3062         if (xmlSchemaExtensions) {
3063             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3064                                        FLAG(QChar::Mark_SpacingCombining) |
3065                                        FLAG(QChar::Mark_Enclosing) |
3066                                        FLAG(QChar::Number_DecimalDigit) |
3067                                        FLAG(QChar::Number_Letter) |
3068                                        FLAG(QChar::Number_Other) |
3069                                        FLAG(QChar::Letter_Uppercase) |
3070                                        FLAG(QChar::Letter_Lowercase) |
3071                                        FLAG(QChar::Letter_Titlecase) |
3072                                        FLAG(QChar::Letter_Modifier) |
3073                                        FLAG(QChar::Letter_Other));
3074             yyCharClass->addSingleton(0x002d); // '-'
3075             yyCharClass->addSingleton(0x002e); // '.'
3076             yyCharClass->addSingleton(0x003a); // ':'
3077             yyCharClass->addSingleton(0x005f); // '_'
3078             yyCharClass->addSingleton(0xb7);
3079             yyCharClass->addRange(0x0030, 0x0039); // [0-9]
3080             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3081             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3082             yyCharClass->addRange(0xc0, 0xd6);
3083             yyCharClass->addRange(0xd8, 0xf6);
3084             yyCharClass->addRange(0xf8, 0x2ff);
3085             yyCharClass->addRange(0x370, 0x37d);
3086             yyCharClass->addRange(0x37f, 0x1fff);
3087             yyCharClass->addRange(0x200c, 0x200d);
3088             yyCharClass->addRange(0x2070, 0x218f);
3089             yyCharClass->addRange(0x2c00, 0x2fef);
3090             yyCharClass->addRange(0x3001, 0xd7ff);
3091             yyCharClass->addRange(0xf900, 0xfdcf);
3092             yyCharClass->addRange(0xfdf0, 0xfffd);
3093             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3094             yyCharClass->addRange(0x0300, 0x036f);
3095             yyCharClass->addRange(0x203f, 0x2040);
3096             return Tok_CharClass;
3097         } else {
3098             break;
3099         }
3100     case 'P':
3101         if (xmlSchemaExtensions) {
3102             yyCharClass->setNegative(!yyCharClass->negative());
3103             // fall through
3104         } else {
3105             break;
3106         }
3107     case 'p':
3108         if (xmlSchemaExtensions) {
3109             if (yyCh != '{') {
3110                 error(RXERR_CHARCLASS);
3111                 return Tok_CharClass;
3112             }
3113
3114             QByteArray category;
3115             yyCh = getChar();
3116             while (yyCh != '}') {
3117                 if (yyCh == EOS) {
3118                     error(RXERR_END);
3119                     return Tok_CharClass;
3120                 }
3121                 category.append(yyCh);
3122                 yyCh = getChar();
3123             }
3124             yyCh = getChar(); // skip closing '}'
3125
3126             int catlen = category.length();
3127             if (catlen == 1 || catlen == 2) {
3128                 switch (category.at(0)) {
3129                 case 'M':
3130                     if (catlen == 1) {
3131                         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3132                                                    FLAG(QChar::Mark_SpacingCombining) |
3133                                                    FLAG(QChar::Mark_Enclosing));
3134                     } else {
3135                         switch (category.at(1)) {
3136                         case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3137                         case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3138                         case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3139                         default: error(RXERR_CATEGORY); break;
3140                         }
3141                     }
3142                     break;
3143                 case 'N':
3144                     if (catlen == 1) {
3145                         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3146                                                    FLAG(QChar::Number_Letter) |
3147                                                    FLAG(QChar::Number_Other));
3148                     } else {
3149                         switch (category.at(1)) {
3150                         case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3151                         case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3152                         case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3153                         default: error(RXERR_CATEGORY); break;
3154                         }
3155                     }
3156                     break;
3157                 case 'Z':
3158                     if (catlen == 1) {
3159                         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3160                                                    FLAG(QChar::Separator_Line) |
3161                                                    FLAG(QChar::Separator_Paragraph));
3162                     } else {
3163                         switch (category.at(1)) {
3164                         case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3165                         case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3166                         case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3167                         default: error(RXERR_CATEGORY); break;
3168                         }
3169                     }
3170                     break;
3171                 case 'C':
3172                     if (catlen == 1) {
3173                         yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3174                                                    FLAG(QChar::Other_Format) |
3175                                                    FLAG(QChar::Other_Surrogate) |
3176                                                    FLAG(QChar::Other_PrivateUse) |
3177                                                    FLAG(QChar::Other_NotAssigned));
3178                     } else {
3179                         switch (category.at(1)) {
3180                         case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3181                         case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3182                         case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3183                         case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3184                         case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3185                         default: error(RXERR_CATEGORY); break;
3186                         }
3187                     }
3188                     break;
3189                 case 'L':
3190                     if (catlen == 1) {
3191                         yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3192                                                    FLAG(QChar::Letter_Lowercase) |
3193                                                    FLAG(QChar::Letter_Titlecase) |
3194                                                    FLAG(QChar::Letter_Modifier) |
3195                                                    FLAG(QChar::Letter_Other));
3196                     } else {
3197                         switch (category.at(1)) {
3198                         case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3199                         case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3200                         case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3201                         case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3202                         case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3203                         default: error(RXERR_CATEGORY); break;
3204                         }
3205                     }
3206                     break;
3207                 case 'P':
3208                     if (catlen == 1) {
3209                         yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3210                                                    FLAG(QChar::Punctuation_Dash) |
3211                                                    FLAG(QChar::Punctuation_Open) |
3212                                                    FLAG(QChar::Punctuation_Close) |
3213                                                    FLAG(QChar::Punctuation_InitialQuote) |
3214                                                    FLAG(QChar::Punctuation_FinalQuote) |
3215                                                    FLAG(QChar::Punctuation_Other));
3216                     } else {
3217                         switch (category.at(1)) {
3218                         case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3219                         case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3220                         case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3221                         case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3222                         case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3223                         case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3224                         case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3225                         default: error(RXERR_CATEGORY); break;
3226                         }
3227                     }
3228                     break;
3229                 case 'S':
3230                     if (catlen == 1) {
3231                         yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3232                                                    FLAG(QChar::Symbol_Currency) |
3233                                                    FLAG(QChar::Symbol_Modifier) |
3234                                                    FLAG(QChar::Symbol_Other));
3235                     } else {
3236                         switch (category.at(1)) {
3237                         case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3238                         case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3239                         case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3240                         case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3241                         default: error(RXERR_CATEGORY); break;
3242                         }
3243                     }
3244                     break;
3245                 default:
3246                     error(RXERR_CATEGORY);
3247                     break;
3248                 }
3249             } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {
3250                 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3251                 const CategoriesRangeMapEntry *r = qBinaryFind(categoriesRangeMap, categoriesRangeMap + N, category.constData() + 2);
3252                 if (r != categoriesRangeMap + N)
3253                     yyCharClass->addRange(r->first, r->second);
3254                 else
3255                     error(RXERR_CATEGORY);
3256             } else {
3257                 error(RXERR_CATEGORY);
3258             }
3259             return Tok_CharClass;
3260         } else {
3261             break;
3262         }
3263 #endif
3264 #ifndef QT_NO_REGEXP_ESCAPE
3265     case 'x':
3266         val = 0;
3267         for (i = 0; i < 4; i++) {
3268             low = QChar(yyCh).toLower().unicode();
3269             if (low >= '0' && low <= '9')
3270                 val = (val << 4) | (low - '0');
3271             else if (low >= 'a' && low <= 'f')
3272                 val = (val << 4) | (low - 'a' + 10);
3273             else
3274                 break;
3275             yyCh = getChar();
3276         }
3277         return Tok_Char | val;
3278 #endif
3279     default:
3280         break;
3281     }
3282     if (prevCh >= '1' && prevCh <= '9') {
3283 #ifndef QT_NO_REGEXP_BACKREF
3284         val = prevCh - '0';
3285         while (yyCh >= '0' && yyCh <= '9') {
3286             val = (val * 10) + (yyCh - '0');
3287             yyCh = getChar();
3288         }
3289         return Tok_BackRef | val;
3290 #else
3291         error(RXERR_DISABLED);
3292 #endif
3293     }
3294     return Tok_Char | prevCh;
3295 }
3296
3297 #ifndef QT_NO_REGEXP_INTERVAL
3298 int QRegExpEngine::getRep(int def)
3299 {
3300     if (yyCh >= '0' && yyCh <= '9') {
3301         int rep = 0;
3302         do {
3303             rep = 10 * rep + yyCh - '0';
3304             if (rep >= InftyRep) {
3305                 error(RXERR_REPETITION);
3306                 rep = def;
3307             }
3308             yyCh = getChar();
3309         } while (yyCh >= '0' && yyCh <= '9');
3310         return rep;
3311     } else {
3312         return def;
3313     }
3314 }
3315 #endif
3316
3317 #ifndef QT_NO_REGEXP_LOOKAHEAD
3318 void QRegExpEngine::skipChars(int n)
3319 {
3320     if (n > 0) {
3321         yyPos += n - 1;
3322         yyCh = getChar();
3323     }
3324 }
3325 #endif
3326
3327 void QRegExpEngine::error(const char *msg)
3328 {
3329     if (yyError.isEmpty())
3330         yyError = QLatin1String(msg);
3331 }
3332
3333 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3334 {
3335     yyIn = rx;
3336     yyPos0 = 0;
3337     yyPos = 0;
3338     yyLen = len;
3339     yyCh = getChar();
3340     yyCharClass.reset(new QRegExpCharClass);
3341     yyMinRep = 0;
3342     yyMaxRep = 0;
3343     yyError = QString();
3344 }
3345
3346 int QRegExpEngine::getToken()
3347 {
3348 #ifndef QT_NO_REGEXP_CCLASS
3349     ushort pendingCh = 0;
3350     bool charPending;
3351     bool rangePending;
3352     int tok;
3353 #endif
3354     int prevCh = yyCh;
3355
3356     yyPos0 = yyPos - 1;
3357 #ifndef QT_NO_REGEXP_CCLASS
3358     yyCharClass->clear();
3359 #endif
3360     yyMinRep = 0;
3361     yyMaxRep = 0;
3362     yyCh = getChar();
3363
3364     switch (prevCh) {
3365     case EOS:
3366         yyPos0 = yyPos;
3367         return Tok_Eos;
3368     case '$':
3369         return Tok_Dollar;
3370     case '(':
3371         if (yyCh == '?') {
3372             prevCh = getChar();
3373             yyCh = getChar();
3374             switch (prevCh) {
3375 #ifndef QT_NO_REGEXP_LOOKAHEAD
3376             case '!':
3377                 return Tok_NegLookahead;
3378             case '=':
3379                 return Tok_PosLookahead;
3380 #endif
3381             case ':':
3382                 return Tok_MagicLeftParen;
3383             case '<':
3384                 error(RXERR_LOOKBEHIND);
3385                 return Tok_MagicLeftParen;
3386             default:
3387                 error(RXERR_LOOKAHEAD);
3388                 return Tok_MagicLeftParen;
3389             }
3390         } else {
3391             return Tok_LeftParen;
3392         }
3393     case ')':
3394         return Tok_RightParen;
3395     case '*':
3396         yyMinRep = 0;
3397         yyMaxRep = InftyRep;
3398         return Tok_Quantifier;
3399     case '+':
3400         yyMinRep = 1;
3401         yyMaxRep = InftyRep;
3402         return Tok_Quantifier;
3403     case '.':
3404 #ifndef QT_NO_REGEXP_CCLASS
3405         yyCharClass->setNegative(true);
3406 #endif
3407         return Tok_CharClass;
3408     case '?':
3409         yyMinRep = 0;
3410         yyMaxRep = 1;
3411         return Tok_Quantifier;
3412     case '[':
3413 #ifndef QT_NO_REGEXP_CCLASS
3414         if (yyCh == '^') {
3415             yyCharClass->setNegative(true);
3416             yyCh = getChar();
3417         }
3418         charPending = false;
3419         rangePending = false;
3420         do {
3421             if (yyCh == '-' && charPending && !rangePending) {
3422                 rangePending = true;
3423                 yyCh = getChar();
3424             } else {
3425                 if (charPending && !rangePending) {
3426                     yyCharClass->addSingleton(pendingCh);
3427                     charPending = false;
3428                 }
3429                 if (yyCh == '\\') {
3430                     yyCh = getChar();
3431                     tok = getEscape();
3432                     if (tok == Tok_Word)
3433                         tok = '\b';
3434                 } else {
3435                     tok = Tok_Char | yyCh;
3436                     yyCh = getChar();
3437                 }
3438                 if (tok == Tok_CharClass) {
3439                     if (rangePending) {
3440                         yyCharClass->addSingleton('-');
3441                         yyCharClass->addSingleton(pendingCh);
3442                         charPending = false;
3443                         rangePending = false;
3444                     }
3445                 } else if ((tok & Tok_Char) != 0) {
3446                     if (rangePending) {
3447                         yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3448                         charPending = false;
3449                         rangePending = false;
3450                     } else {
3451                         pendingCh = tok ^ Tok_Char;
3452                         charPending = true;
3453                     }
3454                 } else {
3455                     error(RXERR_CHARCLASS);
3456                 }
3457             }
3458         }  while (yyCh != ']' && yyCh != EOS);
3459         if (rangePending)
3460             yyCharClass->addSingleton('-');
3461         if (charPending)
3462             yyCharClass->addSingleton(pendingCh);
3463         if (yyCh == EOS)
3464             error(RXERR_END);
3465         else
3466             yyCh = getChar();
3467         return Tok_CharClass;
3468 #else
3469         error(RXERR_END);
3470         return Tok_Char | '[';
3471 #endif
3472     case '\\':
3473         return getEscape();
3474     case ']':
3475         error(RXERR_LEFTDELIM);
3476         return Tok_Char | ']';
3477     case '^':
3478         return Tok_Caret;
3479     case '{':
3480 #ifndef QT_NO_REGEXP_INTERVAL
3481         yyMinRep = getRep(0);
3482         yyMaxRep = yyMinRep;
3483         if (yyCh == ',') {
3484             yyCh = getChar();
3485             yyMaxRep = getRep(InftyRep);
3486         }
3487         if (yyMaxRep < yyMinRep)
3488             error(RXERR_INTERVAL);
3489         if (yyCh != '}')
3490             error(RXERR_REPETITION);
3491         yyCh = getChar();
3492         return Tok_Quantifier;
3493 #else
3494         error(RXERR_DISABLED);
3495         return Tok_Char | '{';
3496 #endif
3497     case '|':
3498         return Tok_Bar;
3499     case '}':
3500         error(RXERR_LEFTDELIM);
3501         return Tok_Char | '}';
3502     default:
3503         return Tok_Char | prevCh;
3504     }
3505 }
3506
3507 int QRegExpEngine::parse(const QChar *pattern, int len)
3508 {
3509     valid = true;
3510     startTokenizer(pattern, len);
3511     yyTok = getToken();
3512 #ifndef QT_NO_REGEXP_CAPTURE
3513     yyMayCapture = true;
3514 #else
3515     yyMayCapture = false;
3516 #endif
3517
3518 #ifndef QT_NO_REGEXP_CAPTURE
3519     int atom = startAtom(false);
3520 #endif
3521     QRegExpCharClass anything;
3522     Box box(this); // create InitialState
3523     box.set(anything);
3524     Box rightBox(this); // create FinalState
3525     rightBox.set(anything);
3526
3527     Box middleBox(this);
3528     parseExpression(&middleBox);
3529 #ifndef QT_NO_REGEXP_CAPTURE
3530     finishAtom(atom, false);
3531 #endif
3532 #ifndef QT_NO_REGEXP_OPTIM
3533     middleBox.setupHeuristics();
3534 #endif
3535     box.cat(middleBox);
3536     box.cat(rightBox);
3537     yyCharClass.reset(0);
3538
3539 #ifndef QT_NO_REGEXP_CAPTURE
3540     for (int i = 0; i < nf; ++i) {
3541         switch (f[i].capture) {
3542         case QRegExpAtom::NoCapture:
3543             break;
3544         case QRegExpAtom::OfficialCapture:
3545             f[i].capture = ncap;
3546             captureForOfficialCapture.append(ncap);
3547             ++ncap;
3548             ++officialncap;
3549             break;
3550         case QRegExpAtom::UnofficialCapture:
3551             f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3552         }
3553     }
3554
3555 #ifndef QT_NO_REGEXP_BACKREF
3556 #ifndef QT_NO_REGEXP_OPTIM
3557     if (officialncap == 0 && nbrefs == 0) {
3558         ncap = nf = 0;
3559         f.clear();
3560     }
3561 #endif
3562     // handle the case where there's a \5 with no corresponding capture
3563     // (captureForOfficialCapture.size() != officialncap)
3564     for (int i = 0; i < nbrefs - officialncap; ++i) {
3565         captureForOfficialCapture.append(ncap);
3566         ++ncap;
3567     }
3568 #endif
3569 #endif
3570
3571     if (!yyError.isEmpty())
3572         return -1;
3573
3574 #ifndef QT_NO_REGEXP_OPTIM
3575     const QRegExpAutomatonState &sinit = s.at(InitialState);
3576     caretAnchored = !sinit.anchors.isEmpty();
3577     if (caretAnchored) {
3578         const QMap<int, int> &anchors = sinit.anchors;
3579         QMap<int, int>::const_iterator a;
3580         for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3581             if (
3582 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3583                 (*a & Anchor_Alternation) != 0 ||
3584 #endif
3585                 (*a & Anchor_Caret) == 0)
3586             {
3587                 caretAnchored = false;
3588                 break;
3589             }
3590         }
3591     }
3592 #endif
3593
3594     // cleanup anchors
3595     int numStates = s.count();
3596     for (int i = 0; i < numStates; ++i) {
3597         QRegExpAutomatonState &state = s[i];
3598         if (!state.anchors.isEmpty()) {
3599             QMap<int, int>::iterator a = state.anchors.begin();
3600             while (a != state.anchors.end()) {
3601                 if (a.value() == 0)
3602                     a = state.anchors.erase(a);
3603                 else
3604                     ++a;
3605             }
3606         }
3607     }
3608
3609     return yyPos0;
3610 }
3611
3612 void QRegExpEngine::parseAtom(Box *box)
3613 {
3614 #ifndef QT_NO_REGEXP_LOOKAHEAD
3615     QRegExpEngine *eng = 0;
3616     bool neg;
3617     int len;
3618 #endif
3619
3620     if ((yyTok & Tok_Char) != 0) {
3621         box->set(QChar(yyTok ^ Tok_Char));
3622     } else {
3623 #ifndef QT_NO_REGEXP_OPTIM
3624         trivial = false;
3625 #endif
3626         switch (yyTok) {
3627         case Tok_Dollar:
3628             box->catAnchor(Anchor_Dollar);
3629             break;
3630         case Tok_Caret:
3631             box->catAnchor(Anchor_Caret);
3632             break;
3633 #ifndef QT_NO_REGEXP_LOOKAHEAD
3634         case Tok_PosLookahead:
3635         case Tok_NegLookahead:
3636             neg = (yyTok == Tok_NegLookahead);
3637             eng = new QRegExpEngine(cs, greedyQuantifiers);
3638             len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3639             if (len >= 0)
3640                 skipChars(len);
3641             else
3642                 error(RXERR_LOOKAHEAD);
3643             box->catAnchor(addLookahead(eng, neg));
3644             yyTok = getToken();
3645             if (yyTok != Tok_RightParen)
3646                 error(RXERR_LOOKAHEAD);
3647             break;
3648 #endif
3649 #ifndef QT_NO_REGEXP_ESCAPE
3650         case Tok_Word:
3651             box->catAnchor(Anchor_Word);
3652             break;
3653         case Tok_NonWord:
3654             box->catAnchor(Anchor_NonWord);
3655             break;
3656 #endif
3657         case Tok_LeftParen:
3658         case Tok_MagicLeftParen:
3659             yyTok = getToken();
3660             parseExpression(box);
3661             if (yyTok != Tok_RightParen)
3662                 error(RXERR_END);
3663             break;
3664         case Tok_CharClass:
3665             box->set(*yyCharClass);
3666             break;
3667         case Tok_Quantifier:
3668             error(RXERR_REPETITION);
3669             break;
3670         default:
3671 #ifndef QT_NO_REGEXP_BACKREF
3672             if ((yyTok & Tok_BackRef) != 0)
3673                 box->set(yyTok ^ Tok_BackRef);
3674             else
3675 #endif
3676                 error(RXERR_DISABLED);
3677         }
3678     }
3679     yyTok = getToken();
3680 }
3681
3682 void QRegExpEngine::parseFactor(Box *box)
3683 {
3684 #ifndef QT_NO_REGEXP_CAPTURE
3685     int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3686     int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3687     bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3688 #else
3689     const int innerAtom = -1;
3690 #endif
3691
3692 #ifndef QT_NO_REGEXP_INTERVAL
3693 #define YYREDO() \
3694         yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3695         *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3696
3697     const QChar *in = yyIn;
3698     int pos0 = yyPos0;
3699     int pos = yyPos;
3700     int len = yyLen;
3701     int ch = yyCh;
3702     QRegExpCharClass charClass;
3703     if (yyTok == Tok_CharClass)
3704         charClass = *yyCharClass;
3705     int tok = yyTok;
3706     bool mayCapture = yyMayCapture;
3707 #endif
3708
3709     parseAtom(box);
3710 #ifndef QT_NO_REGEXP_CAPTURE
3711     finishAtom(innerAtom, magicLeftParen);
3712 #endif
3713
3714     bool hasQuantifier = (yyTok == Tok_Quantifier);
3715     if (hasQuantifier) {
3716 #ifndef QT_NO_REGEXP_OPTIM
3717         trivial = false;
3718 #endif
3719         if (yyMaxRep == InftyRep) {
3720             box->plus(innerAtom);
3721 #ifndef QT_NO_REGEXP_INTERVAL
3722         } else if (yyMaxRep == 0) {
3723             box->clear();
3724 #endif
3725         }
3726         if (yyMinRep == 0)
3727             box->opt();
3728
3729 #ifndef QT_NO_REGEXP_INTERVAL
3730         yyMayCapture = false;
3731         int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3732         int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3733
3734         Box rightBox(this);
3735         int i;
3736
3737         for (i = 0; i < beta; i++) {
3738             YYREDO();
3739             Box leftBox(this);
3740             parseAtom(&leftBox);
3741             leftBox.cat(rightBox);
3742             leftBox.opt();
3743             rightBox = leftBox;
3744         }
3745         for (i = 0; i < alpha; i++) {
3746             YYREDO();
3747             Box leftBox(this);
3748             parseAtom(&leftBox);
3749             leftBox.cat(rightBox);
3750             rightBox = leftBox;
3751         }
3752         rightBox.cat(*box);
3753         *box = rightBox;
3754 #endif
3755         yyTok = getToken();
3756 #ifndef QT_NO_REGEXP_INTERVAL
3757         yyMayCapture = mayCapture;
3758 #endif
3759     }
3760 #undef YYREDO
3761 #ifndef QT_NO_REGEXP_CAPTURE
3762     if (greedyQuantifiers)
3763         finishAtom(outerAtom, hasQuantifier);
3764 #endif
3765 }
3766
3767 void QRegExpEngine::parseTerm(Box *box)
3768 {
3769 #ifndef QT_NO_REGEXP_OPTIM
3770     if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3771         parseFactor(box);
3772 #endif
3773     while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3774         Box rightBox(this);
3775         parseFactor(&rightBox);
3776         box->cat(rightBox);
3777     }
3778 }
3779
3780 void QRegExpEngine::parseExpression(Box *box)
3781 {
3782     parseTerm(box);
3783     while (yyTok == Tok_Bar) {
3784 #ifndef QT_NO_REGEXP_OPTIM
3785         trivial = false;
3786 #endif
3787         Box rightBox(this);
3788         yyTok = getToken();
3789         parseTerm(&rightBox);
3790         box->orx(rightBox);
3791     }
3792 }
3793
3794 /*
3795   The struct QRegExpPrivate contains the private data of a regular
3796   expression other than the automaton. It makes it possible for many
3797   QRegExp objects to use the same QRegExpEngine object with different
3798   QRegExpPrivate objects.
3799 */
3800 struct QRegExpPrivate
3801 {
3802     QRegExpEngine *eng;
3803     QRegExpEngineKey engineKey;
3804     bool minimal;
3805 #ifndef QT_NO_REGEXP_CAPTURE
3806     QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3807     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3808 #endif
3809     QRegExpMatchState matchState;
3810
3811     inline QRegExpPrivate()
3812         : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3813     inline QRegExpPrivate(const QRegExpEngineKey &key)
3814         : eng(0), engineKey(key), minimal(false) {}
3815 };
3816
3817 #if !defined(QT_NO_REGEXP_OPTIM)
3818 uint qHash(const QRegExpEngineKey &key, uint seed)
3819 {
3820     return qHash(key.pattern, seed);
3821 }
3822
3823 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3824 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3825 static QBasicMutex globalEngineCacheMutex;
3826 #endif // QT_NO_REGEXP_OPTIM
3827
3828 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3829 {
3830     if (!eng->ref.deref()) {
3831 #if !defined(QT_NO_REGEXP_OPTIM)
3832         if (globalEngineCache()) {
3833             QMutexLocker locker(&globalEngineCacheMutex);
3834             QT_TRY {
3835                 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3836             } QT_CATCH(const std::bad_alloc &) {
3837                 // in case of an exception (e.g. oom), just delete the engine
3838                 delete eng;
3839             }
3840         } else {
3841             delete eng;
3842         }
3843 #else
3844         Q_UNUSED(key);
3845         delete eng;
3846 #endif
3847     }
3848 }
3849
3850 static void prepareEngine_helper(QRegExpPrivate *priv)
3851 {
3852     bool initMatchState = !priv->eng;
3853 #if !defined(QT_NO_REGEXP_OPTIM)
3854     if (!priv->eng && globalEngineCache()) {
3855         QMutexLocker locker(&globalEngineCacheMutex);
3856         priv->eng = globalEngineCache()->take(priv->engineKey);
3857         if (priv->eng != 0)
3858             priv->eng->ref.ref();
3859     }
3860 #endif // QT_NO_REGEXP_OPTIM
3861
3862     if (!priv->eng)
3863         priv->eng = new QRegExpEngine(priv->engineKey);
3864
3865     if (initMatchState)
3866         priv->matchState.prepareForMatch(priv->eng);
3867 }
3868
3869 inline static void prepareEngine(QRegExpPrivate *priv)
3870 {
3871     if (priv->eng)
3872         return;
3873     prepareEngine_helper(priv);
3874 }
3875
3876 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3877 {
3878     prepareEngine(priv);
3879     priv->matchState.prepareForMatch(priv->eng);
3880 #ifndef QT_NO_REGEXP_CAPTURE
3881     priv->t = str;
3882     priv->capturedCache.clear();
3883 #else
3884     Q_UNUSED(str);
3885 #endif
3886 }
3887
3888 static void invalidateEngine(QRegExpPrivate *priv)
3889 {
3890     if (priv->eng != 0) {
3891         derefEngine(priv->eng, priv->engineKey);
3892         priv->eng = 0;
3893         priv->matchState.drain();
3894     }
3895 }
3896
3897 /*!
3898     \enum QRegExp::CaretMode
3899
3900     The CaretMode enum defines the different meanings of the caret
3901     (\b{^}) in a regular expression. The possible values are:
3902
3903     \value CaretAtZero
3904            The caret corresponds to index 0 in the searched string.
3905
3906     \value CaretAtOffset
3907            The caret corresponds to the start offset of the search.
3908
3909     \value CaretWontMatch
3910            The caret never matches.
3911 */
3912
3913 /*!
3914     \enum QRegExp::PatternSyntax
3915
3916     The syntax used to interpret the meaning of the pattern.
3917
3918     \value RegExp A rich Perl-like pattern matching syntax. This is
3919     the default.
3920
3921     \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3922     (Introduced in Qt 4.2.)
3923
3924     \value Wildcard This provides a simple pattern matching syntax
3925     similar to that used by shells (command interpreters) for "file
3926     globbing". See \l{Wildcard Matching}.
3927
3928     \value WildcardUnix This is similar to Wildcard but with the
3929     behavior of a Unix shell. The wildcard characters can be escaped
3930     with the character "\\".
3931
3932     \value FixedString The pattern is a fixed string. This is
3933     equivalent to using the RegExp pattern on a string in
3934     which all metacharacters are escaped using escape().
3935
3936     \value W3CXmlSchema11 The pattern is a regular expression as
3937     defined by the W3C XML Schema 1.1 specification.
3938
3939     \sa setPatternSyntax()
3940 */
3941
3942 /*!
3943     Constructs an empty regexp.
3944
3945     \sa isValid(), errorString()
3946 */
3947 QRegExp::QRegExp()
3948 {
3949     priv = new QRegExpPrivate;
3950     prepareEngine(priv);
3951 }
3952
3953 /*!
3954     Constructs a regular expression object for the given \a pattern
3955     string. The pattern must be given using wildcard notation if \a
3956     syntax is \l Wildcard; the default is \l RegExp. The pattern is
3957     case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3958     greedy (maximal), but can be changed by calling
3959     setMinimal().
3960
3961     \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3962 */
3963 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3964 {
3965     priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3966     prepareEngine(priv);
3967 }
3968
3969 /*!
3970     Constructs a regular expression as a copy of \a rx.
3971
3972     \sa operator=()
3973 */
3974 QRegExp::QRegExp(const QRegExp &rx)
3975 {
3976     priv = new QRegExpPrivate;
3977     operator=(rx);
3978 }
3979
3980 /*!
3981     Destroys the regular expression and cleans up its internal data.
3982 */
3983 QRegExp::~QRegExp()
3984 {
3985     invalidateEngine(priv);
3986     delete priv;
3987 }
3988
3989 /*!
3990     Copies the regular expression \a rx and returns a reference to the
3991     copy. The case sensitivity, wildcard, and minimal matching options
3992     are also copied.
3993 */
3994 QRegExp &QRegExp::operator=(const QRegExp &rx)
3995 {
3996     prepareEngine(rx.priv); // to allow sharing
3997     QRegExpEngine *otherEng = rx.priv->eng;
3998     if (otherEng)
3999         otherEng->ref.ref();
4000     invalidateEngine(priv);
4001     priv->eng = otherEng;
4002     priv->engineKey = rx.priv->engineKey;
4003     priv->minimal = rx.priv->minimal;
4004 #ifndef QT_NO_REGEXP_CAPTURE
4005     priv->t = rx.priv->t;
4006     priv->capturedCache = rx.priv->capturedCache;
4007 #endif
4008     if (priv->eng)
4009         priv->matchState.prepareForMatch(priv->eng);
4010     priv->matchState.captured = rx.priv->matchState.captured;
4011     return *this;
4012 }
4013
4014 /*!
4015     \fn void QRegExp::swap(QRegExp &other)
4016     \since 4.8
4017
4018     Swaps regular expression \a other with this regular
4019     expression. This operation is very fast and never fails.
4020 */
4021
4022 /*!
4023     Returns true if this regular expression is equal to \a rx;
4024     otherwise returns false.
4025
4026     Two QRegExp objects are equal if they have the same pattern
4027     strings and the same settings for case sensitivity, wildcard and
4028     minimal matching.
4029 */
4030 bool QRegExp::operator==(const QRegExp &rx) const
4031 {
4032     return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4033 }
4034
4035 /*!
4036     \fn bool QRegExp::operator!=(const QRegExp &rx) const
4037
4038     Returns true if this regular expression is not equal to \a rx;
4039     otherwise returns false.
4040
4041     \sa operator==()
4042 */
4043
4044 /*!
4045     Returns true if the pattern string is empty; otherwise returns
4046     false.
4047
4048     If you call exactMatch() with an empty pattern on an empty string
4049     it will return true; otherwise it returns false since it operates
4050     over the whole string. If you call indexIn() with an empty pattern
4051     on \e any string it will return the start offset (0 by default)
4052     because the empty pattern matches the 'emptiness' at the start of
4053     the string. In this case the length of the match returned by
4054     matchedLength() will be 0.
4055
4056     See QString::isEmpty().
4057 */
4058
4059 bool QRegExp::isEmpty() const
4060 {
4061     return priv->engineKey.pattern.isEmpty();
4062 }
4063
4064 /*!
4065     Returns true if the regular expression is valid; otherwise returns
4066     false. An invalid regular expression never matches.
4067
4068     The pattern \b{[a-z} is an example of an invalid pattern, since
4069     it lacks a closing square bracket.
4070
4071     Note that the validity of a regexp may also depend on the setting
4072     of the wildcard flag, for example \b{*.html} is a valid
4073     wildcard regexp but an invalid full regexp.
4074
4075     \sa errorString()
4076 */
4077 bool QRegExp::isValid() const
4078 {
4079     if (priv->engineKey.pattern.isEmpty()) {
4080         return true;
4081     } else {
4082         prepareEngine(priv);
4083         return priv->eng->isValid();
4084     }
4085 }
4086
4087 /*!
4088     Returns the pattern string of the regular expression. The pattern
4089     has either regular expression syntax or wildcard syntax, depending
4090     on patternSyntax().
4091
4092     \sa patternSyntax(), caseSensitivity()
4093 */
4094 QString QRegExp::pattern() const
4095 {
4096     return priv->engineKey.pattern;
4097 }
4098
4099 /*!
4100     Sets the pattern string to \a pattern. The case sensitivity,
4101     wildcard, and minimal matching options are not changed.
4102
4103     \sa setPatternSyntax(), setCaseSensitivity()
4104 */
4105 void QRegExp::setPattern(const QString &pattern)
4106 {
4107     if (priv->engineKey.pattern != pattern) {
4108         invalidateEngine(priv);
4109         priv->engineKey.pattern = pattern;
4110     }
4111 }
4112
4113 /*!
4114     Returns Qt::CaseSensitive if the regexp is matched case
4115     sensitively; otherwise returns Qt::CaseInsensitive.
4116
4117     \sa patternSyntax(), pattern(), isMinimal()
4118 */
4119 Qt::CaseSensitivity QRegExp::caseSensitivity() const
4120 {
4121     return priv->engineKey.cs;
4122 }
4123
4124 /*!
4125     Sets case sensitive matching to \a cs.
4126
4127     If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4128     \c{readme.txt} but not \c{README.TXT}.
4129
4130     \sa setPatternSyntax(), setPattern(), setMinimal()
4131 */
4132 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4133 {
4134     if ((bool)cs != (bool)priv->engineKey.cs) {
4135         invalidateEngine(priv);
4136         priv->engineKey.cs = cs;
4137     }
4138 }
4139
4140 /*!
4141     Returns the syntax used by the regular expression. The default is
4142     QRegExp::RegExp.
4143
4144     \sa pattern(), caseSensitivity()
4145 */
4146 QRegExp::PatternSyntax QRegExp::patternSyntax() const
4147 {
4148     return priv->engineKey.patternSyntax;
4149 }
4150
4151 /*!
4152     Sets the syntax mode for the regular expression. The default is
4153     QRegExp::RegExp.
4154
4155     Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4156     \l{wildcard matching}. For example, \b{r*.txt} matches the
4157     string \c{readme.txt} in wildcard mode, but does not match
4158     \c{readme}.
4159
4160     Setting \a syntax to QRegExp::FixedString means that the pattern
4161     is interpreted as a plain string. Special characters (e.g.,
4162     backslash) don't need to be escaped then.
4163
4164     \sa setPattern(), setCaseSensitivity(), escape()
4165 */
4166 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4167 {
4168     if (syntax != priv->engineKey.patternSyntax) {
4169         invalidateEngine(priv);
4170         priv->engineKey.patternSyntax = syntax;
4171     }
4172 }
4173
4174 /*!
4175     Returns true if minimal (non-greedy) matching is enabled;
4176     otherwise returns false.
4177
4178     \sa caseSensitivity(), setMinimal()
4179 */
4180 bool QRegExp::isMinimal() const
4181 {
4182     return priv->minimal;
4183 }
4184
4185 /*!
4186     Enables or disables minimal matching. If \a minimal is false,
4187     matching is greedy (maximal) which is the default.
4188
4189     For example, suppose we have the input string "We must be
4190     <b>bold</b>, very <b>bold</b>!" and the pattern
4191     \b{<b>.*</b>}. With the default greedy (maximal) matching,
4192     the match is "We must be \underline{<b>bold</b>, very
4193     <b>bold</b>}!". But with minimal (non-greedy) matching, the
4194     first match is: "We must be \underline{<b>bold</b>}, very
4195     <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4196     very \underline{<b>bold</b>}!". In practice we might use the pattern
4197     \b{<b>[^<]*\</b>} instead, although this will still fail for
4198     nested tags.
4199
4200     \sa setCaseSensitivity()
4201 */
4202 void QRegExp::setMinimal(bool minimal)
4203 {
4204     priv->minimal = minimal;
4205 }
4206
4207 // ### Qt 5: make non-const
4208 /*!
4209     Returns true if \a str is matched exactly by this regular
4210     expression; otherwise returns false. You can determine how much of
4211     the string was matched by calling matchedLength().
4212
4213     For a given regexp string R, exactMatch("R") is the equivalent of
4214     indexIn("^R$") since exactMatch() effectively encloses the regexp
4215     in the start of string and end of string anchors, except that it
4216     sets matchedLength() differently.
4217
4218     For example, if the regular expression is \b{blue}, then
4219     exactMatch() returns true only for input \c blue. For inputs \c
4220     bluebell, \c blutak and \c lightblue, exactMatch() returns false
4221     and matchedLength() will return 4, 3 and 0 respectively.
4222
4223     Although const, this function sets matchedLength(),
4224     capturedTexts(), and pos().
4225
4226     \sa indexIn(), lastIndexIn()
4227 */
4228 bool QRegExp::exactMatch(const QString &str) const
4229 {
4230     prepareEngineForMatch(priv, str);
4231     priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4232     if (priv->matchState.captured[1] == str.length()) {
4233         return true;
4234     } else {
4235         priv->matchState.captured[0] = 0;
4236         priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4237         return false;
4238     }
4239 }
4240
4241 // ### Qt 5: make non-const
4242 /*!
4243     Attempts to find a match in \a str from position \a offset (0 by
4244     default). If \a offset is -1, the search starts at the last
4245     character; if -2, at the next to last character; etc.
4246
4247     Returns the position of the first match, or -1 if there was no
4248     match.
4249
4250     The \a caretMode parameter can be used to instruct whether \b{^}
4251     should match at index 0 or at \a offset.
4252
4253     You might prefer to use QString::indexOf(), QString::contains(),
4254     or even QStringList::filter(). To replace matches use
4255     QString::replace().
4256
4257     Example:
4258     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4259
4260     Although const, this function sets matchedLength(),
4261     capturedTexts() and pos().
4262
4263     If the QRegExp is a wildcard expression (see setPatternSyntax())
4264     and want to test a string against the whole wildcard expression,
4265     use exactMatch() instead of this function.
4266
4267     \sa lastIndexIn(), exactMatch()
4268 */
4269
4270 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4271 {
4272     prepareEngineForMatch(priv, str);
4273     if (offset < 0)
4274         offset += str.length();
4275     priv->matchState.match(str.unicode(), str.length(), offset,
4276         priv->minimal, false, caretIndex(offset, caretMode));
4277     return priv->matchState.captured[0];
4278 }
4279
4280 // ### Qt 5: make non-const
4281 /*!
4282     Attempts to find a match backwards in \a str from position \a
4283     offset. If \a offset is -1 (the default), the search starts at the
4284     last character; if -2, at the next to last character; etc.
4285
4286     Returns the position of the first match, or -1 if there was no
4287     match.
4288
4289     The \a caretMode parameter can be used to instruct whether \b{^}
4290     should match at index 0 or at \a offset.
4291
4292     Although const, this function sets matchedLength(),
4293     capturedTexts() and pos().
4294
4295     \warning Searching backwards is much slower than searching
4296     forwards.
4297
4298     \sa indexIn(), exactMatch()
4299 */
4300
4301 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4302 {
4303     prepareEngineForMatch(priv, str);
4304     if (offset < 0)
4305         offset += str.length();
4306     if (offset < 0 || offset > str.length()) {
4307         memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4308         return -1;
4309     }
4310
4311     while (offset >= 0) {
4312         priv->matchState.match(str.unicode(), str.length(), offset,
4313             priv->minimal, true, caretIndex(offset, caretMode));
4314         if (priv->matchState.captured[0] == offset)
4315             return offset;
4316         --offset;
4317     }
4318     return -1;
4319 }
4320
4321 /*!
4322     Returns the length of the last matched string, or -1 if there was
4323     no match.
4324
4325     \sa exactMatch(), indexIn(), lastIndexIn()
4326 */
4327 int QRegExp::matchedLength() const
4328 {
4329     return priv->matchState.captured[1];
4330 }
4331
4332 #ifndef QT_NO_REGEXP_CAPTURE
4333
4334 /*!
4335   \fn int QRegExp::numCaptures() const
4336   \obsolete
4337   Returns the number of captures contained in the regular expression.
4338
4339   \sa captureCount()
4340  */
4341
4342 /*!
4343   \since 4.6
4344   Returns the number of captures contained in the regular expression.
4345  */
4346 int QRegExp::captureCount() const
4347 {
4348     prepareEngine(priv);
4349     return priv->eng->captureCount();
4350 }
4351
4352 /*!
4353     Returns a list of the captured text strings.
4354
4355     The first string in the list is the entire matched string. Each
4356     subsequent list element contains a string that matched a
4357     (capturing) subexpression of the regexp.
4358
4359     For example:
4360     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4361
4362     The above example also captures elements that may be present but
4363     which we have no interest in. This problem can be solved by using
4364     non-capturing parentheses:
4365
4366     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4367
4368     Note that if you want to iterate over the list, you should iterate
4369     over a copy, e.g.
4370     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4371
4372     Some regexps can match an indeterminate number of times. For
4373     example if the input string is "Offsets: 12 14 99 231 7" and the
4374     regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4375     all the numbers matched. However, after calling
4376     \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4377     "12"), i.e. the entire match was "12" and the first subexpression
4378     matched was "12". The correct approach is to use cap() in a
4379     \l{QRegExp#cap_in_a_loop}{loop}.
4380
4381     The order of elements in the string list is as follows. The first
4382     element is the entire matching string. Each subsequent element
4383     corresponds to the next capturing open left parentheses. Thus
4384     capturedTexts()[1] is the text of the first capturing parentheses,
4385     capturedTexts()[2] is the text of the second and so on
4386     (corresponding to $1, $2, etc., in some other regexp languages).
4387
4388     \sa cap(), pos()
4389 */
4390 QStringList QRegExp::capturedTexts() const
4391 {
4392     if (priv->capturedCache.isEmpty()) {
4393         prepareEngine(priv);
4394         const int *captured = priv->matchState.captured;
4395         int n = priv->matchState.capturedSize;
4396
4397         for (int i = 0; i < n; i += 2) {
4398             QString m;
4399             if (captured[i + 1] == 0)
4400                 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4401             else if (captured[i] >= 0)
4402                 m = priv->t.mid(captured[i], captured[i + 1]);
4403             priv->capturedCache.append(m);
4404         }
4405         priv->t.clear();
4406     }
4407     return priv->capturedCache;
4408 }
4409
4410 /*!
4411     \internal
4412 */
4413 QStringList QRegExp::capturedTexts()
4414 {
4415     return const_cast<const QRegExp *>(this)->capturedTexts();
4416 }
4417
4418 /*!
4419     Returns the text captured by the \a nth subexpression. The entire
4420     match has index 0 and the parenthesized subexpressions have
4421     indexes starting from 1 (excluding non-capturing parentheses).
4422
4423     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4424
4425     The order of elements matched by cap() is as follows. The first
4426     element, cap(0), is the entire matching string. Each subsequent
4427     element corresponds to the next capturing open left parentheses.
4428     Thus cap(1) is the text of the first capturing parentheses, cap(2)
4429     is the text of the second, and so on.
4430
4431     \sa capturedTexts(), pos()
4432 */
4433 QString QRegExp::cap(int nth) const
4434 {
4435     return capturedTexts().value(nth);
4436 }
4437
4438 /*!
4439     \internal
4440 */
4441 QString QRegExp::cap(int nth)
4442 {
4443     return const_cast<const QRegExp *>(this)->cap(nth);
4444 }
4445
4446 /*!
4447     Returns the position of the \a nth captured text in the searched
4448     string. If \a nth is 0 (the default), pos() returns the position
4449     of the whole match.
4450
4451     Example:
4452     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4453
4454     For zero-length matches, pos() always returns -1. (For example, if
4455     cap(4) would return an empty string, pos(4) returns -1.) This is
4456     a feature of the implementation.
4457
4458     \sa cap(), capturedTexts()
4459 */
4460 int QRegExp::pos(int nth) const
4461 {
4462     if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4463         return -1;
4464     else
4465         return priv->matchState.captured[2 * nth];
4466 }
4467
4468 /*!
4469     \internal
4470 */
4471 int QRegExp::pos(int nth)
4472 {
4473     return const_cast<const QRegExp *>(this)->pos(nth);
4474 }
4475
4476 /*!
4477   Returns a text string that explains why a regexp pattern is
4478   invalid the case being; otherwise returns "no error occurred".
4479
4480   \sa isValid()
4481 */
4482 QString QRegExp::errorString() const
4483 {
4484     if (isValid()) {
4485         return QString::fromLatin1(RXERR_OK);
4486     } else {
4487         return priv->eng->errorString();
4488     }
4489 }
4490
4491 /*!
4492     \internal
4493 */
4494 QString QRegExp::errorString()
4495 {
4496     return const_cast<const QRegExp *>(this)->errorString();
4497 }
4498 #endif
4499
4500 /*!
4501     Returns the string \a str with every regexp special character
4502     escaped with a backslash. The special characters are $, (,), *, +,
4503     ., ?, [, \,], ^, {, | and }.
4504
4505     Example:
4506
4507     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4508
4509     This function is useful to construct regexp patterns dynamically:
4510
4511     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4512
4513     \sa setPatternSyntax()
4514 */
4515 QString QRegExp::escape(const QString &str)
4516 {
4517     QString quoted;
4518     const int count = str.count();
4519     quoted.reserve(count * 2);
4520     const QLatin1Char backslash('\\');
4521     for (int i = 0; i < count; i++) {
4522         switch (str.at(i).toLatin1()) {
4523         case '$':
4524         case '(':
4525         case ')':
4526         case '*':
4527         case '+':
4528         case '.':
4529         case '?':
4530         case '[':
4531         case '\\':
4532         case ']':
4533         case '^':
4534         case '{':
4535         case '|':
4536         case '}':
4537             quoted.append(backslash);
4538         }
4539         quoted.append(str.at(i));
4540     }
4541     return quoted;
4542 }
4543
4544
4545 #ifndef QT_NO_DATASTREAM
4546 /*!
4547     \relates QRegExp
4548
4549     Writes the regular expression \a regExp to stream \a out.
4550
4551     \sa {Serializing Qt Data Types}
4552 */
4553 QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4554 {
4555     return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4556                << (quint8)regExp.patternSyntax()
4557                << (quint8)!!regExp.isMinimal();
4558 }
4559
4560 /*!
4561     \relates QRegExp
4562
4563     Reads a regular expression from stream \a in into \a regExp.
4564
4565     \sa {Serializing Qt Data Types}
4566 */
4567 QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4568 {
4569     QString pattern;
4570     quint8 cs;
4571     quint8 patternSyntax;
4572     quint8 isMinimal;
4573
4574     in >> pattern >> cs >> patternSyntax >> isMinimal;
4575
4576     QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4577                       QRegExp::PatternSyntax(patternSyntax));
4578
4579     newRegExp.setMinimal(isMinimal);
4580     regExp = newRegExp;
4581     return in;
4582 }
4583 #endif // QT_NO_DATASTREAM
4584
4585 #ifndef QT_NO_DEBUG_STREAM
4586 QDebug operator<<(QDebug dbg, const QRegExp &r)
4587 {
4588     dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4589                   << ", pattern='"<< r.pattern() << "')";
4590     return dbg.space();
4591 }
4592 #endif
4593
4594 QT_END_NAMESPACE