src/corelib/tools/qregexp.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include "qregexp.h"
  43
  44 #include "qalgorithms.h"
  45 #include "qbitarray.h"
  46 #include "qcache.h"
  47 #include "qdatastream.h"
  48 #include "qdebug.h"
  49 #include "qlist.h"
  50 #include "qmap.h"
  51 #include "qmutex.h"
  52 #include "qstring.h"
  53 #include "qstringlist.h"
  54 #include "qstringmatcher.h"
  55 #include "qvector.h"
  56 #include "private/qfunctions_p.h"
  57
  58 #include <limits.h>
  59
  60 QT_BEGIN_NAMESPACE
  61
  62 int qFindString(const QChar *haystack, int haystackLen, int from,
  63     const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
  64
  65 // error strings for the regexp parser
  66 #define RXERR_OK         QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
  67 #define RXERR_DISABLED   QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
  68 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
  69 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
  70 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
  71 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
  72 #define RXERR_OCTAL      QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
  73 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
  74 #define RXERR_END        QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
  75 #define RXERR_LIMIT      QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
  76 #define RXERR_INTERVAL   QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
  77 #define RXERR_CATEGORY   QT_TRANSLATE_NOOP("QRegExp", "invalid category")
  78
  79 /*!
  80     \class QRegExp
  81     \reentrant
  82     \brief The QRegExp class provides pattern matching using regular expressions.
  83
  84     \ingroup tools
  85     \ingroup shared
  86
  87     \keyword regular expression
  88
  89     A regular expression, or "regexp", is a pattern for matching
  90     substrings in a text. This is useful in many contexts, e.g.,
  91
  92     \table
  93     \row \i Validation
  94          \i A regexp can test whether a substring meets some criteria,
  95          e.g. is an integer or contains no whitespace.
  96     \row \i Searching
  97          \i A regexp provides more powerful pattern matching than
  98          simple substring matching, e.g., match one of the words
  99          \e{mail}, \e{letter} or \e{correspondence}, but none of the
 100          words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
 101      \row \i Search and Replace
 102          \i A regexp can replace all occurrences of a substring with a
 103          different substring, e.g., replace all occurrences of \e{&}
 104          with \e{\&amp;} except where the \e{&} is already followed by
 105          an \e{amp;}.
 106     \row \i String Splitting
 107          \i A regexp can be used to identify where a string should be
 108          split apart, e.g. splitting tab-delimited strings.
 109     \endtable
 110
 111     A brief introduction to regexps is presented, a description of
 112     Qt's regexp language, some examples, and the function
 113     documentation itself. QRegExp is modeled on Perl's regexp
 114     language. It fully supports Unicode. QRegExp can also be used in a
 115     simpler, \e{wildcard mode} that is similar to the functionality
 116     found in command shells. The syntax rules used by QRegExp can be
 117     changed with setPatternSyntax(). In particular, the pattern syntax
 118     can be set to QRegExp::FixedString, which means the pattern to be
 119     matched is interpreted as a plain string, i.e., special characters
 120     (e.g., backslash) are not escaped.
 121
 122     A good text on regexps is \e {Mastering Regular Expressions}
 123     (Third Edition) by Jeffrey E. F.  Friedl, ISBN 0-596-52812-4.
 124
 125     \tableofcontents
 126
 127     \section1 Introduction
 128
 129     Regexps are built up from expressions, quantifiers, and
 130     assertions. The simplest expression is a character, e.g. \bold{x}
 131     or \bold{5}. An expression can also be a set of characters
 132     enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
 133     or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
 134     expression as \bold{[A-D]}, and an experession to match any
 135     captital letter in the English alphabet is written as
 136     \bold{[A-Z]}.
 137
 138     A quantifier specifies the number of occurrences of an expression
 139     that must be matched. \bold{x{1,1}} means match one and only one
 140     \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
 141     characters that contains at least one \bold{x} but no more than
 142     five.
 143
 144     Note that in general regexps cannot be used to check for balanced
 145     brackets or tags. For example, a regexp can be written to match an
 146     opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
 147     are not nested, but if the \c{<b>} tags are nested, that same
 148     regexp will match an opening \c{<b>} tag with the wrong closing
 149     \c{</b>}.  For the fragment \c{<b>bold <b>bolder</b></b>}, the
 150     first \c{<b>} would be matched with the first \c{</b>}, which is
 151     not correct. However, it is possible to write a regexp that will
 152     match nested brackets or tags correctly, but only if the number of
 153     nesting levels is fixed and known. If the number of nesting levels
 154     is not fixed and known, it is impossible to write a regexp that
 155     will not fail.
 156
 157     Suppose we want a regexp to match integers in the range 0 to 99.
 158     At least one digit is required, so we start with the expression
 159     \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
 160     regexp matches integers in the range 0 to 9. To match integers up
 161     to 99, increase the maximum number of occurrences to 2, so the
 162     regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
 163     original requirement to match integers from 0 to 99, but it will
 164     also match integers that occur in the middle of strings. If we
 165     want the matched integer to be the whole string, we must use the
 166     anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
 167     \bold{^} is the first character in a regexp, it means the regexp
 168     must match from the beginning of the string. When \bold{$} is the
 169     last character of the regexp, it means the regexp must match to
 170     the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
 171     Note that assertions, e.g. \bold{^} and \bold{$}, do not match
 172     characters but locations in the string.
 173
 174     If you have seen regexps described elsewhere, they may have looked
 175     different from the ones shown here. This is because some sets of
 176     characters and some quantifiers are so common that they have been
 177     given special symbols to represent them. \bold{[0-9]} can be
 178     replaced with the symbol \bold{\\d}. The quantifier to match
 179     exactly one occurrence, \bold{{1,1}}, can be replaced with the
 180     expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
 181     our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
 182     also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
 183     the string, match a digit, followed immediately by 0 or 1 digits}.
 184     In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
 185     is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
 186     occurrences. \bold{?} makes an expression optional. The regexp
 187     \bold{^\\d\\d?$} means \e{From the beginning of the string, match
 188     one digit, followed immediately by 0 or 1 more digit, followed
 189     immediately by end of string}.
 190
 191     To write a regexp that matches one of the words 'mail' \e or
 192     'letter' \e or 'correspondence' but does not match words that
 193     contain these words, e.g., 'email', 'mailman', 'mailer', and
 194     'letterbox', start with a regexp that matches 'mail'. Expressed
 195     fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
 196     a character expression is automatically quantified by
 197     \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
 198     'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
 199     we can use the vertical bar \bold{|}, which means \bold{or}, to
 200     include the other two words, so our regexp for matching any of the
 201     three words becomes \bold{mail|letter|correspondence}. Match
 202     'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
 203     regexp will match one of the three words we want to match, it will
 204     also match words we don't want to match, e.g., 'email'.  To
 205     prevent the regexp from matching unwanted words, we must tell it
 206     to begin and end the match at word boundaries. First we enclose
 207     our regexp in parentheses, \bold{(mail|letter|correspondence)}.
 208     Parentheses group expressions together, and they identify a part
 209     of the regexp that we wish to \l{capturing text}{capture}.
 210     Enclosing the expression in parentheses allows us to use it as a
 211     component in more complex regexps. It also allows us to examine
 212     which of the three words was actually matched. To force the match
 213     to begin and end on word boundaries, we enclose the regexp in
 214     \bold{\\b} \e{word boundary} assertions:
 215     \bold{\\b(mail|letter|correspondence)\\b}.  Now the regexp means:
 216     \e{Match a word boundary, followed by the regexp in parentheses,
 217     followed by a word boundary}. The \bold{\\b} assertion matches a
 218     \e position in the regexp, not a \e character. A word boundary is
 219     any non-word character, e.g., a space, newline, or the beginning
 220     or ending of a string.
 221
 222     If we want to replace ampersand characters with the HTML entity
 223     \bold{\&amp;}, the regexp to match is simply \bold{\&}. But this
 224     regexp will also match ampersands that have already been converted
 225     to HTML entities. We want to replace only ampersands that are not
 226     already followed by \bold{amp;}. For this, we need the negative
 227     lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
 228     written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
 229     \bold{not} \e{followed by} \bold{amp;}.
 230
 231     If we want to count all the occurrences of 'Eric' and 'Eirik' in a
 232     string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
 233     \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
 234     required to avoid matching words that contain either name,
 235     e.g. 'Ericsson'. Note that the second regexp matches more
 236     spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
 237
 238     Some of the examples discussed above are implemented in the
 239     \link #code-examples code examples \endlink section.
 240
 241     \target characters-and-abbreviations-for-sets-of-characters
 242     \section1 Characters and Abbreviations for Sets of Characters
 243
 244     \table
 245     \header \i Element \i Meaning
 246     \row \i \bold{c}
 247          \i A character represents itself unless it has a special
 248          regexp meaning. e.g. \bold{c} matches the character \e c.
 249     \row \i \bold{\\c}
 250          \i A character that follows a backslash matches the character
 251          itself, except as specified below. e.g., To match a literal
 252          caret at the beginning of a string, write \bold{\\^}.
 253     \row \i \bold{\\a}
 254          \i Matches the ASCII bell (BEL, 0x07).
 255     \row \i \bold{\\f}
 256          \i Matches the ASCII form feed (FF, 0x0C).
 257     \row \i \bold{\\n}
 258          \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
 259     \row \i \bold{\\r}
 260          \i Matches the ASCII carriage return (CR, 0x0D).
 261     \row \i \bold{\\t}
 262          \i Matches the ASCII horizontal tab (HT, 0x09).
 263     \row \i \bold{\\v}
 264          \i Matches the ASCII vertical tab (VT, 0x0B).
 265     \row \i \bold{\\x\e{hhhh}}
 266          \i Matches the Unicode character corresponding to the
 267          hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
 268     \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
 269          \i matches the ASCII/Latin1 character for the octal number
 270          \e{ooo} (between 0 and 0377).
 271     \row \i \bold{. (dot)}
 272          \i Matches any character (including newline).
 273     \row \i \bold{\\d}
 274          \i Matches a digit (QChar::isDigit()).
 275     \row \i \bold{\\D}
 276          \i Matches a non-digit.
 277     \row \i \bold{\\s}
 278          \i Matches a whitespace character (QChar::isSpace()).
 279     \row \i \bold{\\S}
 280          \i Matches a non-whitespace character.
 281     \row \i \bold{\\w}
 282          \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
 283     \row \i \bold{\\W}
 284          \i Matches a non-word character.
 285     \row \i \bold{\\\e{n}}
 286          \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
 287     \endtable
 288
 289     \bold{Note:} The C++ compiler transforms backslashes in strings.
 290     To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
 291     To match the backslash character itself, enter it four times, i.e.
 292     \c{\\\\}.
 293
 294     \target sets-of-characters
 295     \section1 Sets of Characters
 296
 297     Square brackets mean match any character contained in the square
 298     brackets. The character set abbreviations described above can
 299     appear in a character set in square brackets. Except for the
 300     character set abbreviations and the following two exceptions,
 301     characters do not have special meanings in square brackets.
 302
 303     \table
 304     \row \i \bold{^}
 305
 306          \i The caret negates the character set if it occurs as the
 307          first character (i.e. immediately after the opening square
 308          bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
 309          \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
 310
 311     \row \i \bold{-}
 312
 313          \i The dash indicates a range of characters. \bold{[W-Z]}
 314          matches 'W' or 'X' or 'Y' or 'Z'.
 315
 316     \endtable
 317
 318     Using the predefined character set abbreviations is more portable
 319     than using character ranges across platforms and languages. For
 320     example, \bold{[0-9]} matches a digit in Western alphabets but
 321     \bold{\\d} matches a digit in \e any alphabet.
 322
 323     Note: In other regexp documentation, sets of characters are often
 324     called "character classes".
 325
 326     \target quantifiers
 327     \section1 Quantifiers
 328
 329     By default, an expression is automatically quantified by
 330     \bold{{1,1}}, i.e. it should occur exactly once. In the following
 331     list, \bold{\e {E}} stands for expression. An expression is a
 332     character, or an abbreviation for a set of characters, or a set of
 333     characters in square brackets, or an expression in parentheses.
 334
 335     \table
 336     \row \i \bold{\e {E}?}
 337
 338          \i Matches zero or one occurrences of \e E. This quantifier
 339          means \e{The previous expression is optional}, because it
 340          will match whether or not the expression is found. \bold{\e
 341          {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
 342          matches 'dent' or 'dents'.
 343
 344     \row \i \bold{\e {E}+}
 345
 346          \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
 347          the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
 348          '00', '000', etc.
 349
 350     \row \i \bold{\e {E}*}
 351
 352          \i Matches zero or more occurrences of \e E. It is the same
 353          as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
 354          in error where \bold{+} should be used. For example, if
 355          \bold{\\s*$} is used in an expression to match strings that
 356          end in whitespace, it will match every string because
 357          \bold{\\s*$} means \e{Match zero or more whitespaces followed
 358          by end of string}. The correct regexp to match strings that
 359          have at least one trailing whitespace character is
 360          \bold{\\s+$}.
 361
 362     \row \i \bold{\e {E}{n}}
 363
 364          \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
 365          is the same as repeating \e E \e n times. For example,
 366          \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
 367          as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
 368
 369     \row \i \bold{\e {E}{n,}}
 370          \i Matches at least \e n occurrences of \e E.
 371
 372     \row \i \bold{\e {E}{,m}}
 373          \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
 374          is the same as \bold{\e {E}{0,m}}.
 375
 376     \row \i \bold{\e {E}{n,m}}
 377          \i Matches at least \e n and at most \e m occurrences of \e E.
 378     \endtable
 379
 380     To apply a quantifier to more than just the preceding character,
 381     use parentheses to group characters together in an expression. For
 382     example, \bold{tag+} matches a 't' followed by an 'a' followed by
 383     at least one 'g', whereas \bold{(tag)+} matches at least one
 384     occurrence of 'tag'.
 385
 386     Note: Quantifiers are normally "greedy". They always match as much
 387     text as they can. For example, \bold{0+} matches the first zero it
 388     finds and all the consecutive zeros after the first zero. Applied
 389     to '20005', it matches'2\underline{000}5'. Quantifiers can be made
 390     non-greedy, see setMinimal().
 391
 392     \target capturing parentheses
 393     \target backreferences
 394     \section1 Capturing Text
 395
 396     Parentheses allow us to group elements together so that we can
 397     quantify and capture them. For example if we have the expression
 398     \bold{mail|letter|correspondence} that matches a string we know
 399     that \e one of the words matched but not which one. Using
 400     parentheses allows us to "capture" whatever is matched within
 401     their bounds, so if we used \bold{(mail|letter|correspondence)}
 402     and matched this regexp against the string "I sent you some email"
 403     we can use the cap() or capturedTexts() functions to extract the
 404     matched characters, in this case 'mail'.
 405
 406     We can use captured text within the regexp itself. To refer to the
 407     captured text we use \e backreferences which are indexed from 1,
 408     the same as for cap(). For example we could search for duplicate
 409     words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
 410     word boundary followed by one or more word characters followed by
 411     one or more non-word characters followed by the same text as the
 412     first parenthesized expression followed by a word boundary.
 413
 414     If we want to use parentheses purely for grouping and not for
 415     capturing we can use the non-capturing syntax, e.g.
 416     \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
 417     end ')'. In this example we match either 'green' or 'blue' but we
 418     do not capture the match so we only know whether or not we matched
 419     but not which color we actually found. Using non-capturing
 420     parentheses is more efficient than using capturing parentheses
 421     since the regexp engine has to do less book-keeping.
 422
 423     Both capturing and non-capturing parentheses may be nested.
 424
 425     \target greedy quantifiers
 426
 427     For historical reasons, quantifiers (e.g. \bold{*}) that apply to
 428     capturing parentheses are more "greedy" than other quantifiers.
 429     For example, \bold{a*(a*)} will match "aaa" with cap(1) == "aaa".
 430     This behavior is different from what other regexp engines do
 431     (notably, Perl). To obtain a more intuitive capturing behavior,
 432     specify QRegExp::RegExp2 to the QRegExp constructor or call
 433     setPatternSyntax(QRegExp::RegExp2).
 434
 435     \target cap_in_a_loop
 436
 437     When the number of matches cannot be determined in advance, a
 438     common idiom is to use cap() in a loop. For example:
 439
 440     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
 441
 442     \target assertions
 443     \section1 Assertions
 444
 445     Assertions make some statement about the text at the point where
 446     they occur in the regexp but they do not match any characters. In
 447     the following list \bold{\e {E}} stands for any expression.
 448
 449     \table
 450     \row \i \bold{^}
 451          \i The caret signifies the beginning of the string. If you
 452          wish to match a literal \c{^} you must escape it by
 453          writing \c{\\^}. For example, \bold{^#include} will only
 454          match strings which \e begin with the characters '#include'.
 455          (When the caret is the first character of a character set it
 456          has a special meaning, see \link #sets-of-characters Sets of
 457          Characters \endlink.)
 458
 459     \row \i \bold{$}
 460          \i The dollar signifies the end of the string. For example
 461          \bold{\\d\\s*$} will match strings which end with a digit
 462          optionally followed by whitespace. If you wish to match a
 463          literal \c{$} you must escape it by writing
 464          \c{\\$}.
 465
 466     \row \i \bold{\\b}
 467          \i A word boundary. For example the regexp
 468          \bold{\\bOK\\b} means match immediately after a word
 469          boundary (e.g. start of string or whitespace) the letter 'O'
 470          then the letter 'K' immediately before another word boundary
 471          (e.g. end of string or whitespace). But note that the
 472          assertion does not actually match any whitespace so if we
 473          write \bold{(\\bOK\\b)} and we have a match it will only
 474          contain 'OK' even if the string is "It's \underline{OK} now".
 475
 476     \row \i \bold{\\B}
 477          \i A non-word boundary. This assertion is true wherever
 478          \bold{\\b} is false. For example if we searched for
 479          \bold{\\Bon\\B} in "Left on" the match would fail (space
 480          and end of string aren't non-word boundaries), but it would
 481          match in "t\underline{on}ne".
 482
 483     \row \i \bold{(?=\e E)}
 484          \i Positive lookahead. This assertion is true if the
 485          expression matches at this point in the regexp. For example,
 486          \bold{const(?=\\s+char)} matches 'const' whenever it is
 487          followed by 'char', as in 'static \underline{const} char *'.
 488          (Compare with \bold{const\\s+char}, which matches 'static
 489          \underline{const char} *'.)
 490
 491     \row \i \bold{(?!\e E)}
 492          \i Negative lookahead. This assertion is true if the
 493          expression does not match at this point in the regexp. For
 494          example, \bold{const(?!\\s+char)} matches 'const' \e except
 495          when it is followed by 'char'.
 496     \endtable
 497
 498     \keyword QRegExp wildcard matching
 499     \section1 Wildcard Matching
 500
 501     Most command shells such as \e bash or \e cmd.exe support "file
 502     globbing", the ability to identify a group of files by using
 503     wildcards. The setPatternSyntax() function is used to switch
 504     between regexp and wildcard mode. Wildcard matching is much
 505     simpler than full regexps and has only four features:
 506
 507     \table
 508     \row \i \bold{c}
 509          \i Any character represents itself apart from those mentioned
 510          below. Thus \bold{c} matches the character \e c.
 511     \row \i \bold{?}
 512          \i Matches any single character. It is the same as
 513          \bold{.} in full regexps.
 514     \row \i \bold{*}
 515          \i Matches zero or more of any characters. It is the
 516          same as \bold{.*} in full regexps.
 517     \row \i \bold{[...]}
 518          \i Sets of characters can be represented in square brackets,
 519          similar to full regexps. Within the character class, like
 520          outside, backslash has no special meaning.
 521     \endtable
 522
 523     In the mode Wildcard, the wildcard characters cannot be
 524     escaped. In the mode WildcardUnix, the character '\\' escapes the
 525     wildcard.
 526
 527     For example if we are in wildcard mode and have strings which
 528     contain filenames we could identify HTML files with \bold{*.html}.
 529     This will match zero or more characters followed by a dot followed
 530     by 'h', 't', 'm' and 'l'.
 531
 532     To test a string against a wildcard expression, use exactMatch().
 533     For example:
 534
 535     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
 536
 537     \target perl-users
 538     \section1 Notes for Perl Users
 539
 540     Most of the character class abbreviations supported by Perl are
 541     supported by QRegExp, see \link
 542     #characters-and-abbreviations-for-sets-of-characters characters
 543     and abbreviations for sets of characters \endlink.
 544
 545     In QRegExp, apart from within character classes, \c{^} always
 546     signifies the start of the string, so carets must always be
 547     escaped unless used for that purpose. In Perl the meaning of caret
 548     varies automagically depending on where it occurs so escaping it
 549     is rarely necessary. The same applies to \c{$} which in
 550     QRegExp always signifies the end of the string.
 551
 552     QRegExp's quantifiers are the same as Perl's greedy quantifiers
 553     (but see the \l{greedy quantifiers}{note above}). Non-greedy
 554     matching cannot be applied to individual quantifiers, but can be
 555     applied to all the quantifiers in the pattern. For example, to
 556     match the Perl regexp \bold{ro+?m} requires:
 557
 558     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
 559
 560     The equivalent of Perl's \c{/i} option is
 561     setCaseSensitivity(Qt::CaseInsensitive).
 562
 563     Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
 564
 565     In QRegExp \bold{.} matches any character, therefore all QRegExp
 566     regexps have the equivalent of Perl's \c{/s} option. QRegExp
 567     does not have an equivalent to Perl's \c{/m} option, but this
 568     can be emulated in various ways for example by splitting the input
 569     into lines or by looping with a regexp that searches for newlines.
 570
 571     Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
 572     assertions. The \\G assertion is not supported but can be emulated
 573     in a loop.
 574
 575     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
 576     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
 577     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
 578     capturedTexts()[2], etc.
 579
 580     To substitute a pattern use QString::replace().
 581
 582     Perl's extended \c{/x} syntax is not supported, nor are
 583     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
 584     the other hand, C++'s rules for literal strings can be used to
 585     achieve the same:
 586
 587     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
 588
 589     Both zero-width positive and zero-width negative lookahead
 590     assertions (?=pattern) and (?!pattern) are supported with the same
 591     syntax as Perl. Perl's lookbehind assertions, "independent"
 592     subexpressions and conditional expressions are not supported.
 593
 594     Non-capturing parentheses are also supported, with the same
 595     (?:pattern) syntax.
 596
 597     See QString::split() and QStringList::join() for equivalents
 598     to Perl's split and join functions.
 599
 600     Note: because C++ transforms \\'s they must be written \e twice in
 601     code, e.g. \bold{\\b} must be written \bold{\\\\b}.
 602
 603     \target code-examples
 604     \section1 Code Examples
 605
 606     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
 607
 608     The third string matches '\underline{6}'. This is a simple validation
 609     regexp for integers in the range 0 to 99.
 610
 611     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
 612
 613     The second string matches '\underline{This_is-OK}'. We've used the
 614     character set abbreviation '\\S' (non-whitespace) and the anchors
 615     to match strings which contain no whitespace.
 616
 617     In the following example we match strings containing 'mail' or
 618     'letter' or 'correspondence' but only match whole words i.e. not
 619     'email'
 620
 621     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
 622
 623     The second string matches "Please write the \underline{letter}". The
 624     word 'letter' is also captured (because of the parentheses). We
 625     can see what text we've captured like this:
 626
 627     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
 628
 629     This will capture the text from the first set of capturing
 630     parentheses (counting capturing left parentheses from left to
 631     right). The parentheses are counted from 1 since cap(0) is the
 632     whole matched regexp (equivalent to '&' in most regexp engines).
 633
 634     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
 635
 636     Here we've passed the QRegExp to QString's replace() function to
 637     replace the matched text with new text.
 638
 639     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
 640
 641     We've used the indexIn() function to repeatedly match the regexp in
 642     the string. Note that instead of moving forward by one character
 643     at a time \c pos++ we could have written \c {pos +=
 644     rx.matchedLength()} to skip over the already matched string. The
 645     count will equal 3, matching 'One \underline{Eric} another
 646     \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
 647     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
 648     by non-word boundaries.
 649
 650     One common use of regexps is to split lines of delimited data into
 651     their component fields.
 652
 653     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
 654
 655     In this example our input lines have the format company name, web
 656     address and country. Unfortunately the regexp is rather long and
 657     not very versatile -- the code will break if we add any more
 658     fields. A simpler and better solution is to look for the
 659     separator, '\\t' in this case, and take the surrounding text. The
 660     QString::split() function can take a separator string or regexp
 661     as an argument and split a string accordingly.
 662
 663     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
 664
 665     Here field[0] is the company, field[1] the web address and so on.
 666
 667     To imitate the matching of a shell we can use wildcard mode.
 668
 669     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
 670
 671     Wildcard matching can be convenient because of its simplicity, but
 672     any wildcard regexp can be defined using full regexps, e.g.
 673     \bold{.*\\.html$}. Notice that we can't match both \c .html and \c
 674     .htm files with a wildcard unless we use \bold{*.htm*} which will
 675     also match 'test.html.bak'. A full regexp gives us the precision
 676     we need, \bold{.*\\.html?$}.
 677
 678     QRegExp can match case insensitively using setCaseSensitivity(),
 679     and can use non-greedy matching, see setMinimal(). By
 680     default QRegExp uses full regexps but this can be changed with
 681     setWildcard(). Searching can be forward with indexIn() or backward
 682     with lastIndexIn(). Captured text can be accessed using
 683     capturedTexts() which returns a string list of all captured
 684     strings, or using cap() which returns the captured string for the
 685     given index. The pos() function takes a match index and returns
 686     the position in the string where the match was made (or -1 if
 687     there was no match).
 688
 689     \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
 690         {tools/regexp}{Regular Expression Example}
 691 */
 692
 693 #if defined(Q_OS_VXWORKS) && defined(EOS)
 694 #  undef EOS
 695 #endif
 696
 697 const int NumBadChars = 64;
 698 #define BadChar(ch) ((ch).unicode() % NumBadChars)
 699
 700 const int NoOccurrence = INT_MAX;
 701 const int EmptyCapture = INT_MAX;
 702 const int InftyLen = INT_MAX;
 703 const int InftyRep = 1025;
 704 const int EOS = -1;
 705
 706 static bool isWord(QChar ch)
 707 {
 708     return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
 709 }
 710
 711 /*
 712   Merges two vectors of ints and puts the result into the first
 713   one.
 714 */
 715 static void mergeInto(QVector<int> *a, const QVector<int> &b)
 716 {
 717     int asize = a->size();
 718     int bsize = b.size();
 719     if (asize == 0) {
 720         *a = b;
 721 #ifndef QT_NO_REGEXP_OPTIM
 722     } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
 723         a->resize(asize + 1);
 724         (*a)[asize] = b.at(0);
 725 #endif
 726     } else if (bsize >= 1) {
 727         int csize = asize + bsize;
 728         QVector<int> c(csize);
 729         int i = 0, j = 0, k = 0;
 730         while (i < asize) {
 731             if (j < bsize) {
 732                 if (a->at(i) == b.at(j)) {
 733                     ++i;
 734                     --csize;
 735                 } else if (a->at(i) < b.at(j)) {
 736                     c[k++] = a->at(i++);
 737                 } else {
 738                     c[k++] = b.at(j++);
 739                 }
 740             } else {
 741                 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
 742                 break;
 743             }
 744         }
 745         c.resize(csize);
 746         if (j < bsize)
 747             memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
 748         *a = c;
 749     }
 750 }
 751
 752 #ifndef QT_NO_REGEXP_WILDCARD
 753 /*
 754   Translates a wildcard pattern to an equivalent regular expression
 755   pattern (e.g., *.cpp to .*\.cpp).
 756
 757   If enableEscaping is true, it is possible to escape the wildcard
 758   characters with \
 759 */
 760 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
 761 {
 762     const int wclen = wc_str.length();
 763     QString rx;
 764     int i = 0;
 765     bool isEscaping = false; // the previous character is '\'
 766     const QChar *wc = wc_str.unicode();
 767
 768     while (i < wclen) {
 769         const QChar c = wc[i++];
 770         switch (c.unicode()) {
 771         case '\\':
 772             if (enableEscaping) {
 773                 if (isEscaping) {
 774                     rx += QLatin1String("\\\\");
 775                 } // we insert the \\ later if necessary
 776                 if (i == wclen) { // the end
 777                     rx += QLatin1String("\\\\");
 778                 }
 779             } else {
 780                 rx += QLatin1String("\\\\");
 781             }
 782             isEscaping = true;
 783             break;
 784         case '*':
 785             if (isEscaping) {
 786                 rx += QLatin1String("\\*");
 787                 isEscaping = false;
 788             } else {
 789                 rx += QLatin1String(".*");
 790             }
 791             break;
 792         case '?':
 793             if (isEscaping) {
 794                 rx += QLatin1String("\\?");
 795                 isEscaping = false;
 796             } else {
 797                 rx += QLatin1Char('.');
 798             }
 799
 800             break;
 801         case '$':
 802         case '(':
 803         case ')':
 804         case '+':
 805         case '.':
 806         case '^':
 807         case '{':
 808         case '|':
 809         case '}':
 810             if (isEscaping) {
 811                 isEscaping = false;
 812                 rx += QLatin1String("\\\\");
 813             }
 814             rx += QLatin1Char('\\');
 815             rx += c;
 816             break;
 817          case '[':
 818             if (isEscaping) {
 819                 isEscaping = false;
 820                 rx += QLatin1String("\\[");
 821             } else {
 822                 rx += c;
 823                 if (wc[i] == QLatin1Char('^'))
 824                     rx += wc[i++];
 825                 if (i < wclen) {
 826                     if (rx[i] == QLatin1Char(']'))
 827                         rx += wc[i++];
 828                     while (i < wclen && wc[i] != QLatin1Char(']')) {
 829                         if (wc[i] == QLatin1Char('\\'))
 830                             rx += QLatin1Char('\\');
 831                         rx += wc[i++];
 832                     }
 833                 }
 834             }
 835              break;
 836
 837         case ']':
 838             if(isEscaping){
 839                 isEscaping = false;
 840                 rx += QLatin1String("\\");
 841             }
 842             rx += c;
 843             break;
 844
 845         default:
 846             if(isEscaping){
 847                 isEscaping = false;
 848                 rx += QLatin1String("\\\\");
 849             }
 850             rx += c;
 851         }
 852     }
 853     return rx;
 854 }
 855 #endif
 856
 857 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
 858 {
 859     if (caretMode == QRegExp::CaretAtZero) {
 860         return 0;
 861     } else if (caretMode == QRegExp::CaretAtOffset) {
 862         return offset;
 863     } else { // QRegExp::CaretWontMatch
 864         return -1;
 865     }
 866 }
 867
 868 /*
 869     The QRegExpEngineKey struct uniquely identifies an engine.
 870 */
 871 struct QRegExpEngineKey
 872 {
 873     QString pattern;
 874     QRegExp::PatternSyntax patternSyntax;
 875     Qt::CaseSensitivity cs;
 876
 877     inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
 878                             Qt::CaseSensitivity cs)
 879         : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
 880
 881     inline void clear() {
 882         pattern.clear();
 883         patternSyntax = QRegExp::RegExp;
 884         cs = Qt::CaseSensitive;
 885     }
 886 };
 887
 888 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
 889 {
 890     return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
 891            && key1.cs == key2.cs;
 892 }
 893
 894 class QRegExpEngine;
 895
 896 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
 897
 898 /*
 899   This is the engine state during matching.
 900 */
 901 struct QRegExpMatchState
 902 {
 903     const QChar *in; // a pointer to the input string data
 904     int pos; // the current position in the string
 905     int caretPos;
 906     int len; // the length of the input string
 907     bool minimal; // minimal matching?
 908     int *bigArray; // big array holding the data for the next pointers
 909     int *inNextStack; // is state is nextStack?
 910     int *curStack; // stack of current states
 911     int *nextStack; // stack of next states
 912     int *curCapBegin; // start of current states' captures
 913     int *nextCapBegin; // start of next states' captures
 914     int *curCapEnd; // end of current states' captures
 915     int *nextCapEnd; // end of next states' captures
 916     int *tempCapBegin; // start of temporary captures
 917     int *tempCapEnd; // end of temporary captures
 918     int *capBegin; // start of captures for a next state
 919     int *capEnd; // end of captures for a next state
 920     int *slideTab; // bump-along slide table for bad-character heuristic
 921     int *captured; // what match() returned last
 922     int slideTabSize; // size of slide table
 923     int capturedSize;
 924 #ifndef QT_NO_REGEXP_BACKREF
 925     QList<QVector<int> > sleeping; // list of back-reference sleepers
 926 #endif
 927     int matchLen; // length of match
 928     int oneTestMatchedLen; // length of partial match
 929
 930     const QRegExpEngine *eng;
 931
 932     inline QRegExpMatchState() : bigArray(0), captured(0) {}
 933     inline ~QRegExpMatchState() { free(bigArray); }
 934
 935     void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
 936     void prepareForMatch(QRegExpEngine *eng);
 937     void match(const QChar *str, int len, int pos, bool minimal,
 938         bool oneTest, int caretIndex);
 939     bool matchHere();
 940     bool testAnchor(int i, int a, const int *capBegin);
 941 };
 942
 943 /*
 944   The struct QRegExpAutomatonState represents one state in a modified NFA. The
 945   input characters matched are stored in the state instead of on
 946   the transitions, something possible for an automaton
 947   constructed from a regular expression.
 948 */
 949 struct QRegExpAutomatonState
 950 {
 951 #ifndef QT_NO_REGEXP_CAPTURE
 952     int atom; // which atom does this state belong to?
 953 #endif
 954     int match; // what does it match? (see CharClassBit and BackRefBit)
 955     QVector<int> outs; // out-transitions
 956     QMap<int, int> reenter; // atoms reentered when transiting out
 957     QMap<int, int> anchors; // anchors met when transiting out
 958
 959     inline QRegExpAutomatonState() { }
 960 #ifndef QT_NO_REGEXP_CAPTURE
 961     inline QRegExpAutomatonState(int a, int m)
 962         : atom(a), match(m) { }
 963 #else
 964     inline QRegExpAutomatonState(int m)
 965         : match(m) { }
 966 #endif
 967 };
 968
 969 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
 970
 971 /*
 972   The struct QRegExpCharClassRange represents a range of characters (e.g.,
 973   [0-9] denotes range 48 to 57).
 974 */
 975 struct QRegExpCharClassRange
 976 {
 977     ushort from; // 48
 978     ushort len; // 10
 979 };
 980
 981 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
 982
 983 #ifndef QT_NO_REGEXP_CAPTURE
 984 /*
 985   The struct QRegExpAtom represents one node in the hierarchy of regular
 986   expression atoms.
 987 */
 988 struct QRegExpAtom
 989 {
 990     enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
 991
 992     int parent; // index of parent in array of atoms
 993     int capture; // index of capture, from 1 to ncap - 1
 994 };
 995
 996 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
 997 #endif
 998
 999 struct QRegExpLookahead;
1000
1001 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1002 /*
1003   The struct QRegExpAnchorAlternation represents a pair of anchors with
1004   OR semantics.
1005 */
1006 struct QRegExpAnchorAlternation
1007 {
1008     int a; // this anchor...
1009     int b; // ...or this one
1010 };
1011
1012 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1013 #endif
1014
1015 #ifndef QT_NO_REGEXP_CCLASS
1016
1017 #define FLAG(x) (1 << (x))
1018 /*
1019   The class QRegExpCharClass represents a set of characters, such as can
1020   be found in regular expressions (e.g., [a-z] denotes the set
1021   {a, b, ..., z}).
1022 */
1023 class QRegExpCharClass
1024 {
1025 public:
1026     QRegExpCharClass();
1027     inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1028
1029     QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1030
1031     void clear();
1032     bool negative() const { return n; }
1033     void setNegative(bool negative);
1034     void addCategories(uint cats);
1035     void addRange(ushort from, ushort to);
1036     void addSingleton(ushort ch) { addRange(ch, ch); }
1037
1038     bool in(QChar ch) const;
1039 #ifndef QT_NO_REGEXP_OPTIM
1040     const QVector<int> &firstOccurrence() const { return occ1; }
1041 #endif
1042
1043 #if defined(QT_DEBUG)
1044     void dump() const;
1045 #endif
1046
1047 private:
1048     uint c; // character classes
1049     QVector<QRegExpCharClassRange> r; // character ranges
1050     bool n; // negative?
1051 #ifndef QT_NO_REGEXP_OPTIM
1052     QVector<int> occ1; // first-occurrence array
1053 #endif
1054 };
1055 #else
1056 struct QRegExpCharClass
1057 {
1058     int dummy;
1059
1060 #ifndef QT_NO_REGEXP_OPTIM
1061     QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1062
1063     const QVector<int> &firstOccurrence() const { return occ1; }
1064     QVector<int> occ1;
1065 #endif
1066 };
1067 #endif
1068
1069 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1070
1071 /*
1072   The QRegExpEngine class encapsulates a modified nondeterministic
1073   finite automaton (NFA).
1074 */
1075 class QRegExpEngine
1076 {
1077 public:
1078     QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1079         : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1080
1081     QRegExpEngine(const QRegExpEngineKey &key);
1082     ~QRegExpEngine();
1083
1084     bool isValid() const { return valid; }
1085     const QString &errorString() const { return yyError; }
1086     int captureCount() const { return officialncap; }
1087
1088     int createState(QChar ch);
1089     int createState(const QRegExpCharClass &cc);
1090 #ifndef QT_NO_REGEXP_BACKREF
1091     int createState(int bref);
1092 #endif
1093
1094     void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1095 #ifndef QT_NO_REGEXP_CAPTURE
1096     void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1097 #endif
1098
1099 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1100     int anchorAlternation(int a, int b);
1101     int anchorConcatenation(int a, int b);
1102 #else
1103     int anchorAlternation(int a, int b) { return a & b; }
1104     int anchorConcatenation(int a, int b) { return a | b; }
1105 #endif
1106     void addAnchors(int from, int to, int a);
1107
1108 #ifndef QT_NO_REGEXP_OPTIM
1109     void heuristicallyChooseHeuristic();
1110 #endif
1111
1112 #if defined(QT_DEBUG)
1113     void dump() const;
1114 #endif
1115
1116     QAtomicInt ref;
1117
1118 private:
1119     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1120     enum { InitialState = 0, FinalState = 1 };
1121
1122     void setup();
1123     int setupState(int match);
1124
1125     /*
1126       Let's hope that 13 lookaheads and 14 back-references are
1127       enough.
1128      */
1129     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1130     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1131            Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1132            Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1133            Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1134            Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1135
1136            Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1137                    ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1138 #ifndef QT_NO_REGEXP_CAPTURE
1139     int startAtom(bool officialCapture);
1140     void finishAtom(int atom, bool needCapture);
1141 #endif
1142
1143 #ifndef QT_NO_REGEXP_LOOKAHEAD
1144     int addLookahead(QRegExpEngine *eng, bool negative);
1145 #endif
1146
1147 #ifndef QT_NO_REGEXP_OPTIM
1148     bool goodStringMatch(QRegExpMatchState &matchState) const;
1149     bool badCharMatch(QRegExpMatchState &matchState) const;
1150 #else
1151     bool bruteMatch(QRegExpMatchState &matchState) const;
1152 #endif
1153
1154     QVector<QRegExpAutomatonState> s; // array of states
1155 #ifndef QT_NO_REGEXP_CAPTURE
1156     QVector<QRegExpAtom> f; // atom hierarchy
1157     int nf; // number of atoms
1158     int cf; // current atom
1159     QVector<int> captureForOfficialCapture;
1160 #endif
1161     int officialncap; // number of captures, seen from the outside
1162     int ncap; // number of captures, seen from the inside
1163 #ifndef QT_NO_REGEXP_CCLASS
1164     QVector<QRegExpCharClass> cl; // array of character classes
1165 #endif
1166 #ifndef QT_NO_REGEXP_LOOKAHEAD
1167     QVector<QRegExpLookahead *> ahead; // array of lookaheads
1168 #endif
1169 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1170     QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1171 #endif
1172 #ifndef QT_NO_REGEXP_OPTIM
1173     bool caretAnchored; // does the regexp start with ^?
1174     bool trivial; // is the good-string all that needs to match?
1175 #endif
1176     bool valid; // is the regular expression valid?
1177     Qt::CaseSensitivity cs; // case sensitive?
1178     bool greedyQuantifiers; // RegExp2?
1179     bool xmlSchemaExtensions;
1180 #ifndef QT_NO_REGEXP_BACKREF
1181     int nbrefs; // number of back-references
1182 #endif
1183
1184 #ifndef QT_NO_REGEXP_OPTIM
1185     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1186
1187     int goodEarlyStart; // the index where goodStr can first occur in a match
1188     int goodLateStart; // the index where goodStr can last occur in a match
1189     QString goodStr; // the string that any match has to contain
1190
1191     int minl; // the minimum length of a match
1192     QVector<int> occ1; // first-occurrence array
1193 #endif
1194
1195     /*
1196       The class Box is an abstraction for a regular expression
1197       fragment. It can also be seen as one node in the syntax tree of
1198       a regular expression with synthetized attributes.
1199
1200       Its interface is ugly for performance reasons.
1201     */
1202     class Box
1203     {
1204     public:
1205         Box(QRegExpEngine *engine);
1206         Box(const Box &b) { operator=(b); }
1207
1208         Box &operator=(const Box &b);
1209
1210         void clear() { operator=(Box(eng)); }
1211         void set(QChar ch);
1212         void set(const QRegExpCharClass &cc);
1213 #ifndef QT_NO_REGEXP_BACKREF
1214         void set(int bref);
1215 #endif
1216
1217         void cat(const Box &b);
1218         void orx(const Box &b);
1219         void plus(int atom);
1220         void opt();
1221         void catAnchor(int a);
1222 #ifndef QT_NO_REGEXP_OPTIM
1223         void setupHeuristics();
1224 #endif
1225
1226 #if defined(QT_DEBUG)
1227         void dump() const;
1228 #endif
1229
1230     private:
1231         void addAnchorsToEngine(const Box &to) const;
1232
1233         QRegExpEngine *eng; // the automaton under construction
1234         QVector<int> ls; // the left states (firstpos)
1235         QVector<int> rs; // the right states (lastpos)
1236         QMap<int, int> lanchors; // the left anchors
1237         QMap<int, int> ranchors; // the right anchors
1238         int skipanchors; // the anchors to match if the box is skipped
1239
1240 #ifndef QT_NO_REGEXP_OPTIM
1241         int earlyStart; // the index where str can first occur
1242         int lateStart; // the index where str can last occur
1243         QString str; // a string that has to occur in any match
1244         QString leftStr; // a string occurring at the left of this box
1245         QString rightStr; // a string occurring at the right of this box
1246         int maxl; // the maximum length of this box (possibly InftyLen)
1247 #endif
1248
1249         int minl; // the minimum length of this box
1250 #ifndef QT_NO_REGEXP_OPTIM
1251         QVector<int> occ1; // first-occurrence array
1252 #endif
1253     };
1254
1255     friend class Box;
1256
1257     /*
1258       This is the lexical analyzer for regular expressions.
1259     */
1260     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1261            Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1262            Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1263     int getChar();
1264     int getEscape();
1265 #ifndef QT_NO_REGEXP_INTERVAL
1266     int getRep(int def);
1267 #endif
1268 #ifndef QT_NO_REGEXP_LOOKAHEAD
1269     void skipChars(int n);
1270 #endif
1271     void error(const char *msg);
1272     void startTokenizer(const QChar *rx, int len);
1273     int getToken();
1274
1275     const QChar *yyIn; // a pointer to the input regular expression pattern
1276     int yyPos0; // the position of yyTok in the input pattern
1277     int yyPos; // the position of the next character to read
1278     int yyLen; // the length of yyIn
1279     int yyCh; // the last character read
1280     QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1281     int yyMinRep; // attribute for Tok_Quantifier
1282     int yyMaxRep; // ditto
1283     QString yyError; // syntax error or overflow during parsing?
1284
1285     /*
1286       This is the syntactic analyzer for regular expressions.
1287     */
1288     int parse(const QChar *rx, int len);
1289     void parseAtom(Box *box);
1290     void parseFactor(Box *box);
1291     void parseTerm(Box *box);
1292     void parseExpression(Box *box);
1293
1294     int yyTok; // the last token read
1295     bool yyMayCapture; // set this to false to disable capturing
1296
1297     friend struct QRegExpMatchState;
1298 };
1299
1300 #ifndef QT_NO_REGEXP_LOOKAHEAD
1301 /*
1302   The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1303   (?=foo) and (?!bar)).
1304 */
1305 struct QRegExpLookahead
1306 {
1307     QRegExpEngine *eng; // NFA representing the embedded regular expression
1308     bool neg; // negative lookahead?
1309
1310     inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1311         : eng(eng0), neg(neg0) { }
1312     inline ~QRegExpLookahead() { delete eng; }
1313 };
1314 #endif
1315
1316 /*! \internal
1317     convert the pattern string to the RegExp syntax.
1318
1319     This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1320  */
1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1322 {
1323     switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325     case QRegExp::Wildcard:
1326         return wc2rx(pattern, false);
1327         break;
1328     case QRegExp::WildcardUnix:
1329         return wc2rx(pattern, true);
1330         break;
1331 #endif
1332     case QRegExp::FixedString:
1333         return QRegExp::escape(pattern);
1334         break;
1335     case QRegExp::W3CXmlSchema11:
1336     default:
1337         return pattern;
1338     }
1339 }
1340
1341 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1342     : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1343       xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1344 {
1345     setup();
1346
1347     QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1348
1349     valid = (parse(rx.unicode(), rx.length()) == rx.length());
1350     if (!valid) {
1351 #ifndef QT_NO_REGEXP_OPTIM
1352         trivial = false;
1353 #endif
1354         error(RXERR_LEFTDELIM);
1355     }
1356 }
1357
1358 QRegExpEngine::~QRegExpEngine()
1359 {
1360 #ifndef QT_NO_REGEXP_LOOKAHEAD
1361     qDeleteAll(ahead);
1362 #endif
1363 }
1364
1365 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1366 {
1367     /*
1368       We use one QVector<int> for all the big data used a lot in
1369       matchHere() and friends.
1370     */
1371     int ns = eng->s.size(); // number of states
1372     int ncap = eng->ncap;
1373 #ifndef QT_NO_REGEXP_OPTIM
1374     int newSlideTabSize = qMax(eng->minl + 1, 16);
1375 #else
1376     int newSlideTabSize = 0;
1377 #endif
1378     int numCaptures = eng->captureCount();
1379     int newCapturedSize = 2 + 2 * numCaptures;
1380     bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1381
1382     // set all internal variables only _after_ bigArray is realloc'ed
1383     // to prevent a broken regexp in oom case
1384
1385     slideTabSize = newSlideTabSize;
1386     capturedSize = newCapturedSize;
1387     inNextStack = bigArray;
1388     memset(inNextStack, -1, ns * sizeof(int));
1389     curStack = inNextStack + ns;
1390     nextStack = inNextStack + 2 * ns;
1391
1392     curCapBegin = inNextStack + 3 * ns;
1393     nextCapBegin = curCapBegin + ncap * ns;
1394     curCapEnd = curCapBegin + 2 * ncap * ns;
1395     nextCapEnd = curCapBegin + 3 * ncap * ns;
1396
1397     tempCapBegin = curCapBegin + 4 * ncap * ns;
1398     tempCapEnd = tempCapBegin + ncap;
1399     capBegin = tempCapBegin + 2 * ncap;
1400     capEnd = tempCapBegin + 3 * ncap;
1401
1402     slideTab = tempCapBegin + 4 * ncap;
1403     captured = slideTab + slideTabSize;
1404     memset(captured, -1, capturedSize*sizeof(int));
1405     this->eng = eng;
1406 }
1407
1408 /*
1409   Tries to match in str and returns an array of (begin, length) pairs
1410   for captured text. If there is no match, all pairs are (-1, -1).
1411 */
1412 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1413     bool minimal0, bool oneTest, int caretIndex)
1414 {
1415     bool matched = false;
1416     QChar char_null;
1417
1418 #ifndef QT_NO_REGEXP_OPTIM
1419     if (eng->trivial && !oneTest) {
1420         pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1421         matchLen = eng->goodStr.length();
1422         matched = (pos != -1);
1423     } else
1424 #endif
1425     {
1426         in = str0;
1427         if (in == 0)
1428             in = &char_null;
1429         pos = pos0;
1430         caretPos = caretIndex;
1431         len = len0;
1432         minimal = minimal0;
1433         matchLen = 0;
1434         oneTestMatchedLen = 0;
1435
1436         if (eng->valid && pos >= 0 && pos <= len) {
1437 #ifndef QT_NO_REGEXP_OPTIM
1438             if (oneTest) {
1439                 matched = matchHere();
1440             } else {
1441                 if (pos <= len - eng->minl) {
1442                     if (eng->caretAnchored) {
1443                         matched = matchHere();
1444                     } else if (eng->useGoodStringHeuristic) {
1445                         matched = eng->goodStringMatch(*this);
1446                     } else {
1447                         matched = eng->badCharMatch(*this);
1448                     }
1449                 }
1450             }
1451 #else
1452             matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1453 #endif
1454         }
1455     }
1456
1457     if (matched) {
1458         int *c = captured;
1459         *c++ = pos;
1460         *c++ = matchLen;
1461
1462         int numCaptures = (capturedSize - 2) >> 1;
1463 #ifndef QT_NO_REGEXP_CAPTURE
1464         for (int i = 0; i < numCaptures; ++i) {
1465             int j = eng->captureForOfficialCapture.at(i);
1466             if (capBegin[j] != EmptyCapture) {
1467                 int len = capEnd[j] - capBegin[j];
1468                 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1469                 *c++ = len;
1470             } else {
1471                 *c++ = -1;
1472                 *c++ = -1;
1473             }
1474         }
1475 #endif
1476     } else {
1477         // we rely on 2's complement here
1478         memset(captured, -1, capturedSize * sizeof(int));
1479     }
1480 }
1481
1482 /*
1483   The three following functions add one state to the automaton and
1484   return the number of the state.
1485 */
1486
1487 int QRegExpEngine::createState(QChar ch)
1488 {
1489     return setupState(ch.unicode());
1490 }
1491
1492 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1493 {
1494 #ifndef QT_NO_REGEXP_CCLASS
1495     int n = cl.size();
1496     cl += QRegExpCharClass(cc);
1497     return setupState(CharClassBit | n);
1498 #else
1499     Q_UNUSED(cc);
1500     return setupState(CharClassBit);
1501 #endif
1502 }
1503
1504 #ifndef QT_NO_REGEXP_BACKREF
1505 int QRegExpEngine::createState(int bref)
1506 {
1507     if (bref > nbrefs) {
1508         nbrefs = bref;
1509         if (nbrefs > MaxBackRefs) {
1510             error(RXERR_LIMIT);
1511             return 0;
1512         }
1513     }
1514     return setupState(BackRefBit | bref);
1515 }
1516 #endif
1517
1518 /*
1519   The two following functions add a transition between all pairs of
1520   states (i, j) where i is found in from, and j is found in to.
1521
1522   Cat-transitions are distinguished from plus-transitions for
1523   capturing.
1524 */
1525
1526 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1527 {
1528     for (int i = 0; i < from.size(); i++)
1529         mergeInto(&s[from.at(i)].outs, to);
1530 }
1531
1532 #ifndef QT_NO_REGEXP_CAPTURE
1533 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1534 {
1535     for (int i = 0; i < from.size(); i++) {
1536         QRegExpAutomatonState &st = s[from.at(i)];
1537         const QVector<int> oldOuts = st.outs;
1538         mergeInto(&st.outs, to);
1539         if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1540             for (int j = 0; j < to.size(); j++) {
1541                 // ### st.reenter.contains(to.at(j)) check looks suspicious
1542                 if (!st.reenter.contains(to.at(j)) &&
1543                      qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1544                     st.reenter.insert(to.at(j), atom);
1545             }
1546         }
1547     }
1548 }
1549 #endif
1550
1551 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1552 /*
1553   Returns an anchor that means a OR b.
1554 */
1555 int QRegExpEngine::anchorAlternation(int a, int b)
1556 {
1557     if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1558         return a & b;
1559
1560     int n = aa.size();
1561 #ifndef QT_NO_REGEXP_OPTIM
1562     if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1563         return Anchor_Alternation | (n - 1);
1564 #endif
1565
1566     QRegExpAnchorAlternation element = {a, b};
1567     aa.append(element);
1568     return Anchor_Alternation | n;
1569 }
1570
1571 /*
1572   Returns an anchor that means a AND b.
1573 */
1574 int QRegExpEngine::anchorConcatenation(int a, int b)
1575 {
1576     if (((a | b) & Anchor_Alternation) == 0)
1577         return a | b;
1578     if ((b & Anchor_Alternation) != 0)
1579         qSwap(a, b);
1580
1581     int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1582     int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1583     return anchorAlternation(aprime, bprime);
1584 }
1585 #endif
1586
1587 /*
1588   Adds anchor a on a transition caracterised by its from state and
1589   its to state.
1590 */
1591 void QRegExpEngine::addAnchors(int from, int to, int a)
1592 {
1593     QRegExpAutomatonState &st = s[from];
1594     if (st.anchors.contains(to))
1595         a = anchorAlternation(st.anchors.value(to), a);
1596     st.anchors.insert(to, a);
1597 }
1598
1599 #ifndef QT_NO_REGEXP_OPTIM
1600 /*
1601   This function chooses between the good-string and the bad-character
1602   heuristics. It computes two scores and chooses the heuristic with
1603   the highest score.
1604
1605   Here are some common-sense constraints on the scores that should be
1606   respected if the formulas are ever modified: (1) If goodStr is
1607   empty, the good-string heuristic scores 0. (2) If the regular
1608   expression is trivial, the good-string heuristic should be used.
1609   (3) If the search is case insensitive, the good-string heuristic
1610   should be used, unless it scores 0. (Case insensitivity turns all
1611   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1612   big, the good-string heuristic should score less.
1613 */
1614 void QRegExpEngine::heuristicallyChooseHeuristic()
1615 {
1616     if (minl == 0) {
1617         useGoodStringHeuristic = false;
1618     } else if (trivial) {
1619         useGoodStringHeuristic = true;
1620     } else {
1621         /*
1622           Magic formula: The good string has to constitute a good
1623           proportion of the minimum-length string, and appear at a
1624           more-or-less known index.
1625         */
1626         int goodStringScore = (64 * goodStr.length() / minl) -
1627                               (goodLateStart - goodEarlyStart);
1628         /*
1629           Less magic formula: We pick some characters at random, and
1630           check whether they are good or bad.
1631         */
1632         int badCharScore = 0;
1633         int step = qMax(1, NumBadChars / 32);
1634         for (int i = 1; i < NumBadChars; i += step) {
1635             if (occ1.at(i) == NoOccurrence)
1636                 badCharScore += minl;
1637             else
1638                 badCharScore += occ1.at(i);
1639         }
1640         badCharScore /= minl;
1641         useGoodStringHeuristic = (goodStringScore > badCharScore);
1642     }
1643 }
1644 #endif
1645
1646 #if defined(QT_DEBUG)
1647 void QRegExpEngine::dump() const
1648 {
1649     int i, j;
1650     qDebug("Case %ssensitive engine", cs ? "" : "in");
1651     qDebug("  States");
1652     for (i = 0; i < s.size(); i++) {
1653         qDebug("  %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1654 #ifndef QT_NO_REGEXP_CAPTURE
1655         if (nf > 0)
1656             qDebug("    in atom %d", s[i].atom);
1657 #endif
1658         int m = s[i].match;
1659         if ((m & CharClassBit) != 0) {
1660             qDebug("    match character class %d", m ^ CharClassBit);
1661 #ifndef QT_NO_REGEXP_CCLASS
1662             cl[m ^ CharClassBit].dump();
1663 #else
1664             qDebug("    negative character class");
1665 #endif
1666         } else if ((m & BackRefBit) != 0) {
1667             qDebug("    match back-reference %d", m ^ BackRefBit);
1668         } else if (m >= 0x20 && m <= 0x7e) {
1669             qDebug("    match 0x%.4x (%c)", m, m);
1670         } else {
1671             qDebug("    match 0x%.4x", m);
1672         }
1673         for (j = 0; j < s[i].outs.size(); j++) {
1674             int next = s[i].outs[j];
1675             qDebug("    -> %d", next);
1676             if (s[i].reenter.contains(next))
1677                 qDebug("       [reenter %d]", s[i].reenter[next]);
1678             if (s[i].anchors.value(next) != 0)
1679                 qDebug("       [anchors 0x%.8x]", s[i].anchors[next]);
1680         }
1681     }
1682 #ifndef QT_NO_REGEXP_CAPTURE
1683     if (nf > 0) {
1684         qDebug("  Atom    Parent  Capture");
1685         for (i = 0; i < nf; i++) {
1686             if (f[i].capture == QRegExpAtom::NoCapture) {
1687                 qDebug("  %6d  %6d     nil", i, f[i].parent);
1688             } else {
1689                 int cap = f[i].capture;
1690                 bool official = captureForOfficialCapture.contains(cap);
1691                 qDebug("  %6d  %6d  %6d  %s", i, f[i].parent, f[i].capture,
1692                        official ? "official" : "");
1693             }
1694         }
1695     }
1696 #endif
1697 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1698     for (i = 0; i < aa.size(); i++)
1699         qDebug("  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1700 #endif
1701 }
1702 #endif
1703
1704 void QRegExpEngine::setup()
1705 {
1706     ref.store(1);
1707 #ifndef QT_NO_REGEXP_CAPTURE
1708     f.resize(32);
1709     nf = 0;
1710     cf = -1;
1711 #endif
1712     officialncap = 0;
1713     ncap = 0;
1714 #ifndef QT_NO_REGEXP_OPTIM
1715     caretAnchored = true;
1716     trivial = true;
1717 #endif
1718     valid = false;
1719 #ifndef QT_NO_REGEXP_BACKREF
1720     nbrefs = 0;
1721 #endif
1722 #ifndef QT_NO_REGEXP_OPTIM
1723     useGoodStringHeuristic = true;
1724     minl = 0;
1725     occ1.fill(0, NumBadChars);
1726 #endif
1727 }
1728
1729 int QRegExpEngine::setupState(int match)
1730 {
1731 #ifndef QT_NO_REGEXP_CAPTURE
1732     s += QRegExpAutomatonState(cf, match);
1733 #else
1734     s += QRegExpAutomatonState(match);
1735 #endif
1736     return s.size() - 1;
1737 }
1738
1739 #ifndef QT_NO_REGEXP_CAPTURE
1740 /*
1741   Functions startAtom() and finishAtom() should be called to delimit
1742   atoms. When a state is created, it is assigned to the current atom.
1743   The information is later used for capturing.
1744 */
1745 int QRegExpEngine::startAtom(bool officialCapture)
1746 {
1747     if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1748         f.resize((nf + 1) << 1);
1749     f[nf].parent = cf;
1750     cf = nf++;
1751     f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1752     return cf;
1753 }
1754
1755 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1756 {
1757     if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1758         f[atom].capture = QRegExpAtom::UnofficialCapture;
1759     cf = f.at(atom).parent;
1760 }
1761 #endif
1762
1763 #ifndef QT_NO_REGEXP_LOOKAHEAD
1764 /*
1765   Creates a lookahead anchor.
1766 */
1767 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1768 {
1769     int n = ahead.size();
1770     if (n == MaxLookaheads) {
1771         error(RXERR_LIMIT);
1772         return 0;
1773     }
1774     ahead += new QRegExpLookahead(eng, negative);
1775     return Anchor_FirstLookahead << n;
1776 }
1777 #endif
1778
1779 #ifndef QT_NO_REGEXP_CAPTURE
1780 /*
1781   We want the longest leftmost captures.
1782 */
1783 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1784                             const int *end2)
1785 {
1786     for (int i = 0; i < ncap; i++) {
1787         int delta = begin2[i] - begin1[i]; // it has to start early...
1788         if (delta == 0)
1789             delta = end1[i] - end2[i]; // ...and end late
1790
1791         if (delta != 0)
1792             return delta > 0;
1793     }
1794     return false;
1795 }
1796 #endif
1797
1798 /*
1799   Returns true if anchor a matches at position pos + i in the input
1800   string, otherwise false.
1801 */
1802 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1803 {
1804     int j;
1805
1806 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1807     if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1808         return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1809                || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1810 #endif
1811
1812     if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1813         if (pos + i != caretPos)
1814             return false;
1815     }
1816     if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1817         if (pos + i != len)
1818             return false;
1819     }
1820 #ifndef QT_NO_REGEXP_ESCAPE
1821     if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1822         bool before = false;
1823         bool after = false;
1824         if (pos + i != 0)
1825             before = isWord(in[pos + i - 1]);
1826         if (pos + i != len)
1827             after = isWord(in[pos + i]);
1828         if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1829             return false;
1830         if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1831             return false;
1832     }
1833 #endif
1834 #ifndef QT_NO_REGEXP_LOOKAHEAD
1835     if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1836         const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1837         for (j = 0; j < ahead.size(); j++) {
1838             if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1839                 QRegExpMatchState matchState;
1840                 matchState.prepareForMatch(ahead[j]->eng);
1841                 matchState.match(in + pos + i, len - pos - i, 0,
1842                     true, true, matchState.caretPos - matchState.pos - i);
1843                 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1844                     return false;
1845             }
1846         }
1847     }
1848 #endif
1849 #ifndef QT_NO_REGEXP_CAPTURE
1850 #ifndef QT_NO_REGEXP_BACKREF
1851     for (j = 0; j < eng->nbrefs; j++) {
1852         if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1853             int i = eng->captureForOfficialCapture.at(j);
1854             if (capBegin[i] != EmptyCapture)
1855                 return false;
1856         }
1857     }
1858 #endif
1859 #endif
1860     return true;
1861 }
1862
1863 #ifndef QT_NO_REGEXP_OPTIM
1864 /*
1865   The three following functions are what Jeffrey Friedl would call
1866   transmissions (or bump-alongs). Using one or the other should make
1867   no difference except in performance.
1868 */
1869
1870 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1871 {
1872     int k = matchState.pos + goodEarlyStart;
1873     QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1874     while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1875         int from = k - goodLateStart;
1876         int to = k - goodEarlyStart;
1877         if (from > matchState.pos)
1878             matchState.pos = from;
1879
1880         while (matchState.pos <= to) {
1881             if (matchState.matchHere())
1882                 return true;
1883             ++matchState.pos;
1884         }
1885         ++k;
1886     }
1887     return false;
1888 }
1889
1890 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1891 {
1892     int slideHead = 0;
1893     int slideNext = 0;
1894     int i;
1895     int lastPos = matchState.len - minl;
1896     memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1897
1898     /*
1899       Set up the slide table, used for the bad-character heuristic,
1900       using the table of first occurrence of each character.
1901     */
1902     for (i = 0; i < minl; i++) {
1903         int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1904         if (sk == NoOccurrence)
1905             sk = i + 1;
1906         if (sk > 0) {
1907             int k = i + 1 - sk;
1908             if (k < 0) {
1909                 sk = i + 1;
1910                 k = 0;
1911             }
1912             if (sk > matchState.slideTab[k])
1913                 matchState.slideTab[k] = sk;
1914         }
1915     }
1916
1917     if (matchState.pos > lastPos)
1918         return false;
1919
1920     for (;;) {
1921         if (++slideNext >= matchState.slideTabSize)
1922             slideNext = 0;
1923         if (matchState.slideTab[slideHead] > 0) {
1924             if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1925                 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1926             matchState.slideTab[slideHead] = 0;
1927         } else {
1928             if (matchState.matchHere())
1929                 return true;
1930         }
1931
1932         if (matchState.pos == lastPos)
1933             break;
1934
1935         /*
1936           Update the slide table. This code has much in common with
1937           the initialization code.
1938         */
1939         int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1940         if (sk == NoOccurrence) {
1941             matchState.slideTab[slideNext] = minl;
1942         } else if (sk > 0) {
1943             int k = slideNext + minl - sk;
1944             if (k >= matchState.slideTabSize)
1945                 k -= matchState.slideTabSize;
1946             if (sk > matchState.slideTab[k])
1947                 matchState.slideTab[k] = sk;
1948         }
1949         slideHead = slideNext;
1950         ++matchState.pos;
1951     }
1952     return false;
1953 }
1954 #else
1955 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1956 {
1957     while (matchState.pos <= matchState.len) {
1958         if (matchState.matchHere())
1959             return true;
1960         ++matchState.pos;
1961     }
1962     return false;
1963 }
1964 #endif
1965
1966 /*
1967   Here's the core of the engine. It tries to do a match here and now.
1968 */
1969 bool QRegExpMatchState::matchHere()
1970 {
1971     int ncur = 1, nnext = 0;
1972     int i = 0, j, k, m;
1973     bool stop = false;
1974
1975     matchLen = -1;
1976     oneTestMatchedLen = -1;
1977     curStack[0] = QRegExpEngine::InitialState;
1978
1979     int ncap = eng->ncap;
1980 #ifndef QT_NO_REGEXP_CAPTURE
1981     if (ncap > 0) {
1982         for (j = 0; j < ncap; j++) {
1983             curCapBegin[j] = EmptyCapture;
1984             curCapEnd[j] = EmptyCapture;
1985         }
1986     }
1987 #endif
1988
1989 #ifndef QT_NO_REGEXP_BACKREF
1990     while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1991 #else
1992     while (ncur > 0 && i <= len - pos && !stop)
1993 #endif
1994     {
1995         int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1996         for (j = 0; j < ncur; j++) {
1997             int cur = curStack[j];
1998             const QRegExpAutomatonState &scur = eng->s.at(cur);
1999             const QVector<int> &outs = scur.outs;
2000             for (k = 0; k < outs.size(); k++) {
2001                 int next = outs.at(k);
2002                 const QRegExpAutomatonState &snext = eng->s.at(next);
2003                 bool inside = true;
2004 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2005                 int needSomeSleep = 0;
2006 #endif
2007
2008                 /*
2009                   First, check if the anchors are anchored properly.
2010                 */
2011                 int a = scur.anchors.value(next);
2012                 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2013                     inside = false;
2014
2015                 /*
2016                   If indeed they are, check if the input character is
2017                   correct for this transition.
2018                 */
2019                 if (inside) {
2020                     m = snext.match;
2021                     if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2022                         if (eng->cs)
2023                             inside = (m == ch);
2024                         else
2025                             inside = (QChar(m).toLower() == QChar(ch).toLower());
2026                     } else if (next == QRegExpEngine::FinalState) {
2027                         matchLen = i;
2028                         stop = minimal;
2029                         inside = true;
2030                     } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2031 #ifndef QT_NO_REGEXP_CCLASS
2032                         const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2033                         if (eng->cs)
2034                             inside = cc.in(ch);
2035                         else if (cc.negative())
2036                             inside = cc.in(QChar(ch).toLower()) &&
2037                                      cc.in(QChar(ch).toUpper());
2038                         else
2039                             inside = cc.in(QChar(ch).toLower()) ||
2040                                      cc.in(QChar(ch).toUpper());
2041 #endif
2042 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2043                     } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2044                         int bref = m ^ QRegExpEngine::BackRefBit;
2045                         int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2046
2047                         inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2048                         if (inside) {
2049                             if (eng->cs)
2050                                 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2051                             else
2052                                 inside = (in[pos + curCapBegin[ell]].toLower()
2053                                        == QChar(ch).toLower());
2054                         }
2055
2056                         if (inside) {
2057                             int delta;
2058                             if (curCapEnd[ell] == EmptyCapture)
2059                                 delta = i - curCapBegin[ell];
2060                             else
2061                                 delta = curCapEnd[ell] - curCapBegin[ell];
2062
2063                             inside = (delta <= len - (pos + i));
2064                             if (inside && delta > 1) {
2065                                 int n = 1;
2066                                 if (eng->cs) {
2067                                     while (n < delta) {
2068                                         if (in[pos + curCapBegin[ell] + n]
2069                                             != in[pos + i + n])
2070                                             break;
2071                                         ++n;
2072                                     }
2073                                 } else {
2074                                     while (n < delta) {
2075                                         QChar a = in[pos + curCapBegin[ell] + n];
2076                                         QChar b = in[pos + i + n];
2077                                         if (a.toLower() != b.toLower())
2078                                             break;
2079                                         ++n;
2080                                     }
2081                                 }
2082                                 inside = (n == delta);
2083                                 if (inside)
2084                                     needSomeSleep = delta - 1;
2085                             }
2086                         }
2087 #endif
2088                     }
2089                 }
2090
2091                 /*
2092                   We must now update our data structures.
2093                 */
2094                 if (inside) {
2095 #ifndef QT_NO_REGEXP_CAPTURE
2096                     int *capBegin, *capEnd;
2097 #endif
2098                     /*
2099                       If the next state was not encountered yet, all
2100                       is fine.
2101                     */
2102                     if ((m = inNextStack[next]) == -1) {
2103                         m = nnext++;
2104                         nextStack[m] = next;
2105                         inNextStack[next] = m;
2106 #ifndef QT_NO_REGEXP_CAPTURE
2107                         capBegin = nextCapBegin + m * ncap;
2108                         capEnd = nextCapEnd + m * ncap;
2109
2110                     /*
2111                       Otherwise, we'll first maintain captures in
2112                       temporary arrays, and decide at the end whether
2113                       it's best to keep the previous capture zones or
2114                       the new ones.
2115                     */
2116                     } else {
2117                         capBegin = tempCapBegin;
2118                         capEnd = tempCapEnd;
2119 #endif
2120                     }
2121
2122 #ifndef QT_NO_REGEXP_CAPTURE
2123                     /*
2124                       Updating the capture zones is much of a task.
2125                     */
2126                     if (ncap > 0) {
2127                         memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2128                         memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2129                         int c = scur.atom, n = snext.atom;
2130                         int p = -1, q = -1;
2131                         int cap;
2132
2133                         /*
2134                           Lemma 1. For any x in the range [0..nf), we
2135                           have f[x].parent < x.
2136
2137                           Proof. By looking at startAtom(), it is
2138                           clear that cf < nf holds all the time, and
2139                           thus that f[nf].parent < nf.
2140                         */
2141
2142                         /*
2143                           If we are reentering an atom, we empty all
2144                           capture zones inside it.
2145                         */
2146                         if ((q = scur.reenter.value(next)) != 0) {
2147                             QBitArray b(eng->nf, false);
2148                             b.setBit(q, true);
2149                             for (int ell = q + 1; ell < eng->nf; ell++) {
2150                                 if (b.testBit(eng->f.at(ell).parent)) {
2151                                     b.setBit(ell, true);
2152                                     cap = eng->f.at(ell).capture;
2153                                     if (cap >= 0) {
2154                                         capBegin[cap] = EmptyCapture;
2155                                         capEnd[cap] = EmptyCapture;
2156                                     }
2157                                 }
2158                             }
2159                             p = eng->f.at(q).parent;
2160
2161                         /*
2162                           Otherwise, close the capture zones we are
2163                           leaving. We are leaving f[c].capture,
2164                           f[f[c].parent].capture,
2165                           f[f[f[c].parent].parent].capture, ...,
2166                           until f[x].capture, with x such that
2167                           f[x].parent is the youngest common ancestor
2168                           for c and n.
2169
2170                           We go up along c's and n's ancestry until
2171                           we find x.
2172                         */
2173                         } else {
2174                             p = c;
2175                             q = n;
2176                             while (p != q) {
2177                                 if (p > q) {
2178                                     cap = eng->f.at(p).capture;
2179                                     if (cap >= 0) {
2180                                         if (capBegin[cap] == i) {
2181                                             capBegin[cap] = EmptyCapture;
2182                                             capEnd[cap] = EmptyCapture;
2183                                         } else {
2184                                             capEnd[cap] = i;
2185                                         }
2186                                     }
2187                                     p = eng->f.at(p).parent;
2188                                 } else {
2189                                     q = eng->f.at(q).parent;
2190                                 }
2191                             }
2192                         }
2193
2194                         /*
2195                           In any case, we now open the capture zones
2196                           we are entering. We work upwards from n
2197                           until we reach p (the parent of the atom we
2198                           reenter or the youngest common ancestor).
2199                         */
2200                         while (n > p) {
2201                             cap = eng->f.at(n).capture;
2202                             if (cap >= 0) {
2203                                 capBegin[cap] = i;
2204                                 capEnd[cap] = EmptyCapture;
2205                             }
2206                             n = eng->f.at(n).parent;
2207                         }
2208                         /*
2209                           If the next state was already in
2210                           nextStack, we must choose carefully which
2211                           capture zones we want to keep.
2212                         */
2213                         if (capBegin == tempCapBegin &&
2214                                 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2215                                                 nextCapEnd + m * ncap)) {
2216                             memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2217                             memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2218                         }
2219                     }
2220 #ifndef QT_NO_REGEXP_BACKREF
2221                     /*
2222                       We are done with updating the capture zones.
2223                       It's now time to put the next state to sleep,
2224                       if it needs to, and to remove it from
2225                       nextStack.
2226                     */
2227                     if (needSomeSleep > 0) {
2228                         QVector<int> zzZ(2 + 2 * ncap);
2229                         zzZ[0] = i + needSomeSleep;
2230                         zzZ[1] = next;
2231                         if (ncap > 0) {
2232                             memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2233                             memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2234                         }
2235                         inNextStack[nextStack[--nnext]] = -1;
2236                         sleeping.append(zzZ);
2237                     }
2238 #endif
2239 #endif
2240                 }
2241             }
2242         }
2243 #ifndef QT_NO_REGEXP_CAPTURE
2244         /*
2245           If we reached the final state, hurray! Copy the captured
2246           zone.
2247         */
2248         if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2249             memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2250             memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2251         }
2252 #ifndef QT_NO_REGEXP_BACKREF
2253         /*
2254           It's time to wake up the sleepers.
2255         */
2256         j = 0;
2257         while (j < sleeping.count()) {
2258             if (sleeping.at(j)[0] == i) {
2259                 const QVector<int> &zzZ = sleeping.at(j);
2260                 int next = zzZ[1];
2261                 const int *capBegin = zzZ.data() + 2;
2262                 const int *capEnd = zzZ.data() + 2 + ncap;
2263                 bool copyOver = true;
2264
2265                 if ((m = inNextStack[next]) == -1) {
2266                     m = nnext++;
2267                     nextStack[m] = next;
2268                     inNextStack[next] = m;
2269                 } else {
2270                     copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2271                                                capBegin, capEnd);
2272                 }
2273                 if (copyOver) {
2274                     memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2275                     memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2276                 }
2277
2278                 sleeping.removeAt(j);
2279             } else {
2280                 ++j;
2281             }
2282         }
2283 #endif
2284 #endif
2285         for (j = 0; j < nnext; j++)
2286             inNextStack[nextStack[j]] = -1;
2287
2288         // avoid needless iteration that confuses oneTestMatchedLen
2289         if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2290 #ifndef QT_NO_REGEXP_BACKREF
2291              && sleeping.isEmpty()
2292 #endif
2293            )
2294             stop = true;
2295
2296         qSwap(curStack, nextStack);
2297 #ifndef QT_NO_REGEXP_CAPTURE
2298         qSwap(curCapBegin, nextCapBegin);
2299         qSwap(curCapEnd, nextCapEnd);
2300 #endif
2301         ncur = nnext;
2302         nnext = 0;
2303         ++i;
2304     }
2305
2306 #ifndef QT_NO_REGEXP_BACKREF
2307     /*
2308       If minimal matching is enabled, we might have some sleepers
2309       left.
2310     */
2311     if (!sleeping.isEmpty())
2312         sleeping.clear();
2313 #endif
2314
2315     oneTestMatchedLen = i - 1;
2316     return (matchLen >= 0);
2317 }
2318
2319 #ifndef QT_NO_REGEXP_CCLASS
2320
2321 QRegExpCharClass::QRegExpCharClass()
2322     : c(0), n(false)
2323 {
2324 #ifndef QT_NO_REGEXP_OPTIM
2325     occ1.fill(NoOccurrence, NumBadChars);
2326 #endif
2327 }
2328
2329 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2330 {
2331     c = cc.c;
2332     r = cc.r;
2333     n = cc.n;
2334 #ifndef QT_NO_REGEXP_OPTIM
2335     occ1 = cc.occ1;
2336 #endif
2337     return *this;
2338 }
2339
2340 void QRegExpCharClass::clear()
2341 {
2342     c = 0;
2343     r.resize(0);
2344     n = false;
2345 }
2346
2347 void QRegExpCharClass::setNegative(bool negative)
2348 {
2349     n = negative;
2350 #ifndef QT_NO_REGEXP_OPTIM
2351     occ1.fill(0, NumBadChars);
2352 #endif
2353 }
2354
2355 void QRegExpCharClass::addCategories(uint cats)
2356 {
2357     static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2358                                 FLAG(QChar::Mark_SpacingCombining) |
2359                                 FLAG(QChar::Mark_Enclosing) |
2360                                 FLAG(QChar::Number_DecimalDigit) |
2361                                 FLAG(QChar::Number_Letter) |
2362                                 FLAG(QChar::Number_Other) |
2363                                 FLAG(QChar::Separator_Space) |
2364                                 FLAG(QChar::Separator_Line) |
2365                                 FLAG(QChar::Separator_Paragraph) |
2366                                 FLAG(QChar::Other_Control) |
2367                                 FLAG(QChar::Other_Format) |
2368                                 FLAG(QChar::Other_Surrogate) |
2369                                 FLAG(QChar::Other_PrivateUse) |
2370                                 FLAG(QChar::Other_NotAssigned) |
2371                                 FLAG(QChar::Letter_Uppercase) |
2372                                 FLAG(QChar::Letter_Lowercase) |
2373                                 FLAG(QChar::Letter_Titlecase) |
2374                                 FLAG(QChar::Letter_Modifier) |
2375                                 FLAG(QChar::Letter_Other) |
2376                                 FLAG(QChar::Punctuation_Connector) |
2377                                 FLAG(QChar::Punctuation_Dash) |
2378                                 FLAG(QChar::Punctuation_Open) |
2379                                 FLAG(QChar::Punctuation_Close) |
2380                                 FLAG(QChar::Punctuation_InitialQuote) |
2381                                 FLAG(QChar::Punctuation_FinalQuote) |
2382                                 FLAG(QChar::Punctuation_Other) |
2383                                 FLAG(QChar::Symbol_Math) |
2384                                 FLAG(QChar::Symbol_Currency) |
2385                                 FLAG(QChar::Symbol_Modifier) |
2386                                 FLAG(QChar::Symbol_Other);
2387     c |= (all_cats & cats);
2388 #ifndef QT_NO_REGEXP_OPTIM
2389     occ1.fill(0, NumBadChars);
2390 #endif
2391 }
2392
2393 void QRegExpCharClass::addRange(ushort from, ushort to)
2394 {
2395     if (from > to)
2396         qSwap(from, to);
2397     int m = r.size();
2398     r.resize(m + 1);
2399     r[m].from = from;
2400     r[m].len = to - from + 1;
2401
2402 #ifndef QT_NO_REGEXP_OPTIM
2403     int i;
2404
2405     if (to - from < NumBadChars) {
2406         if (from % NumBadChars <= to % NumBadChars) {
2407             for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2408                 occ1[i] = 0;
2409         } else {
2410             for (i = 0; i <= to % NumBadChars; i++)
2411                 occ1[i] = 0;
2412             for (i = from % NumBadChars; i < NumBadChars; i++)
2413                 occ1[i] = 0;
2414         }
2415     } else {
2416         occ1.fill(0, NumBadChars);
2417     }
2418 #endif
2419 }
2420
2421 bool QRegExpCharClass::in(QChar ch) const
2422 {
2423 #ifndef QT_NO_REGEXP_OPTIM
2424     if (occ1.at(BadChar(ch)) == NoOccurrence)
2425         return n;
2426 #endif
2427
2428     if (c != 0 && (c & FLAG(ch.category())) != 0)
2429         return !n;
2430
2431     const int uc = ch.unicode();
2432     int size = r.size();
2433
2434     for (int i = 0; i < size; ++i) {
2435         const QRegExpCharClassRange &range = r.at(i);
2436         if (uint(uc - range.from) < uint(r.at(i).len))
2437             return !n;
2438     }
2439     return n;
2440 }
2441
2442 #if defined(QT_DEBUG)
2443 void QRegExpCharClass::dump() const
2444 {
2445     int i;
2446     qDebug("    %stive character class", n ? "nega" : "posi");
2447 #ifndef QT_NO_REGEXP_CCLASS
2448     if (c != 0)
2449         qDebug("      categories 0x%.8x", c);
2450 #endif
2451     for (i = 0; i < r.size(); i++)
2452         qDebug("      0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2453 }
2454 #endif
2455 #endif
2456
2457 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2458     : eng(engine), skipanchors(0)
2459 #ifndef QT_NO_REGEXP_OPTIM
2460       , earlyStart(0), lateStart(0), maxl(0)
2461 #endif
2462 {
2463 #ifndef QT_NO_REGEXP_OPTIM
2464     occ1.fill(NoOccurrence, NumBadChars);
2465 #endif
2466     minl = 0;
2467 }
2468
2469 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2470 {
2471     eng = b.eng;
2472     ls = b.ls;
2473     rs = b.rs;
2474     lanchors = b.lanchors;
2475     ranchors = b.ranchors;
2476     skipanchors = b.skipanchors;
2477 #ifndef QT_NO_REGEXP_OPTIM
2478     earlyStart = b.earlyStart;
2479     lateStart = b.lateStart;
2480     str = b.str;
2481     leftStr = b.leftStr;
2482     rightStr = b.rightStr;
2483     maxl = b.maxl;
2484     occ1 = b.occ1;
2485 #endif
2486     minl = b.minl;
2487     return *this;
2488 }
2489
2490 void QRegExpEngine::Box::set(QChar ch)
2491 {
2492     ls.resize(1);
2493     ls[0] = eng->createState(ch);
2494     rs = ls;
2495 #ifndef QT_NO_REGEXP_OPTIM
2496     str = ch;
2497     leftStr = ch;
2498     rightStr = ch;
2499     maxl = 1;
2500     occ1[BadChar(ch)] = 0;
2501 #endif
2502     minl = 1;
2503 }
2504
2505 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2506 {
2507     ls.resize(1);
2508     ls[0] = eng->createState(cc);
2509     rs = ls;
2510 #ifndef QT_NO_REGEXP_OPTIM
2511     maxl = 1;
2512     occ1 = cc.firstOccurrence();
2513 #endif
2514     minl = 1;
2515 }
2516
2517 #ifndef QT_NO_REGEXP_BACKREF
2518 void QRegExpEngine::Box::set(int bref)
2519 {
2520     ls.resize(1);
2521     ls[0] = eng->createState(bref);
2522     rs = ls;
2523     if (bref >= 1 && bref <= MaxBackRefs)
2524         skipanchors = Anchor_BackRef0Empty << bref;
2525 #ifndef QT_NO_REGEXP_OPTIM
2526     maxl = InftyLen;
2527 #endif
2528     minl = 0;
2529 }
2530 #endif
2531
2532 void QRegExpEngine::Box::cat(const Box &b)
2533 {
2534     eng->addCatTransitions(rs, b.ls);
2535     addAnchorsToEngine(b);
2536     if (minl == 0) {
2537         lanchors.unite(b.lanchors);
2538         if (skipanchors != 0) {
2539             for (int i = 0; i < b.ls.size(); i++) {
2540                 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2541                 lanchors.insert(b.ls.at(i), a);
2542             }
2543         }
2544         mergeInto(&ls, b.ls);
2545     }
2546     if (b.minl == 0) {
2547         ranchors.unite(b.ranchors);
2548         if (b.skipanchors != 0) {
2549             for (int i = 0; i < rs.size(); i++) {
2550                 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2551                 ranchors.insert(rs.at(i), a);
2552             }
2553         }
2554         mergeInto(&rs, b.rs);
2555     } else {
2556         ranchors = b.ranchors;
2557         rs = b.rs;
2558     }
2559
2560 #ifndef QT_NO_REGEXP_OPTIM
2561     if (maxl != InftyLen) {
2562         if (rightStr.length() + b.leftStr.length() >
2563              qMax(str.length(), b.str.length())) {
2564             earlyStart = minl - rightStr.length();
2565             lateStart = maxl - rightStr.length();
2566             str = rightStr + b.leftStr;
2567         } else if (b.str.length() > str.length()) {
2568             earlyStart = minl + b.earlyStart;
2569             lateStart = maxl + b.lateStart;
2570             str = b.str;
2571         }
2572     }
2573
2574     if (leftStr.length() == maxl)
2575         leftStr += b.leftStr;
2576
2577     if (b.rightStr.length() == b.maxl) {
2578         rightStr += b.rightStr;
2579     } else {
2580         rightStr = b.rightStr;
2581     }
2582
2583     if (maxl == InftyLen || b.maxl == InftyLen) {
2584         maxl = InftyLen;
2585     } else {
2586         maxl += b.maxl;
2587     }
2588
2589     for (int i = 0; i < NumBadChars; i++) {
2590         if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2591             occ1[i] = minl + b.occ1.at(i);
2592     }
2593 #endif
2594
2595     minl += b.minl;
2596     if (minl == 0)
2597         skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2598     else
2599         skipanchors = 0;
2600 }
2601
2602 void QRegExpEngine::Box::orx(const Box &b)
2603 {
2604     mergeInto(&ls, b.ls);
2605     lanchors.unite(b.lanchors);
2606     mergeInto(&rs, b.rs);
2607     ranchors.unite(b.ranchors);
2608
2609     if (b.minl == 0) {
2610         if (minl == 0)
2611             skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2612         else
2613             skipanchors = b.skipanchors;
2614     }
2615
2616 #ifndef QT_NO_REGEXP_OPTIM
2617     for (int i = 0; i < NumBadChars; i++) {
2618         if (occ1.at(i) > b.occ1.at(i))
2619             occ1[i] = b.occ1.at(i);
2620     }
2621     earlyStart = 0;
2622     lateStart = 0;
2623     str = QString();
2624     leftStr = QString();
2625     rightStr = QString();
2626     if (b.maxl > maxl)
2627         maxl = b.maxl;
2628 #endif
2629     if (b.minl < minl)
2630         minl = b.minl;
2631 }
2632
2633 void QRegExpEngine::Box::plus(int atom)
2634 {
2635 #ifndef QT_NO_REGEXP_CAPTURE
2636     eng->addPlusTransitions(rs, ls, atom);
2637 #else
2638     Q_UNUSED(atom);
2639     eng->addCatTransitions(rs, ls);
2640 #endif
2641     addAnchorsToEngine(*this);
2642 #ifndef QT_NO_REGEXP_OPTIM
2643     maxl = InftyLen;
2644 #endif
2645 }
2646
2647 void QRegExpEngine::Box::opt()
2648 {
2649 #ifndef QT_NO_REGEXP_OPTIM
2650     earlyStart = 0;
2651     lateStart = 0;
2652     str = QString();
2653     leftStr = QString();
2654     rightStr = QString();
2655 #endif
2656     skipanchors = 0;
2657     minl = 0;
2658 }
2659
2660 void QRegExpEngine::Box::catAnchor(int a)
2661 {
2662     if (a != 0) {
2663         for (int i = 0; i < rs.size(); i++) {
2664             a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2665             ranchors.insert(rs.at(i), a);
2666         }
2667         if (minl == 0)
2668             skipanchors = eng->anchorConcatenation(skipanchors, a);
2669     }
2670 }
2671
2672 #ifndef QT_NO_REGEXP_OPTIM
2673 void QRegExpEngine::Box::setupHeuristics()
2674 {
2675     eng->goodEarlyStart = earlyStart;
2676     eng->goodLateStart = lateStart;
2677     eng->goodStr = eng->cs ? str : str.toLower();
2678
2679     eng->minl = minl;
2680     if (eng->cs) {
2681         /*
2682           A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2683           1 at this point. An entry of occ1 has to be at most minl or
2684           infinity for the rest of the algorithm to go well.
2685
2686           We waited until here before normalizing these cases (instead of
2687           doing it in Box::orx()) because sometimes things improve by
2688           themselves. Consider for example (112|1)34.
2689         */
2690         for (int i = 0; i < NumBadChars; i++) {
2691             if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2692                 occ1[i] = minl;
2693         }
2694         eng->occ1 = occ1;
2695     } else {
2696         eng->occ1.fill(0, NumBadChars);
2697     }
2698
2699     eng->heuristicallyChooseHeuristic();
2700 }
2701 #endif
2702
2703 #if defined(QT_DEBUG)
2704 void QRegExpEngine::Box::dump() const
2705 {
2706     int i;
2707     qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2708     qDebug("  Left states:");
2709     for (i = 0; i < ls.size(); i++) {
2710         if (lanchors.value(ls[i], 0) == 0)
2711             qDebug("    %d", ls[i]);
2712         else
2713             qDebug("    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2714     }
2715     qDebug("  Right states:");
2716     for (i = 0; i < rs.size(); i++) {
2717         if (ranchors.value(rs[i], 0) == 0)
2718             qDebug("    %d", rs[i]);
2719         else
2720             qDebug("    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2721     }
2722     qDebug("  Skip anchors: 0x%.8x", skipanchors);
2723 }
2724 #endif
2725
2726 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2727 {
2728     for (int i = 0; i < to.ls.size(); i++) {
2729         for (int j = 0; j < rs.size(); j++) {
2730             int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2731                                              to.lanchors.value(to.ls.at(i), 0));
2732             eng->addAnchors(rs[j], to.ls[i], a);
2733         }
2734     }
2735 }
2736
2737 #ifndef QT_NO_REGEXP_CCLASS
2738 // fast lookup hash for xml schema extensions
2739 // sorted by name for b-search
2740 static const struct CategoriesRangeMapEntry {
2741     const char name[40];
2742     uint first, second;
2743 } categoriesRangeMap[] = {
2744     { "AegeanNumbers",                        0x10100, 0x1013F },
2745     { "AlphabeticPresentationForms",          0xFB00, 0xFB4F },
2746     { "AncientGreekMusicalNotation",          0x1D200, 0x1D24F },
2747     { "AncientGreekNumbers",                  0x10140, 0x1018F },
2748     { "Arabic",                               0x0600, 0x06FF },
2749     { "ArabicPresentationForms-A",            0xFB50, 0xFDFF },
2750     { "ArabicPresentationForms-B",            0xFE70, 0xFEFF },
2751     { "ArabicSupplement",                     0x0750, 0x077F },
2752     { "Armenian",                             0x0530, 0x058F },
2753     { "Arrows",                               0x2190, 0x21FF },
2754     { "BasicLatin",                           0x0000, 0x007F },
2755     { "Bengali",                              0x0980, 0x09FF },
2756     { "BlockElements",                        0x2580, 0x259F },
2757     { "Bopomofo",                             0x3100, 0x312F },
2758     { "BopomofoExtended",                     0x31A0, 0x31BF },
2759     { "BoxDrawing",                           0x2500, 0x257F },
2760     { "BraillePatterns",                      0x2800, 0x28FF },
2761     { "Buginese",                             0x1A00, 0x1A1F },
2762     { "Buhid",                                0x1740, 0x175F },
2763     { "ByzantineMusicalSymbols",              0x1D000, 0x1D0FF },
2764     { "CJKCompatibility",                     0x3300, 0x33FF },
2765     { "CJKCompatibilityForms",                0xFE30, 0xFE4F },
2766     { "CJKCompatibilityIdeographs",           0xF900, 0xFAFF },
2767     { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },
2768     { "CJKRadicalsSupplement",                0x2E80, 0x2EFF },
2769     { "CJKStrokes",                           0x31C0, 0x31EF },
2770     { "CJKSymbolsandPunctuation",             0x3000, 0x303F },
2771     { "CJKUnifiedIdeographs",                 0x4E00, 0x9FFF },
2772     { "CJKUnifiedIdeographsExtensionA",       0x3400, 0x4DB5 },
2773     { "CJKUnifiedIdeographsExtensionB",       0x20000, 0x2A6DF },
2774     { "Cherokee",                             0x13A0, 0x13FF },
2775     { "CombiningDiacriticalMarks",            0x0300, 0x036F },
2776     { "CombiningDiacriticalMarksSupplement",  0x1DC0, 0x1DFF },
2777     { "CombiningHalfMarks",                   0xFE20, 0xFE2F },
2778     { "CombiningMarksforSymbols",             0x20D0, 0x20FF },
2779     { "ControlPictures",                      0x2400, 0x243F },
2780     { "Coptic",                               0x2C80, 0x2CFF },
2781     { "CurrencySymbols",                      0x20A0, 0x20CF },
2782     { "CypriotSyllabary",                     0x10800, 0x1083F },
2783     { "Cyrillic",                             0x0400, 0x04FF },
2784     { "CyrillicSupplement",                   0x0500, 0x052F },
2785     { "Deseret",                              0x10400, 0x1044F },
2786     { "Devanagari",                           0x0900, 0x097F },
2787     { "Dingbats",                             0x2700, 0x27BF },
2788     { "EnclosedAlphanumerics",                0x2460, 0x24FF },
2789     { "EnclosedCJKLettersandMonths",          0x3200, 0x32FF },
2790     { "Ethiopic",                             0x1200, 0x137F },
2791     { "EthiopicExtended",                     0x2D80, 0x2DDF },
2792     { "EthiopicSupplement",                   0x1380, 0x139F },
2793     { "GeneralPunctuation",                   0x2000, 0x206F },
2794     { "GeometricShapes",                      0x25A0, 0x25FF },
2795     { "Georgian",                             0x10A0, 0x10FF },
2796     { "GeorgianSupplement",                   0x2D00, 0x2D2F },
2797     { "Glagolitic",                           0x2C00, 0x2C5F },
2798     { "Gothic",                               0x10330, 0x1034F },
2799     { "Greek",                                0x0370, 0x03FF },
2800     { "GreekExtended",                        0x1F00, 0x1FFF },
2801     { "Gujarati",                             0x0A80, 0x0AFF },
2802     { "Gurmukhi",                             0x0A00, 0x0A7F },
2803     { "HalfwidthandFullwidthForms",           0xFF00, 0xFFEF },
2804     { "HangulCompatibilityJamo",              0x3130, 0x318F },
2805     { "HangulJamo",                           0x1100, 0x11FF },
2806     { "HangulSyllables",                      0xAC00, 0xD7A3 },
2807     { "Hanunoo",                              0x1720, 0x173F },
2808     { "Hebrew",                               0x0590, 0x05FF },
2809     { "Hiragana",                             0x3040, 0x309F },
2810     { "IPAExtensions",                        0x0250, 0x02AF },
2811     { "IdeographicDescriptionCharacters",     0x2FF0, 0x2FFF },
2812     { "Kanbun",                               0x3190, 0x319F },
2813     { "KangxiRadicals",                       0x2F00, 0x2FDF },
2814     { "Kannada",                              0x0C80, 0x0CFF },
2815     { "Katakana",                             0x30A0, 0x30FF },
2816     { "KatakanaPhoneticExtensions",           0x31F0, 0x31FF },
2817     { "Kharoshthi",                           0x10A00, 0x10A5F },
2818     { "Khmer",                                0x1780, 0x17FF },
2819     { "KhmerSymbols",                         0x19E0, 0x19FF },
2820     { "Lao",                                  0x0E80, 0x0EFF },
2821     { "Latin-1Supplement",                    0x0080, 0x00FF },
2822     { "LatinExtended-A",                      0x0100, 0x017F },
2823     { "LatinExtended-B",                      0x0180, 0x024F },
2824     { "LatinExtendedAdditional",              0x1E00, 0x1EFF },
2825     { "LetterlikeSymbols",                    0x2100, 0x214F },
2826     { "Limbu",                                0x1900, 0x194F },
2827     { "LinearBIdeograms",                     0x10080, 0x100FF },
2828     { "LinearBSyllabary",                     0x10000, 0x1007F },
2829     { "Malayalam",                            0x0D00, 0x0D7F },
2830     { "MathematicalAlphanumericSymbols",      0x1D400, 0x1D7FF },
2831     { "MathematicalOperators",                0x2200, 0x22FF },
2832     { "MiscellaneousMathematicalSymbols-A",   0x27C0, 0x27EF },
2833     { "MiscellaneousMathematicalSymbols-B",   0x2980, 0x29FF },
2834     { "MiscellaneousSymbols",                 0x2600, 0x26FF },
2835     { "MiscellaneousSymbolsandArrows",        0x2B00, 0x2BFF },
2836     { "MiscellaneousTechnical",               0x2300, 0x23FF },
2837     { "ModifierToneLetters",                  0xA700, 0xA71F },
2838     { "Mongolian",                            0x1800, 0x18AF },
2839     { "MusicalSymbols",                       0x1D100, 0x1D1FF },
2840     { "Myanmar",                              0x1000, 0x109F },
2841     { "NewTaiLue",                            0x1980, 0x19DF },
2842     { "NumberForms",                          0x2150, 0x218F },
2843     { "Ogham",                                0x1680, 0x169F },
2844     { "OldItalic",                            0x10300, 0x1032F },
2845     { "OldPersian",                           0x103A0, 0x103DF },
2846     { "OpticalCharacterRecognition",          0x2440, 0x245F },
2847     { "Oriya",                                0x0B00, 0x0B7F },
2848     { "Osmanya",                              0x10480, 0x104AF },
2849     { "PhoneticExtensions",                   0x1D00, 0x1D7F },
2850     { "PhoneticExtensionsSupplement",         0x1D80, 0x1DBF },
2851     { "PrivateUse",                           0xE000, 0xF8FF },
2852     { "Runic",                                0x16A0, 0x16FF },
2853     { "Shavian",                              0x10450, 0x1047F },
2854     { "Sinhala",                              0x0D80, 0x0DFF },
2855     { "SmallFormVariants",                    0xFE50, 0xFE6F },
2856     { "SpacingModifierLetters",               0x02B0, 0x02FF },
2857     { "Specials",                             0xFFF0, 0xFFFF },
2858     { "SuperscriptsandSubscripts",            0x2070, 0x209F },
2859     { "SupplementalArrows-A",                 0x27F0, 0x27FF },
2860     { "SupplementalArrows-B",                 0x2900, 0x297F },
2861     { "SupplementalMathematicalOperators",    0x2A00, 0x2AFF },
2862     { "SupplementalPunctuation",              0x2E00, 0x2E7F },
2863     { "SupplementaryPrivateUseArea-A",        0xF0000, 0xFFFFF },
2864     { "SupplementaryPrivateUseArea-B",        0x100000, 0x10FFFF },
2865     { "SylotiNagri",                          0xA800, 0xA82F },
2866     { "Syriac",                               0x0700, 0x074F },
2867     { "Tagalog",                              0x1700, 0x171F },
2868     { "Tagbanwa",                             0x1760, 0x177F },
2869     { "Tags",                                 0xE0000, 0xE007F },
2870     { "TaiLe",                                0x1950, 0x197F },
2871     { "TaiXuanJingSymbols",                   0x1D300, 0x1D35F },
2872     { "Tamil",                                0x0B80, 0x0BFF },
2873     { "Telugu",                               0x0C00, 0x0C7F },
2874     { "Thaana",                               0x0780, 0x07BF },
2875     { "Thai",                                 0x0E00, 0x0E7F },
2876     { "Tibetan",                              0x0F00, 0x0FFF },
2877     { "Tifinagh",                             0x2D30, 0x2D7F },
2878     { "Ugaritic",                             0x10380, 0x1039F },
2879     { "UnifiedCanadianAboriginalSyllabics",   0x1400, 0x167F },
2880     { "VariationSelectors",                   0xFE00, 0xFE0F },
2881     { "VariationSelectorsSupplement",         0xE0100, 0xE01EF },
2882     { "VerticalForms",                        0xFE10, 0xFE1F },
2883     { "YiRadicals",                           0xA490, 0xA4CF },
2884     { "YiSyllables",                          0xA000, 0xA48F },
2885     { "YijingHexagramSymbols",                0x4DC0, 0x4DFF }
2886 };
2887
2888 inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2889 { return qstrcmp(name, entry.name) < 0; }
2890 inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2891 { return qstrcmp(entry.name, name) < 0; }
2892 #endif // QT_NO_REGEXP_CCLASS
2893
2894 int QRegExpEngine::getChar()
2895 {
2896     return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2897 }
2898
2899 int QRegExpEngine::getEscape()
2900 {
2901 #ifndef QT_NO_REGEXP_ESCAPE
2902     const char tab[] = "afnrtv"; // no b, as \b means word boundary
2903     const char backTab[] = "\a\f\n\r\t\v";
2904     ushort low;
2905     int i;
2906 #endif
2907     ushort val;
2908     int prevCh = yyCh;
2909
2910     if (prevCh == EOS) {
2911         error(RXERR_END);
2912         return Tok_Char | '\\';
2913     }
2914     yyCh = getChar();
2915 #ifndef QT_NO_REGEXP_ESCAPE
2916     if ((prevCh & ~0xff) == 0) {
2917         const char *p = strchr(tab, prevCh);
2918         if (p != 0)
2919             return Tok_Char | backTab[p - tab];
2920     }
2921 #endif
2922
2923     switch (prevCh) {
2924 #ifndef QT_NO_REGEXP_ESCAPE
2925     case '0':
2926         val = 0;
2927         for (i = 0; i < 3; i++) {
2928             if (yyCh >= '0' && yyCh <= '7')
2929                 val = (val << 3) | (yyCh - '0');
2930             else
2931                 break;
2932             yyCh = getChar();
2933         }
2934         if ((val & ~0377) != 0)
2935             error(RXERR_OCTAL);
2936         return Tok_Char | val;
2937 #endif
2938 #ifndef QT_NO_REGEXP_ESCAPE
2939     case 'B':
2940         return Tok_NonWord;
2941 #endif
2942 #ifndef QT_NO_REGEXP_CCLASS
2943     case 'D':
2944         // see QChar::isDigit()
2945         yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2946         return Tok_CharClass;
2947     case 'S':
2948         // see QChar::isSpace()
2949         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |
2950                                                FLAG(QChar::Separator_Line) |
2951                                                FLAG(QChar::Separator_Paragraph) |
2952                                                FLAG(QChar::Other_Control)));
2953         yyCharClass->addRange(0x0000, 0x0008);
2954         yyCharClass->addRange(0x000e, 0x001f);
2955         yyCharClass->addRange(0x007f, 0x0084);
2956         yyCharClass->addRange(0x0086, 0x009f);
2957         return Tok_CharClass;
2958     case 'W':
2959         // see QChar::isLetterOrNumber() and QChar::isMark()
2960         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2961                                                FLAG(QChar::Mark_SpacingCombining) |
2962                                                FLAG(QChar::Mark_Enclosing) |
2963                                                FLAG(QChar::Number_DecimalDigit) |
2964                                                FLAG(QChar::Number_Letter) |
2965                                                FLAG(QChar::Number_Other) |
2966                                                FLAG(QChar::Letter_Uppercase) |
2967                                                FLAG(QChar::Letter_Lowercase) |
2968                                                FLAG(QChar::Letter_Titlecase) |
2969                                                FLAG(QChar::Letter_Modifier) |
2970                                                FLAG(QChar::Letter_Other) |
2971                                                FLAG(QChar::Punctuation_Connector)));
2972         yyCharClass->addRange(0x203f, 0x2040);
2973         yyCharClass->addSingleton(0x2040);
2974         yyCharClass->addSingleton(0x2054);
2975         yyCharClass->addSingleton(0x30fb);
2976         yyCharClass->addRange(0xfe33, 0xfe34);
2977         yyCharClass->addRange(0xfe4d, 0xfe4f);
2978         yyCharClass->addSingleton(0xff3f);
2979         yyCharClass->addSingleton(0xff65);
2980         return Tok_CharClass;
2981 #endif
2982 #ifndef QT_NO_REGEXP_ESCAPE
2983     case 'b':
2984         return Tok_Word;
2985 #endif
2986 #ifndef QT_NO_REGEXP_CCLASS
2987     case 'd':
2988         // see QChar::isDigit()
2989         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2990         return Tok_CharClass;
2991     case 's':
2992         // see QChar::isSpace()
2993         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2994                                    FLAG(QChar::Separator_Line) |
2995                                    FLAG(QChar::Separator_Paragraph));
2996         yyCharClass->addRange(0x0009, 0x000d);
2997         yyCharClass->addSingleton(0x0085);
2998         return Tok_CharClass;
2999     case 'w':
3000         // see QChar::isLetterOrNumber() and QChar::isMark()
3001         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3002                                    FLAG(QChar::Mark_SpacingCombining) |
3003                                    FLAG(QChar::Mark_Enclosing) |
3004                                    FLAG(QChar::Number_DecimalDigit) |
3005                                    FLAG(QChar::Number_Letter) |
3006                                    FLAG(QChar::Number_Other) |
3007                                    FLAG(QChar::Letter_Uppercase) |
3008                                    FLAG(QChar::Letter_Lowercase) |
3009                                    FLAG(QChar::Letter_Titlecase) |
3010                                    FLAG(QChar::Letter_Modifier) |
3011                                    FLAG(QChar::Letter_Other));
3012         yyCharClass->addSingleton(0x005f); // '_'
3013         return Tok_CharClass;
3014     case 'I':
3015         if (xmlSchemaExtensions) {
3016             yyCharClass->setNegative(!yyCharClass->negative());
3017             // fall through
3018         }
3019     case 'i':
3020         if (xmlSchemaExtensions) {
3021             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3022                                        FLAG(QChar::Mark_SpacingCombining) |
3023                                        FLAG(QChar::Mark_Enclosing) |
3024                                        FLAG(QChar::Number_DecimalDigit) |
3025                                        FLAG(QChar::Number_Letter) |
3026                                        FLAG(QChar::Number_Other) |
3027                                        FLAG(QChar::Letter_Uppercase) |
3028                                        FLAG(QChar::Letter_Lowercase) |
3029                                        FLAG(QChar::Letter_Titlecase) |
3030                                        FLAG(QChar::Letter_Modifier) |
3031                                        FLAG(QChar::Letter_Other));
3032             yyCharClass->addSingleton(0x003a); // ':'
3033             yyCharClass->addSingleton(0x005f); // '_'
3034             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3035             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3036             yyCharClass->addRange(0xc0, 0xd6);
3037             yyCharClass->addRange(0xd8, 0xf6);
3038             yyCharClass->addRange(0xf8, 0x2ff);
3039             yyCharClass->addRange(0x370, 0x37d);
3040             yyCharClass->addRange(0x37f, 0x1fff);
3041             yyCharClass->addRange(0x200c, 0x200d);
3042             yyCharClass->addRange(0x2070, 0x218f);
3043             yyCharClass->addRange(0x2c00, 0x2fef);
3044             yyCharClass->addRange(0x3001, 0xd7ff);
3045             yyCharClass->addRange(0xf900, 0xfdcf);
3046             yyCharClass->addRange(0xfdf0, 0xfffd);
3047             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3048         }
3049         return Tok_CharClass;
3050     case 'C':
3051         if (xmlSchemaExtensions) {
3052             yyCharClass->setNegative(!yyCharClass->negative());
3053             // fall through
3054         }
3055     case 'c':
3056         if (xmlSchemaExtensions) {
3057             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3058                                        FLAG(QChar::Mark_SpacingCombining) |
3059                                        FLAG(QChar::Mark_Enclosing) |
3060                                        FLAG(QChar::Number_DecimalDigit) |
3061                                        FLAG(QChar::Number_Letter) |
3062                                        FLAG(QChar::Number_Other) |
3063                                        FLAG(QChar::Letter_Uppercase) |
3064                                        FLAG(QChar::Letter_Lowercase) |
3065                                        FLAG(QChar::Letter_Titlecase) |
3066                                        FLAG(QChar::Letter_Modifier) |
3067                                        FLAG(QChar::Letter_Other));
3068             yyCharClass->addSingleton(0x002d); // '-'
3069             yyCharClass->addSingleton(0x002e); // '.'
3070             yyCharClass->addSingleton(0x003a); // ':'
3071             yyCharClass->addSingleton(0x005f); // '_'
3072             yyCharClass->addSingleton(0xb7);
3073             yyCharClass->addRange(0x0030, 0x0039); // [0-9]
3074             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3075             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3076             yyCharClass->addRange(0xc0, 0xd6);
3077             yyCharClass->addRange(0xd8, 0xf6);
3078             yyCharClass->addRange(0xf8, 0x2ff);
3079             yyCharClass->addRange(0x370, 0x37d);
3080             yyCharClass->addRange(0x37f, 0x1fff);
3081             yyCharClass->addRange(0x200c, 0x200d);
3082             yyCharClass->addRange(0x2070, 0x218f);
3083             yyCharClass->addRange(0x2c00, 0x2fef);
3084             yyCharClass->addRange(0x3001, 0xd7ff);
3085             yyCharClass->addRange(0xf900, 0xfdcf);
3086             yyCharClass->addRange(0xfdf0, 0xfffd);
3087             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3088             yyCharClass->addRange(0x0300, 0x036f);
3089             yyCharClass->addRange(0x203f, 0x2040);
3090         }
3091         return Tok_CharClass;
3092     case 'P':
3093         if (xmlSchemaExtensions) {
3094             yyCharClass->setNegative(!yyCharClass->negative());
3095             // fall through
3096         }
3097     case 'p':
3098         if (xmlSchemaExtensions) {
3099             if (yyCh != '{') {
3100                 error(RXERR_CHARCLASS);
3101                 return Tok_CharClass;
3102             }
3103
3104             QByteArray category;
3105             yyCh = getChar();
3106             while (yyCh != '}') {
3107                 if (yyCh == EOS) {
3108                     error(RXERR_END);
3109                     return Tok_CharClass;
3110                 }
3111                 category.append(yyCh);
3112                 yyCh = getChar();
3113             }
3114             yyCh = getChar(); // skip closing '}'
3115
3116             int catlen = category.length();
3117             if (catlen == 1 || catlen == 2) {
3118                 switch (category.at(0)) {
3119                 case 'M':
3120                     if (catlen == 1) {
3121                         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3122                                                    FLAG(QChar::Mark_SpacingCombining) |
3123                                                    FLAG(QChar::Mark_Enclosing));
3124                     } else {
3125                         switch (category.at(1)) {
3126                         case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3127                         case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3128                         case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3129                         default: error(RXERR_CATEGORY); break;
3130                         }
3131                     }
3132                     break;
3133                 case 'N':
3134                     if (catlen == 1) {
3135                         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3136                                                    FLAG(QChar::Number_Letter) |
3137                                                    FLAG(QChar::Number_Other));
3138                     } else {
3139                         switch (category.at(1)) {
3140                         case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3141                         case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3142                         case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3143                         default: error(RXERR_CATEGORY); break;
3144                         }
3145                     }
3146                     break;
3147                 case 'Z':
3148                     if (catlen == 1) {
3149                         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3150                                                    FLAG(QChar::Separator_Line) |
3151                                                    FLAG(QChar::Separator_Paragraph));
3152                     } else {
3153                         switch (category.at(1)) {
3154                         case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3155                         case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3156                         case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3157                         default: error(RXERR_CATEGORY); break;
3158                         }
3159                     }
3160                     break;
3161                 case 'C':
3162                     if (catlen == 1) {
3163                         yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3164                                                    FLAG(QChar::Other_Format) |
3165                                                    FLAG(QChar::Other_Surrogate) |
3166                                                    FLAG(QChar::Other_PrivateUse) |
3167                                                    FLAG(QChar::Other_NotAssigned));
3168                     } else {
3169                         switch (category.at(1)) {
3170                         case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3171                         case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3172                         case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3173                         case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3174                         case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3175                         default: error(RXERR_CATEGORY); break;
3176                         }
3177                     }
3178                     break;
3179                 case 'L':
3180                     if (catlen == 1) {
3181                         yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3182                                                    FLAG(QChar::Letter_Lowercase) |
3183                                                    FLAG(QChar::Letter_Titlecase) |
3184                                                    FLAG(QChar::Letter_Modifier) |
3185                                                    FLAG(QChar::Letter_Other));
3186                     } else {
3187                         switch (category.at(1)) {
3188                         case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3189                         case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3190                         case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3191                         case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3192                         case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3193                         default: error(RXERR_CATEGORY); break;
3194                         }
3195                     }
3196                     break;
3197                 case 'P':
3198                     if (catlen == 1) {
3199                         yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3200                                                    FLAG(QChar::Punctuation_Dash) |
3201                                                    FLAG(QChar::Punctuation_Open) |
3202                                                    FLAG(QChar::Punctuation_Close) |
3203                                                    FLAG(QChar::Punctuation_InitialQuote) |
3204                                                    FLAG(QChar::Punctuation_FinalQuote) |
3205                                                    FLAG(QChar::Punctuation_Other));
3206                     } else {
3207                         switch (category.at(1)) {
3208                         case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3209                         case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3210                         case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3211                         case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3212                         case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3213                         case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3214                         case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3215                         default: error(RXERR_CATEGORY); break;
3216                         }
3217                     }
3218                     break;
3219                 case 'S':
3220                     if (catlen == 1) {
3221                         yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3222                                                    FLAG(QChar::Symbol_Currency) |
3223                                                    FLAG(QChar::Symbol_Modifier) |
3224                                                    FLAG(QChar::Symbol_Other));
3225                     } else {
3226                         switch (category.at(1)) {
3227                         case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3228                         case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3229                         case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3230                         case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3231                         default: error(RXERR_CATEGORY); break;
3232                         }
3233                     }
3234                     break;
3235                 default:
3236                     error(RXERR_CATEGORY);
3237                     break;
3238                 }
3239             } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {
3240                 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3241                 const CategoriesRangeMapEntry *r = qBinaryFind(categoriesRangeMap, categoriesRangeMap + N, category.constData() + 2);
3242                 if (r != categoriesRangeMap + N)
3243                     yyCharClass->addRange(r->first, r->second);
3244                 else
3245                     error(RXERR_CATEGORY);
3246             } else {
3247                 error(RXERR_CATEGORY);
3248             }
3249         }
3250         return Tok_CharClass;
3251 #endif
3252 #ifndef QT_NO_REGEXP_ESCAPE
3253     case 'x':
3254         val = 0;
3255         for (i = 0; i < 4; i++) {
3256             low = QChar(yyCh).toLower().unicode();
3257             if (low >= '0' && low <= '9')
3258                 val = (val << 4) | (low - '0');
3259             else if (low >= 'a' && low <= 'f')
3260                 val = (val << 4) | (low - 'a' + 10);
3261             else
3262                 break;
3263             yyCh = getChar();
3264         }
3265         return Tok_Char | val;
3266 #endif
3267     default:
3268         if (prevCh >= '1' && prevCh <= '9') {
3269 #ifndef QT_NO_REGEXP_BACKREF
3270             val = prevCh - '0';
3271             while (yyCh >= '0' && yyCh <= '9') {
3272                 val = (val * 10) + (yyCh - '0');
3273                 yyCh = getChar();
3274             }
3275             return Tok_BackRef | val;
3276 #else
3277             error(RXERR_DISABLED);
3278 #endif
3279         }
3280         return Tok_Char | prevCh;
3281     }
3282 }
3283
3284 #ifndef QT_NO_REGEXP_INTERVAL
3285 int QRegExpEngine::getRep(int def)
3286 {
3287     if (yyCh >= '0' && yyCh <= '9') {
3288         int rep = 0;
3289         do {
3290             rep = 10 * rep + yyCh - '0';
3291             if (rep >= InftyRep) {
3292                 error(RXERR_REPETITION);
3293                 rep = def;
3294             }
3295             yyCh = getChar();
3296         } while (yyCh >= '0' && yyCh <= '9');
3297         return rep;
3298     } else {
3299         return def;
3300     }
3301 }
3302 #endif
3303
3304 #ifndef QT_NO_REGEXP_LOOKAHEAD
3305 void QRegExpEngine::skipChars(int n)
3306 {
3307     if (n > 0) {
3308         yyPos += n - 1;
3309         yyCh = getChar();
3310     }
3311 }
3312 #endif
3313
3314 void QRegExpEngine::error(const char *msg)
3315 {
3316     if (yyError.isEmpty())
3317         yyError = QLatin1String(msg);
3318 }
3319
3320 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3321 {
3322     yyIn = rx;
3323     yyPos0 = 0;
3324     yyPos = 0;
3325     yyLen = len;
3326     yyCh = getChar();
3327     yyCharClass.reset(new QRegExpCharClass);
3328     yyMinRep = 0;
3329     yyMaxRep = 0;
3330     yyError = QString();
3331 }
3332
3333 int QRegExpEngine::getToken()
3334 {
3335 #ifndef QT_NO_REGEXP_CCLASS
3336     ushort pendingCh = 0;
3337     bool charPending;
3338     bool rangePending;
3339     int tok;
3340 #endif
3341     int prevCh = yyCh;
3342
3343     yyPos0 = yyPos - 1;
3344 #ifndef QT_NO_REGEXP_CCLASS
3345     yyCharClass->clear();
3346 #endif
3347     yyMinRep = 0;
3348     yyMaxRep = 0;
3349     yyCh = getChar();
3350
3351     switch (prevCh) {
3352     case EOS:
3353         yyPos0 = yyPos;
3354         return Tok_Eos;
3355     case '$':
3356         return Tok_Dollar;
3357     case '(':
3358         if (yyCh == '?') {
3359             prevCh = getChar();
3360             yyCh = getChar();
3361             switch (prevCh) {
3362 #ifndef QT_NO_REGEXP_LOOKAHEAD
3363             case '!':
3364                 return Tok_NegLookahead;
3365             case '=':
3366                 return Tok_PosLookahead;
3367 #endif
3368             case ':':
3369                 return Tok_MagicLeftParen;
3370             case '<':
3371                 error(RXERR_LOOKBEHIND);
3372                 return Tok_MagicLeftParen;
3373             default:
3374                 error(RXERR_LOOKAHEAD);
3375                 return Tok_MagicLeftParen;
3376             }
3377         } else {
3378             return Tok_LeftParen;
3379         }
3380     case ')':
3381         return Tok_RightParen;
3382     case '*':
3383         yyMinRep = 0;
3384         yyMaxRep = InftyRep;
3385         return Tok_Quantifier;
3386     case '+':
3387         yyMinRep = 1;
3388         yyMaxRep = InftyRep;
3389         return Tok_Quantifier;
3390     case '.':
3391 #ifndef QT_NO_REGEXP_CCLASS
3392         yyCharClass->setNegative(true);
3393 #endif
3394         return Tok_CharClass;
3395     case '?':
3396         yyMinRep = 0;
3397         yyMaxRep = 1;
3398         return Tok_Quantifier;
3399     case '[':
3400 #ifndef QT_NO_REGEXP_CCLASS
3401         if (yyCh == '^') {
3402             yyCharClass->setNegative(true);
3403             yyCh = getChar();
3404         }
3405         charPending = false;
3406         rangePending = false;
3407         do {
3408             if (yyCh == '-' && charPending && !rangePending) {
3409                 rangePending = true;
3410                 yyCh = getChar();
3411             } else {
3412                 if (charPending && !rangePending) {
3413                     yyCharClass->addSingleton(pendingCh);
3414                     charPending = false;
3415                 }
3416                 if (yyCh == '\\') {
3417                     yyCh = getChar();
3418                     tok = getEscape();
3419                     if (tok == Tok_Word)
3420                         tok = '\b';
3421                 } else {
3422                     tok = Tok_Char | yyCh;
3423                     yyCh = getChar();
3424                 }
3425                 if (tok == Tok_CharClass) {
3426                     if (rangePending) {
3427                         yyCharClass->addSingleton('-');
3428                         yyCharClass->addSingleton(pendingCh);
3429                         charPending = false;
3430                         rangePending = false;
3431                     }
3432                 } else if ((tok & Tok_Char) != 0) {
3433                     if (rangePending) {
3434                         yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3435                         charPending = false;
3436                         rangePending = false;
3437                     } else {
3438                         pendingCh = tok ^ Tok_Char;
3439                         charPending = true;
3440                     }
3441                 } else {
3442                     error(RXERR_CHARCLASS);
3443                 }
3444             }
3445         }  while (yyCh != ']' && yyCh != EOS);
3446         if (rangePending)
3447             yyCharClass->addSingleton('-');
3448         if (charPending)
3449             yyCharClass->addSingleton(pendingCh);
3450         if (yyCh == EOS)
3451             error(RXERR_END);
3452         else
3453             yyCh = getChar();
3454         return Tok_CharClass;
3455 #else
3456         error(RXERR_END);
3457         return Tok_Char | '[';
3458 #endif
3459     case '\\':
3460         return getEscape();
3461     case ']':
3462         error(RXERR_LEFTDELIM);
3463         return Tok_Char | ']';
3464     case '^':
3465         return Tok_Caret;
3466     case '{':
3467 #ifndef QT_NO_REGEXP_INTERVAL
3468         yyMinRep = getRep(0);
3469         yyMaxRep = yyMinRep;
3470         if (yyCh == ',') {
3471             yyCh = getChar();
3472             yyMaxRep = getRep(InftyRep);
3473         }
3474         if (yyMaxRep < yyMinRep)
3475             error(RXERR_INTERVAL);
3476         if (yyCh != '}')
3477             error(RXERR_REPETITION);
3478         yyCh = getChar();
3479         return Tok_Quantifier;
3480 #else
3481         error(RXERR_DISABLED);
3482         return Tok_Char | '{';
3483 #endif
3484     case '|':
3485         return Tok_Bar;
3486     case '}':
3487         error(RXERR_LEFTDELIM);
3488         return Tok_Char | '}';
3489     default:
3490         return Tok_Char | prevCh;
3491     }
3492 }
3493
3494 int QRegExpEngine::parse(const QChar *pattern, int len)
3495 {
3496     valid = true;
3497     startTokenizer(pattern, len);
3498     yyTok = getToken();
3499 #ifndef QT_NO_REGEXP_CAPTURE
3500     yyMayCapture = true;
3501 #else
3502     yyMayCapture = false;
3503 #endif
3504
3505 #ifndef QT_NO_REGEXP_CAPTURE
3506     int atom = startAtom(false);
3507 #endif
3508     QRegExpCharClass anything;
3509     Box box(this); // create InitialState
3510     box.set(anything);
3511     Box rightBox(this); // create FinalState
3512     rightBox.set(anything);
3513
3514     Box middleBox(this);
3515     parseExpression(&middleBox);
3516 #ifndef QT_NO_REGEXP_CAPTURE
3517     finishAtom(atom, false);
3518 #endif
3519 #ifndef QT_NO_REGEXP_OPTIM
3520     middleBox.setupHeuristics();
3521 #endif
3522     box.cat(middleBox);
3523     box.cat(rightBox);
3524     yyCharClass.reset(0);
3525
3526 #ifndef QT_NO_REGEXP_CAPTURE
3527     for (int i = 0; i < nf; ++i) {
3528         switch (f[i].capture) {
3529         case QRegExpAtom::NoCapture:
3530             break;
3531         case QRegExpAtom::OfficialCapture:
3532             f[i].capture = ncap;
3533             captureForOfficialCapture.append(ncap);
3534             ++ncap;
3535             ++officialncap;
3536             break;
3537         case QRegExpAtom::UnofficialCapture:
3538             f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3539         }
3540     }
3541
3542 #ifndef QT_NO_REGEXP_BACKREF
3543 #ifndef QT_NO_REGEXP_OPTIM
3544     if (officialncap == 0 && nbrefs == 0) {
3545         ncap = nf = 0;
3546         f.clear();
3547     }
3548 #endif
3549     // handle the case where there's a \5 with no corresponding capture
3550     // (captureForOfficialCapture.size() != officialncap)
3551     for (int i = 0; i < nbrefs - officialncap; ++i) {
3552         captureForOfficialCapture.append(ncap);
3553         ++ncap;
3554     }
3555 #endif
3556 #endif
3557
3558     if (!yyError.isEmpty())
3559         return -1;
3560
3561 #ifndef QT_NO_REGEXP_OPTIM
3562     const QRegExpAutomatonState &sinit = s.at(InitialState);
3563     caretAnchored = !sinit.anchors.isEmpty();
3564     if (caretAnchored) {
3565         const QMap<int, int> &anchors = sinit.anchors;
3566         QMap<int, int>::const_iterator a;
3567         for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3568             if (
3569 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3570                 (*a & Anchor_Alternation) != 0 ||
3571 #endif
3572                 (*a & Anchor_Caret) == 0)
3573             {
3574                 caretAnchored = false;
3575                 break;
3576             }
3577         }
3578     }
3579 #endif
3580
3581     // cleanup anchors
3582     int numStates = s.count();
3583     for (int i = 0; i < numStates; ++i) {
3584         QRegExpAutomatonState &state = s[i];
3585         if (!state.anchors.isEmpty()) {
3586             QMap<int, int>::iterator a = state.anchors.begin();
3587             while (a != state.anchors.end()) {
3588                 if (a.value() == 0)
3589                     a = state.anchors.erase(a);
3590                 else
3591                     ++a;
3592             }
3593         }
3594     }
3595
3596     return yyPos0;
3597 }
3598
3599 void QRegExpEngine::parseAtom(Box *box)
3600 {
3601 #ifndef QT_NO_REGEXP_LOOKAHEAD
3602     QRegExpEngine *eng = 0;
3603     bool neg;
3604     int len;
3605 #endif
3606
3607     if ((yyTok & Tok_Char) != 0) {
3608         box->set(QChar(yyTok ^ Tok_Char));
3609     } else {
3610 #ifndef QT_NO_REGEXP_OPTIM
3611         trivial = false;
3612 #endif
3613         switch (yyTok) {
3614         case Tok_Dollar:
3615             box->catAnchor(Anchor_Dollar);
3616             break;
3617         case Tok_Caret:
3618             box->catAnchor(Anchor_Caret);
3619             break;
3620 #ifndef QT_NO_REGEXP_LOOKAHEAD
3621         case Tok_PosLookahead:
3622         case Tok_NegLookahead:
3623             neg = (yyTok == Tok_NegLookahead);
3624             eng = new QRegExpEngine(cs, greedyQuantifiers);
3625             len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3626             if (len >= 0)
3627                 skipChars(len);
3628             else
3629                 error(RXERR_LOOKAHEAD);
3630             box->catAnchor(addLookahead(eng, neg));
3631             yyTok = getToken();
3632             if (yyTok != Tok_RightParen)
3633                 error(RXERR_LOOKAHEAD);
3634             break;
3635 #endif
3636 #ifndef QT_NO_REGEXP_ESCAPE
3637         case Tok_Word:
3638             box->catAnchor(Anchor_Word);
3639             break;
3640         case Tok_NonWord:
3641             box->catAnchor(Anchor_NonWord);
3642             break;
3643 #endif
3644         case Tok_LeftParen:
3645         case Tok_MagicLeftParen:
3646             yyTok = getToken();
3647             parseExpression(box);
3648             if (yyTok != Tok_RightParen)
3649                 error(RXERR_END);
3650             break;
3651         case Tok_CharClass:
3652             box->set(*yyCharClass);
3653             break;
3654         case Tok_Quantifier:
3655             error(RXERR_REPETITION);
3656             break;
3657         default:
3658 #ifndef QT_NO_REGEXP_BACKREF
3659             if ((yyTok & Tok_BackRef) != 0)
3660                 box->set(yyTok ^ Tok_BackRef);
3661             else
3662 #endif
3663                 error(RXERR_DISABLED);
3664         }
3665     }
3666     yyTok = getToken();
3667 }
3668
3669 void QRegExpEngine::parseFactor(Box *box)
3670 {
3671 #ifndef QT_NO_REGEXP_CAPTURE
3672     int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3673     int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3674     bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3675 #else
3676     const int innerAtom = -1;
3677 #endif
3678
3679 #ifndef QT_NO_REGEXP_INTERVAL
3680 #define YYREDO() \
3681         yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3682         *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3683
3684     const QChar *in = yyIn;
3685     int pos0 = yyPos0;
3686     int pos = yyPos;
3687     int len = yyLen;
3688     int ch = yyCh;
3689     QRegExpCharClass charClass;
3690     if (yyTok == Tok_CharClass)
3691         charClass = *yyCharClass;
3692     int tok = yyTok;
3693     bool mayCapture = yyMayCapture;
3694 #endif
3695
3696     parseAtom(box);
3697 #ifndef QT_NO_REGEXP_CAPTURE
3698     finishAtom(innerAtom, magicLeftParen);
3699 #endif
3700
3701     bool hasQuantifier = (yyTok == Tok_Quantifier);
3702     if (hasQuantifier) {
3703 #ifndef QT_NO_REGEXP_OPTIM
3704         trivial = false;
3705 #endif
3706         if (yyMaxRep == InftyRep) {
3707             box->plus(innerAtom);
3708 #ifndef QT_NO_REGEXP_INTERVAL
3709         } else if (yyMaxRep == 0) {
3710             box->clear();
3711 #endif
3712         }
3713         if (yyMinRep == 0)
3714             box->opt();
3715
3716 #ifndef QT_NO_REGEXP_INTERVAL
3717         yyMayCapture = false;
3718         int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3719         int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3720
3721         Box rightBox(this);
3722         int i;
3723
3724         for (i = 0; i < beta; i++) {
3725             YYREDO();
3726             Box leftBox(this);
3727             parseAtom(&leftBox);
3728             leftBox.cat(rightBox);
3729             leftBox.opt();
3730             rightBox = leftBox;
3731         }
3732         for (i = 0; i < alpha; i++) {
3733             YYREDO();
3734             Box leftBox(this);
3735             parseAtom(&leftBox);
3736             leftBox.cat(rightBox);
3737             rightBox = leftBox;
3738         }
3739         rightBox.cat(*box);
3740         *box = rightBox;
3741 #endif
3742         yyTok = getToken();
3743 #ifndef QT_NO_REGEXP_INTERVAL
3744         yyMayCapture = mayCapture;
3745 #endif
3746     }
3747 #undef YYREDO
3748 #ifndef QT_NO_REGEXP_CAPTURE
3749     if (greedyQuantifiers)
3750         finishAtom(outerAtom, hasQuantifier);
3751 #endif
3752 }
3753
3754 void QRegExpEngine::parseTerm(Box *box)
3755 {
3756 #ifndef QT_NO_REGEXP_OPTIM
3757     if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3758         parseFactor(box);
3759 #endif
3760     while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3761         Box rightBox(this);
3762         parseFactor(&rightBox);
3763         box->cat(rightBox);
3764     }
3765 }
3766
3767 void QRegExpEngine::parseExpression(Box *box)
3768 {
3769     parseTerm(box);
3770     while (yyTok == Tok_Bar) {
3771 #ifndef QT_NO_REGEXP_OPTIM
3772         trivial = false;
3773 #endif
3774         Box rightBox(this);
3775         yyTok = getToken();
3776         parseTerm(&rightBox);
3777         box->orx(rightBox);
3778     }
3779 }
3780
3781 /*
3782   The struct QRegExpPrivate contains the private data of a regular
3783   expression other than the automaton. It makes it possible for many
3784   QRegExp objects to use the same QRegExpEngine object with different
3785   QRegExpPrivate objects.
3786 */
3787 struct QRegExpPrivate
3788 {
3789     QRegExpEngine *eng;
3790     QRegExpEngineKey engineKey;
3791     bool minimal;
3792 #ifndef QT_NO_REGEXP_CAPTURE
3793     QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3794     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3795 #endif
3796     QRegExpMatchState matchState;
3797
3798     inline QRegExpPrivate()
3799         : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3800     inline QRegExpPrivate(const QRegExpEngineKey &key)
3801         : eng(0), engineKey(key), minimal(false) {}
3802 };
3803
3804 #if !defined(QT_NO_REGEXP_OPTIM)
3805 uint qHash(const QRegExpEngineKey &key)
3806 {
3807     return qHash(key.pattern);
3808 }
3809
3810 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3811 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3812 static QBasicMutex globalEngineCacheMutex;
3813 #endif // QT_NO_REGEXP_OPTIM
3814
3815 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3816 {
3817     if (!eng->ref.deref()) {
3818 #if !defined(QT_NO_REGEXP_OPTIM)
3819         if (globalEngineCache()) {
3820             QMutexLocker locker(&globalEngineCacheMutex);
3821             QT_TRY {
3822                 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3823             } QT_CATCH(const std::bad_alloc &) {
3824                 // in case of an exception (e.g. oom), just delete the engine
3825                 delete eng;
3826             }
3827         } else {
3828             delete eng;
3829         }
3830 #else
3831         Q_UNUSED(key);
3832         delete eng;
3833 #endif
3834     }
3835 }
3836
3837 static void prepareEngine_helper(QRegExpPrivate *priv)
3838 {
3839     bool initMatchState = !priv->eng;
3840 #if !defined(QT_NO_REGEXP_OPTIM)
3841     if (!priv->eng && globalEngineCache()) {
3842         QMutexLocker locker(&globalEngineCacheMutex);
3843         priv->eng = globalEngineCache()->take(priv->engineKey);
3844         if (priv->eng != 0)
3845             priv->eng->ref.ref();
3846     }
3847 #endif // QT_NO_REGEXP_OPTIM
3848
3849     if (!priv->eng)
3850         priv->eng = new QRegExpEngine(priv->engineKey);
3851
3852     if (initMatchState)
3853         priv->matchState.prepareForMatch(priv->eng);
3854 }
3855
3856 inline static void prepareEngine(QRegExpPrivate *priv)
3857 {
3858     if (priv->eng)
3859         return;
3860     prepareEngine_helper(priv);
3861 }
3862
3863 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3864 {
3865     prepareEngine(priv);
3866     priv->matchState.prepareForMatch(priv->eng);
3867 #ifndef QT_NO_REGEXP_CAPTURE
3868     priv->t = str;
3869     priv->capturedCache.clear();
3870 #else
3871     Q_UNUSED(str);
3872 #endif
3873 }
3874
3875 static void invalidateEngine(QRegExpPrivate *priv)
3876 {
3877     if (priv->eng != 0) {
3878         derefEngine(priv->eng, priv->engineKey);
3879         priv->eng = 0;
3880         priv->matchState.drain();
3881     }
3882 }
3883
3884 /*!
3885     \enum QRegExp::CaretMode
3886
3887     The CaretMode enum defines the different meanings of the caret
3888     (\bold{^}) in a regular expression. The possible values are:
3889
3890     \value CaretAtZero
3891            The caret corresponds to index 0 in the searched string.
3892
3893     \value CaretAtOffset
3894            The caret corresponds to the start offset of the search.
3895
3896     \value CaretWontMatch
3897            The caret never matches.
3898 */
3899
3900 /*!
3901     \enum QRegExp::PatternSyntax
3902
3903     The syntax used to interpret the meaning of the pattern.
3904
3905     \value RegExp A rich Perl-like pattern matching syntax. This is
3906     the default.
3907
3908     \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
3909     will be the default in Qt 5. (Introduced in Qt 4.2.)
3910
3911     \value Wildcard This provides a simple pattern matching syntax
3912     similar to that used by shells (command interpreters) for "file
3913     globbing". See \l{Wildcard Matching}.
3914
3915     \value WildcardUnix This is similar to Wildcard but with the
3916     behavior of a Unix shell. The wildcard characters can be escaped
3917     with the character "\\".
3918
3919     \value FixedString The pattern is a fixed string. This is
3920     equivalent to using the RegExp pattern on a string in
3921     which all metacharacters are escaped using escape().
3922
3923     \value W3CXmlSchema11 The pattern is a regular expression as
3924     defined by the W3C XML Schema 1.1 specification.
3925
3926     \sa setPatternSyntax()
3927 */
3928
3929 /*!
3930     Constructs an empty regexp.
3931
3932     \sa isValid(), errorString()
3933 */
3934 QRegExp::QRegExp()
3935 {
3936     priv = new QRegExpPrivate;
3937     prepareEngine(priv);
3938 }
3939
3940 /*!
3941     Constructs a regular expression object for the given \a pattern
3942     string. The pattern must be given using wildcard notation if \a
3943     syntax is \l Wildcard; the default is \l RegExp. The pattern is
3944     case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3945     greedy (maximal), but can be changed by calling
3946     setMinimal().
3947
3948     \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3949 */
3950 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3951 {
3952     priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3953     prepareEngine(priv);
3954 }
3955
3956 /*!
3957     Constructs a regular expression as a copy of \a rx.
3958
3959     \sa operator=()
3960 */
3961 QRegExp::QRegExp(const QRegExp &rx)
3962 {
3963     priv = new QRegExpPrivate;
3964     operator=(rx);
3965 }
3966
3967 /*!
3968     Destroys the regular expression and cleans up its internal data.
3969 */
3970 QRegExp::~QRegExp()
3971 {
3972     invalidateEngine(priv);
3973     delete priv;
3974 }
3975
3976 /*!
3977     Copies the regular expression \a rx and returns a reference to the
3978     copy. The case sensitivity, wildcard, and minimal matching options
3979     are also copied.
3980 */
3981 QRegExp &QRegExp::operator=(const QRegExp &rx)
3982 {
3983     prepareEngine(rx.priv); // to allow sharing
3984     QRegExpEngine *otherEng = rx.priv->eng;
3985     if (otherEng)
3986         otherEng->ref.ref();
3987     invalidateEngine(priv);
3988     priv->eng = otherEng;
3989     priv->engineKey = rx.priv->engineKey;
3990     priv->minimal = rx.priv->minimal;
3991 #ifndef QT_NO_REGEXP_CAPTURE
3992     priv->t = rx.priv->t;
3993     priv->capturedCache = rx.priv->capturedCache;
3994 #endif
3995     if (priv->eng)
3996         priv->matchState.prepareForMatch(priv->eng);
3997     priv->matchState.captured = rx.priv->matchState.captured;
3998     return *this;
3999 }
4000
4001 /*!
4002     \fn void QRegExp::swap(QRegExp &other)
4003     \since 4.8
4004
4005     Swaps regular expression \a other with this regular
4006     expression. This operation is very fast and never fails.
4007 */
4008
4009 /*!
4010     Returns true if this regular expression is equal to \a rx;
4011     otherwise returns false.
4012
4013     Two QRegExp objects are equal if they have the same pattern
4014     strings and the same settings for case sensitivity, wildcard and
4015     minimal matching.
4016 */
4017 bool QRegExp::operator==(const QRegExp &rx) const
4018 {
4019     return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4020 }
4021
4022 /*!
4023     \fn bool QRegExp::operator!=(const QRegExp &rx) const
4024
4025     Returns true if this regular expression is not equal to \a rx;
4026     otherwise returns false.
4027
4028     \sa operator==()
4029 */
4030
4031 /*!
4032     Returns true if the pattern string is empty; otherwise returns
4033     false.
4034
4035     If you call exactMatch() with an empty pattern on an empty string
4036     it will return true; otherwise it returns false since it operates
4037     over the whole string. If you call indexIn() with an empty pattern
4038     on \e any string it will return the start offset (0 by default)
4039     because the empty pattern matches the 'emptiness' at the start of
4040     the string. In this case the length of the match returned by
4041     matchedLength() will be 0.
4042
4043     See QString::isEmpty().
4044 */
4045
4046 bool QRegExp::isEmpty() const
4047 {
4048     return priv->engineKey.pattern.isEmpty();
4049 }
4050
4051 /*!
4052     Returns true if the regular expression is valid; otherwise returns
4053     false. An invalid regular expression never matches.
4054
4055     The pattern \bold{[a-z} is an example of an invalid pattern, since
4056     it lacks a closing square bracket.
4057
4058     Note that the validity of a regexp may also depend on the setting
4059     of the wildcard flag, for example \bold{*.html} is a valid
4060     wildcard regexp but an invalid full regexp.
4061
4062     \sa errorString()
4063 */
4064 bool QRegExp::isValid() const
4065 {
4066     if (priv->engineKey.pattern.isEmpty()) {
4067         return true;
4068     } else {
4069         prepareEngine(priv);
4070         return priv->eng->isValid();
4071     }
4072 }
4073
4074 /*!
4075     Returns the pattern string of the regular expression. The pattern
4076     has either regular expression syntax or wildcard syntax, depending
4077     on patternSyntax().
4078
4079     \sa patternSyntax(), caseSensitivity()
4080 */
4081 QString QRegExp::pattern() const
4082 {
4083     return priv->engineKey.pattern;
4084 }
4085
4086 /*!
4087     Sets the pattern string to \a pattern. The case sensitivity,
4088     wildcard, and minimal matching options are not changed.
4089
4090     \sa setPatternSyntax(), setCaseSensitivity()
4091 */
4092 void QRegExp::setPattern(const QString &pattern)
4093 {
4094     if (priv->engineKey.pattern != pattern) {
4095         invalidateEngine(priv);
4096         priv->engineKey.pattern = pattern;
4097     }
4098 }
4099
4100 /*!
4101     Returns Qt::CaseSensitive if the regexp is matched case
4102     sensitively; otherwise returns Qt::CaseInsensitive.
4103
4104     \sa patternSyntax(), pattern(), isMinimal()
4105 */
4106 Qt::CaseSensitivity QRegExp::caseSensitivity() const
4107 {
4108     return priv->engineKey.cs;
4109 }
4110
4111 /*!
4112     Sets case sensitive matching to \a cs.
4113
4114     If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
4115     \c{readme.txt} but not \c{README.TXT}.
4116
4117     \sa setPatternSyntax(), setPattern(), setMinimal()
4118 */
4119 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4120 {
4121     if ((bool)cs != (bool)priv->engineKey.cs) {
4122         invalidateEngine(priv);
4123         priv->engineKey.cs = cs;
4124     }
4125 }
4126
4127 /*!
4128     Returns the syntax used by the regular expression. The default is
4129     QRegExp::RegExp.
4130
4131     \sa pattern(), caseSensitivity()
4132 */
4133 QRegExp::PatternSyntax QRegExp::patternSyntax() const
4134 {
4135     return priv->engineKey.patternSyntax;
4136 }
4137
4138 /*!
4139     Sets the syntax mode for the regular expression. The default is
4140     QRegExp::RegExp.
4141
4142     Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4143     \l{wildcard matching}. For example, \bold{r*.txt} matches the
4144     string \c{readme.txt} in wildcard mode, but does not match
4145     \c{readme}.
4146
4147     Setting \a syntax to QRegExp::FixedString means that the pattern
4148     is interpreted as a plain string. Special characters (e.g.,
4149     backslash) don't need to be escaped then.
4150
4151     \sa setPattern(), setCaseSensitivity(), escape()
4152 */
4153 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4154 {
4155     if (syntax != priv->engineKey.patternSyntax) {
4156         invalidateEngine(priv);
4157         priv->engineKey.patternSyntax = syntax;
4158     }
4159 }
4160
4161 /*!
4162     Returns true if minimal (non-greedy) matching is enabled;
4163     otherwise returns false.
4164
4165     \sa caseSensitivity(), setMinimal()
4166 */
4167 bool QRegExp::isMinimal() const
4168 {
4169     return priv->minimal;
4170 }
4171
4172 /*!
4173     Enables or disables minimal matching. If \a minimal is false,
4174     matching is greedy (maximal) which is the default.
4175
4176     For example, suppose we have the input string "We must be
4177     <b>bold</b>, very <b>bold</b>!" and the pattern
4178     \bold{<b>.*</b>}. With the default greedy (maximal) matching,
4179     the match is "We must be \underline{<b>bold</b>, very
4180     <b>bold</b>}!". But with minimal (non-greedy) matching, the
4181     first match is: "We must be \underline{<b>bold</b>}, very
4182     <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4183     very \underline{<b>bold</b>}!". In practice we might use the pattern
4184     \bold{<b>[^<]*\</b>} instead, although this will still fail for
4185     nested tags.
4186
4187     \sa setCaseSensitivity()
4188 */
4189 void QRegExp::setMinimal(bool minimal)
4190 {
4191     priv->minimal = minimal;
4192 }
4193
4194 // ### Qt 5: make non-const
4195 /*!
4196     Returns true if \a str is matched exactly by this regular
4197     expression; otherwise returns false. You can determine how much of
4198     the string was matched by calling matchedLength().
4199
4200     For a given regexp string R, exactMatch("R") is the equivalent of
4201     indexIn("^R$") since exactMatch() effectively encloses the regexp
4202     in the start of string and end of string anchors, except that it
4203     sets matchedLength() differently.
4204
4205     For example, if the regular expression is \bold{blue}, then
4206     exactMatch() returns true only for input \c blue. For inputs \c
4207     bluebell, \c blutak and \c lightblue, exactMatch() returns false
4208     and matchedLength() will return 4, 3 and 0 respectively.
4209
4210     Although const, this function sets matchedLength(),
4211     capturedTexts(), and pos().
4212
4213     \sa indexIn(), lastIndexIn()
4214 */
4215 bool QRegExp::exactMatch(const QString &str) const
4216 {
4217     prepareEngineForMatch(priv, str);
4218     priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4219     if (priv->matchState.captured[1] == str.length()) {
4220         return true;
4221     } else {
4222         priv->matchState.captured[0] = 0;
4223         priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4224         return false;
4225     }
4226 }
4227
4228 // ### Qt 5: make non-const
4229 /*!
4230     Attempts to find a match in \a str from position \a offset (0 by
4231     default). If \a offset is -1, the search starts at the last
4232     character; if -2, at the next to last character; etc.
4233
4234     Returns the position of the first match, or -1 if there was no
4235     match.
4236
4237     The \a caretMode parameter can be used to instruct whether \bold{^}
4238     should match at index 0 or at \a offset.
4239
4240     You might prefer to use QString::indexOf(), QString::contains(),
4241     or even QStringList::filter(). To replace matches use
4242     QString::replace().
4243
4244     Example:
4245     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4246
4247     Although const, this function sets matchedLength(),
4248     capturedTexts() and pos().
4249
4250     If the QRegExp is a wildcard expression (see setPatternSyntax())
4251     and want to test a string against the whole wildcard expression,
4252     use exactMatch() instead of this function.
4253
4254     \sa lastIndexIn(), exactMatch()
4255 */
4256
4257 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4258 {
4259     prepareEngineForMatch(priv, str);
4260     if (offset < 0)
4261         offset += str.length();
4262     priv->matchState.match(str.unicode(), str.length(), offset,
4263         priv->minimal, false, caretIndex(offset, caretMode));
4264     return priv->matchState.captured[0];
4265 }
4266
4267 // ### Qt 5: make non-const
4268 /*!
4269     Attempts to find a match backwards in \a str from position \a
4270     offset. If \a offset is -1 (the default), the search starts at the
4271     last character; if -2, at the next to last character; etc.
4272
4273     Returns the position of the first match, or -1 if there was no
4274     match.
4275
4276     The \a caretMode parameter can be used to instruct whether \bold{^}
4277     should match at index 0 or at \a offset.
4278
4279     Although const, this function sets matchedLength(),
4280     capturedTexts() and pos().
4281
4282     \warning Searching backwards is much slower than searching
4283     forwards.
4284
4285     \sa indexIn(), exactMatch()
4286 */
4287
4288 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4289 {
4290     prepareEngineForMatch(priv, str);
4291     if (offset < 0)
4292         offset += str.length();
4293     if (offset < 0 || offset > str.length()) {
4294         memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4295         return -1;
4296     }
4297
4298     while (offset >= 0) {
4299         priv->matchState.match(str.unicode(), str.length(), offset,
4300             priv->minimal, true, caretIndex(offset, caretMode));
4301         if (priv->matchState.captured[0] == offset)
4302             return offset;
4303         --offset;
4304     }
4305     return -1;
4306 }
4307
4308 /*!
4309     Returns the length of the last matched string, or -1 if there was
4310     no match.
4311
4312     \sa exactMatch(), indexIn(), lastIndexIn()
4313 */
4314 int QRegExp::matchedLength() const
4315 {
4316     return priv->matchState.captured[1];
4317 }
4318
4319 #ifndef QT_NO_REGEXP_CAPTURE
4320
4321 /*!
4322   \fn int QRegExp::numCaptures() const
4323   \obsolete
4324   Returns the number of captures contained in the regular expression.
4325
4326   \sa captureCount()
4327  */
4328
4329 /*!
4330   \since 4.6
4331   Returns the number of captures contained in the regular expression.
4332  */
4333 int QRegExp::captureCount() const
4334 {
4335     prepareEngine(priv);
4336     return priv->eng->captureCount();
4337 }
4338
4339 /*!
4340     Returns a list of the captured text strings.
4341
4342     The first string in the list is the entire matched string. Each
4343     subsequent list element contains a string that matched a
4344     (capturing) subexpression of the regexp.
4345
4346     For example:
4347     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4348
4349     The above example also captures elements that may be present but
4350     which we have no interest in. This problem can be solved by using
4351     non-capturing parentheses:
4352
4353     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4354
4355     Note that if you want to iterate over the list, you should iterate
4356     over a copy, e.g.
4357     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4358
4359     Some regexps can match an indeterminate number of times. For
4360     example if the input string is "Offsets: 12 14 99 231 7" and the
4361     regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
4362     all the numbers matched. However, after calling
4363     \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4364     "12"), i.e. the entire match was "12" and the first subexpression
4365     matched was "12". The correct approach is to use cap() in a
4366     \l{QRegExp#cap_in_a_loop}{loop}.
4367
4368     The order of elements in the string list is as follows. The first
4369     element is the entire matching string. Each subsequent element
4370     corresponds to the next capturing open left parentheses. Thus
4371     capturedTexts()[1] is the text of the first capturing parentheses,
4372     capturedTexts()[2] is the text of the second and so on
4373     (corresponding to $1, $2, etc., in some other regexp languages).
4374
4375     \sa cap(), pos()
4376 */
4377 QStringList QRegExp::capturedTexts() const
4378 {
4379     if (priv->capturedCache.isEmpty()) {
4380         prepareEngine(priv);
4381         const int *captured = priv->matchState.captured;
4382         int n = priv->matchState.capturedSize;
4383
4384         for (int i = 0; i < n; i += 2) {
4385             QString m;
4386             if (captured[i + 1] == 0)
4387                 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4388             else if (captured[i] >= 0)
4389                 m = priv->t.mid(captured[i], captured[i + 1]);
4390             priv->capturedCache.append(m);
4391         }
4392         priv->t.clear();
4393     }
4394     return priv->capturedCache;
4395 }
4396
4397 /*!
4398     \internal
4399 */
4400 QStringList QRegExp::capturedTexts()
4401 {
4402     return const_cast<const QRegExp *>(this)->capturedTexts();
4403 }
4404
4405 /*!
4406     Returns the text captured by the \a nth subexpression. The entire
4407     match has index 0 and the parenthesized subexpressions have
4408     indexes starting from 1 (excluding non-capturing parentheses).
4409
4410     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4411
4412     The order of elements matched by cap() is as follows. The first
4413     element, cap(0), is the entire matching string. Each subsequent
4414     element corresponds to the next capturing open left parentheses.
4415     Thus cap(1) is the text of the first capturing parentheses, cap(2)
4416     is the text of the second, and so on.
4417
4418     \sa capturedTexts(), pos()
4419 */
4420 QString QRegExp::cap(int nth) const
4421 {
4422     return capturedTexts().value(nth);
4423 }
4424
4425 /*!
4426     \internal
4427 */
4428 QString QRegExp::cap(int nth)
4429 {
4430     return const_cast<const QRegExp *>(this)->cap(nth);
4431 }
4432
4433 /*!
4434     Returns the position of the \a nth captured text in the searched
4435     string. If \a nth is 0 (the default), pos() returns the position
4436     of the whole match.
4437
4438     Example:
4439     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4440
4441     For zero-length matches, pos() always returns -1. (For example, if
4442     cap(4) would return an empty string, pos(4) returns -1.) This is
4443     a feature of the implementation.
4444
4445     \sa cap(), capturedTexts()
4446 */
4447 int QRegExp::pos(int nth) const
4448 {
4449     if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4450         return -1;
4451     else
4452         return priv->matchState.captured[2 * nth];
4453 }
4454
4455 /*!
4456     \internal
4457 */
4458 int QRegExp::pos(int nth)
4459 {
4460     return const_cast<const QRegExp *>(this)->pos(nth);
4461 }
4462
4463 /*!
4464   Returns a text string that explains why a regexp pattern is
4465   invalid the case being; otherwise returns "no error occurred".
4466
4467   \sa isValid()
4468 */
4469 QString QRegExp::errorString() const
4470 {
4471     if (isValid()) {
4472         return QString::fromLatin1(RXERR_OK);
4473     } else {
4474         return priv->eng->errorString();
4475     }
4476 }
4477
4478 /*!
4479     \internal
4480 */
4481 QString QRegExp::errorString()
4482 {
4483     return const_cast<const QRegExp *>(this)->errorString();
4484 }
4485 #endif
4486
4487 /*!
4488     Returns the string \a str with every regexp special character
4489     escaped with a backslash. The special characters are $, (,), *, +,
4490     ., ?, [, \,], ^, {, | and }.
4491
4492     Example:
4493
4494     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4495
4496     This function is useful to construct regexp patterns dynamically:
4497
4498     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4499
4500     \sa setPatternSyntax()
4501 */
4502 QString QRegExp::escape(const QString &str)
4503 {
4504     QString quoted;
4505     const int count = str.count();
4506     quoted.reserve(count * 2);
4507     const QLatin1Char backslash('\\');
4508     for (int i = 0; i < count; i++) {
4509         switch (str.at(i).toLatin1()) {
4510         case '$':
4511         case '(':
4512         case ')':
4513         case '*':
4514         case '+':
4515         case '.':
4516         case '?':
4517         case '[':
4518         case '\\':
4519         case ']':
4520         case '^':
4521         case '{':
4522         case '|':
4523         case '}':
4524             quoted.append(backslash);
4525         }
4526         quoted.append(str.at(i));
4527     }
4528     return quoted;
4529 }
4530
4531
4532 #ifndef QT_NO_DATASTREAM
4533 /*!
4534     \relates QRegExp
4535
4536     Writes the regular expression \a regExp to stream \a out.
4537
4538     \sa {Serializing Qt Data Types}
4539 */
4540 QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4541 {
4542     return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4543                << (quint8)regExp.patternSyntax()
4544                << (quint8)!!regExp.isMinimal();
4545 }
4546
4547 /*!
4548     \relates QRegExp
4549
4550     Reads a regular expression from stream \a in into \a regExp.
4551
4552     \sa {Serializing Qt Data Types}
4553 */
4554 QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4555 {
4556     QString pattern;
4557     quint8 cs;
4558     quint8 patternSyntax;
4559     quint8 isMinimal;
4560
4561     in >> pattern >> cs >> patternSyntax >> isMinimal;
4562
4563     QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4564                       QRegExp::PatternSyntax(patternSyntax));
4565
4566     newRegExp.setMinimal(isMinimal);
4567     regExp = newRegExp;
4568     return in;
4569 }
4570 #endif // QT_NO_DATASTREAM
4571
4572 #ifndef QT_NO_DEBUG_STREAM
4573 QDebug operator<<(QDebug dbg, const QRegExp &r)
4574 {
4575     dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4576                   << ", pattern='"<< r.pattern() << "')";
4577     return dbg.space();
4578 }
4579 #endif
4580
4581 QT_END_NAMESPACE