src/corelib/tools/qregexp.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/legal
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** Commercial License Usage
  10 ** Licensees holding valid commercial Qt licenses may use this file in
  11 ** accordance with the commercial license agreement provided with the
  12 ** Software or, alternatively, in accordance with the terms contained in
  13 ** a written agreement between you and Digia.  For licensing terms and
  14 ** conditions see http://qt.digia.com/licensing.  For further information
  15 ** use the contact form at http://qt.digia.com/contact-us.
  16 **
  17 ** GNU Lesser General Public License Usage
  18 ** Alternatively, this file may be used under the terms of the GNU Lesser
  19 ** General Public License version 2.1 as published by the Free Software
  20 ** Foundation and appearing in the file LICENSE.LGPL included in the
  21 ** packaging of this file.  Please review the following information to
  22 ** ensure the GNU Lesser General Public License version 2.1 requirements
  23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  24 **
  25 ** In addition, as a special exception, Digia gives you certain additional
  26 ** rights.  These rights are described in the Digia Qt LGPL Exception
  27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  28 **
  29 ** GNU General Public License Usage
  30 ** Alternatively, this file may be used under the terms of the GNU
  31 ** General Public License version 3.0 as published by the Free Software
  32 ** Foundation and appearing in the file LICENSE.GPL included in the
  33 ** packaging of this file.  Please review the following information to
  34 ** ensure the GNU General Public License version 3.0 requirements will be
  35 ** met: http://www.gnu.org/copyleft/gpl.html.
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include "qregexp.h"
  43
  44 #include "qalgorithms.h"
  45 #include "qbitarray.h"
  46 #include "qcache.h"
  47 #include "qdatastream.h"
  48 #include "qdebug.h"
  49 #include "qlist.h"
  50 #include "qmap.h"
  51 #include "qmutex.h"
  52 #include "qstring.h"
  53 #include "qstringlist.h"
  54 #include "qstringmatcher.h"
  55 #include "qvector.h"
  56 #include "private/qfunctions_p.h"
  57
  58 #include <limits.h>
  59
  60 QT_BEGIN_NAMESPACE
  61
  62 int qFindString(const QChar *haystack, int haystackLen, int from,
  63     const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
  64
  65 // error strings for the regexp parser
  66 #define RXERR_OK         QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
  67 #define RXERR_DISABLED   QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
  68 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
  69 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
  70 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
  71 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
  72 #define RXERR_OCTAL      QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
  73 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
  74 #define RXERR_END        QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
  75 #define RXERR_LIMIT      QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
  76 #define RXERR_INTERVAL   QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
  77 #define RXERR_CATEGORY   QT_TRANSLATE_NOOP("QRegExp", "invalid category")
  78
  79 /*!
  80     \class QRegExp
  81     \inmodule QtCore
  82     \reentrant
  83     \brief The QRegExp class provides pattern matching using regular expressions.
  84
  85     \ingroup tools
  86     \ingroup shared
  87
  88     \keyword regular expression
  89
  90     A regular expression, or "regexp", is a pattern for matching
  91     substrings in a text. This is useful in many contexts, e.g.,
  92
  93     \table
  94     \row \li Validation
  95          \li A regexp can test whether a substring meets some criteria,
  96          e.g. is an integer or contains no whitespace.
  97     \row \li Searching
  98          \li A regexp provides more powerful pattern matching than
  99          simple substring matching, e.g., match one of the words
 100          \e{mail}, \e{letter} or \e{correspondence}, but none of the
 101          words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
 102      \row \li Search and Replace
 103          \li A regexp can replace all occurrences of a substring with a
 104          different substring, e.g., replace all occurrences of \e{&}
 105          with \e{\&amp;} except where the \e{&} is already followed by
 106          an \e{amp;}.
 107     \row \li String Splitting
 108          \li A regexp can be used to identify where a string should be
 109          split apart, e.g. splitting tab-delimited strings.
 110     \endtable
 111
 112     A brief introduction to regexps is presented, a description of
 113     Qt's regexp language, some examples, and the function
 114     documentation itself. QRegExp is modeled on Perl's regexp
 115     language. It fully supports Unicode. QRegExp can also be used in a
 116     simpler, \e{wildcard mode} that is similar to the functionality
 117     found in command shells. The syntax rules used by QRegExp can be
 118     changed with setPatternSyntax(). In particular, the pattern syntax
 119     can be set to QRegExp::FixedString, which means the pattern to be
 120     matched is interpreted as a plain string, i.e., special characters
 121     (e.g., backslash) are not escaped.
 122
 123     A good text on regexps is \e {Mastering Regular Expressions}
 124     (Third Edition) by Jeffrey E. F.  Friedl, ISBN 0-596-52812-4.
 125
 126     \tableofcontents
 127
 128     \section1 Introduction
 129
 130     Regexps are built up from expressions, quantifiers, and
 131     assertions. The simplest expression is a character, e.g. \b{x}
 132     or \b{5}. An expression can also be a set of characters
 133     enclosed in square brackets. \b{[ABCD]} will match an \b{A}
 134     or a \b{B} or a \b{C} or a \b{D}. We can write this same
 135     expression as \b{[A-D]}, and an experession to match any
 136     captital letter in the English alphabet is written as
 137     \b{[A-Z]}.
 138
 139     A quantifier specifies the number of occurrences of an expression
 140     that must be matched. \b{x{1,1}} means match one and only one
 141     \b{x}. \b{x{1,5}} means match a sequence of \b{x}
 142     characters that contains at least one \b{x} but no more than
 143     five.
 144
 145     Note that in general regexps cannot be used to check for balanced
 146     brackets or tags. For example, a regexp can be written to match an
 147     opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
 148     are not nested, but if the \c{<b>} tags are nested, that same
 149     regexp will match an opening \c{<b>} tag with the wrong closing
 150     \c{</b>}.  For the fragment \c{<b>bold <b>bolder</b></b>}, the
 151     first \c{<b>} would be matched with the first \c{</b>}, which is
 152     not correct. However, it is possible to write a regexp that will
 153     match nested brackets or tags correctly, but only if the number of
 154     nesting levels is fixed and known. If the number of nesting levels
 155     is not fixed and known, it is impossible to write a regexp that
 156     will not fail.
 157
 158     Suppose we want a regexp to match integers in the range 0 to 99.
 159     At least one digit is required, so we start with the expression
 160     \b{[0-9]{1,1}}, which matches a single digit exactly once. This
 161     regexp matches integers in the range 0 to 9. To match integers up
 162     to 99, increase the maximum number of occurrences to 2, so the
 163     regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
 164     original requirement to match integers from 0 to 99, but it will
 165     also match integers that occur in the middle of strings. If we
 166     want the matched integer to be the whole string, we must use the
 167     anchor assertions, \b{^} (caret) and \b{$} (dollar). When
 168     \b{^} is the first character in a regexp, it means the regexp
 169     must match from the beginning of the string. When \b{$} is the
 170     last character of the regexp, it means the regexp must match to
 171     the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
 172     Note that assertions, e.g. \b{^} and \b{$}, do not match
 173     characters but locations in the string.
 174
 175     If you have seen regexps described elsewhere, they may have looked
 176     different from the ones shown here. This is because some sets of
 177     characters and some quantifiers are so common that they have been
 178     given special symbols to represent them. \b{[0-9]} can be
 179     replaced with the symbol \b{\\d}. The quantifier to match
 180     exactly one occurrence, \b{{1,1}}, can be replaced with the
 181     expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
 182     our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
 183     also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
 184     the string, match a digit, followed immediately by 0 or 1 digits}.
 185     In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
 186     is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
 187     occurrences. \b{?} makes an expression optional. The regexp
 188     \b{^\\d\\d?$} means \e{From the beginning of the string, match
 189     one digit, followed immediately by 0 or 1 more digit, followed
 190     immediately by end of string}.
 191
 192     To write a regexp that matches one of the words 'mail' \e or
 193     'letter' \e or 'correspondence' but does not match words that
 194     contain these words, e.g., 'email', 'mailman', 'mailer', and
 195     'letterbox', start with a regexp that matches 'mail'. Expressed
 196     fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
 197     a character expression is automatically quantified by
 198     \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
 199     'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
 200     we can use the vertical bar \b{|}, which means \b{or}, to
 201     include the other two words, so our regexp for matching any of the
 202     three words becomes \b{mail|letter|correspondence}. Match
 203     'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
 204     regexp will match one of the three words we want to match, it will
 205     also match words we don't want to match, e.g., 'email'.  To
 206     prevent the regexp from matching unwanted words, we must tell it
 207     to begin and end the match at word boundaries. First we enclose
 208     our regexp in parentheses, \b{(mail|letter|correspondence)}.
 209     Parentheses group expressions together, and they identify a part
 210     of the regexp that we wish to \l{capturing text}{capture}.
 211     Enclosing the expression in parentheses allows us to use it as a
 212     component in more complex regexps. It also allows us to examine
 213     which of the three words was actually matched. To force the match
 214     to begin and end on word boundaries, we enclose the regexp in
 215     \b{\\b} \e{word boundary} assertions:
 216     \b{\\b(mail|letter|correspondence)\\b}.  Now the regexp means:
 217     \e{Match a word boundary, followed by the regexp in parentheses,
 218     followed by a word boundary}. The \b{\\b} assertion matches a
 219     \e position in the regexp, not a \e character. A word boundary is
 220     any non-word character, e.g., a space, newline, or the beginning
 221     or ending of a string.
 222
 223     If we want to replace ampersand characters with the HTML entity
 224     \b{\&amp;}, the regexp to match is simply \b{\&}. But this
 225     regexp will also match ampersands that have already been converted
 226     to HTML entities. We want to replace only ampersands that are not
 227     already followed by \b{amp;}. For this, we need the negative
 228     lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
 229     written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
 230     \b{not} \e{followed by} \b{amp;}.
 231
 232     If we want to count all the occurrences of 'Eric' and 'Eirik' in a
 233     string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
 234     \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
 235     required to avoid matching words that contain either name,
 236     e.g. 'Ericsson'. Note that the second regexp matches more
 237     spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
 238
 239     Some of the examples discussed above are implemented in the
 240     \l{#code-examples}{code examples} section.
 241
 242     \target characters-and-abbreviations-for-sets-of-characters
 243     \section1 Characters and Abbreviations for Sets of Characters
 244
 245     \table
 246     \header \li Element \li Meaning
 247     \row \li \b{c}
 248          \li A character represents itself unless it has a special
 249          regexp meaning. e.g. \b{c} matches the character \e c.
 250     \row \li \b{\\c}
 251          \li A character that follows a backslash matches the character
 252          itself, except as specified below. e.g., To match a literal
 253          caret at the beginning of a string, write \b{\\^}.
 254     \row \li \b{\\a}
 255          \li Matches the ASCII bell (BEL, 0x07).
 256     \row \li \b{\\f}
 257          \li Matches the ASCII form feed (FF, 0x0C).
 258     \row \li \b{\\n}
 259          \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
 260     \row \li \b{\\r}
 261          \li Matches the ASCII carriage return (CR, 0x0D).
 262     \row \li \b{\\t}
 263          \li Matches the ASCII horizontal tab (HT, 0x09).
 264     \row \li \b{\\v}
 265          \li Matches the ASCII vertical tab (VT, 0x0B).
 266     \row \li \b{\\x\e{hhhh}}
 267          \li Matches the Unicode character corresponding to the
 268          hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
 269     \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
 270          \li matches the ASCII/Latin1 character for the octal number
 271          \e{ooo} (between 0 and 0377).
 272     \row \li \b{. (dot)}
 273          \li Matches any character (including newline).
 274     \row \li \b{\\d}
 275          \li Matches a digit (QChar::isDigit()).
 276     \row \li \b{\\D}
 277          \li Matches a non-digit.
 278     \row \li \b{\\s}
 279          \li Matches a whitespace character (QChar::isSpace()).
 280     \row \li \b{\\S}
 281          \li Matches a non-whitespace character.
 282     \row \li \b{\\w}
 283          \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
 284     \row \li \b{\\W}
 285          \li Matches a non-word character.
 286     \row \li \b{\\\e{n}}
 287          \li The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
 288     \endtable
 289
 290     \b{Note:} The C++ compiler transforms backslashes in strings.
 291     To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
 292     To match the backslash character itself, enter it four times, i.e.
 293     \c{\\\\}.
 294
 295     \target sets-of-characters
 296     \section1 Sets of Characters
 297
 298     Square brackets mean match any character contained in the square
 299     brackets. The character set abbreviations described above can
 300     appear in a character set in square brackets. Except for the
 301     character set abbreviations and the following two exceptions,
 302     characters do not have special meanings in square brackets.
 303
 304     \table
 305     \row \li \b{^}
 306
 307          \li The caret negates the character set if it occurs as the
 308          first character (i.e. immediately after the opening square
 309          bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
 310          \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
 311
 312     \row \li \b{-}
 313
 314          \li The dash indicates a range of characters. \b{[W-Z]}
 315          matches 'W' or 'X' or 'Y' or 'Z'.
 316
 317     \endtable
 318
 319     Using the predefined character set abbreviations is more portable
 320     than using character ranges across platforms and languages. For
 321     example, \b{[0-9]} matches a digit in Western alphabets but
 322     \b{\\d} matches a digit in \e any alphabet.
 323
 324     Note: In other regexp documentation, sets of characters are often
 325     called "character classes".
 326
 327     \target quantifiers
 328     \section1 Quantifiers
 329
 330     By default, an expression is automatically quantified by
 331     \b{{1,1}}, i.e. it should occur exactly once. In the following
 332     list, \b{\e {E}} stands for expression. An expression is a
 333     character, or an abbreviation for a set of characters, or a set of
 334     characters in square brackets, or an expression in parentheses.
 335
 336     \table
 337     \row \li \b{\e {E}?}
 338
 339          \li Matches zero or one occurrences of \e E. This quantifier
 340          means \e{The previous expression is optional}, because it
 341          will match whether or not the expression is found. \b{\e
 342          {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
 343          matches 'dent' or 'dents'.
 344
 345     \row \li \b{\e {E}+}
 346
 347          \li Matches one or more occurrences of \e E. \b{\e {E}+} is
 348          the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
 349          '00', '000', etc.
 350
 351     \row \li \b{\e {E}*}
 352
 353          \li Matches zero or more occurrences of \e E. It is the same
 354          as \b{\e {E}{0,}}. The \b{*} quantifier is often used
 355          in error where \b{+} should be used. For example, if
 356          \b{\\s*$} is used in an expression to match strings that
 357          end in whitespace, it will match every string because
 358          \b{\\s*$} means \e{Match zero or more whitespaces followed
 359          by end of string}. The correct regexp to match strings that
 360          have at least one trailing whitespace character is
 361          \b{\\s+$}.
 362
 363     \row \li \b{\e {E}{n}}
 364
 365          \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
 366          is the same as repeating \e E \e n times. For example,
 367          \b{x{5}} is the same as \b{xxxxx}. It is also the same
 368          as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
 369
 370     \row \li \b{\e {E}{n,}}
 371          \li Matches at least \e n occurrences of \e E.
 372
 373     \row \li \b{\e {E}{,m}}
 374          \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
 375          is the same as \b{\e {E}{0,m}}.
 376
 377     \row \li \b{\e {E}{n,m}}
 378          \li Matches at least \e n and at most \e m occurrences of \e E.
 379     \endtable
 380
 381     To apply a quantifier to more than just the preceding character,
 382     use parentheses to group characters together in an expression. For
 383     example, \b{tag+} matches a 't' followed by an 'a' followed by
 384     at least one 'g', whereas \b{(tag)+} matches at least one
 385     occurrence of 'tag'.
 386
 387     Note: Quantifiers are normally "greedy". They always match as much
 388     text as they can. For example, \b{0+} matches the first zero it
 389     finds and all the consecutive zeros after the first zero. Applied
 390     to '20005', it matches'2\underline{000}5'. Quantifiers can be made
 391     non-greedy, see setMinimal().
 392
 393     \target capturing parentheses
 394     \target backreferences
 395     \section1 Capturing Text
 396
 397     Parentheses allow us to group elements together so that we can
 398     quantify and capture them. For example if we have the expression
 399     \b{mail|letter|correspondence} that matches a string we know
 400     that \e one of the words matched but not which one. Using
 401     parentheses allows us to "capture" whatever is matched within
 402     their bounds, so if we used \b{(mail|letter|correspondence)}
 403     and matched this regexp against the string "I sent you some email"
 404     we can use the cap() or capturedTexts() functions to extract the
 405     matched characters, in this case 'mail'.
 406
 407     We can use captured text within the regexp itself. To refer to the
 408     captured text we use \e backreferences which are indexed from 1,
 409     the same as for cap(). For example we could search for duplicate
 410     words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
 411     word boundary followed by one or more word characters followed by
 412     one or more non-word characters followed by the same text as the
 413     first parenthesized expression followed by a word boundary.
 414
 415     If we want to use parentheses purely for grouping and not for
 416     capturing we can use the non-capturing syntax, e.g.
 417     \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
 418     end ')'. In this example we match either 'green' or 'blue' but we
 419     do not capture the match so we only know whether or not we matched
 420     but not which color we actually found. Using non-capturing
 421     parentheses is more efficient than using capturing parentheses
 422     since the regexp engine has to do less book-keeping.
 423
 424     Both capturing and non-capturing parentheses may be nested.
 425
 426     \target greedy quantifiers
 427
 428     For historical reasons, quantifiers (e.g. \b{*}) that apply to
 429     capturing parentheses are more "greedy" than other quantifiers.
 430     For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
 431     This behavior is different from what other regexp engines do
 432     (notably, Perl). To obtain a more intuitive capturing behavior,
 433     specify QRegExp::RegExp2 to the QRegExp constructor or call
 434     setPatternSyntax(QRegExp::RegExp2).
 435
 436     \target cap_in_a_loop
 437
 438     When the number of matches cannot be determined in advance, a
 439     common idiom is to use cap() in a loop. For example:
 440
 441     \snippet code/src_corelib_tools_qregexp.cpp 0
 442
 443     \target assertions
 444     \section1 Assertions
 445
 446     Assertions make some statement about the text at the point where
 447     they occur in the regexp but they do not match any characters. In
 448     the following list \b{\e {E}} stands for any expression.
 449
 450     \table
 451     \row \li \b{^}
 452          \li The caret signifies the beginning of the string. If you
 453          wish to match a literal \c{^} you must escape it by
 454          writing \c{\\^}. For example, \b{^#include} will only
 455          match strings which \e begin with the characters '#include'.
 456          (When the caret is the first character of a character set it
 457          has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
 458
 459     \row \li \b{$}
 460          \li The dollar signifies the end of the string. For example
 461          \b{\\d\\s*$} will match strings which end with a digit
 462          optionally followed by whitespace. If you wish to match a
 463          literal \c{$} you must escape it by writing
 464          \c{\\$}.
 465
 466     \row \li \b{\\b}
 467          \li A word boundary. For example the regexp
 468          \b{\\bOK\\b} means match immediately after a word
 469          boundary (e.g. start of string or whitespace) the letter 'O'
 470          then the letter 'K' immediately before another word boundary
 471          (e.g. end of string or whitespace). But note that the
 472          assertion does not actually match any whitespace so if we
 473          write \b{(\\bOK\\b)} and we have a match it will only
 474          contain 'OK' even if the string is "It's \underline{OK} now".
 475
 476     \row \li \b{\\B}
 477          \li A non-word boundary. This assertion is true wherever
 478          \b{\\b} is false. For example if we searched for
 479          \b{\\Bon\\B} in "Left on" the match would fail (space
 480          and end of string aren't non-word boundaries), but it would
 481          match in "t\underline{on}ne".
 482
 483     \row \li \b{(?=\e E)}
 484          \li Positive lookahead. This assertion is true if the
 485          expression matches at this point in the regexp. For example,
 486          \b{const(?=\\s+char)} matches 'const' whenever it is
 487          followed by 'char', as in 'static \underline{const} char *'.
 488          (Compare with \b{const\\s+char}, which matches 'static
 489          \underline{const char} *'.)
 490
 491     \row \li \b{(?!\e E)}
 492          \li Negative lookahead. This assertion is true if the
 493          expression does not match at this point in the regexp. For
 494          example, \b{const(?!\\s+char)} matches 'const' \e except
 495          when it is followed by 'char'.
 496     \endtable
 497
 498     \keyword QRegExp wildcard matching
 499     \section1 Wildcard Matching
 500
 501     Most command shells such as \e bash or \e cmd.exe support "file
 502     globbing", the ability to identify a group of files by using
 503     wildcards. The setPatternSyntax() function is used to switch
 504     between regexp and wildcard mode. Wildcard matching is much
 505     simpler than full regexps and has only four features:
 506
 507     \table
 508     \row \li \b{c}
 509          \li Any character represents itself apart from those mentioned
 510          below. Thus \b{c} matches the character \e c.
 511     \row \li \b{?}
 512          \li Matches any single character. It is the same as
 513          \b{.} in full regexps.
 514     \row \li \b{*}
 515          \li Matches zero or more of any characters. It is the
 516          same as \b{.*} in full regexps.
 517     \row \li \b{[...]}
 518          \li Sets of characters can be represented in square brackets,
 519          similar to full regexps. Within the character class, like
 520          outside, backslash has no special meaning.
 521     \endtable
 522
 523     In the mode Wildcard, the wildcard characters cannot be
 524     escaped. In the mode WildcardUnix, the character '\\' escapes the
 525     wildcard.
 526
 527     For example if we are in wildcard mode and have strings which
 528     contain filenames we could identify HTML files with \b{*.html}.
 529     This will match zero or more characters followed by a dot followed
 530     by 'h', 't', 'm' and 'l'.
 531
 532     To test a string against a wildcard expression, use exactMatch().
 533     For example:
 534
 535     \snippet code/src_corelib_tools_qregexp.cpp 1
 536
 537     \target perl-users
 538     \section1 Notes for Perl Users
 539
 540     Most of the character class abbreviations supported by Perl are
 541     supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
 542     {characters and abbreviations for sets of characters}.
 543
 544     In QRegExp, apart from within character classes, \c{^} always
 545     signifies the start of the string, so carets must always be
 546     escaped unless used for that purpose. In Perl the meaning of caret
 547     varies automagically depending on where it occurs so escaping it
 548     is rarely necessary. The same applies to \c{$} which in
 549     QRegExp always signifies the end of the string.
 550
 551     QRegExp's quantifiers are the same as Perl's greedy quantifiers
 552     (but see the \l{greedy quantifiers}{note above}). Non-greedy
 553     matching cannot be applied to individual quantifiers, but can be
 554     applied to all the quantifiers in the pattern. For example, to
 555     match the Perl regexp \b{ro+?m} requires:
 556
 557     \snippet code/src_corelib_tools_qregexp.cpp 2
 558
 559     The equivalent of Perl's \c{/i} option is
 560     setCaseSensitivity(Qt::CaseInsensitive).
 561
 562     Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
 563
 564     In QRegExp \b{.} matches any character, therefore all QRegExp
 565     regexps have the equivalent of Perl's \c{/s} option. QRegExp
 566     does not have an equivalent to Perl's \c{/m} option, but this
 567     can be emulated in various ways for example by splitting the input
 568     into lines or by looping with a regexp that searches for newlines.
 569
 570     Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
 571     assertions. The \\G assertion is not supported but can be emulated
 572     in a loop.
 573
 574     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
 575     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
 576     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
 577     capturedTexts()[2], etc.
 578
 579     To substitute a pattern use QString::replace().
 580
 581     Perl's extended \c{/x} syntax is not supported, nor are
 582     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
 583     the other hand, C++'s rules for literal strings can be used to
 584     achieve the same:
 585
 586     \snippet code/src_corelib_tools_qregexp.cpp 3
 587
 588     Both zero-width positive and zero-width negative lookahead
 589     assertions (?=pattern) and (?!pattern) are supported with the same
 590     syntax as Perl. Perl's lookbehind assertions, "independent"
 591     subexpressions and conditional expressions are not supported.
 592
 593     Non-capturing parentheses are also supported, with the same
 594     (?:pattern) syntax.
 595
 596     See QString::split() and QStringList::join() for equivalents
 597     to Perl's split and join functions.
 598
 599     Note: because C++ transforms \\'s they must be written \e twice in
 600     code, e.g. \b{\\b} must be written \b{\\\\b}.
 601
 602     \target code-examples
 603     \section1 Code Examples
 604
 605     \snippet code/src_corelib_tools_qregexp.cpp 4
 606
 607     The third string matches '\underline{6}'. This is a simple validation
 608     regexp for integers in the range 0 to 99.
 609
 610     \snippet code/src_corelib_tools_qregexp.cpp 5
 611
 612     The second string matches '\underline{This_is-OK}'. We've used the
 613     character set abbreviation '\\S' (non-whitespace) and the anchors
 614     to match strings which contain no whitespace.
 615
 616     In the following example we match strings containing 'mail' or
 617     'letter' or 'correspondence' but only match whole words i.e. not
 618     'email'
 619
 620     \snippet code/src_corelib_tools_qregexp.cpp 6
 621
 622     The second string matches "Please write the \underline{letter}". The
 623     word 'letter' is also captured (because of the parentheses). We
 624     can see what text we've captured like this:
 625
 626     \snippet code/src_corelib_tools_qregexp.cpp 7
 627
 628     This will capture the text from the first set of capturing
 629     parentheses (counting capturing left parentheses from left to
 630     right). The parentheses are counted from 1 since cap(0) is the
 631     whole matched regexp (equivalent to '&' in most regexp engines).
 632
 633     \snippet code/src_corelib_tools_qregexp.cpp 8
 634
 635     Here we've passed the QRegExp to QString's replace() function to
 636     replace the matched text with new text.
 637
 638     \snippet code/src_corelib_tools_qregexp.cpp 9
 639
 640     We've used the indexIn() function to repeatedly match the regexp in
 641     the string. Note that instead of moving forward by one character
 642     at a time \c pos++ we could have written \c {pos +=
 643     rx.matchedLength()} to skip over the already matched string. The
 644     count will equal 3, matching 'One \underline{Eric} another
 645     \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
 646     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
 647     by non-word boundaries.
 648
 649     One common use of regexps is to split lines of delimited data into
 650     their component fields.
 651
 652     \snippet code/src_corelib_tools_qregexp.cpp 10
 653
 654     In this example our input lines have the format company name, web
 655     address and country. Unfortunately the regexp is rather long and
 656     not very versatile -- the code will break if we add any more
 657     fields. A simpler and better solution is to look for the
 658     separator, '\\t' in this case, and take the surrounding text. The
 659     QString::split() function can take a separator string or regexp
 660     as an argument and split a string accordingly.
 661
 662     \snippet code/src_corelib_tools_qregexp.cpp 11
 663
 664     Here field[0] is the company, field[1] the web address and so on.
 665
 666     To imitate the matching of a shell we can use wildcard mode.
 667
 668     \snippet code/src_corelib_tools_qregexp.cpp 12
 669
 670     Wildcard matching can be convenient because of its simplicity, but
 671     any wildcard regexp can be defined using full regexps, e.g.
 672     \b{.*\\.html$}. Notice that we can't match both \c .html and \c
 673     .htm files with a wildcard unless we use \b{*.htm*} which will
 674     also match 'test.html.bak'. A full regexp gives us the precision
 675     we need, \b{.*\\.html?$}.
 676
 677     QRegExp can match case insensitively using setCaseSensitivity(),
 678     and can use non-greedy matching, see setMinimal(). By
 679     default QRegExp uses full regexps but this can be changed with
 680     setWildcard(). Searching can be forward with indexIn() or backward
 681     with lastIndexIn(). Captured text can be accessed using
 682     capturedTexts() which returns a string list of all captured
 683     strings, or using cap() which returns the captured string for the
 684     given index. The pos() function takes a match index and returns
 685     the position in the string where the match was made (or -1 if
 686     there was no match).
 687
 688     \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
 689         {tools/regexp}{Regular Expression Example}
 690 */
 691
 692 #if defined(Q_OS_VXWORKS) && defined(EOS)
 693 #  undef EOS
 694 #endif
 695
 696 const int NumBadChars = 64;
 697 #define BadChar(ch) ((ch).unicode() % NumBadChars)
 698
 699 const int NoOccurrence = INT_MAX;
 700 const int EmptyCapture = INT_MAX;
 701 const int InftyLen = INT_MAX;
 702 const int InftyRep = 1025;
 703 const int EOS = -1;
 704
 705 static bool isWord(QChar ch)
 706 {
 707     return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
 708 }
 709
 710 /*
 711   Merges two vectors of ints and puts the result into the first
 712   one.
 713 */
 714 static void mergeInto(QVector<int> *a, const QVector<int> &b)
 715 {
 716     int asize = a->size();
 717     int bsize = b.size();
 718     if (asize == 0) {
 719         *a = b;
 720 #ifndef QT_NO_REGEXP_OPTIM
 721     } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
 722         a->resize(asize + 1);
 723         (*a)[asize] = b.at(0);
 724 #endif
 725     } else if (bsize >= 1) {
 726         int csize = asize + bsize;
 727         QVector<int> c(csize);
 728         int i = 0, j = 0, k = 0;
 729         while (i < asize) {
 730             if (j < bsize) {
 731                 if (a->at(i) == b.at(j)) {
 732                     ++i;
 733                     --csize;
 734                 } else if (a->at(i) < b.at(j)) {
 735                     c[k++] = a->at(i++);
 736                 } else {
 737                     c[k++] = b.at(j++);
 738                 }
 739             } else {
 740                 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
 741                 break;
 742             }
 743         }
 744         c.resize(csize);
 745         if (j < bsize)
 746             memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
 747         *a = c;
 748     }
 749 }
 750
 751 #ifndef QT_NO_REGEXP_WILDCARD
 752 /*
 753   Translates a wildcard pattern to an equivalent regular expression
 754   pattern (e.g., *.cpp to .*\.cpp).
 755
 756   If enableEscaping is true, it is possible to escape the wildcard
 757   characters with \
 758 */
 759 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
 760 {
 761     const int wclen = wc_str.length();
 762     QString rx;
 763     int i = 0;
 764     bool isEscaping = false; // the previous character is '\'
 765     const QChar *wc = wc_str.unicode();
 766
 767     while (i < wclen) {
 768         const QChar c = wc[i++];
 769         switch (c.unicode()) {
 770         case '\\':
 771             if (enableEscaping) {
 772                 if (isEscaping) {
 773                     rx += QLatin1String("\\\\");
 774                 } // we insert the \\ later if necessary
 775                 if (i == wclen) { // the end
 776                     rx += QLatin1String("\\\\");
 777                 }
 778             } else {
 779                 rx += QLatin1String("\\\\");
 780             }
 781             isEscaping = true;
 782             break;
 783         case '*':
 784             if (isEscaping) {
 785                 rx += QLatin1String("\\*");
 786                 isEscaping = false;
 787             } else {
 788                 rx += QLatin1String(".*");
 789             }
 790             break;
 791         case '?':
 792             if (isEscaping) {
 793                 rx += QLatin1String("\\?");
 794                 isEscaping = false;
 795             } else {
 796                 rx += QLatin1Char('.');
 797             }
 798
 799             break;
 800         case '$':
 801         case '(':
 802         case ')':
 803         case '+':
 804         case '.':
 805         case '^':
 806         case '{':
 807         case '|':
 808         case '}':
 809             if (isEscaping) {
 810                 isEscaping = false;
 811                 rx += QLatin1String("\\\\");
 812             }
 813             rx += QLatin1Char('\\');
 814             rx += c;
 815             break;
 816          case '[':
 817             if (isEscaping) {
 818                 isEscaping = false;
 819                 rx += QLatin1String("\\[");
 820             } else {
 821                 rx += c;
 822                 if (wc[i] == QLatin1Char('^'))
 823                     rx += wc[i++];
 824                 if (i < wclen) {
 825                     if (rx[i] == QLatin1Char(']'))
 826                         rx += wc[i++];
 827                     while (i < wclen && wc[i] != QLatin1Char(']')) {
 828                         if (wc[i] == QLatin1Char('\\'))
 829                             rx += QLatin1Char('\\');
 830                         rx += wc[i++];
 831                     }
 832                 }
 833             }
 834              break;
 835
 836         case ']':
 837             if(isEscaping){
 838                 isEscaping = false;
 839                 rx += QLatin1String("\\");
 840             }
 841             rx += c;
 842             break;
 843
 844         default:
 845             if(isEscaping){
 846                 isEscaping = false;
 847                 rx += QLatin1String("\\\\");
 848             }
 849             rx += c;
 850         }
 851     }
 852     return rx;
 853 }
 854 #endif
 855
 856 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
 857 {
 858     if (caretMode == QRegExp::CaretAtZero) {
 859         return 0;
 860     } else if (caretMode == QRegExp::CaretAtOffset) {
 861         return offset;
 862     } else { // QRegExp::CaretWontMatch
 863         return -1;
 864     }
 865 }
 866
 867 /*
 868     The QRegExpEngineKey struct uniquely identifies an engine.
 869 */
 870 struct QRegExpEngineKey
 871 {
 872     QString pattern;
 873     QRegExp::PatternSyntax patternSyntax;
 874     Qt::CaseSensitivity cs;
 875
 876     inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
 877                             Qt::CaseSensitivity cs)
 878         : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
 879
 880     inline void clear() {
 881         pattern.clear();
 882         patternSyntax = QRegExp::RegExp;
 883         cs = Qt::CaseSensitive;
 884     }
 885 };
 886
 887 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
 888 {
 889     return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
 890            && key1.cs == key2.cs;
 891 }
 892
 893 class QRegExpEngine;
 894
 895 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
 896
 897 /*
 898   This is the engine state during matching.
 899 */
 900 struct QRegExpMatchState
 901 {
 902     const QChar *in; // a pointer to the input string data
 903     int pos; // the current position in the string
 904     int caretPos;
 905     int len; // the length of the input string
 906     bool minimal; // minimal matching?
 907     int *bigArray; // big array holding the data for the next pointers
 908     int *inNextStack; // is state is nextStack?
 909     int *curStack; // stack of current states
 910     int *nextStack; // stack of next states
 911     int *curCapBegin; // start of current states' captures
 912     int *nextCapBegin; // start of next states' captures
 913     int *curCapEnd; // end of current states' captures
 914     int *nextCapEnd; // end of next states' captures
 915     int *tempCapBegin; // start of temporary captures
 916     int *tempCapEnd; // end of temporary captures
 917     int *capBegin; // start of captures for a next state
 918     int *capEnd; // end of captures for a next state
 919     int *slideTab; // bump-along slide table for bad-character heuristic
 920     int *captured; // what match() returned last
 921     int slideTabSize; // size of slide table
 922     int capturedSize;
 923 #ifndef QT_NO_REGEXP_BACKREF
 924     QList<QVector<int> > sleeping; // list of back-reference sleepers
 925 #endif
 926     int matchLen; // length of match
 927     int oneTestMatchedLen; // length of partial match
 928
 929     const QRegExpEngine *eng;
 930
 931     inline QRegExpMatchState() : bigArray(0), captured(0) {}
 932     inline ~QRegExpMatchState() { free(bigArray); }
 933
 934     void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
 935     void prepareForMatch(QRegExpEngine *eng);
 936     void match(const QChar *str, int len, int pos, bool minimal,
 937         bool oneTest, int caretIndex);
 938     bool matchHere();
 939     bool testAnchor(int i, int a, const int *capBegin);
 940 };
 941
 942 /*
 943   The struct QRegExpAutomatonState represents one state in a modified NFA. The
 944   input characters matched are stored in the state instead of on
 945   the transitions, something possible for an automaton
 946   constructed from a regular expression.
 947 */
 948 struct QRegExpAutomatonState
 949 {
 950 #ifndef QT_NO_REGEXP_CAPTURE
 951     int atom; // which atom does this state belong to?
 952 #endif
 953     int match; // what does it match? (see CharClassBit and BackRefBit)
 954     QVector<int> outs; // out-transitions
 955     QMap<int, int> reenter; // atoms reentered when transiting out
 956     QMap<int, int> anchors; // anchors met when transiting out
 957
 958     inline QRegExpAutomatonState() { }
 959 #ifndef QT_NO_REGEXP_CAPTURE
 960     inline QRegExpAutomatonState(int a, int m)
 961         : atom(a), match(m) { }
 962 #else
 963     inline QRegExpAutomatonState(int m)
 964         : match(m) { }
 965 #endif
 966 };
 967
 968 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
 969
 970 /*
 971   The struct QRegExpCharClassRange represents a range of characters (e.g.,
 972   [0-9] denotes range 48 to 57).
 973 */
 974 struct QRegExpCharClassRange
 975 {
 976     ushort from; // 48
 977     ushort len; // 10
 978 };
 979
 980 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
 981
 982 #ifndef QT_NO_REGEXP_CAPTURE
 983 /*
 984   The struct QRegExpAtom represents one node in the hierarchy of regular
 985   expression atoms.
 986 */
 987 struct QRegExpAtom
 988 {
 989     enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
 990
 991     int parent; // index of parent in array of atoms
 992     int capture; // index of capture, from 1 to ncap - 1
 993 };
 994
 995 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
 996 #endif
 997
 998 struct QRegExpLookahead;
 999
1000 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1001 /*
1002   The struct QRegExpAnchorAlternation represents a pair of anchors with
1003   OR semantics.
1004 */
1005 struct QRegExpAnchorAlternation
1006 {
1007     int a; // this anchor...
1008     int b; // ...or this one
1009 };
1010
1011 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1012 #endif
1013
1014 #ifndef QT_NO_REGEXP_CCLASS
1015
1016 #define FLAG(x) (1 << (x))
1017 /*
1018   The class QRegExpCharClass represents a set of characters, such as can
1019   be found in regular expressions (e.g., [a-z] denotes the set
1020   {a, b, ..., z}).
1021 */
1022 class QRegExpCharClass
1023 {
1024 public:
1025     QRegExpCharClass();
1026     inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1027
1028     QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1029
1030     void clear();
1031     bool negative() const { return n; }
1032     void setNegative(bool negative);
1033     void addCategories(uint cats);
1034     void addRange(ushort from, ushort to);
1035     void addSingleton(ushort ch) { addRange(ch, ch); }
1036
1037     bool in(QChar ch) const;
1038 #ifndef QT_NO_REGEXP_OPTIM
1039     const QVector<int> &firstOccurrence() const { return occ1; }
1040 #endif
1041
1042 #if defined(QT_DEBUG)
1043     void dump() const;
1044 #endif
1045
1046 private:
1047     uint c; // character classes
1048     QVector<QRegExpCharClassRange> r; // character ranges
1049     bool n; // negative?
1050 #ifndef QT_NO_REGEXP_OPTIM
1051     QVector<int> occ1; // first-occurrence array
1052 #endif
1053 };
1054 #else
1055 struct QRegExpCharClass
1056 {
1057     int dummy;
1058
1059 #ifndef QT_NO_REGEXP_OPTIM
1060     QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1061
1062     const QVector<int> &firstOccurrence() const { return occ1; }
1063     QVector<int> occ1;
1064 #endif
1065 };
1066 #endif
1067
1068 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1069
1070 /*
1071   The QRegExpEngine class encapsulates a modified nondeterministic
1072   finite automaton (NFA).
1073 */
1074 class QRegExpEngine
1075 {
1076 public:
1077     QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1078         : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1079
1080     QRegExpEngine(const QRegExpEngineKey &key);
1081     ~QRegExpEngine();
1082
1083     bool isValid() const { return valid; }
1084     const QString &errorString() const { return yyError; }
1085     int captureCount() const { return officialncap; }
1086
1087     int createState(QChar ch);
1088     int createState(const QRegExpCharClass &cc);
1089 #ifndef QT_NO_REGEXP_BACKREF
1090     int createState(int bref);
1091 #endif
1092
1093     void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1094 #ifndef QT_NO_REGEXP_CAPTURE
1095     void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1096 #endif
1097
1098 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1099     int anchorAlternation(int a, int b);
1100     int anchorConcatenation(int a, int b);
1101 #else
1102     int anchorAlternation(int a, int b) { return a & b; }
1103     int anchorConcatenation(int a, int b) { return a | b; }
1104 #endif
1105     void addAnchors(int from, int to, int a);
1106
1107 #ifndef QT_NO_REGEXP_OPTIM
1108     void heuristicallyChooseHeuristic();
1109 #endif
1110
1111 #if defined(QT_DEBUG)
1112     void dump() const;
1113 #endif
1114
1115     QAtomicInt ref;
1116
1117 private:
1118     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1119     enum { InitialState = 0, FinalState = 1 };
1120
1121     void setup();
1122     int setupState(int match);
1123
1124     /*
1125       Let's hope that 13 lookaheads and 14 back-references are
1126       enough.
1127      */
1128     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1129     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1130            Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1131            Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1132            Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1133            Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1134
1135            Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1136                    ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1137 #ifndef QT_NO_REGEXP_CAPTURE
1138     int startAtom(bool officialCapture);
1139     void finishAtom(int atom, bool needCapture);
1140 #endif
1141
1142 #ifndef QT_NO_REGEXP_LOOKAHEAD
1143     int addLookahead(QRegExpEngine *eng, bool negative);
1144 #endif
1145
1146 #ifndef QT_NO_REGEXP_OPTIM
1147     bool goodStringMatch(QRegExpMatchState &matchState) const;
1148     bool badCharMatch(QRegExpMatchState &matchState) const;
1149 #else
1150     bool bruteMatch(QRegExpMatchState &matchState) const;
1151 #endif
1152
1153     QVector<QRegExpAutomatonState> s; // array of states
1154 #ifndef QT_NO_REGEXP_CAPTURE
1155     QVector<QRegExpAtom> f; // atom hierarchy
1156     int nf; // number of atoms
1157     int cf; // current atom
1158     QVector<int> captureForOfficialCapture;
1159 #endif
1160     int officialncap; // number of captures, seen from the outside
1161     int ncap; // number of captures, seen from the inside
1162 #ifndef QT_NO_REGEXP_CCLASS
1163     QVector<QRegExpCharClass> cl; // array of character classes
1164 #endif
1165 #ifndef QT_NO_REGEXP_LOOKAHEAD
1166     QVector<QRegExpLookahead *> ahead; // array of lookaheads
1167 #endif
1168 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1169     QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1170 #endif
1171 #ifndef QT_NO_REGEXP_OPTIM
1172     bool caretAnchored; // does the regexp start with ^?
1173     bool trivial; // is the good-string all that needs to match?
1174 #endif
1175     bool valid; // is the regular expression valid?
1176     Qt::CaseSensitivity cs; // case sensitive?
1177     bool greedyQuantifiers; // RegExp2?
1178     bool xmlSchemaExtensions;
1179 #ifndef QT_NO_REGEXP_BACKREF
1180     int nbrefs; // number of back-references
1181 #endif
1182
1183 #ifndef QT_NO_REGEXP_OPTIM
1184     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1185
1186     int goodEarlyStart; // the index where goodStr can first occur in a match
1187     int goodLateStart; // the index where goodStr can last occur in a match
1188     QString goodStr; // the string that any match has to contain
1189
1190     int minl; // the minimum length of a match
1191     QVector<int> occ1; // first-occurrence array
1192 #endif
1193
1194     /*
1195       The class Box is an abstraction for a regular expression
1196       fragment. It can also be seen as one node in the syntax tree of
1197       a regular expression with synthetized attributes.
1198
1199       Its interface is ugly for performance reasons.
1200     */
1201     class Box
1202     {
1203     public:
1204         Box(QRegExpEngine *engine);
1205         Box(const Box &b) { operator=(b); }
1206
1207         Box &operator=(const Box &b);
1208
1209         void clear() { operator=(Box(eng)); }
1210         void set(QChar ch);
1211         void set(const QRegExpCharClass &cc);
1212 #ifndef QT_NO_REGEXP_BACKREF
1213         void set(int bref);
1214 #endif
1215
1216         void cat(const Box &b);
1217         void orx(const Box &b);
1218         void plus(int atom);
1219         void opt();
1220         void catAnchor(int a);
1221 #ifndef QT_NO_REGEXP_OPTIM
1222         void setupHeuristics();
1223 #endif
1224
1225 #if defined(QT_DEBUG)
1226         void dump() const;
1227 #endif
1228
1229     private:
1230         void addAnchorsToEngine(const Box &to) const;
1231
1232         QRegExpEngine *eng; // the automaton under construction
1233         QVector<int> ls; // the left states (firstpos)
1234         QVector<int> rs; // the right states (lastpos)
1235         QMap<int, int> lanchors; // the left anchors
1236         QMap<int, int> ranchors; // the right anchors
1237         int skipanchors; // the anchors to match if the box is skipped
1238
1239 #ifndef QT_NO_REGEXP_OPTIM
1240         int earlyStart; // the index where str can first occur
1241         int lateStart; // the index where str can last occur
1242         QString str; // a string that has to occur in any match
1243         QString leftStr; // a string occurring at the left of this box
1244         QString rightStr; // a string occurring at the right of this box
1245         int maxl; // the maximum length of this box (possibly InftyLen)
1246 #endif
1247
1248         int minl; // the minimum length of this box
1249 #ifndef QT_NO_REGEXP_OPTIM
1250         QVector<int> occ1; // first-occurrence array
1251 #endif
1252     };
1253
1254     friend class Box;
1255
1256     /*
1257       This is the lexical analyzer for regular expressions.
1258     */
1259     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1260            Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1261            Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1262     int getChar();
1263     int getEscape();
1264 #ifndef QT_NO_REGEXP_INTERVAL
1265     int getRep(int def);
1266 #endif
1267 #ifndef QT_NO_REGEXP_LOOKAHEAD
1268     void skipChars(int n);
1269 #endif
1270     void error(const char *msg);
1271     void startTokenizer(const QChar *rx, int len);
1272     int getToken();
1273
1274     const QChar *yyIn; // a pointer to the input regular expression pattern
1275     int yyPos0; // the position of yyTok in the input pattern
1276     int yyPos; // the position of the next character to read
1277     int yyLen; // the length of yyIn
1278     int yyCh; // the last character read
1279     QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1280     int yyMinRep; // attribute for Tok_Quantifier
1281     int yyMaxRep; // ditto
1282     QString yyError; // syntax error or overflow during parsing?
1283
1284     /*
1285       This is the syntactic analyzer for regular expressions.
1286     */
1287     int parse(const QChar *rx, int len);
1288     void parseAtom(Box *box);
1289     void parseFactor(Box *box);
1290     void parseTerm(Box *box);
1291     void parseExpression(Box *box);
1292
1293     int yyTok; // the last token read
1294     bool yyMayCapture; // set this to false to disable capturing
1295
1296     friend struct QRegExpMatchState;
1297 };
1298
1299 #ifndef QT_NO_REGEXP_LOOKAHEAD
1300 /*
1301   The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1302   (?=foo) and (?!bar)).
1303 */
1304 struct QRegExpLookahead
1305 {
1306     QRegExpEngine *eng; // NFA representing the embedded regular expression
1307     bool neg; // negative lookahead?
1308
1309     inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1310         : eng(eng0), neg(neg0) { }
1311     inline ~QRegExpLookahead() { delete eng; }
1312 };
1313 #endif
1314
1315 /*!
1316     \internal
1317     convert the pattern string to the RegExp syntax.
1318
1319     This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1320  */
1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1322 {
1323     switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325     case QRegExp::Wildcard:
1326         return wc2rx(pattern, false);
1327     case QRegExp::WildcardUnix:
1328         return wc2rx(pattern, true);
1329 #endif
1330     case QRegExp::FixedString:
1331         return QRegExp::escape(pattern);
1332     case QRegExp::W3CXmlSchema11:
1333     default:
1334         return pattern;
1335     }
1336 }
1337
1338 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1339     : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1340       xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1341 {
1342     setup();
1343
1344     QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1345
1346     valid = (parse(rx.unicode(), rx.length()) == rx.length());
1347     if (!valid) {
1348 #ifndef QT_NO_REGEXP_OPTIM
1349         trivial = false;
1350 #endif
1351         error(RXERR_LEFTDELIM);
1352     }
1353 }
1354
1355 QRegExpEngine::~QRegExpEngine()
1356 {
1357 #ifndef QT_NO_REGEXP_LOOKAHEAD
1358     qDeleteAll(ahead);
1359 #endif
1360 }
1361
1362 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1363 {
1364     /*
1365       We use one QVector<int> for all the big data used a lot in
1366       matchHere() and friends.
1367     */
1368     int ns = eng->s.size(); // number of states
1369     int ncap = eng->ncap;
1370 #ifndef QT_NO_REGEXP_OPTIM
1371     int newSlideTabSize = qMax(eng->minl + 1, 16);
1372 #else
1373     int newSlideTabSize = 0;
1374 #endif
1375     int numCaptures = eng->captureCount();
1376     int newCapturedSize = 2 + 2 * numCaptures;
1377     bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1378
1379     // set all internal variables only _after_ bigArray is realloc'ed
1380     // to prevent a broken regexp in oom case
1381
1382     slideTabSize = newSlideTabSize;
1383     capturedSize = newCapturedSize;
1384     inNextStack = bigArray;
1385     memset(inNextStack, -1, ns * sizeof(int));
1386     curStack = inNextStack + ns;
1387     nextStack = inNextStack + 2 * ns;
1388
1389     curCapBegin = inNextStack + 3 * ns;
1390     nextCapBegin = curCapBegin + ncap * ns;
1391     curCapEnd = curCapBegin + 2 * ncap * ns;
1392     nextCapEnd = curCapBegin + 3 * ncap * ns;
1393
1394     tempCapBegin = curCapBegin + 4 * ncap * ns;
1395     tempCapEnd = tempCapBegin + ncap;
1396     capBegin = tempCapBegin + 2 * ncap;
1397     capEnd = tempCapBegin + 3 * ncap;
1398
1399     slideTab = tempCapBegin + 4 * ncap;
1400     captured = slideTab + slideTabSize;
1401     memset(captured, -1, capturedSize*sizeof(int));
1402     this->eng = eng;
1403 }
1404
1405 /*
1406   Tries to match in str and returns an array of (begin, length) pairs
1407   for captured text. If there is no match, all pairs are (-1, -1).
1408 */
1409 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1410     bool minimal0, bool oneTest, int caretIndex)
1411 {
1412     bool matched = false;
1413     QChar char_null;
1414
1415 #ifndef QT_NO_REGEXP_OPTIM
1416     if (eng->trivial && !oneTest) {
1417         pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1418         matchLen = eng->goodStr.length();
1419         matched = (pos != -1);
1420     } else
1421 #endif
1422     {
1423         in = str0;
1424         if (in == 0)
1425             in = &char_null;
1426         pos = pos0;
1427         caretPos = caretIndex;
1428         len = len0;
1429         minimal = minimal0;
1430         matchLen = 0;
1431         oneTestMatchedLen = 0;
1432
1433         if (eng->valid && pos >= 0 && pos <= len) {
1434 #ifndef QT_NO_REGEXP_OPTIM
1435             if (oneTest) {
1436                 matched = matchHere();
1437             } else {
1438                 if (pos <= len - eng->minl) {
1439                     if (eng->caretAnchored) {
1440                         matched = matchHere();
1441                     } else if (eng->useGoodStringHeuristic) {
1442                         matched = eng->goodStringMatch(*this);
1443                     } else {
1444                         matched = eng->badCharMatch(*this);
1445                     }
1446                 }
1447             }
1448 #else
1449             matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1450 #endif
1451         }
1452     }
1453
1454     if (matched) {
1455         int *c = captured;
1456         *c++ = pos;
1457         *c++ = matchLen;
1458
1459         int numCaptures = (capturedSize - 2) >> 1;
1460 #ifndef QT_NO_REGEXP_CAPTURE
1461         for (int i = 0; i < numCaptures; ++i) {
1462             int j = eng->captureForOfficialCapture.at(i);
1463             if (capBegin[j] != EmptyCapture) {
1464                 int len = capEnd[j] - capBegin[j];
1465                 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1466                 *c++ = len;
1467             } else {
1468                 *c++ = -1;
1469                 *c++ = -1;
1470             }
1471         }
1472 #endif
1473     } else {
1474         // we rely on 2's complement here
1475         memset(captured, -1, capturedSize * sizeof(int));
1476     }
1477 }
1478
1479 /*
1480   The three following functions add one state to the automaton and
1481   return the number of the state.
1482 */
1483
1484 int QRegExpEngine::createState(QChar ch)
1485 {
1486     return setupState(ch.unicode());
1487 }
1488
1489 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1490 {
1491 #ifndef QT_NO_REGEXP_CCLASS
1492     int n = cl.size();
1493     cl += QRegExpCharClass(cc);
1494     return setupState(CharClassBit | n);
1495 #else
1496     Q_UNUSED(cc);
1497     return setupState(CharClassBit);
1498 #endif
1499 }
1500
1501 #ifndef QT_NO_REGEXP_BACKREF
1502 int QRegExpEngine::createState(int bref)
1503 {
1504     if (bref > nbrefs) {
1505         nbrefs = bref;
1506         if (nbrefs > MaxBackRefs) {
1507             error(RXERR_LIMIT);
1508             return 0;
1509         }
1510     }
1511     return setupState(BackRefBit | bref);
1512 }
1513 #endif
1514
1515 /*
1516   The two following functions add a transition between all pairs of
1517   states (i, j) where i is found in from, and j is found in to.
1518
1519   Cat-transitions are distinguished from plus-transitions for
1520   capturing.
1521 */
1522
1523 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1524 {
1525     for (int i = 0; i < from.size(); i++)
1526         mergeInto(&s[from.at(i)].outs, to);
1527 }
1528
1529 #ifndef QT_NO_REGEXP_CAPTURE
1530 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1531 {
1532     for (int i = 0; i < from.size(); i++) {
1533         QRegExpAutomatonState &st = s[from.at(i)];
1534         const QVector<int> oldOuts = st.outs;
1535         mergeInto(&st.outs, to);
1536         if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1537             for (int j = 0; j < to.size(); j++) {
1538                 // ### st.reenter.contains(to.at(j)) check looks suspicious
1539                 if (!st.reenter.contains(to.at(j)) &&
1540                      qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1541                     st.reenter.insert(to.at(j), atom);
1542             }
1543         }
1544     }
1545 }
1546 #endif
1547
1548 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1549 /*
1550   Returns an anchor that means a OR b.
1551 */
1552 int QRegExpEngine::anchorAlternation(int a, int b)
1553 {
1554     if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1555         return a & b;
1556
1557     int n = aa.size();
1558 #ifndef QT_NO_REGEXP_OPTIM
1559     if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1560         return Anchor_Alternation | (n - 1);
1561 #endif
1562
1563     QRegExpAnchorAlternation element = {a, b};
1564     aa.append(element);
1565     return Anchor_Alternation | n;
1566 }
1567
1568 /*
1569   Returns an anchor that means a AND b.
1570 */
1571 int QRegExpEngine::anchorConcatenation(int a, int b)
1572 {
1573     if (((a | b) & Anchor_Alternation) == 0)
1574         return a | b;
1575     if ((b & Anchor_Alternation) != 0)
1576         qSwap(a, b);
1577
1578     int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1579     int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1580     return anchorAlternation(aprime, bprime);
1581 }
1582 #endif
1583
1584 /*
1585   Adds anchor a on a transition caracterised by its from state and
1586   its to state.
1587 */
1588 void QRegExpEngine::addAnchors(int from, int to, int a)
1589 {
1590     QRegExpAutomatonState &st = s[from];
1591     if (st.anchors.contains(to))
1592         a = anchorAlternation(st.anchors.value(to), a);
1593     st.anchors.insert(to, a);
1594 }
1595
1596 #ifndef QT_NO_REGEXP_OPTIM
1597 /*
1598   This function chooses between the good-string and the bad-character
1599   heuristics. It computes two scores and chooses the heuristic with
1600   the highest score.
1601
1602   Here are some common-sense constraints on the scores that should be
1603   respected if the formulas are ever modified: (1) If goodStr is
1604   empty, the good-string heuristic scores 0. (2) If the regular
1605   expression is trivial, the good-string heuristic should be used.
1606   (3) If the search is case insensitive, the good-string heuristic
1607   should be used, unless it scores 0. (Case insensitivity turns all
1608   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1609   big, the good-string heuristic should score less.
1610 */
1611 void QRegExpEngine::heuristicallyChooseHeuristic()
1612 {
1613     if (minl == 0) {
1614         useGoodStringHeuristic = false;
1615     } else if (trivial) {
1616         useGoodStringHeuristic = true;
1617     } else {
1618         /*
1619           Magic formula: The good string has to constitute a good
1620           proportion of the minimum-length string, and appear at a
1621           more-or-less known index.
1622         */
1623         int goodStringScore = (64 * goodStr.length() / minl) -
1624                               (goodLateStart - goodEarlyStart);
1625         /*
1626           Less magic formula: We pick some characters at random, and
1627           check whether they are good or bad.
1628         */
1629         int badCharScore = 0;
1630         int step = qMax(1, NumBadChars / 32);
1631         for (int i = 1; i < NumBadChars; i += step) {
1632             if (occ1.at(i) == NoOccurrence)
1633                 badCharScore += minl;
1634             else
1635                 badCharScore += occ1.at(i);
1636         }
1637         badCharScore /= minl;
1638         useGoodStringHeuristic = (goodStringScore > badCharScore);
1639     }
1640 }
1641 #endif
1642
1643 #if defined(QT_DEBUG)
1644 void QRegExpEngine::dump() const
1645 {
1646     int i, j;
1647     qDebug("Case %ssensitive engine", cs ? "" : "in");
1648     qDebug("  States");
1649     for (i = 0; i < s.size(); i++) {
1650         qDebug("  %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1651 #ifndef QT_NO_REGEXP_CAPTURE
1652         if (nf > 0)
1653             qDebug("    in atom %d", s[i].atom);
1654 #endif
1655         int m = s[i].match;
1656         if ((m & CharClassBit) != 0) {
1657             qDebug("    match character class %d", m ^ CharClassBit);
1658 #ifndef QT_NO_REGEXP_CCLASS
1659             cl[m ^ CharClassBit].dump();
1660 #else
1661             qDebug("    negative character class");
1662 #endif
1663         } else if ((m & BackRefBit) != 0) {
1664             qDebug("    match back-reference %d", m ^ BackRefBit);
1665         } else if (m >= 0x20 && m <= 0x7e) {
1666             qDebug("    match 0x%.4x (%c)", m, m);
1667         } else {
1668             qDebug("    match 0x%.4x", m);
1669         }
1670         for (j = 0; j < s[i].outs.size(); j++) {
1671             int next = s[i].outs[j];
1672             qDebug("    -> %d", next);
1673             if (s[i].reenter.contains(next))
1674                 qDebug("       [reenter %d]", s[i].reenter[next]);
1675             if (s[i].anchors.value(next) != 0)
1676                 qDebug("       [anchors 0x%.8x]", s[i].anchors[next]);
1677         }
1678     }
1679 #ifndef QT_NO_REGEXP_CAPTURE
1680     if (nf > 0) {
1681         qDebug("  Atom    Parent  Capture");
1682         for (i = 0; i < nf; i++) {
1683             if (f[i].capture == QRegExpAtom::NoCapture) {
1684                 qDebug("  %6d  %6d     nil", i, f[i].parent);
1685             } else {
1686                 int cap = f[i].capture;
1687                 bool official = captureForOfficialCapture.contains(cap);
1688                 qDebug("  %6d  %6d  %6d  %s", i, f[i].parent, f[i].capture,
1689                        official ? "official" : "");
1690             }
1691         }
1692     }
1693 #endif
1694 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1695     for (i = 0; i < aa.size(); i++)
1696         qDebug("  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1697 #endif
1698 }
1699 #endif
1700
1701 void QRegExpEngine::setup()
1702 {
1703     ref.store(1);
1704 #ifndef QT_NO_REGEXP_CAPTURE
1705     f.resize(32);
1706     nf = 0;
1707     cf = -1;
1708 #endif
1709     officialncap = 0;
1710     ncap = 0;
1711 #ifndef QT_NO_REGEXP_OPTIM
1712     caretAnchored = true;
1713     trivial = true;
1714 #endif
1715     valid = false;
1716 #ifndef QT_NO_REGEXP_BACKREF
1717     nbrefs = 0;
1718 #endif
1719 #ifndef QT_NO_REGEXP_OPTIM
1720     useGoodStringHeuristic = true;
1721     minl = 0;
1722     occ1.fill(0, NumBadChars);
1723 #endif
1724 }
1725
1726 int QRegExpEngine::setupState(int match)
1727 {
1728 #ifndef QT_NO_REGEXP_CAPTURE
1729     s += QRegExpAutomatonState(cf, match);
1730 #else
1731     s += QRegExpAutomatonState(match);
1732 #endif
1733     return s.size() - 1;
1734 }
1735
1736 #ifndef QT_NO_REGEXP_CAPTURE
1737 /*
1738   Functions startAtom() and finishAtom() should be called to delimit
1739   atoms. When a state is created, it is assigned to the current atom.
1740   The information is later used for capturing.
1741 */
1742 int QRegExpEngine::startAtom(bool officialCapture)
1743 {
1744     if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1745         f.resize((nf + 1) << 1);
1746     f[nf].parent = cf;
1747     cf = nf++;
1748     f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1749     return cf;
1750 }
1751
1752 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1753 {
1754     if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1755         f[atom].capture = QRegExpAtom::UnofficialCapture;
1756     cf = f.at(atom).parent;
1757 }
1758 #endif
1759
1760 #ifndef QT_NO_REGEXP_LOOKAHEAD
1761 /*
1762   Creates a lookahead anchor.
1763 */
1764 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1765 {
1766     int n = ahead.size();
1767     if (n == MaxLookaheads) {
1768         error(RXERR_LIMIT);
1769         return 0;
1770     }
1771     ahead += new QRegExpLookahead(eng, negative);
1772     return Anchor_FirstLookahead << n;
1773 }
1774 #endif
1775
1776 #ifndef QT_NO_REGEXP_CAPTURE
1777 /*
1778   We want the longest leftmost captures.
1779 */
1780 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1781                             const int *end2)
1782 {
1783     for (int i = 0; i < ncap; i++) {
1784         int delta = begin2[i] - begin1[i]; // it has to start early...
1785         if (delta == 0)
1786             delta = end1[i] - end2[i]; // ...and end late
1787
1788         if (delta != 0)
1789             return delta > 0;
1790     }
1791     return false;
1792 }
1793 #endif
1794
1795 /*
1796   Returns true if anchor a matches at position pos + i in the input
1797   string, otherwise false.
1798 */
1799 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1800 {
1801     int j;
1802
1803 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1804     if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1805         return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1806                || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1807 #endif
1808
1809     if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1810         if (pos + i != caretPos)
1811             return false;
1812     }
1813     if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1814         if (pos + i != len)
1815             return false;
1816     }
1817 #ifndef QT_NO_REGEXP_ESCAPE
1818     if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1819         bool before = false;
1820         bool after = false;
1821         if (pos + i != 0)
1822             before = isWord(in[pos + i - 1]);
1823         if (pos + i != len)
1824             after = isWord(in[pos + i]);
1825         if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1826             return false;
1827         if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1828             return false;
1829     }
1830 #endif
1831 #ifndef QT_NO_REGEXP_LOOKAHEAD
1832     if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1833         const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1834         for (j = 0; j < ahead.size(); j++) {
1835             if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1836                 QRegExpMatchState matchState;
1837                 matchState.prepareForMatch(ahead[j]->eng);
1838                 matchState.match(in + pos + i, len - pos - i, 0,
1839                     true, true, caretPos - pos - i);
1840                 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1841                     return false;
1842             }
1843         }
1844     }
1845 #endif
1846 #ifndef QT_NO_REGEXP_CAPTURE
1847 #ifndef QT_NO_REGEXP_BACKREF
1848     for (j = 0; j < eng->nbrefs; j++) {
1849         if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1850             int i = eng->captureForOfficialCapture.at(j);
1851             if (capBegin[i] != EmptyCapture)
1852                 return false;
1853         }
1854     }
1855 #endif
1856 #endif
1857     return true;
1858 }
1859
1860 #ifndef QT_NO_REGEXP_OPTIM
1861 /*
1862   The three following functions are what Jeffrey Friedl would call
1863   transmissions (or bump-alongs). Using one or the other should make
1864   no difference except in performance.
1865 */
1866
1867 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1868 {
1869     int k = matchState.pos + goodEarlyStart;
1870     QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1871     while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1872         int from = k - goodLateStart;
1873         int to = k - goodEarlyStart;
1874         if (from > matchState.pos)
1875             matchState.pos = from;
1876
1877         while (matchState.pos <= to) {
1878             if (matchState.matchHere())
1879                 return true;
1880             ++matchState.pos;
1881         }
1882         ++k;
1883     }
1884     return false;
1885 }
1886
1887 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1888 {
1889     int slideHead = 0;
1890     int slideNext = 0;
1891     int i;
1892     int lastPos = matchState.len - minl;
1893     memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1894
1895     /*
1896       Set up the slide table, used for the bad-character heuristic,
1897       using the table of first occurrence of each character.
1898     */
1899     for (i = 0; i < minl; i++) {
1900         int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1901         if (sk == NoOccurrence)
1902             sk = i + 1;
1903         if (sk > 0) {
1904             int k = i + 1 - sk;
1905             if (k < 0) {
1906                 sk = i + 1;
1907                 k = 0;
1908             }
1909             if (sk > matchState.slideTab[k])
1910                 matchState.slideTab[k] = sk;
1911         }
1912     }
1913
1914     if (matchState.pos > lastPos)
1915         return false;
1916
1917     for (;;) {
1918         if (++slideNext >= matchState.slideTabSize)
1919             slideNext = 0;
1920         if (matchState.slideTab[slideHead] > 0) {
1921             if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1922                 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1923             matchState.slideTab[slideHead] = 0;
1924         } else {
1925             if (matchState.matchHere())
1926                 return true;
1927         }
1928
1929         if (matchState.pos == lastPos)
1930             break;
1931
1932         /*
1933           Update the slide table. This code has much in common with
1934           the initialization code.
1935         */
1936         int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1937         if (sk == NoOccurrence) {
1938             matchState.slideTab[slideNext] = minl;
1939         } else if (sk > 0) {
1940             int k = slideNext + minl - sk;
1941             if (k >= matchState.slideTabSize)
1942                 k -= matchState.slideTabSize;
1943             if (sk > matchState.slideTab[k])
1944                 matchState.slideTab[k] = sk;
1945         }
1946         slideHead = slideNext;
1947         ++matchState.pos;
1948     }
1949     return false;
1950 }
1951 #else
1952 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1953 {
1954     while (matchState.pos <= matchState.len) {
1955         if (matchState.matchHere())
1956             return true;
1957         ++matchState.pos;
1958     }
1959     return false;
1960 }
1961 #endif
1962
1963 /*
1964   Here's the core of the engine. It tries to do a match here and now.
1965 */
1966 bool QRegExpMatchState::matchHere()
1967 {
1968     int ncur = 1, nnext = 0;
1969     int i = 0, j, k, m;
1970     bool stop = false;
1971
1972     matchLen = -1;
1973     oneTestMatchedLen = -1;
1974     curStack[0] = QRegExpEngine::InitialState;
1975
1976     int ncap = eng->ncap;
1977 #ifndef QT_NO_REGEXP_CAPTURE
1978     if (ncap > 0) {
1979         for (j = 0; j < ncap; j++) {
1980             curCapBegin[j] = EmptyCapture;
1981             curCapEnd[j] = EmptyCapture;
1982         }
1983     }
1984 #endif
1985
1986 #ifndef QT_NO_REGEXP_BACKREF
1987     while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1988 #else
1989     while (ncur > 0 && i <= len - pos && !stop)
1990 #endif
1991     {
1992         int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1993         for (j = 0; j < ncur; j++) {
1994             int cur = curStack[j];
1995             const QRegExpAutomatonState &scur = eng->s.at(cur);
1996             const QVector<int> &outs = scur.outs;
1997             for (k = 0; k < outs.size(); k++) {
1998                 int next = outs.at(k);
1999                 const QRegExpAutomatonState &snext = eng->s.at(next);
2000                 bool inside = true;
2001 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2002                 int needSomeSleep = 0;
2003 #endif
2004
2005                 /*
2006                   First, check if the anchors are anchored properly.
2007                 */
2008                 int a = scur.anchors.value(next);
2009                 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2010                     inside = false;
2011
2012                 /*
2013                   If indeed they are, check if the input character is
2014                   correct for this transition.
2015                 */
2016                 if (inside) {
2017                     m = snext.match;
2018                     if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2019                         if (eng->cs)
2020                             inside = (m == ch);
2021                         else
2022                             inside = (QChar(m).toLower() == QChar(ch).toLower());
2023                     } else if (next == QRegExpEngine::FinalState) {
2024                         matchLen = i;
2025                         stop = minimal;
2026                         inside = true;
2027                     } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2028 #ifndef QT_NO_REGEXP_CCLASS
2029                         const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2030                         if (eng->cs)
2031                             inside = cc.in(ch);
2032                         else if (cc.negative())
2033                             inside = cc.in(QChar(ch).toLower()) &&
2034                                      cc.in(QChar(ch).toUpper());
2035                         else
2036                             inside = cc.in(QChar(ch).toLower()) ||
2037                                      cc.in(QChar(ch).toUpper());
2038 #endif
2039 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2040                     } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2041                         int bref = m ^ QRegExpEngine::BackRefBit;
2042                         int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2043
2044                         inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2045                         if (inside) {
2046                             if (eng->cs)
2047                                 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2048                             else
2049                                 inside = (in[pos + curCapBegin[ell]].toLower()
2050                                        == QChar(ch).toLower());
2051                         }
2052
2053                         if (inside) {
2054                             int delta;
2055                             if (curCapEnd[ell] == EmptyCapture)
2056                                 delta = i - curCapBegin[ell];
2057                             else
2058                                 delta = curCapEnd[ell] - curCapBegin[ell];
2059
2060                             inside = (delta <= len - (pos + i));
2061                             if (inside && delta > 1) {
2062                                 int n = 1;
2063                                 if (eng->cs) {
2064                                     while (n < delta) {
2065                                         if (in[pos + curCapBegin[ell] + n]
2066                                             != in[pos + i + n])
2067                                             break;
2068                                         ++n;
2069                                     }
2070                                 } else {
2071                                     while (n < delta) {
2072                                         QChar a = in[pos + curCapBegin[ell] + n];
2073                                         QChar b = in[pos + i + n];
2074                                         if (a.toLower() != b.toLower())
2075                                             break;
2076                                         ++n;
2077                                     }
2078                                 }
2079                                 inside = (n == delta);
2080                                 if (inside)
2081                                     needSomeSleep = delta - 1;
2082                             }
2083                         }
2084 #endif
2085                     }
2086                 }
2087
2088                 /*
2089                   We must now update our data structures.
2090                 */
2091                 if (inside) {
2092 #ifndef QT_NO_REGEXP_CAPTURE
2093                     int *capBegin, *capEnd;
2094 #endif
2095                     /*
2096                       If the next state was not encountered yet, all
2097                       is fine.
2098                     */
2099                     if ((m = inNextStack[next]) == -1) {
2100                         m = nnext++;
2101                         nextStack[m] = next;
2102                         inNextStack[next] = m;
2103 #ifndef QT_NO_REGEXP_CAPTURE
2104                         capBegin = nextCapBegin + m * ncap;
2105                         capEnd = nextCapEnd + m * ncap;
2106
2107                     /*
2108                       Otherwise, we'll first maintain captures in
2109                       temporary arrays, and decide at the end whether
2110                       it's best to keep the previous capture zones or
2111                       the new ones.
2112                     */
2113                     } else {
2114                         capBegin = tempCapBegin;
2115                         capEnd = tempCapEnd;
2116 #endif
2117                     }
2118
2119 #ifndef QT_NO_REGEXP_CAPTURE
2120                     /*
2121                       Updating the capture zones is much of a task.
2122                     */
2123                     if (ncap > 0) {
2124                         memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2125                         memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2126                         int c = scur.atom, n = snext.atom;
2127                         int p = -1, q = -1;
2128                         int cap;
2129
2130                         /*
2131                           Lemma 1. For any x in the range [0..nf), we
2132                           have f[x].parent < x.
2133
2134                           Proof. By looking at startAtom(), it is
2135                           clear that cf < nf holds all the time, and
2136                           thus that f[nf].parent < nf.
2137                         */
2138
2139                         /*
2140                           If we are reentering an atom, we empty all
2141                           capture zones inside it.
2142                         */
2143                         if ((q = scur.reenter.value(next)) != 0) {
2144                             QBitArray b(eng->nf, false);
2145                             b.setBit(q, true);
2146                             for (int ell = q + 1; ell < eng->nf; ell++) {
2147                                 if (b.testBit(eng->f.at(ell).parent)) {
2148                                     b.setBit(ell, true);
2149                                     cap = eng->f.at(ell).capture;
2150                                     if (cap >= 0) {
2151                                         capBegin[cap] = EmptyCapture;
2152                                         capEnd[cap] = EmptyCapture;
2153                                     }
2154                                 }
2155                             }
2156                             p = eng->f.at(q).parent;
2157
2158                         /*
2159                           Otherwise, close the capture zones we are
2160                           leaving. We are leaving f[c].capture,
2161                           f[f[c].parent].capture,
2162                           f[f[f[c].parent].parent].capture, ...,
2163                           until f[x].capture, with x such that
2164                           f[x].parent is the youngest common ancestor
2165                           for c and n.
2166
2167                           We go up along c's and n's ancestry until
2168                           we find x.
2169                         */
2170                         } else {
2171                             p = c;
2172                             q = n;
2173                             while (p != q) {
2174                                 if (p > q) {
2175                                     cap = eng->f.at(p).capture;
2176                                     if (cap >= 0) {
2177                                         if (capBegin[cap] == i) {
2178                                             capBegin[cap] = EmptyCapture;
2179                                             capEnd[cap] = EmptyCapture;
2180                                         } else {
2181                                             capEnd[cap] = i;
2182                                         }
2183                                     }
2184                                     p = eng->f.at(p).parent;
2185                                 } else {
2186                                     q = eng->f.at(q).parent;
2187                                 }
2188                             }
2189                         }
2190
2191                         /*
2192                           In any case, we now open the capture zones
2193                           we are entering. We work upwards from n
2194                           until we reach p (the parent of the atom we
2195                           reenter or the youngest common ancestor).
2196                         */
2197                         while (n > p) {
2198                             cap = eng->f.at(n).capture;
2199                             if (cap >= 0) {
2200                                 capBegin[cap] = i;
2201                                 capEnd[cap] = EmptyCapture;
2202                             }
2203                             n = eng->f.at(n).parent;
2204                         }
2205                         /*
2206                           If the next state was already in
2207                           nextStack, we must choose carefully which
2208                           capture zones we want to keep.
2209                         */
2210                         if (capBegin == tempCapBegin &&
2211                                 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2212                                                 nextCapEnd + m * ncap)) {
2213                             memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2214                             memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2215                         }
2216                     }
2217 #ifndef QT_NO_REGEXP_BACKREF
2218                     /*
2219                       We are done with updating the capture zones.
2220                       It's now time to put the next state to sleep,
2221                       if it needs to, and to remove it from
2222                       nextStack.
2223                     */
2224                     if (needSomeSleep > 0) {
2225                         QVector<int> zzZ(2 + 2 * ncap);
2226                         zzZ[0] = i + needSomeSleep;
2227                         zzZ[1] = next;
2228                         if (ncap > 0) {
2229                             memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2230                             memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2231                         }
2232                         inNextStack[nextStack[--nnext]] = -1;
2233                         sleeping.append(zzZ);
2234                     }
2235 #endif
2236 #endif
2237                 }
2238             }
2239         }
2240 #ifndef QT_NO_REGEXP_CAPTURE
2241         /*
2242           If we reached the final state, hurray! Copy the captured
2243           zone.
2244         */
2245         if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2246             memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2247             memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2248         }
2249 #ifndef QT_NO_REGEXP_BACKREF
2250         /*
2251           It's time to wake up the sleepers.
2252         */
2253         j = 0;
2254         while (j < sleeping.count()) {
2255             if (sleeping.at(j)[0] == i) {
2256                 const QVector<int> &zzZ = sleeping.at(j);
2257                 int next = zzZ[1];
2258                 const int *capBegin = zzZ.data() + 2;
2259                 const int *capEnd = zzZ.data() + 2 + ncap;
2260                 bool copyOver = true;
2261
2262                 if ((m = inNextStack[next]) == -1) {
2263                     m = nnext++;
2264                     nextStack[m] = next;
2265                     inNextStack[next] = m;
2266                 } else {
2267                     copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2268                                                capBegin, capEnd);
2269                 }
2270                 if (copyOver) {
2271                     memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2272                     memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2273                 }
2274
2275                 sleeping.removeAt(j);
2276             } else {
2277                 ++j;
2278             }
2279         }
2280 #endif
2281 #endif
2282         for (j = 0; j < nnext; j++)
2283             inNextStack[nextStack[j]] = -1;
2284
2285         // avoid needless iteration that confuses oneTestMatchedLen
2286         if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2287 #ifndef QT_NO_REGEXP_BACKREF
2288              && sleeping.isEmpty()
2289 #endif
2290            )
2291             stop = true;
2292
2293         qSwap(curStack, nextStack);
2294 #ifndef QT_NO_REGEXP_CAPTURE
2295         qSwap(curCapBegin, nextCapBegin);
2296         qSwap(curCapEnd, nextCapEnd);
2297 #endif
2298         ncur = nnext;
2299         nnext = 0;
2300         ++i;
2301     }
2302
2303 #ifndef QT_NO_REGEXP_BACKREF
2304     /*
2305       If minimal matching is enabled, we might have some sleepers
2306       left.
2307     */
2308     if (!sleeping.isEmpty())
2309         sleeping.clear();
2310 #endif
2311
2312     oneTestMatchedLen = i - 1;
2313     return (matchLen >= 0);
2314 }
2315
2316 #ifndef QT_NO_REGEXP_CCLASS
2317
2318 QRegExpCharClass::QRegExpCharClass()
2319     : c(0), n(false)
2320 {
2321 #ifndef QT_NO_REGEXP_OPTIM
2322     occ1.fill(NoOccurrence, NumBadChars);
2323 #endif
2324 }
2325
2326 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2327 {
2328     c = cc.c;
2329     r = cc.r;
2330     n = cc.n;
2331 #ifndef QT_NO_REGEXP_OPTIM
2332     occ1 = cc.occ1;
2333 #endif
2334     return *this;
2335 }
2336
2337 void QRegExpCharClass::clear()
2338 {
2339     c = 0;
2340     r.resize(0);
2341     n = false;
2342 }
2343
2344 void QRegExpCharClass::setNegative(bool negative)
2345 {
2346     n = negative;
2347 #ifndef QT_NO_REGEXP_OPTIM
2348     occ1.fill(0, NumBadChars);
2349 #endif
2350 }
2351
2352 void QRegExpCharClass::addCategories(uint cats)
2353 {
2354     static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2355                                 FLAG(QChar::Mark_SpacingCombining) |
2356                                 FLAG(QChar::Mark_Enclosing) |
2357                                 FLAG(QChar::Number_DecimalDigit) |
2358                                 FLAG(QChar::Number_Letter) |
2359                                 FLAG(QChar::Number_Other) |
2360                                 FLAG(QChar::Separator_Space) |
2361                                 FLAG(QChar::Separator_Line) |
2362                                 FLAG(QChar::Separator_Paragraph) |
2363                                 FLAG(QChar::Other_Control) |
2364                                 FLAG(QChar::Other_Format) |
2365                                 FLAG(QChar::Other_Surrogate) |
2366                                 FLAG(QChar::Other_PrivateUse) |
2367                                 FLAG(QChar::Other_NotAssigned) |
2368                                 FLAG(QChar::Letter_Uppercase) |
2369                                 FLAG(QChar::Letter_Lowercase) |
2370                                 FLAG(QChar::Letter_Titlecase) |
2371                                 FLAG(QChar::Letter_Modifier) |
2372                                 FLAG(QChar::Letter_Other) |
2373                                 FLAG(QChar::Punctuation_Connector) |
2374                                 FLAG(QChar::Punctuation_Dash) |
2375                                 FLAG(QChar::Punctuation_Open) |
2376                                 FLAG(QChar::Punctuation_Close) |
2377                                 FLAG(QChar::Punctuation_InitialQuote) |
2378                                 FLAG(QChar::Punctuation_FinalQuote) |
2379                                 FLAG(QChar::Punctuation_Other) |
2380                                 FLAG(QChar::Symbol_Math) |
2381                                 FLAG(QChar::Symbol_Currency) |
2382                                 FLAG(QChar::Symbol_Modifier) |
2383                                 FLAG(QChar::Symbol_Other);
2384     c |= (all_cats & cats);
2385 #ifndef QT_NO_REGEXP_OPTIM
2386     occ1.fill(0, NumBadChars);
2387 #endif
2388 }
2389
2390 void QRegExpCharClass::addRange(ushort from, ushort to)
2391 {
2392     if (from > to)
2393         qSwap(from, to);
2394     int m = r.size();
2395     r.resize(m + 1);
2396     r[m].from = from;
2397     r[m].len = to - from + 1;
2398
2399 #ifndef QT_NO_REGEXP_OPTIM
2400     int i;
2401
2402     if (to - from < NumBadChars) {
2403         if (from % NumBadChars <= to % NumBadChars) {
2404             for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2405                 occ1[i] = 0;
2406         } else {
2407             for (i = 0; i <= to % NumBadChars; i++)
2408                 occ1[i] = 0;
2409             for (i = from % NumBadChars; i < NumBadChars; i++)
2410                 occ1[i] = 0;
2411         }
2412     } else {
2413         occ1.fill(0, NumBadChars);
2414     }
2415 #endif
2416 }
2417
2418 bool QRegExpCharClass::in(QChar ch) const
2419 {
2420 #ifndef QT_NO_REGEXP_OPTIM
2421     if (occ1.at(BadChar(ch)) == NoOccurrence)
2422         return n;
2423 #endif
2424
2425     if (c != 0 && (c & FLAG(ch.category())) != 0)
2426         return !n;
2427
2428     const int uc = ch.unicode();
2429     int size = r.size();
2430
2431     for (int i = 0; i < size; ++i) {
2432         const QRegExpCharClassRange &range = r.at(i);
2433         if (uint(uc - range.from) < uint(r.at(i).len))
2434             return !n;
2435     }
2436     return n;
2437 }
2438
2439 #if defined(QT_DEBUG)
2440 void QRegExpCharClass::dump() const
2441 {
2442     int i;
2443     qDebug("    %stive character class", n ? "nega" : "posi");
2444 #ifndef QT_NO_REGEXP_CCLASS
2445     if (c != 0)
2446         qDebug("      categories 0x%.8x", c);
2447 #endif
2448     for (i = 0; i < r.size(); i++)
2449         qDebug("      0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2450 }
2451 #endif
2452 #endif
2453
2454 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2455     : eng(engine), skipanchors(0)
2456 #ifndef QT_NO_REGEXP_OPTIM
2457       , earlyStart(0), lateStart(0), maxl(0)
2458 #endif
2459 {
2460 #ifndef QT_NO_REGEXP_OPTIM
2461     occ1.fill(NoOccurrence, NumBadChars);
2462 #endif
2463     minl = 0;
2464 }
2465
2466 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2467 {
2468     eng = b.eng;
2469     ls = b.ls;
2470     rs = b.rs;
2471     lanchors = b.lanchors;
2472     ranchors = b.ranchors;
2473     skipanchors = b.skipanchors;
2474 #ifndef QT_NO_REGEXP_OPTIM
2475     earlyStart = b.earlyStart;
2476     lateStart = b.lateStart;
2477     str = b.str;
2478     leftStr = b.leftStr;
2479     rightStr = b.rightStr;
2480     maxl = b.maxl;
2481     occ1 = b.occ1;
2482 #endif
2483     minl = b.minl;
2484     return *this;
2485 }
2486
2487 void QRegExpEngine::Box::set(QChar ch)
2488 {
2489     ls.resize(1);
2490     ls[0] = eng->createState(ch);
2491     rs = ls;
2492 #ifndef QT_NO_REGEXP_OPTIM
2493     str = ch;
2494     leftStr = ch;
2495     rightStr = ch;
2496     maxl = 1;
2497     occ1[BadChar(ch)] = 0;
2498 #endif
2499     minl = 1;
2500 }
2501
2502 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2503 {
2504     ls.resize(1);
2505     ls[0] = eng->createState(cc);
2506     rs = ls;
2507 #ifndef QT_NO_REGEXP_OPTIM
2508     maxl = 1;
2509     occ1 = cc.firstOccurrence();
2510 #endif
2511     minl = 1;
2512 }
2513
2514 #ifndef QT_NO_REGEXP_BACKREF
2515 void QRegExpEngine::Box::set(int bref)
2516 {
2517     ls.resize(1);
2518     ls[0] = eng->createState(bref);
2519     rs = ls;
2520     if (bref >= 1 && bref <= MaxBackRefs)
2521         skipanchors = Anchor_BackRef0Empty << bref;
2522 #ifndef QT_NO_REGEXP_OPTIM
2523     maxl = InftyLen;
2524 #endif
2525     minl = 0;
2526 }
2527 #endif
2528
2529 void QRegExpEngine::Box::cat(const Box &b)
2530 {
2531     eng->addCatTransitions(rs, b.ls);
2532     addAnchorsToEngine(b);
2533     if (minl == 0) {
2534         lanchors.unite(b.lanchors);
2535         if (skipanchors != 0) {
2536             for (int i = 0; i < b.ls.size(); i++) {
2537                 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2538                 lanchors.insert(b.ls.at(i), a);
2539             }
2540         }
2541         mergeInto(&ls, b.ls);
2542     }
2543     if (b.minl == 0) {
2544         ranchors.unite(b.ranchors);
2545         if (b.skipanchors != 0) {
2546             for (int i = 0; i < rs.size(); i++) {
2547                 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2548                 ranchors.insert(rs.at(i), a);
2549             }
2550         }
2551         mergeInto(&rs, b.rs);
2552     } else {
2553         ranchors = b.ranchors;
2554         rs = b.rs;
2555     }
2556
2557 #ifndef QT_NO_REGEXP_OPTIM
2558     if (maxl != InftyLen) {
2559         if (rightStr.length() + b.leftStr.length() >
2560              qMax(str.length(), b.str.length())) {
2561             earlyStart = minl - rightStr.length();
2562             lateStart = maxl - rightStr.length();
2563             str = rightStr + b.leftStr;
2564         } else if (b.str.length() > str.length()) {
2565             earlyStart = minl + b.earlyStart;
2566             lateStart = maxl + b.lateStart;
2567             str = b.str;
2568         }
2569     }
2570
2571     if (leftStr.length() == maxl)
2572         leftStr += b.leftStr;
2573
2574     if (b.rightStr.length() == b.maxl) {
2575         rightStr += b.rightStr;
2576     } else {
2577         rightStr = b.rightStr;
2578     }
2579
2580     if (maxl == InftyLen || b.maxl == InftyLen) {
2581         maxl = InftyLen;
2582     } else {
2583         maxl += b.maxl;
2584     }
2585
2586     for (int i = 0; i < NumBadChars; i++) {
2587         if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2588             occ1[i] = minl + b.occ1.at(i);
2589     }
2590 #endif
2591
2592     minl += b.minl;
2593     if (minl == 0)
2594         skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2595     else
2596         skipanchors = 0;
2597 }
2598
2599 void QRegExpEngine::Box::orx(const Box &b)
2600 {
2601     mergeInto(&ls, b.ls);
2602     lanchors.unite(b.lanchors);
2603     mergeInto(&rs, b.rs);
2604     ranchors.unite(b.ranchors);
2605
2606     if (b.minl == 0) {
2607         if (minl == 0)
2608             skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2609         else
2610             skipanchors = b.skipanchors;
2611     }
2612
2613 #ifndef QT_NO_REGEXP_OPTIM
2614     for (int i = 0; i < NumBadChars; i++) {
2615         if (occ1.at(i) > b.occ1.at(i))
2616             occ1[i] = b.occ1.at(i);
2617     }
2618     earlyStart = 0;
2619     lateStart = 0;
2620     str = QString();
2621     leftStr = QString();
2622     rightStr = QString();
2623     if (b.maxl > maxl)
2624         maxl = b.maxl;
2625 #endif
2626     if (b.minl < minl)
2627         minl = b.minl;
2628 }
2629
2630 void QRegExpEngine::Box::plus(int atom)
2631 {
2632 #ifndef QT_NO_REGEXP_CAPTURE
2633     eng->addPlusTransitions(rs, ls, atom);
2634 #else
2635     Q_UNUSED(atom);
2636     eng->addCatTransitions(rs, ls);
2637 #endif
2638     addAnchorsToEngine(*this);
2639 #ifndef QT_NO_REGEXP_OPTIM
2640     maxl = InftyLen;
2641 #endif
2642 }
2643
2644 void QRegExpEngine::Box::opt()
2645 {
2646 #ifndef QT_NO_REGEXP_OPTIM
2647     earlyStart = 0;
2648     lateStart = 0;
2649     str = QString();
2650     leftStr = QString();
2651     rightStr = QString();
2652 #endif
2653     skipanchors = 0;
2654     minl = 0;
2655 }
2656
2657 void QRegExpEngine::Box::catAnchor(int a)
2658 {
2659     if (a != 0) {
2660         for (int i = 0; i < rs.size(); i++) {
2661             a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2662             ranchors.insert(rs.at(i), a);
2663         }
2664         if (minl == 0)
2665             skipanchors = eng->anchorConcatenation(skipanchors, a);
2666     }
2667 }
2668
2669 #ifndef QT_NO_REGEXP_OPTIM
2670 void QRegExpEngine::Box::setupHeuristics()
2671 {
2672     eng->goodEarlyStart = earlyStart;
2673     eng->goodLateStart = lateStart;
2674     eng->goodStr = eng->cs ? str : str.toLower();
2675
2676     eng->minl = minl;
2677     if (eng->cs) {
2678         /*
2679           A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2680           1 at this point. An entry of occ1 has to be at most minl or
2681           infinity for the rest of the algorithm to go well.
2682
2683           We waited until here before normalizing these cases (instead of
2684           doing it in Box::orx()) because sometimes things improve by
2685           themselves. Consider for example (112|1)34.
2686         */
2687         for (int i = 0; i < NumBadChars; i++) {
2688             if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2689                 occ1[i] = minl;
2690         }
2691         eng->occ1 = occ1;
2692     } else {
2693         eng->occ1.fill(0, NumBadChars);
2694     }
2695
2696     eng->heuristicallyChooseHeuristic();
2697 }
2698 #endif
2699
2700 #if defined(QT_DEBUG)
2701 void QRegExpEngine::Box::dump() const
2702 {
2703     int i;
2704     qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2705     qDebug("  Left states:");
2706     for (i = 0; i < ls.size(); i++) {
2707         if (lanchors.value(ls[i], 0) == 0)
2708             qDebug("    %d", ls[i]);
2709         else
2710             qDebug("    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2711     }
2712     qDebug("  Right states:");
2713     for (i = 0; i < rs.size(); i++) {
2714         if (ranchors.value(rs[i], 0) == 0)
2715             qDebug("    %d", rs[i]);
2716         else
2717             qDebug("    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2718     }
2719     qDebug("  Skip anchors: 0x%.8x", skipanchors);
2720 }
2721 #endif
2722
2723 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2724 {
2725     for (int i = 0; i < to.ls.size(); i++) {
2726         for (int j = 0; j < rs.size(); j++) {
2727             int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2728                                              to.lanchors.value(to.ls.at(i), 0));
2729             eng->addAnchors(rs[j], to.ls[i], a);
2730         }
2731     }
2732 }
2733
2734 #ifndef QT_NO_REGEXP_CCLASS
2735 // fast lookup hash for xml schema extensions
2736 // sorted by name for b-search
2737 static const struct CategoriesRangeMapEntry {
2738     const char name[40];
2739     uint first, second;
2740 } categoriesRangeMap[] = {
2741     { "AegeanNumbers",                        0x10100, 0x1013F },
2742     { "AlphabeticPresentationForms",          0xFB00, 0xFB4F },
2743     { "AncientGreekMusicalNotation",          0x1D200, 0x1D24F },
2744     { "AncientGreekNumbers",                  0x10140, 0x1018F },
2745     { "Arabic",                               0x0600, 0x06FF },
2746     { "ArabicPresentationForms-A",            0xFB50, 0xFDFF },
2747     { "ArabicPresentationForms-B",            0xFE70, 0xFEFF },
2748     { "ArabicSupplement",                     0x0750, 0x077F },
2749     { "Armenian",                             0x0530, 0x058F },
2750     { "Arrows",                               0x2190, 0x21FF },
2751     { "BasicLatin",                           0x0000, 0x007F },
2752     { "Bengali",                              0x0980, 0x09FF },
2753     { "BlockElements",                        0x2580, 0x259F },
2754     { "Bopomofo",                             0x3100, 0x312F },
2755     { "BopomofoExtended",                     0x31A0, 0x31BF },
2756     { "BoxDrawing",                           0x2500, 0x257F },
2757     { "BraillePatterns",                      0x2800, 0x28FF },
2758     { "Buginese",                             0x1A00, 0x1A1F },
2759     { "Buhid",                                0x1740, 0x175F },
2760     { "ByzantineMusicalSymbols",              0x1D000, 0x1D0FF },
2761     { "CJKCompatibility",                     0x3300, 0x33FF },
2762     { "CJKCompatibilityForms",                0xFE30, 0xFE4F },
2763     { "CJKCompatibilityIdeographs",           0xF900, 0xFAFF },
2764     { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },
2765     { "CJKRadicalsSupplement",                0x2E80, 0x2EFF },
2766     { "CJKStrokes",                           0x31C0, 0x31EF },
2767     { "CJKSymbolsandPunctuation",             0x3000, 0x303F },
2768     { "CJKUnifiedIdeographs",                 0x4E00, 0x9FFF },
2769     { "CJKUnifiedIdeographsExtensionA",       0x3400, 0x4DB5 },
2770     { "CJKUnifiedIdeographsExtensionB",       0x20000, 0x2A6DF },
2771     { "Cherokee",                             0x13A0, 0x13FF },
2772     { "CombiningDiacriticalMarks",            0x0300, 0x036F },
2773     { "CombiningDiacriticalMarksSupplement",  0x1DC0, 0x1DFF },
2774     { "CombiningHalfMarks",                   0xFE20, 0xFE2F },
2775     { "CombiningMarksforSymbols",             0x20D0, 0x20FF },
2776     { "ControlPictures",                      0x2400, 0x243F },
2777     { "Coptic",                               0x2C80, 0x2CFF },
2778     { "CurrencySymbols",                      0x20A0, 0x20CF },
2779     { "CypriotSyllabary",                     0x10800, 0x1083F },
2780     { "Cyrillic",                             0x0400, 0x04FF },
2781     { "CyrillicSupplement",                   0x0500, 0x052F },
2782     { "Deseret",                              0x10400, 0x1044F },
2783     { "Devanagari",                           0x0900, 0x097F },
2784     { "Dingbats",                             0x2700, 0x27BF },
2785     { "EnclosedAlphanumerics",                0x2460, 0x24FF },
2786     { "EnclosedCJKLettersandMonths",          0x3200, 0x32FF },
2787     { "Ethiopic",                             0x1200, 0x137F },
2788     { "EthiopicExtended",                     0x2D80, 0x2DDF },
2789     { "EthiopicSupplement",                   0x1380, 0x139F },
2790     { "GeneralPunctuation",                   0x2000, 0x206F },
2791     { "GeometricShapes",                      0x25A0, 0x25FF },
2792     { "Georgian",                             0x10A0, 0x10FF },
2793     { "GeorgianSupplement",                   0x2D00, 0x2D2F },
2794     { "Glagolitic",                           0x2C00, 0x2C5F },
2795     { "Gothic",                               0x10330, 0x1034F },
2796     { "Greek",                                0x0370, 0x03FF },
2797     { "GreekExtended",                        0x1F00, 0x1FFF },
2798     { "Gujarati",                             0x0A80, 0x0AFF },
2799     { "Gurmukhi",                             0x0A00, 0x0A7F },
2800     { "HalfwidthandFullwidthForms",           0xFF00, 0xFFEF },
2801     { "HangulCompatibilityJamo",              0x3130, 0x318F },
2802     { "HangulJamo",                           0x1100, 0x11FF },
2803     { "HangulSyllables",                      0xAC00, 0xD7A3 },
2804     { "Hanunoo",                              0x1720, 0x173F },
2805     { "Hebrew",                               0x0590, 0x05FF },
2806     { "Hiragana",                             0x3040, 0x309F },
2807     { "IPAExtensions",                        0x0250, 0x02AF },
2808     { "IdeographicDescriptionCharacters",     0x2FF0, 0x2FFF },
2809     { "Kanbun",                               0x3190, 0x319F },
2810     { "KangxiRadicals",                       0x2F00, 0x2FDF },
2811     { "Kannada",                              0x0C80, 0x0CFF },
2812     { "Katakana",                             0x30A0, 0x30FF },
2813     { "KatakanaPhoneticExtensions",           0x31F0, 0x31FF },
2814     { "Kharoshthi",                           0x10A00, 0x10A5F },
2815     { "Khmer",                                0x1780, 0x17FF },
2816     { "KhmerSymbols",                         0x19E0, 0x19FF },
2817     { "Lao",                                  0x0E80, 0x0EFF },
2818     { "Latin-1Supplement",                    0x0080, 0x00FF },
2819     { "LatinExtended-A",                      0x0100, 0x017F },
2820     { "LatinExtended-B",                      0x0180, 0x024F },
2821     { "LatinExtendedAdditional",              0x1E00, 0x1EFF },
2822     { "LetterlikeSymbols",                    0x2100, 0x214F },
2823     { "Limbu",                                0x1900, 0x194F },
2824     { "LinearBIdeograms",                     0x10080, 0x100FF },
2825     { "LinearBSyllabary",                     0x10000, 0x1007F },
2826     { "Malayalam",                            0x0D00, 0x0D7F },
2827     { "MathematicalAlphanumericSymbols",      0x1D400, 0x1D7FF },
2828     { "MathematicalOperators",                0x2200, 0x22FF },
2829     { "MiscellaneousMathematicalSymbols-A",   0x27C0, 0x27EF },
2830     { "MiscellaneousMathematicalSymbols-B",   0x2980, 0x29FF },
2831     { "MiscellaneousSymbols",                 0x2600, 0x26FF },
2832     { "MiscellaneousSymbolsandArrows",        0x2B00, 0x2BFF },
2833     { "MiscellaneousTechnical",               0x2300, 0x23FF },
2834     { "ModifierToneLetters",                  0xA700, 0xA71F },
2835     { "Mongolian",                            0x1800, 0x18AF },
2836     { "MusicalSymbols",                       0x1D100, 0x1D1FF },
2837     { "Myanmar",                              0x1000, 0x109F },
2838     { "NewTaiLue",                            0x1980, 0x19DF },
2839     { "NumberForms",                          0x2150, 0x218F },
2840     { "Ogham",                                0x1680, 0x169F },
2841     { "OldItalic",                            0x10300, 0x1032F },
2842     { "OldPersian",                           0x103A0, 0x103DF },
2843     { "OpticalCharacterRecognition",          0x2440, 0x245F },
2844     { "Oriya",                                0x0B00, 0x0B7F },
2845     { "Osmanya",                              0x10480, 0x104AF },
2846     { "PhoneticExtensions",                   0x1D00, 0x1D7F },
2847     { "PhoneticExtensionsSupplement",         0x1D80, 0x1DBF },
2848     { "PrivateUse",                           0xE000, 0xF8FF },
2849     { "Runic",                                0x16A0, 0x16FF },
2850     { "Shavian",                              0x10450, 0x1047F },
2851     { "Sinhala",                              0x0D80, 0x0DFF },
2852     { "SmallFormVariants",                    0xFE50, 0xFE6F },
2853     { "SpacingModifierLetters",               0x02B0, 0x02FF },
2854     { "Specials",                             0xFFF0, 0xFFFF },
2855     { "SuperscriptsandSubscripts",            0x2070, 0x209F },
2856     { "SupplementalArrows-A",                 0x27F0, 0x27FF },
2857     { "SupplementalArrows-B",                 0x2900, 0x297F },
2858     { "SupplementalMathematicalOperators",    0x2A00, 0x2AFF },
2859     { "SupplementalPunctuation",              0x2E00, 0x2E7F },
2860     { "SupplementaryPrivateUseArea-A",        0xF0000, 0xFFFFF },
2861     { "SupplementaryPrivateUseArea-B",        0x100000, 0x10FFFF },
2862     { "SylotiNagri",                          0xA800, 0xA82F },
2863     { "Syriac",                               0x0700, 0x074F },
2864     { "Tagalog",                              0x1700, 0x171F },
2865     { "Tagbanwa",                             0x1760, 0x177F },
2866     { "Tags",                                 0xE0000, 0xE007F },
2867     { "TaiLe",                                0x1950, 0x197F },
2868     { "TaiXuanJingSymbols",                   0x1D300, 0x1D35F },
2869     { "Tamil",                                0x0B80, 0x0BFF },
2870     { "Telugu",                               0x0C00, 0x0C7F },
2871     { "Thaana",                               0x0780, 0x07BF },
2872     { "Thai",                                 0x0E00, 0x0E7F },
2873     { "Tibetan",                              0x0F00, 0x0FFF },
2874     { "Tifinagh",                             0x2D30, 0x2D7F },
2875     { "Ugaritic",                             0x10380, 0x1039F },
2876     { "UnifiedCanadianAboriginalSyllabics",   0x1400, 0x167F },
2877     { "VariationSelectors",                   0xFE00, 0xFE0F },
2878     { "VariationSelectorsSupplement",         0xE0100, 0xE01EF },
2879     { "VerticalForms",                        0xFE10, 0xFE1F },
2880     { "YiRadicals",                           0xA490, 0xA4CF },
2881     { "YiSyllables",                          0xA000, 0xA48F },
2882     { "YijingHexagramSymbols",                0x4DC0, 0x4DFF }
2883 };
2884
2885 inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2886 { return qstrcmp(name, entry.name) < 0; }
2887 inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2888 { return qstrcmp(entry.name, name) < 0; }
2889 #endif // QT_NO_REGEXP_CCLASS
2890
2891 int QRegExpEngine::getChar()
2892 {
2893     return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2894 }
2895
2896 int QRegExpEngine::getEscape()
2897 {
2898 #ifndef QT_NO_REGEXP_ESCAPE
2899     const char tab[] = "afnrtv"; // no b, as \b means word boundary
2900     const char backTab[] = "\a\f\n\r\t\v";
2901     ushort low;
2902     int i;
2903 #endif
2904     ushort val;
2905     int prevCh = yyCh;
2906
2907     if (prevCh == EOS) {
2908         error(RXERR_END);
2909         return Tok_Char | '\\';
2910     }
2911     yyCh = getChar();
2912 #ifndef QT_NO_REGEXP_ESCAPE
2913     if ((prevCh & ~0xff) == 0) {
2914         const char *p = strchr(tab, prevCh);
2915         if (p != 0)
2916             return Tok_Char | backTab[p - tab];
2917     }
2918 #endif
2919
2920     switch (prevCh) {
2921 #ifndef QT_NO_REGEXP_ESCAPE
2922     case '0':
2923         val = 0;
2924         for (i = 0; i < 3; i++) {
2925             if (yyCh >= '0' && yyCh <= '7')
2926                 val = (val << 3) | (yyCh - '0');
2927             else
2928                 break;
2929             yyCh = getChar();
2930         }
2931         if ((val & ~0377) != 0)
2932             error(RXERR_OCTAL);
2933         return Tok_Char | val;
2934 #endif
2935 #ifndef QT_NO_REGEXP_ESCAPE
2936     case 'B':
2937         return Tok_NonWord;
2938 #endif
2939 #ifndef QT_NO_REGEXP_CCLASS
2940     case 'D':
2941         // see QChar::isDigit()
2942         yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2943         return Tok_CharClass;
2944     case 'S':
2945         // see QChar::isSpace()
2946         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |
2947                                                FLAG(QChar::Separator_Line) |
2948                                                FLAG(QChar::Separator_Paragraph) |
2949                                                FLAG(QChar::Other_Control)));
2950         yyCharClass->addRange(0x0000, 0x0008);
2951         yyCharClass->addRange(0x000e, 0x001f);
2952         yyCharClass->addRange(0x007f, 0x0084);
2953         yyCharClass->addRange(0x0086, 0x009f);
2954         return Tok_CharClass;
2955     case 'W':
2956         // see QChar::isLetterOrNumber() and QChar::isMark()
2957         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2958                                                FLAG(QChar::Mark_SpacingCombining) |
2959                                                FLAG(QChar::Mark_Enclosing) |
2960                                                FLAG(QChar::Number_DecimalDigit) |
2961                                                FLAG(QChar::Number_Letter) |
2962                                                FLAG(QChar::Number_Other) |
2963                                                FLAG(QChar::Letter_Uppercase) |
2964                                                FLAG(QChar::Letter_Lowercase) |
2965                                                FLAG(QChar::Letter_Titlecase) |
2966                                                FLAG(QChar::Letter_Modifier) |
2967                                                FLAG(QChar::Letter_Other) |
2968                                                FLAG(QChar::Punctuation_Connector)));
2969         yyCharClass->addRange(0x203f, 0x2040);
2970         yyCharClass->addSingleton(0x2040);
2971         yyCharClass->addSingleton(0x2054);
2972         yyCharClass->addSingleton(0x30fb);
2973         yyCharClass->addRange(0xfe33, 0xfe34);
2974         yyCharClass->addRange(0xfe4d, 0xfe4f);
2975         yyCharClass->addSingleton(0xff3f);
2976         yyCharClass->addSingleton(0xff65);
2977         return Tok_CharClass;
2978 #endif
2979 #ifndef QT_NO_REGEXP_ESCAPE
2980     case 'b':
2981         return Tok_Word;
2982 #endif
2983 #ifndef QT_NO_REGEXP_CCLASS
2984     case 'd':
2985         // see QChar::isDigit()
2986         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2987         return Tok_CharClass;
2988     case 's':
2989         // see QChar::isSpace()
2990         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2991                                    FLAG(QChar::Separator_Line) |
2992                                    FLAG(QChar::Separator_Paragraph));
2993         yyCharClass->addRange(0x0009, 0x000d);
2994         yyCharClass->addSingleton(0x0085);
2995         return Tok_CharClass;
2996     case 'w':
2997         // see QChar::isLetterOrNumber() and QChar::isMark()
2998         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2999                                    FLAG(QChar::Mark_SpacingCombining) |
3000                                    FLAG(QChar::Mark_Enclosing) |
3001                                    FLAG(QChar::Number_DecimalDigit) |
3002                                    FLAG(QChar::Number_Letter) |
3003                                    FLAG(QChar::Number_Other) |
3004                                    FLAG(QChar::Letter_Uppercase) |
3005                                    FLAG(QChar::Letter_Lowercase) |
3006                                    FLAG(QChar::Letter_Titlecase) |
3007                                    FLAG(QChar::Letter_Modifier) |
3008                                    FLAG(QChar::Letter_Other));
3009         yyCharClass->addSingleton(0x005f); // '_'
3010         return Tok_CharClass;
3011     case 'I':
3012         if (xmlSchemaExtensions) {
3013             yyCharClass->setNegative(!yyCharClass->negative());
3014             // fall through
3015         } else {
3016             break;
3017         }
3018     case 'i':
3019         if (xmlSchemaExtensions) {
3020             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3021                                        FLAG(QChar::Mark_SpacingCombining) |
3022                                        FLAG(QChar::Mark_Enclosing) |
3023                                        FLAG(QChar::Number_DecimalDigit) |
3024                                        FLAG(QChar::Number_Letter) |
3025                                        FLAG(QChar::Number_Other) |
3026                                        FLAG(QChar::Letter_Uppercase) |
3027                                        FLAG(QChar::Letter_Lowercase) |
3028                                        FLAG(QChar::Letter_Titlecase) |
3029                                        FLAG(QChar::Letter_Modifier) |
3030                                        FLAG(QChar::Letter_Other));
3031             yyCharClass->addSingleton(0x003a); // ':'
3032             yyCharClass->addSingleton(0x005f); // '_'
3033             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3034             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3035             yyCharClass->addRange(0xc0, 0xd6);
3036             yyCharClass->addRange(0xd8, 0xf6);
3037             yyCharClass->addRange(0xf8, 0x2ff);
3038             yyCharClass->addRange(0x370, 0x37d);
3039             yyCharClass->addRange(0x37f, 0x1fff);
3040             yyCharClass->addRange(0x200c, 0x200d);
3041             yyCharClass->addRange(0x2070, 0x218f);
3042             yyCharClass->addRange(0x2c00, 0x2fef);
3043             yyCharClass->addRange(0x3001, 0xd7ff);
3044             yyCharClass->addRange(0xf900, 0xfdcf);
3045             yyCharClass->addRange(0xfdf0, 0xfffd);
3046             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3047             return Tok_CharClass;
3048         } else {
3049             break;
3050         }
3051     case 'C':
3052         if (xmlSchemaExtensions) {
3053             yyCharClass->setNegative(!yyCharClass->negative());
3054             // fall through
3055         } else {
3056             break;
3057         }
3058     case 'c':
3059         if (xmlSchemaExtensions) {
3060             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3061                                        FLAG(QChar::Mark_SpacingCombining) |
3062                                        FLAG(QChar::Mark_Enclosing) |
3063                                        FLAG(QChar::Number_DecimalDigit) |
3064                                        FLAG(QChar::Number_Letter) |
3065                                        FLAG(QChar::Number_Other) |
3066                                        FLAG(QChar::Letter_Uppercase) |
3067                                        FLAG(QChar::Letter_Lowercase) |
3068                                        FLAG(QChar::Letter_Titlecase) |
3069                                        FLAG(QChar::Letter_Modifier) |
3070                                        FLAG(QChar::Letter_Other));
3071             yyCharClass->addSingleton(0x002d); // '-'
3072             yyCharClass->addSingleton(0x002e); // '.'
3073             yyCharClass->addSingleton(0x003a); // ':'
3074             yyCharClass->addSingleton(0x005f); // '_'
3075             yyCharClass->addSingleton(0xb7);
3076             yyCharClass->addRange(0x0030, 0x0039); // [0-9]
3077             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3078             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3079             yyCharClass->addRange(0xc0, 0xd6);
3080             yyCharClass->addRange(0xd8, 0xf6);
3081             yyCharClass->addRange(0xf8, 0x2ff);
3082             yyCharClass->addRange(0x370, 0x37d);
3083             yyCharClass->addRange(0x37f, 0x1fff);
3084             yyCharClass->addRange(0x200c, 0x200d);
3085             yyCharClass->addRange(0x2070, 0x218f);
3086             yyCharClass->addRange(0x2c00, 0x2fef);
3087             yyCharClass->addRange(0x3001, 0xd7ff);
3088             yyCharClass->addRange(0xf900, 0xfdcf);
3089             yyCharClass->addRange(0xfdf0, 0xfffd);
3090             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3091             yyCharClass->addRange(0x0300, 0x036f);
3092             yyCharClass->addRange(0x203f, 0x2040);
3093             return Tok_CharClass;
3094         } else {
3095             break;
3096         }
3097     case 'P':
3098         if (xmlSchemaExtensions) {
3099             yyCharClass->setNegative(!yyCharClass->negative());
3100             // fall through
3101         } else {
3102             break;
3103         }
3104     case 'p':
3105         if (xmlSchemaExtensions) {
3106             if (yyCh != '{') {
3107                 error(RXERR_CHARCLASS);
3108                 return Tok_CharClass;
3109             }
3110
3111             QByteArray category;
3112             yyCh = getChar();
3113             while (yyCh != '}') {
3114                 if (yyCh == EOS) {
3115                     error(RXERR_END);
3116                     return Tok_CharClass;
3117                 }
3118                 category.append(yyCh);
3119                 yyCh = getChar();
3120             }
3121             yyCh = getChar(); // skip closing '}'
3122
3123             int catlen = category.length();
3124             if (catlen == 1 || catlen == 2) {
3125                 switch (category.at(0)) {
3126                 case 'M':
3127                     if (catlen == 1) {
3128                         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3129                                                    FLAG(QChar::Mark_SpacingCombining) |
3130                                                    FLAG(QChar::Mark_Enclosing));
3131                     } else {
3132                         switch (category.at(1)) {
3133                         case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3134                         case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3135                         case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3136                         default: error(RXERR_CATEGORY); break;
3137                         }
3138                     }
3139                     break;
3140                 case 'N':
3141                     if (catlen == 1) {
3142                         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3143                                                    FLAG(QChar::Number_Letter) |
3144                                                    FLAG(QChar::Number_Other));
3145                     } else {
3146                         switch (category.at(1)) {
3147                         case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3148                         case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3149                         case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3150                         default: error(RXERR_CATEGORY); break;
3151                         }
3152                     }
3153                     break;
3154                 case 'Z':
3155                     if (catlen == 1) {
3156                         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3157                                                    FLAG(QChar::Separator_Line) |
3158                                                    FLAG(QChar::Separator_Paragraph));
3159                     } else {
3160                         switch (category.at(1)) {
3161                         case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3162                         case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3163                         case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3164                         default: error(RXERR_CATEGORY); break;
3165                         }
3166                     }
3167                     break;
3168                 case 'C':
3169                     if (catlen == 1) {
3170                         yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3171                                                    FLAG(QChar::Other_Format) |
3172                                                    FLAG(QChar::Other_Surrogate) |
3173                                                    FLAG(QChar::Other_PrivateUse) |
3174                                                    FLAG(QChar::Other_NotAssigned));
3175                     } else {
3176                         switch (category.at(1)) {
3177                         case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3178                         case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3179                         case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3180                         case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3181                         case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3182                         default: error(RXERR_CATEGORY); break;
3183                         }
3184                     }
3185                     break;
3186                 case 'L':
3187                     if (catlen == 1) {
3188                         yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3189                                                    FLAG(QChar::Letter_Lowercase) |
3190                                                    FLAG(QChar::Letter_Titlecase) |
3191                                                    FLAG(QChar::Letter_Modifier) |
3192                                                    FLAG(QChar::Letter_Other));
3193                     } else {
3194                         switch (category.at(1)) {
3195                         case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3196                         case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3197                         case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3198                         case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3199                         case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3200                         default: error(RXERR_CATEGORY); break;
3201                         }
3202                     }
3203                     break;
3204                 case 'P':
3205                     if (catlen == 1) {
3206                         yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3207                                                    FLAG(QChar::Punctuation_Dash) |
3208                                                    FLAG(QChar::Punctuation_Open) |
3209                                                    FLAG(QChar::Punctuation_Close) |
3210                                                    FLAG(QChar::Punctuation_InitialQuote) |
3211                                                    FLAG(QChar::Punctuation_FinalQuote) |
3212                                                    FLAG(QChar::Punctuation_Other));
3213                     } else {
3214                         switch (category.at(1)) {
3215                         case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3216                         case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3217                         case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3218                         case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3219                         case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3220                         case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3221                         case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3222                         default: error(RXERR_CATEGORY); break;
3223                         }
3224                     }
3225                     break;
3226                 case 'S':
3227                     if (catlen == 1) {
3228                         yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3229                                                    FLAG(QChar::Symbol_Currency) |
3230                                                    FLAG(QChar::Symbol_Modifier) |
3231                                                    FLAG(QChar::Symbol_Other));
3232                     } else {
3233                         switch (category.at(1)) {
3234                         case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3235                         case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3236                         case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3237                         case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3238                         default: error(RXERR_CATEGORY); break;
3239                         }
3240                     }
3241                     break;
3242                 default:
3243                     error(RXERR_CATEGORY);
3244                     break;
3245                 }
3246             } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {
3247                 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3248                 const CategoriesRangeMapEntry *r = qBinaryFind(categoriesRangeMap, categoriesRangeMap + N, category.constData() + 2);
3249                 if (r != categoriesRangeMap + N)
3250                     yyCharClass->addRange(r->first, r->second);
3251                 else
3252                     error(RXERR_CATEGORY);
3253             } else {
3254                 error(RXERR_CATEGORY);
3255             }
3256             return Tok_CharClass;
3257         } else {
3258             break;
3259         }
3260 #endif
3261 #ifndef QT_NO_REGEXP_ESCAPE
3262     case 'x':
3263         val = 0;
3264         for (i = 0; i < 4; i++) {
3265             low = QChar(yyCh).toLower().unicode();
3266             if (low >= '0' && low <= '9')
3267                 val = (val << 4) | (low - '0');
3268             else if (low >= 'a' && low <= 'f')
3269                 val = (val << 4) | (low - 'a' + 10);
3270             else
3271                 break;
3272             yyCh = getChar();
3273         }
3274         return Tok_Char | val;
3275 #endif
3276     default:
3277         break;
3278     }
3279     if (prevCh >= '1' && prevCh <= '9') {
3280 #ifndef QT_NO_REGEXP_BACKREF
3281         val = prevCh - '0';
3282         while (yyCh >= '0' && yyCh <= '9') {
3283             val = (val * 10) + (yyCh - '0');
3284             yyCh = getChar();
3285         }
3286         return Tok_BackRef | val;
3287 #else
3288         error(RXERR_DISABLED);
3289 #endif
3290     }
3291     return Tok_Char | prevCh;
3292 }
3293
3294 #ifndef QT_NO_REGEXP_INTERVAL
3295 int QRegExpEngine::getRep(int def)
3296 {
3297     if (yyCh >= '0' && yyCh <= '9') {
3298         int rep = 0;
3299         do {
3300             rep = 10 * rep + yyCh - '0';
3301             if (rep >= InftyRep) {
3302                 error(RXERR_REPETITION);
3303                 rep = def;
3304             }
3305             yyCh = getChar();
3306         } while (yyCh >= '0' && yyCh <= '9');
3307         return rep;
3308     } else {
3309         return def;
3310     }
3311 }
3312 #endif
3313
3314 #ifndef QT_NO_REGEXP_LOOKAHEAD
3315 void QRegExpEngine::skipChars(int n)
3316 {
3317     if (n > 0) {
3318         yyPos += n - 1;
3319         yyCh = getChar();
3320     }
3321 }
3322 #endif
3323
3324 void QRegExpEngine::error(const char *msg)
3325 {
3326     if (yyError.isEmpty())
3327         yyError = QLatin1String(msg);
3328 }
3329
3330 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3331 {
3332     yyIn = rx;
3333     yyPos0 = 0;
3334     yyPos = 0;
3335     yyLen = len;
3336     yyCh = getChar();
3337     yyCharClass.reset(new QRegExpCharClass);
3338     yyMinRep = 0;
3339     yyMaxRep = 0;
3340     yyError = QString();
3341 }
3342
3343 int QRegExpEngine::getToken()
3344 {
3345 #ifndef QT_NO_REGEXP_CCLASS
3346     ushort pendingCh = 0;
3347     bool charPending;
3348     bool rangePending;
3349     int tok;
3350 #endif
3351     int prevCh = yyCh;
3352
3353     yyPos0 = yyPos - 1;
3354 #ifndef QT_NO_REGEXP_CCLASS
3355     yyCharClass->clear();
3356 #endif
3357     yyMinRep = 0;
3358     yyMaxRep = 0;
3359     yyCh = getChar();
3360
3361     switch (prevCh) {
3362     case EOS:
3363         yyPos0 = yyPos;
3364         return Tok_Eos;
3365     case '$':
3366         return Tok_Dollar;
3367     case '(':
3368         if (yyCh == '?') {
3369             prevCh = getChar();
3370             yyCh = getChar();
3371             switch (prevCh) {
3372 #ifndef QT_NO_REGEXP_LOOKAHEAD
3373             case '!':
3374                 return Tok_NegLookahead;
3375             case '=':
3376                 return Tok_PosLookahead;
3377 #endif
3378             case ':':
3379                 return Tok_MagicLeftParen;
3380             case '<':
3381                 error(RXERR_LOOKBEHIND);
3382                 return Tok_MagicLeftParen;
3383             default:
3384                 error(RXERR_LOOKAHEAD);
3385                 return Tok_MagicLeftParen;
3386             }
3387         } else {
3388             return Tok_LeftParen;
3389         }
3390     case ')':
3391         return Tok_RightParen;
3392     case '*':
3393         yyMinRep = 0;
3394         yyMaxRep = InftyRep;
3395         return Tok_Quantifier;
3396     case '+':
3397         yyMinRep = 1;
3398         yyMaxRep = InftyRep;
3399         return Tok_Quantifier;
3400     case '.':
3401 #ifndef QT_NO_REGEXP_CCLASS
3402         yyCharClass->setNegative(true);
3403 #endif
3404         return Tok_CharClass;
3405     case '?':
3406         yyMinRep = 0;
3407         yyMaxRep = 1;
3408         return Tok_Quantifier;
3409     case '[':
3410 #ifndef QT_NO_REGEXP_CCLASS
3411         if (yyCh == '^') {
3412             yyCharClass->setNegative(true);
3413             yyCh = getChar();
3414         }
3415         charPending = false;
3416         rangePending = false;
3417         do {
3418             if (yyCh == '-' && charPending && !rangePending) {
3419                 rangePending = true;
3420                 yyCh = getChar();
3421             } else {
3422                 if (charPending && !rangePending) {
3423                     yyCharClass->addSingleton(pendingCh);
3424                     charPending = false;
3425                 }
3426                 if (yyCh == '\\') {
3427                     yyCh = getChar();
3428                     tok = getEscape();
3429                     if (tok == Tok_Word)
3430                         tok = '\b';
3431                 } else {
3432                     tok = Tok_Char | yyCh;
3433                     yyCh = getChar();
3434                 }
3435                 if (tok == Tok_CharClass) {
3436                     if (rangePending) {
3437                         yyCharClass->addSingleton('-');
3438                         yyCharClass->addSingleton(pendingCh);
3439                         charPending = false;
3440                         rangePending = false;
3441                     }
3442                 } else if ((tok & Tok_Char) != 0) {
3443                     if (rangePending) {
3444                         yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3445                         charPending = false;
3446                         rangePending = false;
3447                     } else {
3448                         pendingCh = tok ^ Tok_Char;
3449                         charPending = true;
3450                     }
3451                 } else {
3452                     error(RXERR_CHARCLASS);
3453                 }
3454             }
3455         }  while (yyCh != ']' && yyCh != EOS);
3456         if (rangePending)
3457             yyCharClass->addSingleton('-');
3458         if (charPending)
3459             yyCharClass->addSingleton(pendingCh);
3460         if (yyCh == EOS)
3461             error(RXERR_END);
3462         else
3463             yyCh = getChar();
3464         return Tok_CharClass;
3465 #else
3466         error(RXERR_END);
3467         return Tok_Char | '[';
3468 #endif
3469     case '\\':
3470         return getEscape();
3471     case ']':
3472         error(RXERR_LEFTDELIM);
3473         return Tok_Char | ']';
3474     case '^':
3475         return Tok_Caret;
3476     case '{':
3477 #ifndef QT_NO_REGEXP_INTERVAL
3478         yyMinRep = getRep(0);
3479         yyMaxRep = yyMinRep;
3480         if (yyCh == ',') {
3481             yyCh = getChar();
3482             yyMaxRep = getRep(InftyRep);
3483         }
3484         if (yyMaxRep < yyMinRep)
3485             error(RXERR_INTERVAL);
3486         if (yyCh != '}')
3487             error(RXERR_REPETITION);
3488         yyCh = getChar();
3489         return Tok_Quantifier;
3490 #else
3491         error(RXERR_DISABLED);
3492         return Tok_Char | '{';
3493 #endif
3494     case '|':
3495         return Tok_Bar;
3496     case '}':
3497         error(RXERR_LEFTDELIM);
3498         return Tok_Char | '}';
3499     default:
3500         return Tok_Char | prevCh;
3501     }
3502 }
3503
3504 int QRegExpEngine::parse(const QChar *pattern, int len)
3505 {
3506     valid = true;
3507     startTokenizer(pattern, len);
3508     yyTok = getToken();
3509 #ifndef QT_NO_REGEXP_CAPTURE
3510     yyMayCapture = true;
3511 #else
3512     yyMayCapture = false;
3513 #endif
3514
3515 #ifndef QT_NO_REGEXP_CAPTURE
3516     int atom = startAtom(false);
3517 #endif
3518     QRegExpCharClass anything;
3519     Box box(this); // create InitialState
3520     box.set(anything);
3521     Box rightBox(this); // create FinalState
3522     rightBox.set(anything);
3523
3524     Box middleBox(this);
3525     parseExpression(&middleBox);
3526 #ifndef QT_NO_REGEXP_CAPTURE
3527     finishAtom(atom, false);
3528 #endif
3529 #ifndef QT_NO_REGEXP_OPTIM
3530     middleBox.setupHeuristics();
3531 #endif
3532     box.cat(middleBox);
3533     box.cat(rightBox);
3534     yyCharClass.reset(0);
3535
3536 #ifndef QT_NO_REGEXP_CAPTURE
3537     for (int i = 0; i < nf; ++i) {
3538         switch (f[i].capture) {
3539         case QRegExpAtom::NoCapture:
3540             break;
3541         case QRegExpAtom::OfficialCapture:
3542             f[i].capture = ncap;
3543             captureForOfficialCapture.append(ncap);
3544             ++ncap;
3545             ++officialncap;
3546             break;
3547         case QRegExpAtom::UnofficialCapture:
3548             f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3549         }
3550     }
3551
3552 #ifndef QT_NO_REGEXP_BACKREF
3553 #ifndef QT_NO_REGEXP_OPTIM
3554     if (officialncap == 0 && nbrefs == 0) {
3555         ncap = nf = 0;
3556         f.clear();
3557     }
3558 #endif
3559     // handle the case where there's a \5 with no corresponding capture
3560     // (captureForOfficialCapture.size() != officialncap)
3561     for (int i = 0; i < nbrefs - officialncap; ++i) {
3562         captureForOfficialCapture.append(ncap);
3563         ++ncap;
3564     }
3565 #endif
3566 #endif
3567
3568     if (!yyError.isEmpty())
3569         return -1;
3570
3571 #ifndef QT_NO_REGEXP_OPTIM
3572     const QRegExpAutomatonState &sinit = s.at(InitialState);
3573     caretAnchored = !sinit.anchors.isEmpty();
3574     if (caretAnchored) {
3575         const QMap<int, int> &anchors = sinit.anchors;
3576         QMap<int, int>::const_iterator a;
3577         for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3578             if (
3579 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3580                 (*a & Anchor_Alternation) != 0 ||
3581 #endif
3582                 (*a & Anchor_Caret) == 0)
3583             {
3584                 caretAnchored = false;
3585                 break;
3586             }
3587         }
3588     }
3589 #endif
3590
3591     // cleanup anchors
3592     int numStates = s.count();
3593     for (int i = 0; i < numStates; ++i) {
3594         QRegExpAutomatonState &state = s[i];
3595         if (!state.anchors.isEmpty()) {
3596             QMap<int, int>::iterator a = state.anchors.begin();
3597             while (a != state.anchors.end()) {
3598                 if (a.value() == 0)
3599                     a = state.anchors.erase(a);
3600                 else
3601                     ++a;
3602             }
3603         }
3604     }
3605
3606     return yyPos0;
3607 }
3608
3609 void QRegExpEngine::parseAtom(Box *box)
3610 {
3611 #ifndef QT_NO_REGEXP_LOOKAHEAD
3612     QRegExpEngine *eng = 0;
3613     bool neg;
3614     int len;
3615 #endif
3616
3617     if ((yyTok & Tok_Char) != 0) {
3618         box->set(QChar(yyTok ^ Tok_Char));
3619     } else {
3620 #ifndef QT_NO_REGEXP_OPTIM
3621         trivial = false;
3622 #endif
3623         switch (yyTok) {
3624         case Tok_Dollar:
3625             box->catAnchor(Anchor_Dollar);
3626             break;
3627         case Tok_Caret:
3628             box->catAnchor(Anchor_Caret);
3629             break;
3630 #ifndef QT_NO_REGEXP_LOOKAHEAD
3631         case Tok_PosLookahead:
3632         case Tok_NegLookahead:
3633             neg = (yyTok == Tok_NegLookahead);
3634             eng = new QRegExpEngine(cs, greedyQuantifiers);
3635             len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3636             if (len >= 0)
3637                 skipChars(len);
3638             else
3639                 error(RXERR_LOOKAHEAD);
3640             box->catAnchor(addLookahead(eng, neg));
3641             yyTok = getToken();
3642             if (yyTok != Tok_RightParen)
3643                 error(RXERR_LOOKAHEAD);
3644             break;
3645 #endif
3646 #ifndef QT_NO_REGEXP_ESCAPE
3647         case Tok_Word:
3648             box->catAnchor(Anchor_Word);
3649             break;
3650         case Tok_NonWord:
3651             box->catAnchor(Anchor_NonWord);
3652             break;
3653 #endif
3654         case Tok_LeftParen:
3655         case Tok_MagicLeftParen:
3656             yyTok = getToken();
3657             parseExpression(box);
3658             if (yyTok != Tok_RightParen)
3659                 error(RXERR_END);
3660             break;
3661         case Tok_CharClass:
3662             box->set(*yyCharClass);
3663             break;
3664         case Tok_Quantifier:
3665             error(RXERR_REPETITION);
3666             break;
3667         default:
3668 #ifndef QT_NO_REGEXP_BACKREF
3669             if ((yyTok & Tok_BackRef) != 0)
3670                 box->set(yyTok ^ Tok_BackRef);
3671             else
3672 #endif
3673                 error(RXERR_DISABLED);
3674         }
3675     }
3676     yyTok = getToken();
3677 }
3678
3679 void QRegExpEngine::parseFactor(Box *box)
3680 {
3681 #ifndef QT_NO_REGEXP_CAPTURE
3682     int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3683     int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3684     bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3685 #else
3686     const int innerAtom = -1;
3687 #endif
3688
3689 #ifndef QT_NO_REGEXP_INTERVAL
3690 #define YYREDO() \
3691         yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3692         *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3693
3694     const QChar *in = yyIn;
3695     int pos0 = yyPos0;
3696     int pos = yyPos;
3697     int len = yyLen;
3698     int ch = yyCh;
3699     QRegExpCharClass charClass;
3700     if (yyTok == Tok_CharClass)
3701         charClass = *yyCharClass;
3702     int tok = yyTok;
3703     bool mayCapture = yyMayCapture;
3704 #endif
3705
3706     parseAtom(box);
3707 #ifndef QT_NO_REGEXP_CAPTURE
3708     finishAtom(innerAtom, magicLeftParen);
3709 #endif
3710
3711     bool hasQuantifier = (yyTok == Tok_Quantifier);
3712     if (hasQuantifier) {
3713 #ifndef QT_NO_REGEXP_OPTIM
3714         trivial = false;
3715 #endif
3716         if (yyMaxRep == InftyRep) {
3717             box->plus(innerAtom);
3718 #ifndef QT_NO_REGEXP_INTERVAL
3719         } else if (yyMaxRep == 0) {
3720             box->clear();
3721 #endif
3722         }
3723         if (yyMinRep == 0)
3724             box->opt();
3725
3726 #ifndef QT_NO_REGEXP_INTERVAL
3727         yyMayCapture = false;
3728         int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3729         int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3730
3731         Box rightBox(this);
3732         int i;
3733
3734         for (i = 0; i < beta; i++) {
3735             YYREDO();
3736             Box leftBox(this);
3737             parseAtom(&leftBox);
3738             leftBox.cat(rightBox);
3739             leftBox.opt();
3740             rightBox = leftBox;
3741         }
3742         for (i = 0; i < alpha; i++) {
3743             YYREDO();
3744             Box leftBox(this);
3745             parseAtom(&leftBox);
3746             leftBox.cat(rightBox);
3747             rightBox = leftBox;
3748         }
3749         rightBox.cat(*box);
3750         *box = rightBox;
3751 #endif
3752         yyTok = getToken();
3753 #ifndef QT_NO_REGEXP_INTERVAL
3754         yyMayCapture = mayCapture;
3755 #endif
3756     }
3757 #undef YYREDO
3758 #ifndef QT_NO_REGEXP_CAPTURE
3759     if (greedyQuantifiers)
3760         finishAtom(outerAtom, hasQuantifier);
3761 #endif
3762 }
3763
3764 void QRegExpEngine::parseTerm(Box *box)
3765 {
3766 #ifndef QT_NO_REGEXP_OPTIM
3767     if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3768         parseFactor(box);
3769 #endif
3770     while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3771         Box rightBox(this);
3772         parseFactor(&rightBox);
3773         box->cat(rightBox);
3774     }
3775 }
3776
3777 void QRegExpEngine::parseExpression(Box *box)
3778 {
3779     parseTerm(box);
3780     while (yyTok == Tok_Bar) {
3781 #ifndef QT_NO_REGEXP_OPTIM
3782         trivial = false;
3783 #endif
3784         Box rightBox(this);
3785         yyTok = getToken();
3786         parseTerm(&rightBox);
3787         box->orx(rightBox);
3788     }
3789 }
3790
3791 /*
3792   The struct QRegExpPrivate contains the private data of a regular
3793   expression other than the automaton. It makes it possible for many
3794   QRegExp objects to use the same QRegExpEngine object with different
3795   QRegExpPrivate objects.
3796 */
3797 struct QRegExpPrivate
3798 {
3799     QRegExpEngine *eng;
3800     QRegExpEngineKey engineKey;
3801     bool minimal;
3802 #ifndef QT_NO_REGEXP_CAPTURE
3803     QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3804     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3805 #endif
3806     QRegExpMatchState matchState;
3807
3808     inline QRegExpPrivate()
3809         : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3810     inline QRegExpPrivate(const QRegExpEngineKey &key)
3811         : eng(0), engineKey(key), minimal(false) {}
3812 };
3813
3814 #if !defined(QT_NO_REGEXP_OPTIM)
3815 uint qHash(const QRegExpEngineKey &key, uint seed = 0) Q_DECL_NOTHROW
3816 {
3817     return qHash(key.pattern, seed);
3818 }
3819
3820 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3821 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3822 static QBasicMutex globalEngineCacheMutex;
3823 #endif // QT_NO_REGEXP_OPTIM
3824
3825 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3826 {
3827     if (!eng->ref.deref()) {
3828 #if !defined(QT_NO_REGEXP_OPTIM)
3829         if (globalEngineCache()) {
3830             QMutexLocker locker(&globalEngineCacheMutex);
3831             QT_TRY {
3832                 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3833             } QT_CATCH(const std::bad_alloc &) {
3834                 // in case of an exception (e.g. oom), just delete the engine
3835                 delete eng;
3836             }
3837         } else {
3838             delete eng;
3839         }
3840 #else
3841         Q_UNUSED(key);
3842         delete eng;
3843 #endif
3844     }
3845 }
3846
3847 static void prepareEngine_helper(QRegExpPrivate *priv)
3848 {
3849     bool initMatchState = !priv->eng;
3850 #if !defined(QT_NO_REGEXP_OPTIM)
3851     if (!priv->eng && globalEngineCache()) {
3852         QMutexLocker locker(&globalEngineCacheMutex);
3853         priv->eng = globalEngineCache()->take(priv->engineKey);
3854         if (priv->eng != 0)
3855             priv->eng->ref.ref();
3856     }
3857 #endif // QT_NO_REGEXP_OPTIM
3858
3859     if (!priv->eng)
3860         priv->eng = new QRegExpEngine(priv->engineKey);
3861
3862     if (initMatchState)
3863         priv->matchState.prepareForMatch(priv->eng);
3864 }
3865
3866 inline static void prepareEngine(QRegExpPrivate *priv)
3867 {
3868     if (priv->eng)
3869         return;
3870     prepareEngine_helper(priv);
3871 }
3872
3873 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3874 {
3875     prepareEngine(priv);
3876     priv->matchState.prepareForMatch(priv->eng);
3877 #ifndef QT_NO_REGEXP_CAPTURE
3878     priv->t = str;
3879     priv->capturedCache.clear();
3880 #else
3881     Q_UNUSED(str);
3882 #endif
3883 }
3884
3885 static void invalidateEngine(QRegExpPrivate *priv)
3886 {
3887     if (priv->eng != 0) {
3888         derefEngine(priv->eng, priv->engineKey);
3889         priv->eng = 0;
3890         priv->matchState.drain();
3891     }
3892 }
3893
3894 /*!
3895     \enum QRegExp::CaretMode
3896
3897     The CaretMode enum defines the different meanings of the caret
3898     (\b{^}) in a regular expression. The possible values are:
3899
3900     \value CaretAtZero
3901            The caret corresponds to index 0 in the searched string.
3902
3903     \value CaretAtOffset
3904            The caret corresponds to the start offset of the search.
3905
3906     \value CaretWontMatch
3907            The caret never matches.
3908 */
3909
3910 /*!
3911     \enum QRegExp::PatternSyntax
3912
3913     The syntax used to interpret the meaning of the pattern.
3914
3915     \value RegExp A rich Perl-like pattern matching syntax. This is
3916     the default.
3917
3918     \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3919     (Introduced in Qt 4.2.)
3920
3921     \value Wildcard This provides a simple pattern matching syntax
3922     similar to that used by shells (command interpreters) for "file
3923     globbing". See \l{Wildcard Matching}.
3924
3925     \value WildcardUnix This is similar to Wildcard but with the
3926     behavior of a Unix shell. The wildcard characters can be escaped
3927     with the character "\\".
3928
3929     \value FixedString The pattern is a fixed string. This is
3930     equivalent to using the RegExp pattern on a string in
3931     which all metacharacters are escaped using escape().
3932
3933     \value W3CXmlSchema11 The pattern is a regular expression as
3934     defined by the W3C XML Schema 1.1 specification.
3935
3936     \sa setPatternSyntax()
3937 */
3938
3939 /*!
3940     Constructs an empty regexp.
3941
3942     \sa isValid(), errorString()
3943 */
3944 QRegExp::QRegExp()
3945 {
3946     priv = new QRegExpPrivate;
3947     prepareEngine(priv);
3948 }
3949
3950 /*!
3951     Constructs a regular expression object for the given \a pattern
3952     string. The pattern must be given using wildcard notation if \a
3953     syntax is \l Wildcard; the default is \l RegExp. The pattern is
3954     case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3955     greedy (maximal), but can be changed by calling
3956     setMinimal().
3957
3958     \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3959 */
3960 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3961 {
3962     priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3963     prepareEngine(priv);
3964 }
3965
3966 /*!
3967     Constructs a regular expression as a copy of \a rx.
3968
3969     \sa operator=()
3970 */
3971 QRegExp::QRegExp(const QRegExp &rx)
3972 {
3973     priv = new QRegExpPrivate;
3974     operator=(rx);
3975 }
3976
3977 /*!
3978     Destroys the regular expression and cleans up its internal data.
3979 */
3980 QRegExp::~QRegExp()
3981 {
3982     invalidateEngine(priv);
3983     delete priv;
3984 }
3985
3986 /*!
3987     Copies the regular expression \a rx and returns a reference to the
3988     copy. The case sensitivity, wildcard, and minimal matching options
3989     are also copied.
3990 */
3991 QRegExp &QRegExp::operator=(const QRegExp &rx)
3992 {
3993     prepareEngine(rx.priv); // to allow sharing
3994     QRegExpEngine *otherEng = rx.priv->eng;
3995     if (otherEng)
3996         otherEng->ref.ref();
3997     invalidateEngine(priv);
3998     priv->eng = otherEng;
3999     priv->engineKey = rx.priv->engineKey;
4000     priv->minimal = rx.priv->minimal;
4001 #ifndef QT_NO_REGEXP_CAPTURE
4002     priv->t = rx.priv->t;
4003     priv->capturedCache = rx.priv->capturedCache;
4004 #endif
4005     if (priv->eng)
4006         priv->matchState.prepareForMatch(priv->eng);
4007     priv->matchState.captured = rx.priv->matchState.captured;
4008     return *this;
4009 }
4010
4011 /*!
4012     \fn void QRegExp::swap(QRegExp &other)
4013     \since 4.8
4014
4015     Swaps regular expression \a other with this regular
4016     expression. This operation is very fast and never fails.
4017 */
4018
4019 /*!
4020     Returns true if this regular expression is equal to \a rx;
4021     otherwise returns false.
4022
4023     Two QRegExp objects are equal if they have the same pattern
4024     strings and the same settings for case sensitivity, wildcard and
4025     minimal matching.
4026 */
4027 bool QRegExp::operator==(const QRegExp &rx) const
4028 {
4029     return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4030 }
4031
4032 /*!
4033     \fn bool QRegExp::operator!=(const QRegExp &rx) const
4034
4035     Returns true if this regular expression is not equal to \a rx;
4036     otherwise returns false.
4037
4038     \sa operator==()
4039 */
4040
4041 /*!
4042     Returns true if the pattern string is empty; otherwise returns
4043     false.
4044
4045     If you call exactMatch() with an empty pattern on an empty string
4046     it will return true; otherwise it returns false since it operates
4047     over the whole string. If you call indexIn() with an empty pattern
4048     on \e any string it will return the start offset (0 by default)
4049     because the empty pattern matches the 'emptiness' at the start of
4050     the string. In this case the length of the match returned by
4051     matchedLength() will be 0.
4052
4053     See QString::isEmpty().
4054 */
4055
4056 bool QRegExp::isEmpty() const
4057 {
4058     return priv->engineKey.pattern.isEmpty();
4059 }
4060
4061 /*!
4062     Returns true if the regular expression is valid; otherwise returns
4063     false. An invalid regular expression never matches.
4064
4065     The pattern \b{[a-z} is an example of an invalid pattern, since
4066     it lacks a closing square bracket.
4067
4068     Note that the validity of a regexp may also depend on the setting
4069     of the wildcard flag, for example \b{*.html} is a valid
4070     wildcard regexp but an invalid full regexp.
4071
4072     \sa errorString()
4073 */
4074 bool QRegExp::isValid() const
4075 {
4076     if (priv->engineKey.pattern.isEmpty()) {
4077         return true;
4078     } else {
4079         prepareEngine(priv);
4080         return priv->eng->isValid();
4081     }
4082 }
4083
4084 /*!
4085     Returns the pattern string of the regular expression. The pattern
4086     has either regular expression syntax or wildcard syntax, depending
4087     on patternSyntax().
4088
4089     \sa patternSyntax(), caseSensitivity()
4090 */
4091 QString QRegExp::pattern() const
4092 {
4093     return priv->engineKey.pattern;
4094 }
4095
4096 /*!
4097     Sets the pattern string to \a pattern. The case sensitivity,
4098     wildcard, and minimal matching options are not changed.
4099
4100     \sa setPatternSyntax(), setCaseSensitivity()
4101 */
4102 void QRegExp::setPattern(const QString &pattern)
4103 {
4104     if (priv->engineKey.pattern != pattern) {
4105         invalidateEngine(priv);
4106         priv->engineKey.pattern = pattern;
4107     }
4108 }
4109
4110 /*!
4111     Returns Qt::CaseSensitive if the regexp is matched case
4112     sensitively; otherwise returns Qt::CaseInsensitive.
4113
4114     \sa patternSyntax(), pattern(), isMinimal()
4115 */
4116 Qt::CaseSensitivity QRegExp::caseSensitivity() const
4117 {
4118     return priv->engineKey.cs;
4119 }
4120
4121 /*!
4122     Sets case sensitive matching to \a cs.
4123
4124     If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4125     \c{readme.txt} but not \c{README.TXT}.
4126
4127     \sa setPatternSyntax(), setPattern(), setMinimal()
4128 */
4129 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4130 {
4131     if ((bool)cs != (bool)priv->engineKey.cs) {
4132         invalidateEngine(priv);
4133         priv->engineKey.cs = cs;
4134     }
4135 }
4136
4137 /*!
4138     Returns the syntax used by the regular expression. The default is
4139     QRegExp::RegExp.
4140
4141     \sa pattern(), caseSensitivity()
4142 */
4143 QRegExp::PatternSyntax QRegExp::patternSyntax() const
4144 {
4145     return priv->engineKey.patternSyntax;
4146 }
4147
4148 /*!
4149     Sets the syntax mode for the regular expression. The default is
4150     QRegExp::RegExp.
4151
4152     Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4153     \l{wildcard matching}. For example, \b{r*.txt} matches the
4154     string \c{readme.txt} in wildcard mode, but does not match
4155     \c{readme}.
4156
4157     Setting \a syntax to QRegExp::FixedString means that the pattern
4158     is interpreted as a plain string. Special characters (e.g.,
4159     backslash) don't need to be escaped then.
4160
4161     \sa setPattern(), setCaseSensitivity(), escape()
4162 */
4163 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4164 {
4165     if (syntax != priv->engineKey.patternSyntax) {
4166         invalidateEngine(priv);
4167         priv->engineKey.patternSyntax = syntax;
4168     }
4169 }
4170
4171 /*!
4172     Returns true if minimal (non-greedy) matching is enabled;
4173     otherwise returns false.
4174
4175     \sa caseSensitivity(), setMinimal()
4176 */
4177 bool QRegExp::isMinimal() const
4178 {
4179     return priv->minimal;
4180 }
4181
4182 /*!
4183     Enables or disables minimal matching. If \a minimal is false,
4184     matching is greedy (maximal) which is the default.
4185
4186     For example, suppose we have the input string "We must be
4187     <b>bold</b>, very <b>bold</b>!" and the pattern
4188     \b{<b>.*</b>}. With the default greedy (maximal) matching,
4189     the match is "We must be \underline{<b>bold</b>, very
4190     <b>bold</b>}!". But with minimal (non-greedy) matching, the
4191     first match is: "We must be \underline{<b>bold</b>}, very
4192     <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4193     very \underline{<b>bold</b>}!". In practice we might use the pattern
4194     \b{<b>[^<]*\</b>} instead, although this will still fail for
4195     nested tags.
4196
4197     \sa setCaseSensitivity()
4198 */
4199 void QRegExp::setMinimal(bool minimal)
4200 {
4201     priv->minimal = minimal;
4202 }
4203
4204 // ### Qt 5: make non-const
4205 /*!
4206     Returns true if \a str is matched exactly by this regular
4207     expression; otherwise returns false. You can determine how much of
4208     the string was matched by calling matchedLength().
4209
4210     For a given regexp string R, exactMatch("R") is the equivalent of
4211     indexIn("^R$") since exactMatch() effectively encloses the regexp
4212     in the start of string and end of string anchors, except that it
4213     sets matchedLength() differently.
4214
4215     For example, if the regular expression is \b{blue}, then
4216     exactMatch() returns true only for input \c blue. For inputs \c
4217     bluebell, \c blutak and \c lightblue, exactMatch() returns false
4218     and matchedLength() will return 4, 3 and 0 respectively.
4219
4220     Although const, this function sets matchedLength(),
4221     capturedTexts(), and pos().
4222
4223     \sa indexIn(), lastIndexIn()
4224 */
4225 bool QRegExp::exactMatch(const QString &str) const
4226 {
4227     prepareEngineForMatch(priv, str);
4228     priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4229     if (priv->matchState.captured[1] == str.length()) {
4230         return true;
4231     } else {
4232         priv->matchState.captured[0] = 0;
4233         priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4234         return false;
4235     }
4236 }
4237
4238 // ### Qt 5: make non-const
4239 /*!
4240     Attempts to find a match in \a str from position \a offset (0 by
4241     default). If \a offset is -1, the search starts at the last
4242     character; if -2, at the next to last character; etc.
4243
4244     Returns the position of the first match, or -1 if there was no
4245     match.
4246
4247     The \a caretMode parameter can be used to instruct whether \b{^}
4248     should match at index 0 or at \a offset.
4249
4250     You might prefer to use QString::indexOf(), QString::contains(),
4251     or even QStringList::filter(). To replace matches use
4252     QString::replace().
4253
4254     Example:
4255     \snippet code/src_corelib_tools_qregexp.cpp 13
4256
4257     Although const, this function sets matchedLength(),
4258     capturedTexts() and pos().
4259
4260     If the QRegExp is a wildcard expression (see setPatternSyntax())
4261     and want to test a string against the whole wildcard expression,
4262     use exactMatch() instead of this function.
4263
4264     \sa lastIndexIn(), exactMatch()
4265 */
4266
4267 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4268 {
4269     prepareEngineForMatch(priv, str);
4270     if (offset < 0)
4271         offset += str.length();
4272     priv->matchState.match(str.unicode(), str.length(), offset,
4273         priv->minimal, false, caretIndex(offset, caretMode));
4274     return priv->matchState.captured[0];
4275 }
4276
4277 // ### Qt 5: make non-const
4278 /*!
4279     Attempts to find a match backwards in \a str from position \a
4280     offset. If \a offset is -1 (the default), the search starts at the
4281     last character; if -2, at the next to last character; etc.
4282
4283     Returns the position of the first match, or -1 if there was no
4284     match.
4285
4286     The \a caretMode parameter can be used to instruct whether \b{^}
4287     should match at index 0 or at \a offset.
4288
4289     Although const, this function sets matchedLength(),
4290     capturedTexts() and pos().
4291
4292     \warning Searching backwards is much slower than searching
4293     forwards.
4294
4295     \sa indexIn(), exactMatch()
4296 */
4297
4298 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4299 {
4300     prepareEngineForMatch(priv, str);
4301     if (offset < 0)
4302         offset += str.length();
4303     if (offset < 0 || offset > str.length()) {
4304         memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4305         return -1;
4306     }
4307
4308     while (offset >= 0) {
4309         priv->matchState.match(str.unicode(), str.length(), offset,
4310             priv->minimal, true, caretIndex(offset, caretMode));
4311         if (priv->matchState.captured[0] == offset)
4312             return offset;
4313         --offset;
4314     }
4315     return -1;
4316 }
4317
4318 /*!
4319     Returns the length of the last matched string, or -1 if there was
4320     no match.
4321
4322     \sa exactMatch(), indexIn(), lastIndexIn()
4323 */
4324 int QRegExp::matchedLength() const
4325 {
4326     return priv->matchState.captured[1];
4327 }
4328
4329 #ifndef QT_NO_REGEXP_CAPTURE
4330
4331 /*!
4332   \since 4.6
4333   Returns the number of captures contained in the regular expression.
4334  */
4335 int QRegExp::captureCount() const
4336 {
4337     prepareEngine(priv);
4338     return priv->eng->captureCount();
4339 }
4340
4341 /*!
4342     Returns a list of the captured text strings.
4343
4344     The first string in the list is the entire matched string. Each
4345     subsequent list element contains a string that matched a
4346     (capturing) subexpression of the regexp.
4347
4348     For example:
4349     \snippet code/src_corelib_tools_qregexp.cpp 14
4350
4351     The above example also captures elements that may be present but
4352     which we have no interest in. This problem can be solved by using
4353     non-capturing parentheses:
4354
4355     \snippet code/src_corelib_tools_qregexp.cpp 15
4356
4357     Note that if you want to iterate over the list, you should iterate
4358     over a copy, e.g.
4359     \snippet code/src_corelib_tools_qregexp.cpp 16
4360
4361     Some regexps can match an indeterminate number of times. For
4362     example if the input string is "Offsets: 12 14 99 231 7" and the
4363     regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4364     all the numbers matched. However, after calling
4365     \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4366     "12"), i.e. the entire match was "12" and the first subexpression
4367     matched was "12". The correct approach is to use cap() in a
4368     \l{QRegExp#cap_in_a_loop}{loop}.
4369
4370     The order of elements in the string list is as follows. The first
4371     element is the entire matching string. Each subsequent element
4372     corresponds to the next capturing open left parentheses. Thus
4373     capturedTexts()[1] is the text of the first capturing parentheses,
4374     capturedTexts()[2] is the text of the second and so on
4375     (corresponding to $1, $2, etc., in some other regexp languages).
4376
4377     \sa cap(), pos()
4378 */
4379 QStringList QRegExp::capturedTexts() const
4380 {
4381     if (priv->capturedCache.isEmpty()) {
4382         prepareEngine(priv);
4383         const int *captured = priv->matchState.captured;
4384         int n = priv->matchState.capturedSize;
4385
4386         for (int i = 0; i < n; i += 2) {
4387             QString m;
4388             if (captured[i + 1] == 0)
4389                 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4390             else if (captured[i] >= 0)
4391                 m = priv->t.mid(captured[i], captured[i + 1]);
4392             priv->capturedCache.append(m);
4393         }
4394         priv->t.clear();
4395     }
4396     return priv->capturedCache;
4397 }
4398
4399 /*!
4400     \internal
4401 */
4402 QStringList QRegExp::capturedTexts()
4403 {
4404     return const_cast<const QRegExp *>(this)->capturedTexts();
4405 }
4406
4407 /*!
4408     Returns the text captured by the \a nth subexpression. The entire
4409     match has index 0 and the parenthesized subexpressions have
4410     indexes starting from 1 (excluding non-capturing parentheses).
4411
4412     \snippet code/src_corelib_tools_qregexp.cpp 17
4413
4414     The order of elements matched by cap() is as follows. The first
4415     element, cap(0), is the entire matching string. Each subsequent
4416     element corresponds to the next capturing open left parentheses.
4417     Thus cap(1) is the text of the first capturing parentheses, cap(2)
4418     is the text of the second, and so on.
4419
4420     \sa capturedTexts(), pos()
4421 */
4422 QString QRegExp::cap(int nth) const
4423 {
4424     return capturedTexts().value(nth);
4425 }
4426
4427 /*!
4428     \internal
4429 */
4430 QString QRegExp::cap(int nth)
4431 {
4432     return const_cast<const QRegExp *>(this)->cap(nth);
4433 }
4434
4435 /*!
4436     Returns the position of the \a nth captured text in the searched
4437     string. If \a nth is 0 (the default), pos() returns the position
4438     of the whole match.
4439
4440     Example:
4441     \snippet code/src_corelib_tools_qregexp.cpp 18
4442
4443     For zero-length matches, pos() always returns -1. (For example, if
4444     cap(4) would return an empty string, pos(4) returns -1.) This is
4445     a feature of the implementation.
4446
4447     \sa cap(), capturedTexts()
4448 */
4449 int QRegExp::pos(int nth) const
4450 {
4451     if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4452         return -1;
4453     else
4454         return priv->matchState.captured[2 * nth];
4455 }
4456
4457 /*!
4458     \internal
4459 */
4460 int QRegExp::pos(int nth)
4461 {
4462     return const_cast<const QRegExp *>(this)->pos(nth);
4463 }
4464
4465 /*!
4466   Returns a text string that explains why a regexp pattern is
4467   invalid the case being; otherwise returns "no error occurred".
4468
4469   \sa isValid()
4470 */
4471 QString QRegExp::errorString() const
4472 {
4473     if (isValid()) {
4474         return QString::fromLatin1(RXERR_OK);
4475     } else {
4476         return priv->eng->errorString();
4477     }
4478 }
4479
4480 /*!
4481     \internal
4482 */
4483 QString QRegExp::errorString()
4484 {
4485     return const_cast<const QRegExp *>(this)->errorString();
4486 }
4487 #endif
4488
4489 /*!
4490     Returns the string \a str with every regexp special character
4491     escaped with a backslash. The special characters are $, (,), *, +,
4492     ., ?, [, \,], ^, {, | and }.
4493
4494     Example:
4495
4496     \snippet code/src_corelib_tools_qregexp.cpp 19
4497
4498     This function is useful to construct regexp patterns dynamically:
4499
4500     \snippet code/src_corelib_tools_qregexp.cpp 20
4501
4502     \sa setPatternSyntax()
4503 */
4504 QString QRegExp::escape(const QString &str)
4505 {
4506     QString quoted;
4507     const int count = str.count();
4508     quoted.reserve(count * 2);
4509     const QLatin1Char backslash('\\');
4510     for (int i = 0; i < count; i++) {
4511         switch (str.at(i).toLatin1()) {
4512         case '$':
4513         case '(':
4514         case ')':
4515         case '*':
4516         case '+':
4517         case '.':
4518         case '?':
4519         case '[':
4520         case '\\':
4521         case ']':
4522         case '^':
4523         case '{':
4524         case '|':
4525         case '}':
4526             quoted.append(backslash);
4527         }
4528         quoted.append(str.at(i));
4529     }
4530     return quoted;
4531 }
4532
4533
4534 #ifndef QT_NO_DATASTREAM
4535 /*!
4536     \relates QRegExp
4537
4538     Writes the regular expression \a regExp to stream \a out.
4539
4540     \sa {Serializing Qt Data Types}
4541 */
4542 QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4543 {
4544     return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4545                << (quint8)regExp.patternSyntax()
4546                << (quint8)!!regExp.isMinimal();
4547 }
4548
4549 /*!
4550     \relates QRegExp
4551
4552     Reads a regular expression from stream \a in into \a regExp.
4553
4554     \sa {Serializing Qt Data Types}
4555 */
4556 QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4557 {
4558     QString pattern;
4559     quint8 cs;
4560     quint8 patternSyntax;
4561     quint8 isMinimal;
4562
4563     in >> pattern >> cs >> patternSyntax >> isMinimal;
4564
4565     QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4566                       QRegExp::PatternSyntax(patternSyntax));
4567
4568     newRegExp.setMinimal(isMinimal);
4569     regExp = newRegExp;
4570     return in;
4571 }
4572 #endif // QT_NO_DATASTREAM
4573
4574 #ifndef QT_NO_DEBUG_STREAM
4575 QDebug operator<<(QDebug dbg, const QRegExp &r)
4576 {
4577     dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4578                   << ", pattern='"<< r.pattern() << "')";
4579     return dbg.space();
4580 }
4581 #endif
4582
4583 QT_END_NAMESPACE