1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the QtCore module of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
44 #include "qalgorithms.h"
45 #include "qbitarray.h"
47 #include "qdatastream.h"
53 #include "qstringlist.h"
54 #include "qstringmatcher.h"
56 #include "private/qfunctions_p.h"
62 int qFindString(const QChar *haystack, int haystackLen, int from,
63 const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
65 // error strings for the regexp parser
66 #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
67 #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
68 #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
69 #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
70 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
71 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
72 #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
73 #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
74 #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
75 #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
76 #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
77 #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
82 \brief The QRegExp class provides pattern matching using regular expressions.
87 \keyword regular expression
89 A regular expression, or "regexp", is a pattern for matching
90 substrings in a text. This is useful in many contexts, e.g.,
94 \li A regexp can test whether a substring meets some criteria,
95 e.g. is an integer or contains no whitespace.
97 \li A regexp provides more powerful pattern matching than
98 simple substring matching, e.g., match one of the words
99 \e{mail}, \e{letter} or \e{correspondence}, but none of the
100 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
101 \row \li Search and Replace
102 \li A regexp can replace all occurrences of a substring with a
103 different substring, e.g., replace all occurrences of \e{&}
104 with \e{\&} except where the \e{&} is already followed by
106 \row \li String Splitting
107 \li A regexp can be used to identify where a string should be
108 split apart, e.g. splitting tab-delimited strings.
111 A brief introduction to regexps is presented, a description of
112 Qt's regexp language, some examples, and the function
113 documentation itself. QRegExp is modeled on Perl's regexp
114 language. It fully supports Unicode. QRegExp can also be used in a
115 simpler, \e{wildcard mode} that is similar to the functionality
116 found in command shells. The syntax rules used by QRegExp can be
117 changed with setPatternSyntax(). In particular, the pattern syntax
118 can be set to QRegExp::FixedString, which means the pattern to be
119 matched is interpreted as a plain string, i.e., special characters
120 (e.g., backslash) are not escaped.
122 A good text on regexps is \e {Mastering Regular Expressions}
123 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
127 \section1 Introduction
129 Regexps are built up from expressions, quantifiers, and
130 assertions. The simplest expression is a character, e.g. \b{x}
131 or \b{5}. An expression can also be a set of characters
132 enclosed in square brackets. \b{[ABCD]} will match an \b{A}
133 or a \b{B} or a \b{C} or a \b{D}. We can write this same
134 expression as \b{[A-D]}, and an experession to match any
135 captital letter in the English alphabet is written as
138 A quantifier specifies the number of occurrences of an expression
139 that must be matched. \b{x{1,1}} means match one and only one
140 \b{x}. \b{x{1,5}} means match a sequence of \b{x}
141 characters that contains at least one \b{x} but no more than
144 Note that in general regexps cannot be used to check for balanced
145 brackets or tags. For example, a regexp can be written to match an
146 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
147 are not nested, but if the \c{<b>} tags are nested, that same
148 regexp will match an opening \c{<b>} tag with the wrong closing
149 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
150 first \c{<b>} would be matched with the first \c{</b>}, which is
151 not correct. However, it is possible to write a regexp that will
152 match nested brackets or tags correctly, but only if the number of
153 nesting levels is fixed and known. If the number of nesting levels
154 is not fixed and known, it is impossible to write a regexp that
157 Suppose we want a regexp to match integers in the range 0 to 99.
158 At least one digit is required, so we start with the expression
159 \b{[0-9]{1,1}}, which matches a single digit exactly once. This
160 regexp matches integers in the range 0 to 9. To match integers up
161 to 99, increase the maximum number of occurrences to 2, so the
162 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
163 original requirement to match integers from 0 to 99, but it will
164 also match integers that occur in the middle of strings. If we
165 want the matched integer to be the whole string, we must use the
166 anchor assertions, \b{^} (caret) and \b{$} (dollar). When
167 \b{^} is the first character in a regexp, it means the regexp
168 must match from the beginning of the string. When \b{$} is the
169 last character of the regexp, it means the regexp must match to
170 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
171 Note that assertions, e.g. \b{^} and \b{$}, do not match
172 characters but locations in the string.
174 If you have seen regexps described elsewhere, they may have looked
175 different from the ones shown here. This is because some sets of
176 characters and some quantifiers are so common that they have been
177 given special symbols to represent them. \b{[0-9]} can be
178 replaced with the symbol \b{\\d}. The quantifier to match
179 exactly one occurrence, \b{{1,1}}, can be replaced with the
180 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
181 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
182 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
183 the string, match a digit, followed immediately by 0 or 1 digits}.
184 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
185 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
186 occurrences. \b{?} makes an expression optional. The regexp
187 \b{^\\d\\d?$} means \e{From the beginning of the string, match
188 one digit, followed immediately by 0 or 1 more digit, followed
189 immediately by end of string}.
191 To write a regexp that matches one of the words 'mail' \e or
192 'letter' \e or 'correspondence' but does not match words that
193 contain these words, e.g., 'email', 'mailman', 'mailer', and
194 'letterbox', start with a regexp that matches 'mail'. Expressed
195 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
196 a character expression is automatically quantified by
197 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
198 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
199 we can use the vertical bar \b{|}, which means \b{or}, to
200 include the other two words, so our regexp for matching any of the
201 three words becomes \b{mail|letter|correspondence}. Match
202 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
203 regexp will match one of the three words we want to match, it will
204 also match words we don't want to match, e.g., 'email'. To
205 prevent the regexp from matching unwanted words, we must tell it
206 to begin and end the match at word boundaries. First we enclose
207 our regexp in parentheses, \b{(mail|letter|correspondence)}.
208 Parentheses group expressions together, and they identify a part
209 of the regexp that we wish to \l{capturing text}{capture}.
210 Enclosing the expression in parentheses allows us to use it as a
211 component in more complex regexps. It also allows us to examine
212 which of the three words was actually matched. To force the match
213 to begin and end on word boundaries, we enclose the regexp in
214 \b{\\b} \e{word boundary} assertions:
215 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
216 \e{Match a word boundary, followed by the regexp in parentheses,
217 followed by a word boundary}. The \b{\\b} assertion matches a
218 \e position in the regexp, not a \e character. A word boundary is
219 any non-word character, e.g., a space, newline, or the beginning
220 or ending of a string.
222 If we want to replace ampersand characters with the HTML entity
223 \b{\&}, the regexp to match is simply \b{\&}. But this
224 regexp will also match ampersands that have already been converted
225 to HTML entities. We want to replace only ampersands that are not
226 already followed by \b{amp;}. For this, we need the negative
227 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
228 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
229 \b{not} \e{followed by} \b{amp;}.
231 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
232 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
233 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
234 required to avoid matching words that contain either name,
235 e.g. 'Ericsson'. Note that the second regexp matches more
236 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
238 Some of the examples discussed above are implemented in the
239 \link #code-examples code examples \endlink section.
241 \target characters-and-abbreviations-for-sets-of-characters
242 \section1 Characters and Abbreviations for Sets of Characters
245 \header \li Element \li Meaning
247 \li A character represents itself unless it has a special
248 regexp meaning. e.g. \b{c} matches the character \e c.
250 \li A character that follows a backslash matches the character
251 itself, except as specified below. e.g., To match a literal
252 caret at the beginning of a string, write \b{\\^}.
254 \li Matches the ASCII bell (BEL, 0x07).
256 \li Matches the ASCII form feed (FF, 0x0C).
258 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
260 \li Matches the ASCII carriage return (CR, 0x0D).
262 \li Matches the ASCII horizontal tab (HT, 0x09).
264 \li Matches the ASCII vertical tab (VT, 0x0B).
265 \row \li \b{\\x\e{hhhh}}
266 \li Matches the Unicode character corresponding to the
267 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
268 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
269 \li matches the ASCII/Latin1 character for the octal number
270 \e{ooo} (between 0 and 0377).
272 \li Matches any character (including newline).
274 \li Matches a digit (QChar::isDigit()).
276 \li Matches a non-digit.
278 \li Matches a whitespace character (QChar::isSpace()).
280 \li Matches a non-whitespace character.
282 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
284 \li Matches a non-word character.
286 \li The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
289 \b{Note:} The C++ compiler transforms backslashes in strings.
290 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
291 To match the backslash character itself, enter it four times, i.e.
294 \target sets-of-characters
295 \section1 Sets of Characters
297 Square brackets mean match any character contained in the square
298 brackets. The character set abbreviations described above can
299 appear in a character set in square brackets. Except for the
300 character set abbreviations and the following two exceptions,
301 characters do not have special meanings in square brackets.
306 \li The caret negates the character set if it occurs as the
307 first character (i.e. immediately after the opening square
308 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
309 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
313 \li The dash indicates a range of characters. \b{[W-Z]}
314 matches 'W' or 'X' or 'Y' or 'Z'.
318 Using the predefined character set abbreviations is more portable
319 than using character ranges across platforms and languages. For
320 example, \b{[0-9]} matches a digit in Western alphabets but
321 \b{\\d} matches a digit in \e any alphabet.
323 Note: In other regexp documentation, sets of characters are often
324 called "character classes".
327 \section1 Quantifiers
329 By default, an expression is automatically quantified by
330 \b{{1,1}}, i.e. it should occur exactly once. In the following
331 list, \b{\e {E}} stands for expression. An expression is a
332 character, or an abbreviation for a set of characters, or a set of
333 characters in square brackets, or an expression in parentheses.
338 \li Matches zero or one occurrences of \e E. This quantifier
339 means \e{The previous expression is optional}, because it
340 will match whether or not the expression is found. \b{\e
341 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
342 matches 'dent' or 'dents'.
346 \li Matches one or more occurrences of \e E. \b{\e {E}+} is
347 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
352 \li Matches zero or more occurrences of \e E. It is the same
353 as \b{\e {E}{0,}}. The \b{*} quantifier is often used
354 in error where \b{+} should be used. For example, if
355 \b{\\s*$} is used in an expression to match strings that
356 end in whitespace, it will match every string because
357 \b{\\s*$} means \e{Match zero or more whitespaces followed
358 by end of string}. The correct regexp to match strings that
359 have at least one trailing whitespace character is
362 \row \li \b{\e {E}{n}}
364 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
365 is the same as repeating \e E \e n times. For example,
366 \b{x{5}} is the same as \b{xxxxx}. It is also the same
367 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
369 \row \li \b{\e {E}{n,}}
370 \li Matches at least \e n occurrences of \e E.
372 \row \li \b{\e {E}{,m}}
373 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
374 is the same as \b{\e {E}{0,m}}.
376 \row \li \b{\e {E}{n,m}}
377 \li Matches at least \e n and at most \e m occurrences of \e E.
380 To apply a quantifier to more than just the preceding character,
381 use parentheses to group characters together in an expression. For
382 example, \b{tag+} matches a 't' followed by an 'a' followed by
383 at least one 'g', whereas \b{(tag)+} matches at least one
386 Note: Quantifiers are normally "greedy". They always match as much
387 text as they can. For example, \b{0+} matches the first zero it
388 finds and all the consecutive zeros after the first zero. Applied
389 to '20005', it matches'2\underline{000}5'. Quantifiers can be made
390 non-greedy, see setMinimal().
392 \target capturing parentheses
393 \target backreferences
394 \section1 Capturing Text
396 Parentheses allow us to group elements together so that we can
397 quantify and capture them. For example if we have the expression
398 \b{mail|letter|correspondence} that matches a string we know
399 that \e one of the words matched but not which one. Using
400 parentheses allows us to "capture" whatever is matched within
401 their bounds, so if we used \b{(mail|letter|correspondence)}
402 and matched this regexp against the string "I sent you some email"
403 we can use the cap() or capturedTexts() functions to extract the
404 matched characters, in this case 'mail'.
406 We can use captured text within the regexp itself. To refer to the
407 captured text we use \e backreferences which are indexed from 1,
408 the same as for cap(). For example we could search for duplicate
409 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
410 word boundary followed by one or more word characters followed by
411 one or more non-word characters followed by the same text as the
412 first parenthesized expression followed by a word boundary.
414 If we want to use parentheses purely for grouping and not for
415 capturing we can use the non-capturing syntax, e.g.
416 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
417 end ')'. In this example we match either 'green' or 'blue' but we
418 do not capture the match so we only know whether or not we matched
419 but not which color we actually found. Using non-capturing
420 parentheses is more efficient than using capturing parentheses
421 since the regexp engine has to do less book-keeping.
423 Both capturing and non-capturing parentheses may be nested.
425 \target greedy quantifiers
427 For historical reasons, quantifiers (e.g. \b{*}) that apply to
428 capturing parentheses are more "greedy" than other quantifiers.
429 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
430 This behavior is different from what other regexp engines do
431 (notably, Perl). To obtain a more intuitive capturing behavior,
432 specify QRegExp::RegExp2 to the QRegExp constructor or call
433 setPatternSyntax(QRegExp::RegExp2).
435 \target cap_in_a_loop
437 When the number of matches cannot be determined in advance, a
438 common idiom is to use cap() in a loop. For example:
440 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
445 Assertions make some statement about the text at the point where
446 they occur in the regexp but they do not match any characters. In
447 the following list \b{\e {E}} stands for any expression.
451 \li The caret signifies the beginning of the string. If you
452 wish to match a literal \c{^} you must escape it by
453 writing \c{\\^}. For example, \b{^#include} will only
454 match strings which \e begin with the characters '#include'.
455 (When the caret is the first character of a character set it
456 has a special meaning, see \link #sets-of-characters Sets of
457 Characters \endlink.)
460 \li The dollar signifies the end of the string. For example
461 \b{\\d\\s*$} will match strings which end with a digit
462 optionally followed by whitespace. If you wish to match a
463 literal \c{$} you must escape it by writing
467 \li A word boundary. For example the regexp
468 \b{\\bOK\\b} means match immediately after a word
469 boundary (e.g. start of string or whitespace) the letter 'O'
470 then the letter 'K' immediately before another word boundary
471 (e.g. end of string or whitespace). But note that the
472 assertion does not actually match any whitespace so if we
473 write \b{(\\bOK\\b)} and we have a match it will only
474 contain 'OK' even if the string is "It's \underline{OK} now".
477 \li A non-word boundary. This assertion is true wherever
478 \b{\\b} is false. For example if we searched for
479 \b{\\Bon\\B} in "Left on" the match would fail (space
480 and end of string aren't non-word boundaries), but it would
481 match in "t\underline{on}ne".
483 \row \li \b{(?=\e E)}
484 \li Positive lookahead. This assertion is true if the
485 expression matches at this point in the regexp. For example,
486 \b{const(?=\\s+char)} matches 'const' whenever it is
487 followed by 'char', as in 'static \underline{const} char *'.
488 (Compare with \b{const\\s+char}, which matches 'static
489 \underline{const char} *'.)
491 \row \li \b{(?!\e E)}
492 \li Negative lookahead. This assertion is true if the
493 expression does not match at this point in the regexp. For
494 example, \b{const(?!\\s+char)} matches 'const' \e except
495 when it is followed by 'char'.
498 \keyword QRegExp wildcard matching
499 \section1 Wildcard Matching
501 Most command shells such as \e bash or \e cmd.exe support "file
502 globbing", the ability to identify a group of files by using
503 wildcards. The setPatternSyntax() function is used to switch
504 between regexp and wildcard mode. Wildcard matching is much
505 simpler than full regexps and has only four features:
509 \li Any character represents itself apart from those mentioned
510 below. Thus \b{c} matches the character \e c.
512 \li Matches any single character. It is the same as
513 \b{.} in full regexps.
515 \li Matches zero or more of any characters. It is the
516 same as \b{.*} in full regexps.
518 \li Sets of characters can be represented in square brackets,
519 similar to full regexps. Within the character class, like
520 outside, backslash has no special meaning.
523 In the mode Wildcard, the wildcard characters cannot be
524 escaped. In the mode WildcardUnix, the character '\\' escapes the
527 For example if we are in wildcard mode and have strings which
528 contain filenames we could identify HTML files with \b{*.html}.
529 This will match zero or more characters followed by a dot followed
530 by 'h', 't', 'm' and 'l'.
532 To test a string against a wildcard expression, use exactMatch().
535 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
538 \section1 Notes for Perl Users
540 Most of the character class abbreviations supported by Perl are
541 supported by QRegExp, see \link
542 #characters-and-abbreviations-for-sets-of-characters characters
543 and abbreviations for sets of characters \endlink.
545 In QRegExp, apart from within character classes, \c{^} always
546 signifies the start of the string, so carets must always be
547 escaped unless used for that purpose. In Perl the meaning of caret
548 varies automagically depending on where it occurs so escaping it
549 is rarely necessary. The same applies to \c{$} which in
550 QRegExp always signifies the end of the string.
552 QRegExp's quantifiers are the same as Perl's greedy quantifiers
553 (but see the \l{greedy quantifiers}{note above}). Non-greedy
554 matching cannot be applied to individual quantifiers, but can be
555 applied to all the quantifiers in the pattern. For example, to
556 match the Perl regexp \b{ro+?m} requires:
558 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
560 The equivalent of Perl's \c{/i} option is
561 setCaseSensitivity(Qt::CaseInsensitive).
563 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
565 In QRegExp \b{.} matches any character, therefore all QRegExp
566 regexps have the equivalent of Perl's \c{/s} option. QRegExp
567 does not have an equivalent to Perl's \c{/m} option, but this
568 can be emulated in various ways for example by splitting the input
569 into lines or by looping with a regexp that searches for newlines.
571 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
572 assertions. The \\G assertion is not supported but can be emulated
575 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
576 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
577 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
578 capturedTexts()[2], etc.
580 To substitute a pattern use QString::replace().
582 Perl's extended \c{/x} syntax is not supported, nor are
583 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
584 the other hand, C++'s rules for literal strings can be used to
587 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
589 Both zero-width positive and zero-width negative lookahead
590 assertions (?=pattern) and (?!pattern) are supported with the same
591 syntax as Perl. Perl's lookbehind assertions, "independent"
592 subexpressions and conditional expressions are not supported.
594 Non-capturing parentheses are also supported, with the same
597 See QString::split() and QStringList::join() for equivalents
598 to Perl's split and join functions.
600 Note: because C++ transforms \\'s they must be written \e twice in
601 code, e.g. \b{\\b} must be written \b{\\\\b}.
603 \target code-examples
604 \section1 Code Examples
606 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
608 The third string matches '\underline{6}'. This is a simple validation
609 regexp for integers in the range 0 to 99.
611 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
613 The second string matches '\underline{This_is-OK}'. We've used the
614 character set abbreviation '\\S' (non-whitespace) and the anchors
615 to match strings which contain no whitespace.
617 In the following example we match strings containing 'mail' or
618 'letter' or 'correspondence' but only match whole words i.e. not
621 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
623 The second string matches "Please write the \underline{letter}". The
624 word 'letter' is also captured (because of the parentheses). We
625 can see what text we've captured like this:
627 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
629 This will capture the text from the first set of capturing
630 parentheses (counting capturing left parentheses from left to
631 right). The parentheses are counted from 1 since cap(0) is the
632 whole matched regexp (equivalent to '&' in most regexp engines).
634 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
636 Here we've passed the QRegExp to QString's replace() function to
637 replace the matched text with new text.
639 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
641 We've used the indexIn() function to repeatedly match the regexp in
642 the string. Note that instead of moving forward by one character
643 at a time \c pos++ we could have written \c {pos +=
644 rx.matchedLength()} to skip over the already matched string. The
645 count will equal 3, matching 'One \underline{Eric} another
646 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
647 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
648 by non-word boundaries.
650 One common use of regexps is to split lines of delimited data into
651 their component fields.
653 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
655 In this example our input lines have the format company name, web
656 address and country. Unfortunately the regexp is rather long and
657 not very versatile -- the code will break if we add any more
658 fields. A simpler and better solution is to look for the
659 separator, '\\t' in this case, and take the surrounding text. The
660 QString::split() function can take a separator string or regexp
661 as an argument and split a string accordingly.
663 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
665 Here field[0] is the company, field[1] the web address and so on.
667 To imitate the matching of a shell we can use wildcard mode.
669 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
671 Wildcard matching can be convenient because of its simplicity, but
672 any wildcard regexp can be defined using full regexps, e.g.
673 \b{.*\\.html$}. Notice that we can't match both \c .html and \c
674 .htm files with a wildcard unless we use \b{*.htm*} which will
675 also match 'test.html.bak'. A full regexp gives us the precision
676 we need, \b{.*\\.html?$}.
678 QRegExp can match case insensitively using setCaseSensitivity(),
679 and can use non-greedy matching, see setMinimal(). By
680 default QRegExp uses full regexps but this can be changed with
681 setWildcard(). Searching can be forward with indexIn() or backward
682 with lastIndexIn(). Captured text can be accessed using
683 capturedTexts() which returns a string list of all captured
684 strings, or using cap() which returns the captured string for the
685 given index. The pos() function takes a match index and returns
686 the position in the string where the match was made (or -1 if
689 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
690 {tools/regexp}{Regular Expression Example}
693 #if defined(Q_OS_VXWORKS) && defined(EOS)
697 const int NumBadChars = 64;
698 #define BadChar(ch) ((ch).unicode() % NumBadChars)
700 const int NoOccurrence = INT_MAX;
701 const int EmptyCapture = INT_MAX;
702 const int InftyLen = INT_MAX;
703 const int InftyRep = 1025;
706 static bool isWord(QChar ch)
708 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
712 Merges two vectors of ints and puts the result into the first
715 static void mergeInto(QVector<int> *a, const QVector<int> &b)
717 int asize = a->size();
718 int bsize = b.size();
721 #ifndef QT_NO_REGEXP_OPTIM
722 } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
723 a->resize(asize + 1);
724 (*a)[asize] = b.at(0);
726 } else if (bsize >= 1) {
727 int csize = asize + bsize;
728 QVector<int> c(csize);
729 int i = 0, j = 0, k = 0;
732 if (a->at(i) == b.at(j)) {
735 } else if (a->at(i) < b.at(j)) {
741 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
747 memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
752 #ifndef QT_NO_REGEXP_WILDCARD
754 Translates a wildcard pattern to an equivalent regular expression
755 pattern (e.g., *.cpp to .*\.cpp).
757 If enableEscaping is true, it is possible to escape the wildcard
760 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
762 const int wclen = wc_str.length();
765 bool isEscaping = false; // the previous character is '\'
766 const QChar *wc = wc_str.unicode();
769 const QChar c = wc[i++];
770 switch (c.unicode()) {
772 if (enableEscaping) {
774 rx += QLatin1String("\\\\");
775 } // we insert the \\ later if necessary
776 if (i == wclen) { // the end
777 rx += QLatin1String("\\\\");
780 rx += QLatin1String("\\\\");
786 rx += QLatin1String("\\*");
789 rx += QLatin1String(".*");
794 rx += QLatin1String("\\?");
797 rx += QLatin1Char('.');
812 rx += QLatin1String("\\\\");
814 rx += QLatin1Char('\\');
820 rx += QLatin1String("\\[");
823 if (wc[i] == QLatin1Char('^'))
826 if (rx[i] == QLatin1Char(']'))
828 while (i < wclen && wc[i] != QLatin1Char(']')) {
829 if (wc[i] == QLatin1Char('\\'))
830 rx += QLatin1Char('\\');
840 rx += QLatin1String("\\");
848 rx += QLatin1String("\\\\");
857 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
859 if (caretMode == QRegExp::CaretAtZero) {
861 } else if (caretMode == QRegExp::CaretAtOffset) {
863 } else { // QRegExp::CaretWontMatch
869 The QRegExpEngineKey struct uniquely identifies an engine.
871 struct QRegExpEngineKey
874 QRegExp::PatternSyntax patternSyntax;
875 Qt::CaseSensitivity cs;
877 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
878 Qt::CaseSensitivity cs)
879 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
881 inline void clear() {
883 patternSyntax = QRegExp::RegExp;
884 cs = Qt::CaseSensitive;
888 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
890 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
891 && key1.cs == key2.cs;
896 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
899 This is the engine state during matching.
901 struct QRegExpMatchState
903 const QChar *in; // a pointer to the input string data
904 int pos; // the current position in the string
906 int len; // the length of the input string
907 bool minimal; // minimal matching?
908 int *bigArray; // big array holding the data for the next pointers
909 int *inNextStack; // is state is nextStack?
910 int *curStack; // stack of current states
911 int *nextStack; // stack of next states
912 int *curCapBegin; // start of current states' captures
913 int *nextCapBegin; // start of next states' captures
914 int *curCapEnd; // end of current states' captures
915 int *nextCapEnd; // end of next states' captures
916 int *tempCapBegin; // start of temporary captures
917 int *tempCapEnd; // end of temporary captures
918 int *capBegin; // start of captures for a next state
919 int *capEnd; // end of captures for a next state
920 int *slideTab; // bump-along slide table for bad-character heuristic
921 int *captured; // what match() returned last
922 int slideTabSize; // size of slide table
924 #ifndef QT_NO_REGEXP_BACKREF
925 QList<QVector<int> > sleeping; // list of back-reference sleepers
927 int matchLen; // length of match
928 int oneTestMatchedLen; // length of partial match
930 const QRegExpEngine *eng;
932 inline QRegExpMatchState() : bigArray(0), captured(0) {}
933 inline ~QRegExpMatchState() { free(bigArray); }
935 void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
936 void prepareForMatch(QRegExpEngine *eng);
937 void match(const QChar *str, int len, int pos, bool minimal,
938 bool oneTest, int caretIndex);
940 bool testAnchor(int i, int a, const int *capBegin);
944 The struct QRegExpAutomatonState represents one state in a modified NFA. The
945 input characters matched are stored in the state instead of on
946 the transitions, something possible for an automaton
947 constructed from a regular expression.
949 struct QRegExpAutomatonState
951 #ifndef QT_NO_REGEXP_CAPTURE
952 int atom; // which atom does this state belong to?
954 int match; // what does it match? (see CharClassBit and BackRefBit)
955 QVector<int> outs; // out-transitions
956 QMap<int, int> reenter; // atoms reentered when transiting out
957 QMap<int, int> anchors; // anchors met when transiting out
959 inline QRegExpAutomatonState() { }
960 #ifndef QT_NO_REGEXP_CAPTURE
961 inline QRegExpAutomatonState(int a, int m)
962 : atom(a), match(m) { }
964 inline QRegExpAutomatonState(int m)
969 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
972 The struct QRegExpCharClassRange represents a range of characters (e.g.,
973 [0-9] denotes range 48 to 57).
975 struct QRegExpCharClassRange
981 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
983 #ifndef QT_NO_REGEXP_CAPTURE
985 The struct QRegExpAtom represents one node in the hierarchy of regular
990 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
992 int parent; // index of parent in array of atoms
993 int capture; // index of capture, from 1 to ncap - 1
996 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
999 struct QRegExpLookahead;
1001 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1003 The struct QRegExpAnchorAlternation represents a pair of anchors with
1006 struct QRegExpAnchorAlternation
1008 int a; // this anchor...
1009 int b; // ...or this one
1012 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1015 #ifndef QT_NO_REGEXP_CCLASS
1017 #define FLAG(x) (1 << (x))
1019 The class QRegExpCharClass represents a set of characters, such as can
1020 be found in regular expressions (e.g., [a-z] denotes the set
1023 class QRegExpCharClass
1027 inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1029 QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1032 bool negative() const { return n; }
1033 void setNegative(bool negative);
1034 void addCategories(uint cats);
1035 void addRange(ushort from, ushort to);
1036 void addSingleton(ushort ch) { addRange(ch, ch); }
1038 bool in(QChar ch) const;
1039 #ifndef QT_NO_REGEXP_OPTIM
1040 const QVector<int> &firstOccurrence() const { return occ1; }
1043 #if defined(QT_DEBUG)
1048 uint c; // character classes
1049 QVector<QRegExpCharClassRange> r; // character ranges
1050 bool n; // negative?
1051 #ifndef QT_NO_REGEXP_OPTIM
1052 QVector<int> occ1; // first-occurrence array
1056 struct QRegExpCharClass
1060 #ifndef QT_NO_REGEXP_OPTIM
1061 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1063 const QVector<int> &firstOccurrence() const { return occ1; }
1069 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1072 The QRegExpEngine class encapsulates a modified nondeterministic
1073 finite automaton (NFA).
1078 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1079 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1081 QRegExpEngine(const QRegExpEngineKey &key);
1084 bool isValid() const { return valid; }
1085 const QString &errorString() const { return yyError; }
1086 int captureCount() const { return officialncap; }
1088 int createState(QChar ch);
1089 int createState(const QRegExpCharClass &cc);
1090 #ifndef QT_NO_REGEXP_BACKREF
1091 int createState(int bref);
1094 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1095 #ifndef QT_NO_REGEXP_CAPTURE
1096 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1099 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1100 int anchorAlternation(int a, int b);
1101 int anchorConcatenation(int a, int b);
1103 int anchorAlternation(int a, int b) { return a & b; }
1104 int anchorConcatenation(int a, int b) { return a | b; }
1106 void addAnchors(int from, int to, int a);
1108 #ifndef QT_NO_REGEXP_OPTIM
1109 void heuristicallyChooseHeuristic();
1112 #if defined(QT_DEBUG)
1119 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1120 enum { InitialState = 0, FinalState = 1 };
1123 int setupState(int match);
1126 Let's hope that 13 lookaheads and 14 back-references are
1129 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1130 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1131 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1132 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1133 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1134 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1136 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1137 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1138 #ifndef QT_NO_REGEXP_CAPTURE
1139 int startAtom(bool officialCapture);
1140 void finishAtom(int atom, bool needCapture);
1143 #ifndef QT_NO_REGEXP_LOOKAHEAD
1144 int addLookahead(QRegExpEngine *eng, bool negative);
1147 #ifndef QT_NO_REGEXP_OPTIM
1148 bool goodStringMatch(QRegExpMatchState &matchState) const;
1149 bool badCharMatch(QRegExpMatchState &matchState) const;
1151 bool bruteMatch(QRegExpMatchState &matchState) const;
1154 QVector<QRegExpAutomatonState> s; // array of states
1155 #ifndef QT_NO_REGEXP_CAPTURE
1156 QVector<QRegExpAtom> f; // atom hierarchy
1157 int nf; // number of atoms
1158 int cf; // current atom
1159 QVector<int> captureForOfficialCapture;
1161 int officialncap; // number of captures, seen from the outside
1162 int ncap; // number of captures, seen from the inside
1163 #ifndef QT_NO_REGEXP_CCLASS
1164 QVector<QRegExpCharClass> cl; // array of character classes
1166 #ifndef QT_NO_REGEXP_LOOKAHEAD
1167 QVector<QRegExpLookahead *> ahead; // array of lookaheads
1169 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1170 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1172 #ifndef QT_NO_REGEXP_OPTIM
1173 bool caretAnchored; // does the regexp start with ^?
1174 bool trivial; // is the good-string all that needs to match?
1176 bool valid; // is the regular expression valid?
1177 Qt::CaseSensitivity cs; // case sensitive?
1178 bool greedyQuantifiers; // RegExp2?
1179 bool xmlSchemaExtensions;
1180 #ifndef QT_NO_REGEXP_BACKREF
1181 int nbrefs; // number of back-references
1184 #ifndef QT_NO_REGEXP_OPTIM
1185 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1187 int goodEarlyStart; // the index where goodStr can first occur in a match
1188 int goodLateStart; // the index where goodStr can last occur in a match
1189 QString goodStr; // the string that any match has to contain
1191 int minl; // the minimum length of a match
1192 QVector<int> occ1; // first-occurrence array
1196 The class Box is an abstraction for a regular expression
1197 fragment. It can also be seen as one node in the syntax tree of
1198 a regular expression with synthetized attributes.
1200 Its interface is ugly for performance reasons.
1205 Box(QRegExpEngine *engine);
1206 Box(const Box &b) { operator=(b); }
1208 Box &operator=(const Box &b);
1210 void clear() { operator=(Box(eng)); }
1212 void set(const QRegExpCharClass &cc);
1213 #ifndef QT_NO_REGEXP_BACKREF
1217 void cat(const Box &b);
1218 void orx(const Box &b);
1219 void plus(int atom);
1221 void catAnchor(int a);
1222 #ifndef QT_NO_REGEXP_OPTIM
1223 void setupHeuristics();
1226 #if defined(QT_DEBUG)
1231 void addAnchorsToEngine(const Box &to) const;
1233 QRegExpEngine *eng; // the automaton under construction
1234 QVector<int> ls; // the left states (firstpos)
1235 QVector<int> rs; // the right states (lastpos)
1236 QMap<int, int> lanchors; // the left anchors
1237 QMap<int, int> ranchors; // the right anchors
1238 int skipanchors; // the anchors to match if the box is skipped
1240 #ifndef QT_NO_REGEXP_OPTIM
1241 int earlyStart; // the index where str can first occur
1242 int lateStart; // the index where str can last occur
1243 QString str; // a string that has to occur in any match
1244 QString leftStr; // a string occurring at the left of this box
1245 QString rightStr; // a string occurring at the right of this box
1246 int maxl; // the maximum length of this box (possibly InftyLen)
1249 int minl; // the minimum length of this box
1250 #ifndef QT_NO_REGEXP_OPTIM
1251 QVector<int> occ1; // first-occurrence array
1258 This is the lexical analyzer for regular expressions.
1260 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1261 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1262 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1265 #ifndef QT_NO_REGEXP_INTERVAL
1266 int getRep(int def);
1268 #ifndef QT_NO_REGEXP_LOOKAHEAD
1269 void skipChars(int n);
1271 void error(const char *msg);
1272 void startTokenizer(const QChar *rx, int len);
1275 const QChar *yyIn; // a pointer to the input regular expression pattern
1276 int yyPos0; // the position of yyTok in the input pattern
1277 int yyPos; // the position of the next character to read
1278 int yyLen; // the length of yyIn
1279 int yyCh; // the last character read
1280 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1281 int yyMinRep; // attribute for Tok_Quantifier
1282 int yyMaxRep; // ditto
1283 QString yyError; // syntax error or overflow during parsing?
1286 This is the syntactic analyzer for regular expressions.
1288 int parse(const QChar *rx, int len);
1289 void parseAtom(Box *box);
1290 void parseFactor(Box *box);
1291 void parseTerm(Box *box);
1292 void parseExpression(Box *box);
1294 int yyTok; // the last token read
1295 bool yyMayCapture; // set this to false to disable capturing
1297 friend struct QRegExpMatchState;
1300 #ifndef QT_NO_REGEXP_LOOKAHEAD
1302 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1303 (?=foo) and (?!bar)).
1305 struct QRegExpLookahead
1307 QRegExpEngine *eng; // NFA representing the embedded regular expression
1308 bool neg; // negative lookahead?
1310 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1311 : eng(eng0), neg(neg0) { }
1312 inline ~QRegExpLookahead() { delete eng; }
1317 convert the pattern string to the RegExp syntax.
1319 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1323 switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325 case QRegExp::Wildcard:
1326 return wc2rx(pattern, false);
1328 case QRegExp::WildcardUnix:
1329 return wc2rx(pattern, true);
1332 case QRegExp::FixedString:
1333 return QRegExp::escape(pattern);
1335 case QRegExp::W3CXmlSchema11:
1341 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1342 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1343 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1347 QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1349 valid = (parse(rx.unicode(), rx.length()) == rx.length());
1351 #ifndef QT_NO_REGEXP_OPTIM
1354 error(RXERR_LEFTDELIM);
1358 QRegExpEngine::~QRegExpEngine()
1360 #ifndef QT_NO_REGEXP_LOOKAHEAD
1365 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1368 We use one QVector<int> for all the big data used a lot in
1369 matchHere() and friends.
1371 int ns = eng->s.size(); // number of states
1372 int ncap = eng->ncap;
1373 #ifndef QT_NO_REGEXP_OPTIM
1374 int newSlideTabSize = qMax(eng->minl + 1, 16);
1376 int newSlideTabSize = 0;
1378 int numCaptures = eng->captureCount();
1379 int newCapturedSize = 2 + 2 * numCaptures;
1380 bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1382 // set all internal variables only _after_ bigArray is realloc'ed
1383 // to prevent a broken regexp in oom case
1385 slideTabSize = newSlideTabSize;
1386 capturedSize = newCapturedSize;
1387 inNextStack = bigArray;
1388 memset(inNextStack, -1, ns * sizeof(int));
1389 curStack = inNextStack + ns;
1390 nextStack = inNextStack + 2 * ns;
1392 curCapBegin = inNextStack + 3 * ns;
1393 nextCapBegin = curCapBegin + ncap * ns;
1394 curCapEnd = curCapBegin + 2 * ncap * ns;
1395 nextCapEnd = curCapBegin + 3 * ncap * ns;
1397 tempCapBegin = curCapBegin + 4 * ncap * ns;
1398 tempCapEnd = tempCapBegin + ncap;
1399 capBegin = tempCapBegin + 2 * ncap;
1400 capEnd = tempCapBegin + 3 * ncap;
1402 slideTab = tempCapBegin + 4 * ncap;
1403 captured = slideTab + slideTabSize;
1404 memset(captured, -1, capturedSize*sizeof(int));
1409 Tries to match in str and returns an array of (begin, length) pairs
1410 for captured text. If there is no match, all pairs are (-1, -1).
1412 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1413 bool minimal0, bool oneTest, int caretIndex)
1415 bool matched = false;
1418 #ifndef QT_NO_REGEXP_OPTIM
1419 if (eng->trivial && !oneTest) {
1420 pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1421 matchLen = eng->goodStr.length();
1422 matched = (pos != -1);
1430 caretPos = caretIndex;
1434 oneTestMatchedLen = 0;
1436 if (eng->valid && pos >= 0 && pos <= len) {
1437 #ifndef QT_NO_REGEXP_OPTIM
1439 matched = matchHere();
1441 if (pos <= len - eng->minl) {
1442 if (eng->caretAnchored) {
1443 matched = matchHere();
1444 } else if (eng->useGoodStringHeuristic) {
1445 matched = eng->goodStringMatch(*this);
1447 matched = eng->badCharMatch(*this);
1452 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1462 int numCaptures = (capturedSize - 2) >> 1;
1463 #ifndef QT_NO_REGEXP_CAPTURE
1464 for (int i = 0; i < numCaptures; ++i) {
1465 int j = eng->captureForOfficialCapture.at(i);
1466 if (capBegin[j] != EmptyCapture) {
1467 int len = capEnd[j] - capBegin[j];
1468 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1477 // we rely on 2's complement here
1478 memset(captured, -1, capturedSize * sizeof(int));
1483 The three following functions add one state to the automaton and
1484 return the number of the state.
1487 int QRegExpEngine::createState(QChar ch)
1489 return setupState(ch.unicode());
1492 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1494 #ifndef QT_NO_REGEXP_CCLASS
1496 cl += QRegExpCharClass(cc);
1497 return setupState(CharClassBit | n);
1500 return setupState(CharClassBit);
1504 #ifndef QT_NO_REGEXP_BACKREF
1505 int QRegExpEngine::createState(int bref)
1507 if (bref > nbrefs) {
1509 if (nbrefs > MaxBackRefs) {
1514 return setupState(BackRefBit | bref);
1519 The two following functions add a transition between all pairs of
1520 states (i, j) where i is found in from, and j is found in to.
1522 Cat-transitions are distinguished from plus-transitions for
1526 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1528 for (int i = 0; i < from.size(); i++)
1529 mergeInto(&s[from.at(i)].outs, to);
1532 #ifndef QT_NO_REGEXP_CAPTURE
1533 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1535 for (int i = 0; i < from.size(); i++) {
1536 QRegExpAutomatonState &st = s[from.at(i)];
1537 const QVector<int> oldOuts = st.outs;
1538 mergeInto(&st.outs, to);
1539 if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1540 for (int j = 0; j < to.size(); j++) {
1541 // ### st.reenter.contains(to.at(j)) check looks suspicious
1542 if (!st.reenter.contains(to.at(j)) &&
1543 qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1544 st.reenter.insert(to.at(j), atom);
1551 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1553 Returns an anchor that means a OR b.
1555 int QRegExpEngine::anchorAlternation(int a, int b)
1557 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1561 #ifndef QT_NO_REGEXP_OPTIM
1562 if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1563 return Anchor_Alternation | (n - 1);
1566 QRegExpAnchorAlternation element = {a, b};
1568 return Anchor_Alternation | n;
1572 Returns an anchor that means a AND b.
1574 int QRegExpEngine::anchorConcatenation(int a, int b)
1576 if (((a | b) & Anchor_Alternation) == 0)
1578 if ((b & Anchor_Alternation) != 0)
1581 int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1582 int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1583 return anchorAlternation(aprime, bprime);
1588 Adds anchor a on a transition caracterised by its from state and
1591 void QRegExpEngine::addAnchors(int from, int to, int a)
1593 QRegExpAutomatonState &st = s[from];
1594 if (st.anchors.contains(to))
1595 a = anchorAlternation(st.anchors.value(to), a);
1596 st.anchors.insert(to, a);
1599 #ifndef QT_NO_REGEXP_OPTIM
1601 This function chooses between the good-string and the bad-character
1602 heuristics. It computes two scores and chooses the heuristic with
1605 Here are some common-sense constraints on the scores that should be
1606 respected if the formulas are ever modified: (1) If goodStr is
1607 empty, the good-string heuristic scores 0. (2) If the regular
1608 expression is trivial, the good-string heuristic should be used.
1609 (3) If the search is case insensitive, the good-string heuristic
1610 should be used, unless it scores 0. (Case insensitivity turns all
1611 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1612 big, the good-string heuristic should score less.
1614 void QRegExpEngine::heuristicallyChooseHeuristic()
1617 useGoodStringHeuristic = false;
1618 } else if (trivial) {
1619 useGoodStringHeuristic = true;
1622 Magic formula: The good string has to constitute a good
1623 proportion of the minimum-length string, and appear at a
1624 more-or-less known index.
1626 int goodStringScore = (64 * goodStr.length() / minl) -
1627 (goodLateStart - goodEarlyStart);
1629 Less magic formula: We pick some characters at random, and
1630 check whether they are good or bad.
1632 int badCharScore = 0;
1633 int step = qMax(1, NumBadChars / 32);
1634 for (int i = 1; i < NumBadChars; i += step) {
1635 if (occ1.at(i) == NoOccurrence)
1636 badCharScore += minl;
1638 badCharScore += occ1.at(i);
1640 badCharScore /= minl;
1641 useGoodStringHeuristic = (goodStringScore > badCharScore);
1646 #if defined(QT_DEBUG)
1647 void QRegExpEngine::dump() const
1650 qDebug("Case %ssensitive engine", cs ? "" : "in");
1652 for (i = 0; i < s.size(); i++) {
1653 qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1654 #ifndef QT_NO_REGEXP_CAPTURE
1656 qDebug(" in atom %d", s[i].atom);
1659 if ((m & CharClassBit) != 0) {
1660 qDebug(" match character class %d", m ^ CharClassBit);
1661 #ifndef QT_NO_REGEXP_CCLASS
1662 cl[m ^ CharClassBit].dump();
1664 qDebug(" negative character class");
1666 } else if ((m & BackRefBit) != 0) {
1667 qDebug(" match back-reference %d", m ^ BackRefBit);
1668 } else if (m >= 0x20 && m <= 0x7e) {
1669 qDebug(" match 0x%.4x (%c)", m, m);
1671 qDebug(" match 0x%.4x", m);
1673 for (j = 0; j < s[i].outs.size(); j++) {
1674 int next = s[i].outs[j];
1675 qDebug(" -> %d", next);
1676 if (s[i].reenter.contains(next))
1677 qDebug(" [reenter %d]", s[i].reenter[next]);
1678 if (s[i].anchors.value(next) != 0)
1679 qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
1682 #ifndef QT_NO_REGEXP_CAPTURE
1684 qDebug(" Atom Parent Capture");
1685 for (i = 0; i < nf; i++) {
1686 if (f[i].capture == QRegExpAtom::NoCapture) {
1687 qDebug(" %6d %6d nil", i, f[i].parent);
1689 int cap = f[i].capture;
1690 bool official = captureForOfficialCapture.contains(cap);
1691 qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1692 official ? "official" : "");
1697 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1698 for (i = 0; i < aa.size(); i++)
1699 qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1704 void QRegExpEngine::setup()
1707 #ifndef QT_NO_REGEXP_CAPTURE
1714 #ifndef QT_NO_REGEXP_OPTIM
1715 caretAnchored = true;
1719 #ifndef QT_NO_REGEXP_BACKREF
1722 #ifndef QT_NO_REGEXP_OPTIM
1723 useGoodStringHeuristic = true;
1725 occ1.fill(0, NumBadChars);
1729 int QRegExpEngine::setupState(int match)
1731 #ifndef QT_NO_REGEXP_CAPTURE
1732 s += QRegExpAutomatonState(cf, match);
1734 s += QRegExpAutomatonState(match);
1736 return s.size() - 1;
1739 #ifndef QT_NO_REGEXP_CAPTURE
1741 Functions startAtom() and finishAtom() should be called to delimit
1742 atoms. When a state is created, it is assigned to the current atom.
1743 The information is later used for capturing.
1745 int QRegExpEngine::startAtom(bool officialCapture)
1747 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1748 f.resize((nf + 1) << 1);
1751 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1755 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1757 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1758 f[atom].capture = QRegExpAtom::UnofficialCapture;
1759 cf = f.at(atom).parent;
1763 #ifndef QT_NO_REGEXP_LOOKAHEAD
1765 Creates a lookahead anchor.
1767 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1769 int n = ahead.size();
1770 if (n == MaxLookaheads) {
1774 ahead += new QRegExpLookahead(eng, negative);
1775 return Anchor_FirstLookahead << n;
1779 #ifndef QT_NO_REGEXP_CAPTURE
1781 We want the longest leftmost captures.
1783 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1786 for (int i = 0; i < ncap; i++) {
1787 int delta = begin2[i] - begin1[i]; // it has to start early...
1789 delta = end1[i] - end2[i]; // ...and end late
1799 Returns true if anchor a matches at position pos + i in the input
1800 string, otherwise false.
1802 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1806 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1807 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1808 return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1809 || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1812 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1813 if (pos + i != caretPos)
1816 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1820 #ifndef QT_NO_REGEXP_ESCAPE
1821 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1822 bool before = false;
1825 before = isWord(in[pos + i - 1]);
1827 after = isWord(in[pos + i]);
1828 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1830 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1834 #ifndef QT_NO_REGEXP_LOOKAHEAD
1835 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1836 const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1837 for (j = 0; j < ahead.size(); j++) {
1838 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1839 QRegExpMatchState matchState;
1840 matchState.prepareForMatch(ahead[j]->eng);
1841 matchState.match(in + pos + i, len - pos - i, 0,
1842 true, true, matchState.caretPos - matchState.pos - i);
1843 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1849 #ifndef QT_NO_REGEXP_CAPTURE
1850 #ifndef QT_NO_REGEXP_BACKREF
1851 for (j = 0; j < eng->nbrefs; j++) {
1852 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1853 int i = eng->captureForOfficialCapture.at(j);
1854 if (capBegin[i] != EmptyCapture)
1863 #ifndef QT_NO_REGEXP_OPTIM
1865 The three following functions are what Jeffrey Friedl would call
1866 transmissions (or bump-alongs). Using one or the other should make
1867 no difference except in performance.
1870 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1872 int k = matchState.pos + goodEarlyStart;
1873 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1874 while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1875 int from = k - goodLateStart;
1876 int to = k - goodEarlyStart;
1877 if (from > matchState.pos)
1878 matchState.pos = from;
1880 while (matchState.pos <= to) {
1881 if (matchState.matchHere())
1890 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1895 int lastPos = matchState.len - minl;
1896 memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1899 Set up the slide table, used for the bad-character heuristic,
1900 using the table of first occurrence of each character.
1902 for (i = 0; i < minl; i++) {
1903 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1904 if (sk == NoOccurrence)
1912 if (sk > matchState.slideTab[k])
1913 matchState.slideTab[k] = sk;
1917 if (matchState.pos > lastPos)
1921 if (++slideNext >= matchState.slideTabSize)
1923 if (matchState.slideTab[slideHead] > 0) {
1924 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1925 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1926 matchState.slideTab[slideHead] = 0;
1928 if (matchState.matchHere())
1932 if (matchState.pos == lastPos)
1936 Update the slide table. This code has much in common with
1937 the initialization code.
1939 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1940 if (sk == NoOccurrence) {
1941 matchState.slideTab[slideNext] = minl;
1942 } else if (sk > 0) {
1943 int k = slideNext + minl - sk;
1944 if (k >= matchState.slideTabSize)
1945 k -= matchState.slideTabSize;
1946 if (sk > matchState.slideTab[k])
1947 matchState.slideTab[k] = sk;
1949 slideHead = slideNext;
1955 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1957 while (matchState.pos <= matchState.len) {
1958 if (matchState.matchHere())
1967 Here's the core of the engine. It tries to do a match here and now.
1969 bool QRegExpMatchState::matchHere()
1971 int ncur = 1, nnext = 0;
1976 oneTestMatchedLen = -1;
1977 curStack[0] = QRegExpEngine::InitialState;
1979 int ncap = eng->ncap;
1980 #ifndef QT_NO_REGEXP_CAPTURE
1982 for (j = 0; j < ncap; j++) {
1983 curCapBegin[j] = EmptyCapture;
1984 curCapEnd[j] = EmptyCapture;
1989 #ifndef QT_NO_REGEXP_BACKREF
1990 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1992 while (ncur > 0 && i <= len - pos && !stop)
1995 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1996 for (j = 0; j < ncur; j++) {
1997 int cur = curStack[j];
1998 const QRegExpAutomatonState &scur = eng->s.at(cur);
1999 const QVector<int> &outs = scur.outs;
2000 for (k = 0; k < outs.size(); k++) {
2001 int next = outs.at(k);
2002 const QRegExpAutomatonState &snext = eng->s.at(next);
2004 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2005 int needSomeSleep = 0;
2009 First, check if the anchors are anchored properly.
2011 int a = scur.anchors.value(next);
2012 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2016 If indeed they are, check if the input character is
2017 correct for this transition.
2021 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2025 inside = (QChar(m).toLower() == QChar(ch).toLower());
2026 } else if (next == QRegExpEngine::FinalState) {
2030 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2031 #ifndef QT_NO_REGEXP_CCLASS
2032 const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2035 else if (cc.negative())
2036 inside = cc.in(QChar(ch).toLower()) &&
2037 cc.in(QChar(ch).toUpper());
2039 inside = cc.in(QChar(ch).toLower()) ||
2040 cc.in(QChar(ch).toUpper());
2042 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2043 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2044 int bref = m ^ QRegExpEngine::BackRefBit;
2045 int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2047 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2050 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2052 inside = (in[pos + curCapBegin[ell]].toLower()
2053 == QChar(ch).toLower());
2058 if (curCapEnd[ell] == EmptyCapture)
2059 delta = i - curCapBegin[ell];
2061 delta = curCapEnd[ell] - curCapBegin[ell];
2063 inside = (delta <= len - (pos + i));
2064 if (inside && delta > 1) {
2068 if (in[pos + curCapBegin[ell] + n]
2075 QChar a = in[pos + curCapBegin[ell] + n];
2076 QChar b = in[pos + i + n];
2077 if (a.toLower() != b.toLower())
2082 inside = (n == delta);
2084 needSomeSleep = delta - 1;
2092 We must now update our data structures.
2095 #ifndef QT_NO_REGEXP_CAPTURE
2096 int *capBegin, *capEnd;
2099 If the next state was not encountered yet, all
2102 if ((m = inNextStack[next]) == -1) {
2104 nextStack[m] = next;
2105 inNextStack[next] = m;
2106 #ifndef QT_NO_REGEXP_CAPTURE
2107 capBegin = nextCapBegin + m * ncap;
2108 capEnd = nextCapEnd + m * ncap;
2111 Otherwise, we'll first maintain captures in
2112 temporary arrays, and decide at the end whether
2113 it's best to keep the previous capture zones or
2117 capBegin = tempCapBegin;
2118 capEnd = tempCapEnd;
2122 #ifndef QT_NO_REGEXP_CAPTURE
2124 Updating the capture zones is much of a task.
2127 memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2128 memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2129 int c = scur.atom, n = snext.atom;
2134 Lemma 1. For any x in the range [0..nf), we
2135 have f[x].parent < x.
2137 Proof. By looking at startAtom(), it is
2138 clear that cf < nf holds all the time, and
2139 thus that f[nf].parent < nf.
2143 If we are reentering an atom, we empty all
2144 capture zones inside it.
2146 if ((q = scur.reenter.value(next)) != 0) {
2147 QBitArray b(eng->nf, false);
2149 for (int ell = q + 1; ell < eng->nf; ell++) {
2150 if (b.testBit(eng->f.at(ell).parent)) {
2151 b.setBit(ell, true);
2152 cap = eng->f.at(ell).capture;
2154 capBegin[cap] = EmptyCapture;
2155 capEnd[cap] = EmptyCapture;
2159 p = eng->f.at(q).parent;
2162 Otherwise, close the capture zones we are
2163 leaving. We are leaving f[c].capture,
2164 f[f[c].parent].capture,
2165 f[f[f[c].parent].parent].capture, ...,
2166 until f[x].capture, with x such that
2167 f[x].parent is the youngest common ancestor
2170 We go up along c's and n's ancestry until
2178 cap = eng->f.at(p).capture;
2180 if (capBegin[cap] == i) {
2181 capBegin[cap] = EmptyCapture;
2182 capEnd[cap] = EmptyCapture;
2187 p = eng->f.at(p).parent;
2189 q = eng->f.at(q).parent;
2195 In any case, we now open the capture zones
2196 we are entering. We work upwards from n
2197 until we reach p (the parent of the atom we
2198 reenter or the youngest common ancestor).
2201 cap = eng->f.at(n).capture;
2204 capEnd[cap] = EmptyCapture;
2206 n = eng->f.at(n).parent;
2209 If the next state was already in
2210 nextStack, we must choose carefully which
2211 capture zones we want to keep.
2213 if (capBegin == tempCapBegin &&
2214 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2215 nextCapEnd + m * ncap)) {
2216 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2217 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2220 #ifndef QT_NO_REGEXP_BACKREF
2222 We are done with updating the capture zones.
2223 It's now time to put the next state to sleep,
2224 if it needs to, and to remove it from
2227 if (needSomeSleep > 0) {
2228 QVector<int> zzZ(2 + 2 * ncap);
2229 zzZ[0] = i + needSomeSleep;
2232 memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2233 memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2235 inNextStack[nextStack[--nnext]] = -1;
2236 sleeping.append(zzZ);
2243 #ifndef QT_NO_REGEXP_CAPTURE
2245 If we reached the final state, hurray! Copy the captured
2248 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2249 memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2250 memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2252 #ifndef QT_NO_REGEXP_BACKREF
2254 It's time to wake up the sleepers.
2257 while (j < sleeping.count()) {
2258 if (sleeping.at(j)[0] == i) {
2259 const QVector<int> &zzZ = sleeping.at(j);
2261 const int *capBegin = zzZ.data() + 2;
2262 const int *capEnd = zzZ.data() + 2 + ncap;
2263 bool copyOver = true;
2265 if ((m = inNextStack[next]) == -1) {
2267 nextStack[m] = next;
2268 inNextStack[next] = m;
2270 copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2274 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2275 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2278 sleeping.removeAt(j);
2285 for (j = 0; j < nnext; j++)
2286 inNextStack[nextStack[j]] = -1;
2288 // avoid needless iteration that confuses oneTestMatchedLen
2289 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2290 #ifndef QT_NO_REGEXP_BACKREF
2291 && sleeping.isEmpty()
2296 qSwap(curStack, nextStack);
2297 #ifndef QT_NO_REGEXP_CAPTURE
2298 qSwap(curCapBegin, nextCapBegin);
2299 qSwap(curCapEnd, nextCapEnd);
2306 #ifndef QT_NO_REGEXP_BACKREF
2308 If minimal matching is enabled, we might have some sleepers
2311 if (!sleeping.isEmpty())
2315 oneTestMatchedLen = i - 1;
2316 return (matchLen >= 0);
2319 #ifndef QT_NO_REGEXP_CCLASS
2321 QRegExpCharClass::QRegExpCharClass()
2324 #ifndef QT_NO_REGEXP_OPTIM
2325 occ1.fill(NoOccurrence, NumBadChars);
2329 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2334 #ifndef QT_NO_REGEXP_OPTIM
2340 void QRegExpCharClass::clear()
2347 void QRegExpCharClass::setNegative(bool negative)
2350 #ifndef QT_NO_REGEXP_OPTIM
2351 occ1.fill(0, NumBadChars);
2355 void QRegExpCharClass::addCategories(uint cats)
2357 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2358 FLAG(QChar::Mark_SpacingCombining) |
2359 FLAG(QChar::Mark_Enclosing) |
2360 FLAG(QChar::Number_DecimalDigit) |
2361 FLAG(QChar::Number_Letter) |
2362 FLAG(QChar::Number_Other) |
2363 FLAG(QChar::Separator_Space) |
2364 FLAG(QChar::Separator_Line) |
2365 FLAG(QChar::Separator_Paragraph) |
2366 FLAG(QChar::Other_Control) |
2367 FLAG(QChar::Other_Format) |
2368 FLAG(QChar::Other_Surrogate) |
2369 FLAG(QChar::Other_PrivateUse) |
2370 FLAG(QChar::Other_NotAssigned) |
2371 FLAG(QChar::Letter_Uppercase) |
2372 FLAG(QChar::Letter_Lowercase) |
2373 FLAG(QChar::Letter_Titlecase) |
2374 FLAG(QChar::Letter_Modifier) |
2375 FLAG(QChar::Letter_Other) |
2376 FLAG(QChar::Punctuation_Connector) |
2377 FLAG(QChar::Punctuation_Dash) |
2378 FLAG(QChar::Punctuation_Open) |
2379 FLAG(QChar::Punctuation_Close) |
2380 FLAG(QChar::Punctuation_InitialQuote) |
2381 FLAG(QChar::Punctuation_FinalQuote) |
2382 FLAG(QChar::Punctuation_Other) |
2383 FLAG(QChar::Symbol_Math) |
2384 FLAG(QChar::Symbol_Currency) |
2385 FLAG(QChar::Symbol_Modifier) |
2386 FLAG(QChar::Symbol_Other);
2387 c |= (all_cats & cats);
2388 #ifndef QT_NO_REGEXP_OPTIM
2389 occ1.fill(0, NumBadChars);
2393 void QRegExpCharClass::addRange(ushort from, ushort to)
2400 r[m].len = to - from + 1;
2402 #ifndef QT_NO_REGEXP_OPTIM
2405 if (to - from < NumBadChars) {
2406 if (from % NumBadChars <= to % NumBadChars) {
2407 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2410 for (i = 0; i <= to % NumBadChars; i++)
2412 for (i = from % NumBadChars; i < NumBadChars; i++)
2416 occ1.fill(0, NumBadChars);
2421 bool QRegExpCharClass::in(QChar ch) const
2423 #ifndef QT_NO_REGEXP_OPTIM
2424 if (occ1.at(BadChar(ch)) == NoOccurrence)
2428 if (c != 0 && (c & FLAG(ch.category())) != 0)
2431 const int uc = ch.unicode();
2432 int size = r.size();
2434 for (int i = 0; i < size; ++i) {
2435 const QRegExpCharClassRange &range = r.at(i);
2436 if (uint(uc - range.from) < uint(r.at(i).len))
2442 #if defined(QT_DEBUG)
2443 void QRegExpCharClass::dump() const
2446 qDebug(" %stive character class", n ? "nega" : "posi");
2447 #ifndef QT_NO_REGEXP_CCLASS
2449 qDebug(" categories 0x%.8x", c);
2451 for (i = 0; i < r.size(); i++)
2452 qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2457 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2458 : eng(engine), skipanchors(0)
2459 #ifndef QT_NO_REGEXP_OPTIM
2460 , earlyStart(0), lateStart(0), maxl(0)
2463 #ifndef QT_NO_REGEXP_OPTIM
2464 occ1.fill(NoOccurrence, NumBadChars);
2469 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2474 lanchors = b.lanchors;
2475 ranchors = b.ranchors;
2476 skipanchors = b.skipanchors;
2477 #ifndef QT_NO_REGEXP_OPTIM
2478 earlyStart = b.earlyStart;
2479 lateStart = b.lateStart;
2481 leftStr = b.leftStr;
2482 rightStr = b.rightStr;
2490 void QRegExpEngine::Box::set(QChar ch)
2493 ls[0] = eng->createState(ch);
2495 #ifndef QT_NO_REGEXP_OPTIM
2500 occ1[BadChar(ch)] = 0;
2505 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2508 ls[0] = eng->createState(cc);
2510 #ifndef QT_NO_REGEXP_OPTIM
2512 occ1 = cc.firstOccurrence();
2517 #ifndef QT_NO_REGEXP_BACKREF
2518 void QRegExpEngine::Box::set(int bref)
2521 ls[0] = eng->createState(bref);
2523 if (bref >= 1 && bref <= MaxBackRefs)
2524 skipanchors = Anchor_BackRef0Empty << bref;
2525 #ifndef QT_NO_REGEXP_OPTIM
2532 void QRegExpEngine::Box::cat(const Box &b)
2534 eng->addCatTransitions(rs, b.ls);
2535 addAnchorsToEngine(b);
2537 lanchors.unite(b.lanchors);
2538 if (skipanchors != 0) {
2539 for (int i = 0; i < b.ls.size(); i++) {
2540 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2541 lanchors.insert(b.ls.at(i), a);
2544 mergeInto(&ls, b.ls);
2547 ranchors.unite(b.ranchors);
2548 if (b.skipanchors != 0) {
2549 for (int i = 0; i < rs.size(); i++) {
2550 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2551 ranchors.insert(rs.at(i), a);
2554 mergeInto(&rs, b.rs);
2556 ranchors = b.ranchors;
2560 #ifndef QT_NO_REGEXP_OPTIM
2561 if (maxl != InftyLen) {
2562 if (rightStr.length() + b.leftStr.length() >
2563 qMax(str.length(), b.str.length())) {
2564 earlyStart = minl - rightStr.length();
2565 lateStart = maxl - rightStr.length();
2566 str = rightStr + b.leftStr;
2567 } else if (b.str.length() > str.length()) {
2568 earlyStart = minl + b.earlyStart;
2569 lateStart = maxl + b.lateStart;
2574 if (leftStr.length() == maxl)
2575 leftStr += b.leftStr;
2577 if (b.rightStr.length() == b.maxl) {
2578 rightStr += b.rightStr;
2580 rightStr = b.rightStr;
2583 if (maxl == InftyLen || b.maxl == InftyLen) {
2589 for (int i = 0; i < NumBadChars; i++) {
2590 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2591 occ1[i] = minl + b.occ1.at(i);
2597 skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2602 void QRegExpEngine::Box::orx(const Box &b)
2604 mergeInto(&ls, b.ls);
2605 lanchors.unite(b.lanchors);
2606 mergeInto(&rs, b.rs);
2607 ranchors.unite(b.ranchors);
2611 skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2613 skipanchors = b.skipanchors;
2616 #ifndef QT_NO_REGEXP_OPTIM
2617 for (int i = 0; i < NumBadChars; i++) {
2618 if (occ1.at(i) > b.occ1.at(i))
2619 occ1[i] = b.occ1.at(i);
2624 leftStr = QString();
2625 rightStr = QString();
2633 void QRegExpEngine::Box::plus(int atom)
2635 #ifndef QT_NO_REGEXP_CAPTURE
2636 eng->addPlusTransitions(rs, ls, atom);
2639 eng->addCatTransitions(rs, ls);
2641 addAnchorsToEngine(*this);
2642 #ifndef QT_NO_REGEXP_OPTIM
2647 void QRegExpEngine::Box::opt()
2649 #ifndef QT_NO_REGEXP_OPTIM
2653 leftStr = QString();
2654 rightStr = QString();
2660 void QRegExpEngine::Box::catAnchor(int a)
2663 for (int i = 0; i < rs.size(); i++) {
2664 a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2665 ranchors.insert(rs.at(i), a);
2668 skipanchors = eng->anchorConcatenation(skipanchors, a);
2672 #ifndef QT_NO_REGEXP_OPTIM
2673 void QRegExpEngine::Box::setupHeuristics()
2675 eng->goodEarlyStart = earlyStart;
2676 eng->goodLateStart = lateStart;
2677 eng->goodStr = eng->cs ? str : str.toLower();
2682 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2683 1 at this point. An entry of occ1 has to be at most minl or
2684 infinity for the rest of the algorithm to go well.
2686 We waited until here before normalizing these cases (instead of
2687 doing it in Box::orx()) because sometimes things improve by
2688 themselves. Consider for example (112|1)34.
2690 for (int i = 0; i < NumBadChars; i++) {
2691 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2696 eng->occ1.fill(0, NumBadChars);
2699 eng->heuristicallyChooseHeuristic();
2703 #if defined(QT_DEBUG)
2704 void QRegExpEngine::Box::dump() const
2707 qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2708 qDebug(" Left states:");
2709 for (i = 0; i < ls.size(); i++) {
2710 if (lanchors.value(ls[i], 0) == 0)
2711 qDebug(" %d", ls[i]);
2713 qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2715 qDebug(" Right states:");
2716 for (i = 0; i < rs.size(); i++) {
2717 if (ranchors.value(rs[i], 0) == 0)
2718 qDebug(" %d", rs[i]);
2720 qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2722 qDebug(" Skip anchors: 0x%.8x", skipanchors);
2726 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2728 for (int i = 0; i < to.ls.size(); i++) {
2729 for (int j = 0; j < rs.size(); j++) {
2730 int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2731 to.lanchors.value(to.ls.at(i), 0));
2732 eng->addAnchors(rs[j], to.ls[i], a);
2737 #ifndef QT_NO_REGEXP_CCLASS
2738 // fast lookup hash for xml schema extensions
2739 // sorted by name for b-search
2740 static const struct CategoriesRangeMapEntry {
2741 const char name[40];
2743 } categoriesRangeMap[] = {
2744 { "AegeanNumbers", 0x10100, 0x1013F },
2745 { "AlphabeticPresentationForms", 0xFB00, 0xFB4F },
2746 { "AncientGreekMusicalNotation", 0x1D200, 0x1D24F },
2747 { "AncientGreekNumbers", 0x10140, 0x1018F },
2748 { "Arabic", 0x0600, 0x06FF },
2749 { "ArabicPresentationForms-A", 0xFB50, 0xFDFF },
2750 { "ArabicPresentationForms-B", 0xFE70, 0xFEFF },
2751 { "ArabicSupplement", 0x0750, 0x077F },
2752 { "Armenian", 0x0530, 0x058F },
2753 { "Arrows", 0x2190, 0x21FF },
2754 { "BasicLatin", 0x0000, 0x007F },
2755 { "Bengali", 0x0980, 0x09FF },
2756 { "BlockElements", 0x2580, 0x259F },
2757 { "Bopomofo", 0x3100, 0x312F },
2758 { "BopomofoExtended", 0x31A0, 0x31BF },
2759 { "BoxDrawing", 0x2500, 0x257F },
2760 { "BraillePatterns", 0x2800, 0x28FF },
2761 { "Buginese", 0x1A00, 0x1A1F },
2762 { "Buhid", 0x1740, 0x175F },
2763 { "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF },
2764 { "CJKCompatibility", 0x3300, 0x33FF },
2765 { "CJKCompatibilityForms", 0xFE30, 0xFE4F },
2766 { "CJKCompatibilityIdeographs", 0xF900, 0xFAFF },
2767 { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },
2768 { "CJKRadicalsSupplement", 0x2E80, 0x2EFF },
2769 { "CJKStrokes", 0x31C0, 0x31EF },
2770 { "CJKSymbolsandPunctuation", 0x3000, 0x303F },
2771 { "CJKUnifiedIdeographs", 0x4E00, 0x9FFF },
2772 { "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 },
2773 { "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF },
2774 { "Cherokee", 0x13A0, 0x13FF },
2775 { "CombiningDiacriticalMarks", 0x0300, 0x036F },
2776 { "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF },
2777 { "CombiningHalfMarks", 0xFE20, 0xFE2F },
2778 { "CombiningMarksforSymbols", 0x20D0, 0x20FF },
2779 { "ControlPictures", 0x2400, 0x243F },
2780 { "Coptic", 0x2C80, 0x2CFF },
2781 { "CurrencySymbols", 0x20A0, 0x20CF },
2782 { "CypriotSyllabary", 0x10800, 0x1083F },
2783 { "Cyrillic", 0x0400, 0x04FF },
2784 { "CyrillicSupplement", 0x0500, 0x052F },
2785 { "Deseret", 0x10400, 0x1044F },
2786 { "Devanagari", 0x0900, 0x097F },
2787 { "Dingbats", 0x2700, 0x27BF },
2788 { "EnclosedAlphanumerics", 0x2460, 0x24FF },
2789 { "EnclosedCJKLettersandMonths", 0x3200, 0x32FF },
2790 { "Ethiopic", 0x1200, 0x137F },
2791 { "EthiopicExtended", 0x2D80, 0x2DDF },
2792 { "EthiopicSupplement", 0x1380, 0x139F },
2793 { "GeneralPunctuation", 0x2000, 0x206F },
2794 { "GeometricShapes", 0x25A0, 0x25FF },
2795 { "Georgian", 0x10A0, 0x10FF },
2796 { "GeorgianSupplement", 0x2D00, 0x2D2F },
2797 { "Glagolitic", 0x2C00, 0x2C5F },
2798 { "Gothic", 0x10330, 0x1034F },
2799 { "Greek", 0x0370, 0x03FF },
2800 { "GreekExtended", 0x1F00, 0x1FFF },
2801 { "Gujarati", 0x0A80, 0x0AFF },
2802 { "Gurmukhi", 0x0A00, 0x0A7F },
2803 { "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF },
2804 { "HangulCompatibilityJamo", 0x3130, 0x318F },
2805 { "HangulJamo", 0x1100, 0x11FF },
2806 { "HangulSyllables", 0xAC00, 0xD7A3 },
2807 { "Hanunoo", 0x1720, 0x173F },
2808 { "Hebrew", 0x0590, 0x05FF },
2809 { "Hiragana", 0x3040, 0x309F },
2810 { "IPAExtensions", 0x0250, 0x02AF },
2811 { "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF },
2812 { "Kanbun", 0x3190, 0x319F },
2813 { "KangxiRadicals", 0x2F00, 0x2FDF },
2814 { "Kannada", 0x0C80, 0x0CFF },
2815 { "Katakana", 0x30A0, 0x30FF },
2816 { "KatakanaPhoneticExtensions", 0x31F0, 0x31FF },
2817 { "Kharoshthi", 0x10A00, 0x10A5F },
2818 { "Khmer", 0x1780, 0x17FF },
2819 { "KhmerSymbols", 0x19E0, 0x19FF },
2820 { "Lao", 0x0E80, 0x0EFF },
2821 { "Latin-1Supplement", 0x0080, 0x00FF },
2822 { "LatinExtended-A", 0x0100, 0x017F },
2823 { "LatinExtended-B", 0x0180, 0x024F },
2824 { "LatinExtendedAdditional", 0x1E00, 0x1EFF },
2825 { "LetterlikeSymbols", 0x2100, 0x214F },
2826 { "Limbu", 0x1900, 0x194F },
2827 { "LinearBIdeograms", 0x10080, 0x100FF },
2828 { "LinearBSyllabary", 0x10000, 0x1007F },
2829 { "Malayalam", 0x0D00, 0x0D7F },
2830 { "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF },
2831 { "MathematicalOperators", 0x2200, 0x22FF },
2832 { "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF },
2833 { "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF },
2834 { "MiscellaneousSymbols", 0x2600, 0x26FF },
2835 { "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF },
2836 { "MiscellaneousTechnical", 0x2300, 0x23FF },
2837 { "ModifierToneLetters", 0xA700, 0xA71F },
2838 { "Mongolian", 0x1800, 0x18AF },
2839 { "MusicalSymbols", 0x1D100, 0x1D1FF },
2840 { "Myanmar", 0x1000, 0x109F },
2841 { "NewTaiLue", 0x1980, 0x19DF },
2842 { "NumberForms", 0x2150, 0x218F },
2843 { "Ogham", 0x1680, 0x169F },
2844 { "OldItalic", 0x10300, 0x1032F },
2845 { "OldPersian", 0x103A0, 0x103DF },
2846 { "OpticalCharacterRecognition", 0x2440, 0x245F },
2847 { "Oriya", 0x0B00, 0x0B7F },
2848 { "Osmanya", 0x10480, 0x104AF },
2849 { "PhoneticExtensions", 0x1D00, 0x1D7F },
2850 { "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF },
2851 { "PrivateUse", 0xE000, 0xF8FF },
2852 { "Runic", 0x16A0, 0x16FF },
2853 { "Shavian", 0x10450, 0x1047F },
2854 { "Sinhala", 0x0D80, 0x0DFF },
2855 { "SmallFormVariants", 0xFE50, 0xFE6F },
2856 { "SpacingModifierLetters", 0x02B0, 0x02FF },
2857 { "Specials", 0xFFF0, 0xFFFF },
2858 { "SuperscriptsandSubscripts", 0x2070, 0x209F },
2859 { "SupplementalArrows-A", 0x27F0, 0x27FF },
2860 { "SupplementalArrows-B", 0x2900, 0x297F },
2861 { "SupplementalMathematicalOperators", 0x2A00, 0x2AFF },
2862 { "SupplementalPunctuation", 0x2E00, 0x2E7F },
2863 { "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF },
2864 { "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF },
2865 { "SylotiNagri", 0xA800, 0xA82F },
2866 { "Syriac", 0x0700, 0x074F },
2867 { "Tagalog", 0x1700, 0x171F },
2868 { "Tagbanwa", 0x1760, 0x177F },
2869 { "Tags", 0xE0000, 0xE007F },
2870 { "TaiLe", 0x1950, 0x197F },
2871 { "TaiXuanJingSymbols", 0x1D300, 0x1D35F },
2872 { "Tamil", 0x0B80, 0x0BFF },
2873 { "Telugu", 0x0C00, 0x0C7F },
2874 { "Thaana", 0x0780, 0x07BF },
2875 { "Thai", 0x0E00, 0x0E7F },
2876 { "Tibetan", 0x0F00, 0x0FFF },
2877 { "Tifinagh", 0x2D30, 0x2D7F },
2878 { "Ugaritic", 0x10380, 0x1039F },
2879 { "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F },
2880 { "VariationSelectors", 0xFE00, 0xFE0F },
2881 { "VariationSelectorsSupplement", 0xE0100, 0xE01EF },
2882 { "VerticalForms", 0xFE10, 0xFE1F },
2883 { "YiRadicals", 0xA490, 0xA4CF },
2884 { "YiSyllables", 0xA000, 0xA48F },
2885 { "YijingHexagramSymbols", 0x4DC0, 0x4DFF }
2888 inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2889 { return qstrcmp(name, entry.name) < 0; }
2890 inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2891 { return qstrcmp(entry.name, name) < 0; }
2892 #endif // QT_NO_REGEXP_CCLASS
2894 int QRegExpEngine::getChar()
2896 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2899 int QRegExpEngine::getEscape()
2901 #ifndef QT_NO_REGEXP_ESCAPE
2902 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2903 const char backTab[] = "\a\f\n\r\t\v";
2910 if (prevCh == EOS) {
2912 return Tok_Char | '\\';
2915 #ifndef QT_NO_REGEXP_ESCAPE
2916 if ((prevCh & ~0xff) == 0) {
2917 const char *p = strchr(tab, prevCh);
2919 return Tok_Char | backTab[p - tab];
2924 #ifndef QT_NO_REGEXP_ESCAPE
2927 for (i = 0; i < 3; i++) {
2928 if (yyCh >= '0' && yyCh <= '7')
2929 val = (val << 3) | (yyCh - '0');
2934 if ((val & ~0377) != 0)
2936 return Tok_Char | val;
2938 #ifndef QT_NO_REGEXP_ESCAPE
2942 #ifndef QT_NO_REGEXP_CCLASS
2944 // see QChar::isDigit()
2945 yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2946 return Tok_CharClass;
2948 // see QChar::isSpace()
2949 yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |
2950 FLAG(QChar::Separator_Line) |
2951 FLAG(QChar::Separator_Paragraph) |
2952 FLAG(QChar::Other_Control)));
2953 yyCharClass->addRange(0x0000, 0x0008);
2954 yyCharClass->addRange(0x000e, 0x001f);
2955 yyCharClass->addRange(0x007f, 0x0084);
2956 yyCharClass->addRange(0x0086, 0x009f);
2957 return Tok_CharClass;
2959 // see QChar::isLetterOrNumber() and QChar::isMark()
2960 yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2961 FLAG(QChar::Mark_SpacingCombining) |
2962 FLAG(QChar::Mark_Enclosing) |
2963 FLAG(QChar::Number_DecimalDigit) |
2964 FLAG(QChar::Number_Letter) |
2965 FLAG(QChar::Number_Other) |
2966 FLAG(QChar::Letter_Uppercase) |
2967 FLAG(QChar::Letter_Lowercase) |
2968 FLAG(QChar::Letter_Titlecase) |
2969 FLAG(QChar::Letter_Modifier) |
2970 FLAG(QChar::Letter_Other) |
2971 FLAG(QChar::Punctuation_Connector)));
2972 yyCharClass->addRange(0x203f, 0x2040);
2973 yyCharClass->addSingleton(0x2040);
2974 yyCharClass->addSingleton(0x2054);
2975 yyCharClass->addSingleton(0x30fb);
2976 yyCharClass->addRange(0xfe33, 0xfe34);
2977 yyCharClass->addRange(0xfe4d, 0xfe4f);
2978 yyCharClass->addSingleton(0xff3f);
2979 yyCharClass->addSingleton(0xff65);
2980 return Tok_CharClass;
2982 #ifndef QT_NO_REGEXP_ESCAPE
2986 #ifndef QT_NO_REGEXP_CCLASS
2988 // see QChar::isDigit()
2989 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2990 return Tok_CharClass;
2992 // see QChar::isSpace()
2993 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2994 FLAG(QChar::Separator_Line) |
2995 FLAG(QChar::Separator_Paragraph));
2996 yyCharClass->addRange(0x0009, 0x000d);
2997 yyCharClass->addSingleton(0x0085);
2998 return Tok_CharClass;
3000 // see QChar::isLetterOrNumber() and QChar::isMark()
3001 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3002 FLAG(QChar::Mark_SpacingCombining) |
3003 FLAG(QChar::Mark_Enclosing) |
3004 FLAG(QChar::Number_DecimalDigit) |
3005 FLAG(QChar::Number_Letter) |
3006 FLAG(QChar::Number_Other) |
3007 FLAG(QChar::Letter_Uppercase) |
3008 FLAG(QChar::Letter_Lowercase) |
3009 FLAG(QChar::Letter_Titlecase) |
3010 FLAG(QChar::Letter_Modifier) |
3011 FLAG(QChar::Letter_Other));
3012 yyCharClass->addSingleton(0x005f); // '_'
3013 return Tok_CharClass;
3015 if (xmlSchemaExtensions) {
3016 yyCharClass->setNegative(!yyCharClass->negative());
3022 if (xmlSchemaExtensions) {
3023 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3024 FLAG(QChar::Mark_SpacingCombining) |
3025 FLAG(QChar::Mark_Enclosing) |
3026 FLAG(QChar::Number_DecimalDigit) |
3027 FLAG(QChar::Number_Letter) |
3028 FLAG(QChar::Number_Other) |
3029 FLAG(QChar::Letter_Uppercase) |
3030 FLAG(QChar::Letter_Lowercase) |
3031 FLAG(QChar::Letter_Titlecase) |
3032 FLAG(QChar::Letter_Modifier) |
3033 FLAG(QChar::Letter_Other));
3034 yyCharClass->addSingleton(0x003a); // ':'
3035 yyCharClass->addSingleton(0x005f); // '_'
3036 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3037 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3038 yyCharClass->addRange(0xc0, 0xd6);
3039 yyCharClass->addRange(0xd8, 0xf6);
3040 yyCharClass->addRange(0xf8, 0x2ff);
3041 yyCharClass->addRange(0x370, 0x37d);
3042 yyCharClass->addRange(0x37f, 0x1fff);
3043 yyCharClass->addRange(0x200c, 0x200d);
3044 yyCharClass->addRange(0x2070, 0x218f);
3045 yyCharClass->addRange(0x2c00, 0x2fef);
3046 yyCharClass->addRange(0x3001, 0xd7ff);
3047 yyCharClass->addRange(0xf900, 0xfdcf);
3048 yyCharClass->addRange(0xfdf0, 0xfffd);
3049 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3050 return Tok_CharClass;
3055 if (xmlSchemaExtensions) {
3056 yyCharClass->setNegative(!yyCharClass->negative());
3062 if (xmlSchemaExtensions) {
3063 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3064 FLAG(QChar::Mark_SpacingCombining) |
3065 FLAG(QChar::Mark_Enclosing) |
3066 FLAG(QChar::Number_DecimalDigit) |
3067 FLAG(QChar::Number_Letter) |
3068 FLAG(QChar::Number_Other) |
3069 FLAG(QChar::Letter_Uppercase) |
3070 FLAG(QChar::Letter_Lowercase) |
3071 FLAG(QChar::Letter_Titlecase) |
3072 FLAG(QChar::Letter_Modifier) |
3073 FLAG(QChar::Letter_Other));
3074 yyCharClass->addSingleton(0x002d); // '-'
3075 yyCharClass->addSingleton(0x002e); // '.'
3076 yyCharClass->addSingleton(0x003a); // ':'
3077 yyCharClass->addSingleton(0x005f); // '_'
3078 yyCharClass->addSingleton(0xb7);
3079 yyCharClass->addRange(0x0030, 0x0039); // [0-9]
3080 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3081 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3082 yyCharClass->addRange(0xc0, 0xd6);
3083 yyCharClass->addRange(0xd8, 0xf6);
3084 yyCharClass->addRange(0xf8, 0x2ff);
3085 yyCharClass->addRange(0x370, 0x37d);
3086 yyCharClass->addRange(0x37f, 0x1fff);
3087 yyCharClass->addRange(0x200c, 0x200d);
3088 yyCharClass->addRange(0x2070, 0x218f);
3089 yyCharClass->addRange(0x2c00, 0x2fef);
3090 yyCharClass->addRange(0x3001, 0xd7ff);
3091 yyCharClass->addRange(0xf900, 0xfdcf);
3092 yyCharClass->addRange(0xfdf0, 0xfffd);
3093 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3094 yyCharClass->addRange(0x0300, 0x036f);
3095 yyCharClass->addRange(0x203f, 0x2040);
3096 return Tok_CharClass;
3101 if (xmlSchemaExtensions) {
3102 yyCharClass->setNegative(!yyCharClass->negative());
3108 if (xmlSchemaExtensions) {
3110 error(RXERR_CHARCLASS);
3111 return Tok_CharClass;
3114 QByteArray category;
3116 while (yyCh != '}') {
3119 return Tok_CharClass;
3121 category.append(yyCh);
3124 yyCh = getChar(); // skip closing '}'
3126 int catlen = category.length();
3127 if (catlen == 1 || catlen == 2) {
3128 switch (category.at(0)) {
3131 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3132 FLAG(QChar::Mark_SpacingCombining) |
3133 FLAG(QChar::Mark_Enclosing));
3135 switch (category.at(1)) {
3136 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3137 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3138 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3139 default: error(RXERR_CATEGORY); break;
3145 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3146 FLAG(QChar::Number_Letter) |
3147 FLAG(QChar::Number_Other));
3149 switch (category.at(1)) {
3150 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3151 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3152 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3153 default: error(RXERR_CATEGORY); break;
3159 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3160 FLAG(QChar::Separator_Line) |
3161 FLAG(QChar::Separator_Paragraph));
3163 switch (category.at(1)) {
3164 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3165 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3166 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3167 default: error(RXERR_CATEGORY); break;
3173 yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3174 FLAG(QChar::Other_Format) |
3175 FLAG(QChar::Other_Surrogate) |
3176 FLAG(QChar::Other_PrivateUse) |
3177 FLAG(QChar::Other_NotAssigned));
3179 switch (category.at(1)) {
3180 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3181 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3182 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3183 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3184 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3185 default: error(RXERR_CATEGORY); break;
3191 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3192 FLAG(QChar::Letter_Lowercase) |
3193 FLAG(QChar::Letter_Titlecase) |
3194 FLAG(QChar::Letter_Modifier) |
3195 FLAG(QChar::Letter_Other));
3197 switch (category.at(1)) {
3198 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3199 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3200 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3201 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3202 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3203 default: error(RXERR_CATEGORY); break;
3209 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3210 FLAG(QChar::Punctuation_Dash) |
3211 FLAG(QChar::Punctuation_Open) |
3212 FLAG(QChar::Punctuation_Close) |
3213 FLAG(QChar::Punctuation_InitialQuote) |
3214 FLAG(QChar::Punctuation_FinalQuote) |
3215 FLAG(QChar::Punctuation_Other));
3217 switch (category.at(1)) {
3218 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3219 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3220 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3221 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3222 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3223 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3224 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3225 default: error(RXERR_CATEGORY); break;
3231 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3232 FLAG(QChar::Symbol_Currency) |
3233 FLAG(QChar::Symbol_Modifier) |
3234 FLAG(QChar::Symbol_Other));
3236 switch (category.at(1)) {
3237 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3238 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3239 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3240 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3241 default: error(RXERR_CATEGORY); break;
3246 error(RXERR_CATEGORY);
3249 } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {
3250 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3251 const CategoriesRangeMapEntry *r = qBinaryFind(categoriesRangeMap, categoriesRangeMap + N, category.constData() + 2);
3252 if (r != categoriesRangeMap + N)
3253 yyCharClass->addRange(r->first, r->second);
3255 error(RXERR_CATEGORY);
3257 error(RXERR_CATEGORY);
3259 return Tok_CharClass;
3264 #ifndef QT_NO_REGEXP_ESCAPE
3267 for (i = 0; i < 4; i++) {
3268 low = QChar(yyCh).toLower().unicode();
3269 if (low >= '0' && low <= '9')
3270 val = (val << 4) | (low - '0');
3271 else if (low >= 'a' && low <= 'f')
3272 val = (val << 4) | (low - 'a' + 10);
3277 return Tok_Char | val;
3282 if (prevCh >= '1' && prevCh <= '9') {
3283 #ifndef QT_NO_REGEXP_BACKREF
3285 while (yyCh >= '0' && yyCh <= '9') {
3286 val = (val * 10) + (yyCh - '0');
3289 return Tok_BackRef | val;
3291 error(RXERR_DISABLED);
3294 return Tok_Char | prevCh;
3297 #ifndef QT_NO_REGEXP_INTERVAL
3298 int QRegExpEngine::getRep(int def)
3300 if (yyCh >= '0' && yyCh <= '9') {
3303 rep = 10 * rep + yyCh - '0';
3304 if (rep >= InftyRep) {
3305 error(RXERR_REPETITION);
3309 } while (yyCh >= '0' && yyCh <= '9');
3317 #ifndef QT_NO_REGEXP_LOOKAHEAD
3318 void QRegExpEngine::skipChars(int n)
3327 void QRegExpEngine::error(const char *msg)
3329 if (yyError.isEmpty())
3330 yyError = QLatin1String(msg);
3333 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3340 yyCharClass.reset(new QRegExpCharClass);
3343 yyError = QString();
3346 int QRegExpEngine::getToken()
3348 #ifndef QT_NO_REGEXP_CCLASS
3349 ushort pendingCh = 0;
3357 #ifndef QT_NO_REGEXP_CCLASS
3358 yyCharClass->clear();
3375 #ifndef QT_NO_REGEXP_LOOKAHEAD
3377 return Tok_NegLookahead;
3379 return Tok_PosLookahead;
3382 return Tok_MagicLeftParen;
3384 error(RXERR_LOOKBEHIND);
3385 return Tok_MagicLeftParen;
3387 error(RXERR_LOOKAHEAD);
3388 return Tok_MagicLeftParen;
3391 return Tok_LeftParen;
3394 return Tok_RightParen;
3397 yyMaxRep = InftyRep;
3398 return Tok_Quantifier;
3401 yyMaxRep = InftyRep;
3402 return Tok_Quantifier;
3404 #ifndef QT_NO_REGEXP_CCLASS
3405 yyCharClass->setNegative(true);
3407 return Tok_CharClass;
3411 return Tok_Quantifier;
3413 #ifndef QT_NO_REGEXP_CCLASS
3415 yyCharClass->setNegative(true);
3418 charPending = false;
3419 rangePending = false;
3421 if (yyCh == '-' && charPending && !rangePending) {
3422 rangePending = true;
3425 if (charPending && !rangePending) {
3426 yyCharClass->addSingleton(pendingCh);
3427 charPending = false;
3432 if (tok == Tok_Word)
3435 tok = Tok_Char | yyCh;
3438 if (tok == Tok_CharClass) {
3440 yyCharClass->addSingleton('-');
3441 yyCharClass->addSingleton(pendingCh);
3442 charPending = false;
3443 rangePending = false;
3445 } else if ((tok & Tok_Char) != 0) {
3447 yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3448 charPending = false;
3449 rangePending = false;
3451 pendingCh = tok ^ Tok_Char;
3455 error(RXERR_CHARCLASS);
3458 } while (yyCh != ']' && yyCh != EOS);
3460 yyCharClass->addSingleton('-');
3462 yyCharClass->addSingleton(pendingCh);
3467 return Tok_CharClass;
3470 return Tok_Char | '[';
3475 error(RXERR_LEFTDELIM);
3476 return Tok_Char | ']';
3480 #ifndef QT_NO_REGEXP_INTERVAL
3481 yyMinRep = getRep(0);
3482 yyMaxRep = yyMinRep;
3485 yyMaxRep = getRep(InftyRep);
3487 if (yyMaxRep < yyMinRep)
3488 error(RXERR_INTERVAL);
3490 error(RXERR_REPETITION);
3492 return Tok_Quantifier;
3494 error(RXERR_DISABLED);
3495 return Tok_Char | '{';
3500 error(RXERR_LEFTDELIM);
3501 return Tok_Char | '}';
3503 return Tok_Char | prevCh;
3507 int QRegExpEngine::parse(const QChar *pattern, int len)
3510 startTokenizer(pattern, len);
3512 #ifndef QT_NO_REGEXP_CAPTURE
3513 yyMayCapture = true;
3515 yyMayCapture = false;
3518 #ifndef QT_NO_REGEXP_CAPTURE
3519 int atom = startAtom(false);
3521 QRegExpCharClass anything;
3522 Box box(this); // create InitialState
3524 Box rightBox(this); // create FinalState
3525 rightBox.set(anything);
3527 Box middleBox(this);
3528 parseExpression(&middleBox);
3529 #ifndef QT_NO_REGEXP_CAPTURE
3530 finishAtom(atom, false);
3532 #ifndef QT_NO_REGEXP_OPTIM
3533 middleBox.setupHeuristics();
3537 yyCharClass.reset(0);
3539 #ifndef QT_NO_REGEXP_CAPTURE
3540 for (int i = 0; i < nf; ++i) {
3541 switch (f[i].capture) {
3542 case QRegExpAtom::NoCapture:
3544 case QRegExpAtom::OfficialCapture:
3545 f[i].capture = ncap;
3546 captureForOfficialCapture.append(ncap);
3550 case QRegExpAtom::UnofficialCapture:
3551 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3555 #ifndef QT_NO_REGEXP_BACKREF
3556 #ifndef QT_NO_REGEXP_OPTIM
3557 if (officialncap == 0 && nbrefs == 0) {
3562 // handle the case where there's a \5 with no corresponding capture
3563 // (captureForOfficialCapture.size() != officialncap)
3564 for (int i = 0; i < nbrefs - officialncap; ++i) {
3565 captureForOfficialCapture.append(ncap);
3571 if (!yyError.isEmpty())
3574 #ifndef QT_NO_REGEXP_OPTIM
3575 const QRegExpAutomatonState &sinit = s.at(InitialState);
3576 caretAnchored = !sinit.anchors.isEmpty();
3577 if (caretAnchored) {
3578 const QMap<int, int> &anchors = sinit.anchors;
3579 QMap<int, int>::const_iterator a;
3580 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3582 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3583 (*a & Anchor_Alternation) != 0 ||
3585 (*a & Anchor_Caret) == 0)
3587 caretAnchored = false;
3595 int numStates = s.count();
3596 for (int i = 0; i < numStates; ++i) {
3597 QRegExpAutomatonState &state = s[i];
3598 if (!state.anchors.isEmpty()) {
3599 QMap<int, int>::iterator a = state.anchors.begin();
3600 while (a != state.anchors.end()) {
3602 a = state.anchors.erase(a);
3612 void QRegExpEngine::parseAtom(Box *box)
3614 #ifndef QT_NO_REGEXP_LOOKAHEAD
3615 QRegExpEngine *eng = 0;
3620 if ((yyTok & Tok_Char) != 0) {
3621 box->set(QChar(yyTok ^ Tok_Char));
3623 #ifndef QT_NO_REGEXP_OPTIM
3628 box->catAnchor(Anchor_Dollar);
3631 box->catAnchor(Anchor_Caret);
3633 #ifndef QT_NO_REGEXP_LOOKAHEAD
3634 case Tok_PosLookahead:
3635 case Tok_NegLookahead:
3636 neg = (yyTok == Tok_NegLookahead);
3637 eng = new QRegExpEngine(cs, greedyQuantifiers);
3638 len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3642 error(RXERR_LOOKAHEAD);
3643 box->catAnchor(addLookahead(eng, neg));
3645 if (yyTok != Tok_RightParen)
3646 error(RXERR_LOOKAHEAD);
3649 #ifndef QT_NO_REGEXP_ESCAPE
3651 box->catAnchor(Anchor_Word);
3654 box->catAnchor(Anchor_NonWord);
3658 case Tok_MagicLeftParen:
3660 parseExpression(box);
3661 if (yyTok != Tok_RightParen)
3665 box->set(*yyCharClass);
3667 case Tok_Quantifier:
3668 error(RXERR_REPETITION);
3671 #ifndef QT_NO_REGEXP_BACKREF
3672 if ((yyTok & Tok_BackRef) != 0)
3673 box->set(yyTok ^ Tok_BackRef);
3676 error(RXERR_DISABLED);
3682 void QRegExpEngine::parseFactor(Box *box)
3684 #ifndef QT_NO_REGEXP_CAPTURE
3685 int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3686 int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3687 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3689 const int innerAtom = -1;
3692 #ifndef QT_NO_REGEXP_INTERVAL
3694 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3695 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3697 const QChar *in = yyIn;
3702 QRegExpCharClass charClass;
3703 if (yyTok == Tok_CharClass)
3704 charClass = *yyCharClass;
3706 bool mayCapture = yyMayCapture;
3710 #ifndef QT_NO_REGEXP_CAPTURE
3711 finishAtom(innerAtom, magicLeftParen);
3714 bool hasQuantifier = (yyTok == Tok_Quantifier);
3715 if (hasQuantifier) {
3716 #ifndef QT_NO_REGEXP_OPTIM
3719 if (yyMaxRep == InftyRep) {
3720 box->plus(innerAtom);
3721 #ifndef QT_NO_REGEXP_INTERVAL
3722 } else if (yyMaxRep == 0) {
3729 #ifndef QT_NO_REGEXP_INTERVAL
3730 yyMayCapture = false;
3731 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3732 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3737 for (i = 0; i < beta; i++) {
3740 parseAtom(&leftBox);
3741 leftBox.cat(rightBox);
3745 for (i = 0; i < alpha; i++) {
3748 parseAtom(&leftBox);
3749 leftBox.cat(rightBox);
3756 #ifndef QT_NO_REGEXP_INTERVAL
3757 yyMayCapture = mayCapture;
3761 #ifndef QT_NO_REGEXP_CAPTURE
3762 if (greedyQuantifiers)
3763 finishAtom(outerAtom, hasQuantifier);
3767 void QRegExpEngine::parseTerm(Box *box)
3769 #ifndef QT_NO_REGEXP_OPTIM
3770 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3773 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3775 parseFactor(&rightBox);
3780 void QRegExpEngine::parseExpression(Box *box)
3783 while (yyTok == Tok_Bar) {
3784 #ifndef QT_NO_REGEXP_OPTIM
3789 parseTerm(&rightBox);
3795 The struct QRegExpPrivate contains the private data of a regular
3796 expression other than the automaton. It makes it possible for many
3797 QRegExp objects to use the same QRegExpEngine object with different
3798 QRegExpPrivate objects.
3800 struct QRegExpPrivate
3803 QRegExpEngineKey engineKey;
3805 #ifndef QT_NO_REGEXP_CAPTURE
3806 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3807 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3809 QRegExpMatchState matchState;
3811 inline QRegExpPrivate()
3812 : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3813 inline QRegExpPrivate(const QRegExpEngineKey &key)
3814 : eng(0), engineKey(key), minimal(false) {}
3817 #if !defined(QT_NO_REGEXP_OPTIM)
3818 uint qHash(const QRegExpEngineKey &key, uint seed)
3820 return qHash(key.pattern, seed);
3823 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3824 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3825 static QBasicMutex globalEngineCacheMutex;
3826 #endif // QT_NO_REGEXP_OPTIM
3828 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3830 if (!eng->ref.deref()) {
3831 #if !defined(QT_NO_REGEXP_OPTIM)
3832 if (globalEngineCache()) {
3833 QMutexLocker locker(&globalEngineCacheMutex);
3835 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3836 } QT_CATCH(const std::bad_alloc &) {
3837 // in case of an exception (e.g. oom), just delete the engine
3850 static void prepareEngine_helper(QRegExpPrivate *priv)
3852 bool initMatchState = !priv->eng;
3853 #if !defined(QT_NO_REGEXP_OPTIM)
3854 if (!priv->eng && globalEngineCache()) {
3855 QMutexLocker locker(&globalEngineCacheMutex);
3856 priv->eng = globalEngineCache()->take(priv->engineKey);
3858 priv->eng->ref.ref();
3860 #endif // QT_NO_REGEXP_OPTIM
3863 priv->eng = new QRegExpEngine(priv->engineKey);
3866 priv->matchState.prepareForMatch(priv->eng);
3869 inline static void prepareEngine(QRegExpPrivate *priv)
3873 prepareEngine_helper(priv);
3876 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3878 prepareEngine(priv);
3879 priv->matchState.prepareForMatch(priv->eng);
3880 #ifndef QT_NO_REGEXP_CAPTURE
3882 priv->capturedCache.clear();
3888 static void invalidateEngine(QRegExpPrivate *priv)
3890 if (priv->eng != 0) {
3891 derefEngine(priv->eng, priv->engineKey);
3893 priv->matchState.drain();
3898 \enum QRegExp::CaretMode
3900 The CaretMode enum defines the different meanings of the caret
3901 (\b{^}) in a regular expression. The possible values are:
3904 The caret corresponds to index 0 in the searched string.
3906 \value CaretAtOffset
3907 The caret corresponds to the start offset of the search.
3909 \value CaretWontMatch
3910 The caret never matches.
3914 \enum QRegExp::PatternSyntax
3916 The syntax used to interpret the meaning of the pattern.
3918 \value RegExp A rich Perl-like pattern matching syntax. This is
3921 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3922 (Introduced in Qt 4.2.)
3924 \value Wildcard This provides a simple pattern matching syntax
3925 similar to that used by shells (command interpreters) for "file
3926 globbing". See \l{Wildcard Matching}.
3928 \value WildcardUnix This is similar to Wildcard but with the
3929 behavior of a Unix shell. The wildcard characters can be escaped
3930 with the character "\\".
3932 \value FixedString The pattern is a fixed string. This is
3933 equivalent to using the RegExp pattern on a string in
3934 which all metacharacters are escaped using escape().
3936 \value W3CXmlSchema11 The pattern is a regular expression as
3937 defined by the W3C XML Schema 1.1 specification.
3939 \sa setPatternSyntax()
3943 Constructs an empty regexp.
3945 \sa isValid(), errorString()
3949 priv = new QRegExpPrivate;
3950 prepareEngine(priv);
3954 Constructs a regular expression object for the given \a pattern
3955 string. The pattern must be given using wildcard notation if \a
3956 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3957 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3958 greedy (maximal), but can be changed by calling
3961 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3963 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3965 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3966 prepareEngine(priv);
3970 Constructs a regular expression as a copy of \a rx.
3974 QRegExp::QRegExp(const QRegExp &rx)
3976 priv = new QRegExpPrivate;
3981 Destroys the regular expression and cleans up its internal data.
3985 invalidateEngine(priv);
3990 Copies the regular expression \a rx and returns a reference to the
3991 copy. The case sensitivity, wildcard, and minimal matching options
3994 QRegExp &QRegExp::operator=(const QRegExp &rx)
3996 prepareEngine(rx.priv); // to allow sharing
3997 QRegExpEngine *otherEng = rx.priv->eng;
3999 otherEng->ref.ref();
4000 invalidateEngine(priv);
4001 priv->eng = otherEng;
4002 priv->engineKey = rx.priv->engineKey;
4003 priv->minimal = rx.priv->minimal;
4004 #ifndef QT_NO_REGEXP_CAPTURE
4005 priv->t = rx.priv->t;
4006 priv->capturedCache = rx.priv->capturedCache;
4009 priv->matchState.prepareForMatch(priv->eng);
4010 priv->matchState.captured = rx.priv->matchState.captured;
4015 \fn void QRegExp::swap(QRegExp &other)
4018 Swaps regular expression \a other with this regular
4019 expression. This operation is very fast and never fails.
4023 Returns true if this regular expression is equal to \a rx;
4024 otherwise returns false.
4026 Two QRegExp objects are equal if they have the same pattern
4027 strings and the same settings for case sensitivity, wildcard and
4030 bool QRegExp::operator==(const QRegExp &rx) const
4032 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4036 \fn bool QRegExp::operator!=(const QRegExp &rx) const
4038 Returns true if this regular expression is not equal to \a rx;
4039 otherwise returns false.
4045 Returns true if the pattern string is empty; otherwise returns
4048 If you call exactMatch() with an empty pattern on an empty string
4049 it will return true; otherwise it returns false since it operates
4050 over the whole string. If you call indexIn() with an empty pattern
4051 on \e any string it will return the start offset (0 by default)
4052 because the empty pattern matches the 'emptiness' at the start of
4053 the string. In this case the length of the match returned by
4054 matchedLength() will be 0.
4056 See QString::isEmpty().
4059 bool QRegExp::isEmpty() const
4061 return priv->engineKey.pattern.isEmpty();
4065 Returns true if the regular expression is valid; otherwise returns
4066 false. An invalid regular expression never matches.
4068 The pattern \b{[a-z} is an example of an invalid pattern, since
4069 it lacks a closing square bracket.
4071 Note that the validity of a regexp may also depend on the setting
4072 of the wildcard flag, for example \b{*.html} is a valid
4073 wildcard regexp but an invalid full regexp.
4077 bool QRegExp::isValid() const
4079 if (priv->engineKey.pattern.isEmpty()) {
4082 prepareEngine(priv);
4083 return priv->eng->isValid();
4088 Returns the pattern string of the regular expression. The pattern
4089 has either regular expression syntax or wildcard syntax, depending
4092 \sa patternSyntax(), caseSensitivity()
4094 QString QRegExp::pattern() const
4096 return priv->engineKey.pattern;
4100 Sets the pattern string to \a pattern. The case sensitivity,
4101 wildcard, and minimal matching options are not changed.
4103 \sa setPatternSyntax(), setCaseSensitivity()
4105 void QRegExp::setPattern(const QString &pattern)
4107 if (priv->engineKey.pattern != pattern) {
4108 invalidateEngine(priv);
4109 priv->engineKey.pattern = pattern;
4114 Returns Qt::CaseSensitive if the regexp is matched case
4115 sensitively; otherwise returns Qt::CaseInsensitive.
4117 \sa patternSyntax(), pattern(), isMinimal()
4119 Qt::CaseSensitivity QRegExp::caseSensitivity() const
4121 return priv->engineKey.cs;
4125 Sets case sensitive matching to \a cs.
4127 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4128 \c{readme.txt} but not \c{README.TXT}.
4130 \sa setPatternSyntax(), setPattern(), setMinimal()
4132 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4134 if ((bool)cs != (bool)priv->engineKey.cs) {
4135 invalidateEngine(priv);
4136 priv->engineKey.cs = cs;
4141 Returns the syntax used by the regular expression. The default is
4144 \sa pattern(), caseSensitivity()
4146 QRegExp::PatternSyntax QRegExp::patternSyntax() const
4148 return priv->engineKey.patternSyntax;
4152 Sets the syntax mode for the regular expression. The default is
4155 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4156 \l{wildcard matching}. For example, \b{r*.txt} matches the
4157 string \c{readme.txt} in wildcard mode, but does not match
4160 Setting \a syntax to QRegExp::FixedString means that the pattern
4161 is interpreted as a plain string. Special characters (e.g.,
4162 backslash) don't need to be escaped then.
4164 \sa setPattern(), setCaseSensitivity(), escape()
4166 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4168 if (syntax != priv->engineKey.patternSyntax) {
4169 invalidateEngine(priv);
4170 priv->engineKey.patternSyntax = syntax;
4175 Returns true if minimal (non-greedy) matching is enabled;
4176 otherwise returns false.
4178 \sa caseSensitivity(), setMinimal()
4180 bool QRegExp::isMinimal() const
4182 return priv->minimal;
4186 Enables or disables minimal matching. If \a minimal is false,
4187 matching is greedy (maximal) which is the default.
4189 For example, suppose we have the input string "We must be
4190 <b>bold</b>, very <b>bold</b>!" and the pattern
4191 \b{<b>.*</b>}. With the default greedy (maximal) matching,
4192 the match is "We must be \underline{<b>bold</b>, very
4193 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4194 first match is: "We must be \underline{<b>bold</b>}, very
4195 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4196 very \underline{<b>bold</b>}!". In practice we might use the pattern
4197 \b{<b>[^<]*\</b>} instead, although this will still fail for
4200 \sa setCaseSensitivity()
4202 void QRegExp::setMinimal(bool minimal)
4204 priv->minimal = minimal;
4207 // ### Qt 5: make non-const
4209 Returns true if \a str is matched exactly by this regular
4210 expression; otherwise returns false. You can determine how much of
4211 the string was matched by calling matchedLength().
4213 For a given regexp string R, exactMatch("R") is the equivalent of
4214 indexIn("^R$") since exactMatch() effectively encloses the regexp
4215 in the start of string and end of string anchors, except that it
4216 sets matchedLength() differently.
4218 For example, if the regular expression is \b{blue}, then
4219 exactMatch() returns true only for input \c blue. For inputs \c
4220 bluebell, \c blutak and \c lightblue, exactMatch() returns false
4221 and matchedLength() will return 4, 3 and 0 respectively.
4223 Although const, this function sets matchedLength(),
4224 capturedTexts(), and pos().
4226 \sa indexIn(), lastIndexIn()
4228 bool QRegExp::exactMatch(const QString &str) const
4230 prepareEngineForMatch(priv, str);
4231 priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4232 if (priv->matchState.captured[1] == str.length()) {
4235 priv->matchState.captured[0] = 0;
4236 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4241 // ### Qt 5: make non-const
4243 Attempts to find a match in \a str from position \a offset (0 by
4244 default). If \a offset is -1, the search starts at the last
4245 character; if -2, at the next to last character; etc.
4247 Returns the position of the first match, or -1 if there was no
4250 The \a caretMode parameter can be used to instruct whether \b{^}
4251 should match at index 0 or at \a offset.
4253 You might prefer to use QString::indexOf(), QString::contains(),
4254 or even QStringList::filter(). To replace matches use
4258 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4260 Although const, this function sets matchedLength(),
4261 capturedTexts() and pos().
4263 If the QRegExp is a wildcard expression (see setPatternSyntax())
4264 and want to test a string against the whole wildcard expression,
4265 use exactMatch() instead of this function.
4267 \sa lastIndexIn(), exactMatch()
4270 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4272 prepareEngineForMatch(priv, str);
4274 offset += str.length();
4275 priv->matchState.match(str.unicode(), str.length(), offset,
4276 priv->minimal, false, caretIndex(offset, caretMode));
4277 return priv->matchState.captured[0];
4280 // ### Qt 5: make non-const
4282 Attempts to find a match backwards in \a str from position \a
4283 offset. If \a offset is -1 (the default), the search starts at the
4284 last character; if -2, at the next to last character; etc.
4286 Returns the position of the first match, or -1 if there was no
4289 The \a caretMode parameter can be used to instruct whether \b{^}
4290 should match at index 0 or at \a offset.
4292 Although const, this function sets matchedLength(),
4293 capturedTexts() and pos().
4295 \warning Searching backwards is much slower than searching
4298 \sa indexIn(), exactMatch()
4301 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4303 prepareEngineForMatch(priv, str);
4305 offset += str.length();
4306 if (offset < 0 || offset > str.length()) {
4307 memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4311 while (offset >= 0) {
4312 priv->matchState.match(str.unicode(), str.length(), offset,
4313 priv->minimal, true, caretIndex(offset, caretMode));
4314 if (priv->matchState.captured[0] == offset)
4322 Returns the length of the last matched string, or -1 if there was
4325 \sa exactMatch(), indexIn(), lastIndexIn()
4327 int QRegExp::matchedLength() const
4329 return priv->matchState.captured[1];
4332 #ifndef QT_NO_REGEXP_CAPTURE
4335 \fn int QRegExp::numCaptures() const
4337 Returns the number of captures contained in the regular expression.
4344 Returns the number of captures contained in the regular expression.
4346 int QRegExp::captureCount() const
4348 prepareEngine(priv);
4349 return priv->eng->captureCount();
4353 Returns a list of the captured text strings.
4355 The first string in the list is the entire matched string. Each
4356 subsequent list element contains a string that matched a
4357 (capturing) subexpression of the regexp.
4360 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4362 The above example also captures elements that may be present but
4363 which we have no interest in. This problem can be solved by using
4364 non-capturing parentheses:
4366 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4368 Note that if you want to iterate over the list, you should iterate
4370 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4372 Some regexps can match an indeterminate number of times. For
4373 example if the input string is "Offsets: 12 14 99 231 7" and the
4374 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4375 all the numbers matched. However, after calling
4376 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4377 "12"), i.e. the entire match was "12" and the first subexpression
4378 matched was "12". The correct approach is to use cap() in a
4379 \l{QRegExp#cap_in_a_loop}{loop}.
4381 The order of elements in the string list is as follows. The first
4382 element is the entire matching string. Each subsequent element
4383 corresponds to the next capturing open left parentheses. Thus
4384 capturedTexts()[1] is the text of the first capturing parentheses,
4385 capturedTexts()[2] is the text of the second and so on
4386 (corresponding to $1, $2, etc., in some other regexp languages).
4390 QStringList QRegExp::capturedTexts() const
4392 if (priv->capturedCache.isEmpty()) {
4393 prepareEngine(priv);
4394 const int *captured = priv->matchState.captured;
4395 int n = priv->matchState.capturedSize;
4397 for (int i = 0; i < n; i += 2) {
4399 if (captured[i + 1] == 0)
4400 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4401 else if (captured[i] >= 0)
4402 m = priv->t.mid(captured[i], captured[i + 1]);
4403 priv->capturedCache.append(m);
4407 return priv->capturedCache;
4413 QStringList QRegExp::capturedTexts()
4415 return const_cast<const QRegExp *>(this)->capturedTexts();
4419 Returns the text captured by the \a nth subexpression. The entire
4420 match has index 0 and the parenthesized subexpressions have
4421 indexes starting from 1 (excluding non-capturing parentheses).
4423 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4425 The order of elements matched by cap() is as follows. The first
4426 element, cap(0), is the entire matching string. Each subsequent
4427 element corresponds to the next capturing open left parentheses.
4428 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4429 is the text of the second, and so on.
4431 \sa capturedTexts(), pos()
4433 QString QRegExp::cap(int nth) const
4435 return capturedTexts().value(nth);
4441 QString QRegExp::cap(int nth)
4443 return const_cast<const QRegExp *>(this)->cap(nth);
4447 Returns the position of the \a nth captured text in the searched
4448 string. If \a nth is 0 (the default), pos() returns the position
4452 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4454 For zero-length matches, pos() always returns -1. (For example, if
4455 cap(4) would return an empty string, pos(4) returns -1.) This is
4456 a feature of the implementation.
4458 \sa cap(), capturedTexts()
4460 int QRegExp::pos(int nth) const
4462 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4465 return priv->matchState.captured[2 * nth];
4471 int QRegExp::pos(int nth)
4473 return const_cast<const QRegExp *>(this)->pos(nth);
4477 Returns a text string that explains why a regexp pattern is
4478 invalid the case being; otherwise returns "no error occurred".
4482 QString QRegExp::errorString() const
4485 return QString::fromLatin1(RXERR_OK);
4487 return priv->eng->errorString();
4494 QString QRegExp::errorString()
4496 return const_cast<const QRegExp *>(this)->errorString();
4501 Returns the string \a str with every regexp special character
4502 escaped with a backslash. The special characters are $, (,), *, +,
4503 ., ?, [, \,], ^, {, | and }.
4507 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4509 This function is useful to construct regexp patterns dynamically:
4511 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4513 \sa setPatternSyntax()
4515 QString QRegExp::escape(const QString &str)
4518 const int count = str.count();
4519 quoted.reserve(count * 2);
4520 const QLatin1Char backslash('\\');
4521 for (int i = 0; i < count; i++) {
4522 switch (str.at(i).toLatin1()) {
4537 quoted.append(backslash);
4539 quoted.append(str.at(i));
4545 #ifndef QT_NO_DATASTREAM
4549 Writes the regular expression \a regExp to stream \a out.
4551 \sa {Serializing Qt Data Types}
4553 QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp)
4555 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4556 << (quint8)regExp.patternSyntax()
4557 << (quint8)!!regExp.isMinimal();
4563 Reads a regular expression from stream \a in into \a regExp.
4565 \sa {Serializing Qt Data Types}
4567 QDataStream &operator>>(QDataStream &in, QRegExp ®Exp)
4571 quint8 patternSyntax;
4574 in >> pattern >> cs >> patternSyntax >> isMinimal;
4576 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4577 QRegExp::PatternSyntax(patternSyntax));
4579 newRegExp.setMinimal(isMinimal);
4583 #endif // QT_NO_DATASTREAM
4585 #ifndef QT_NO_DEBUG_STREAM
4586 QDebug operator<<(QDebug dbg, const QRegExp &r)
4588 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4589 << ", pattern='"<< r.pattern() << "')";