1 /****************************************************************************
3 ** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/legal
6 ** This file is part of the QtCore module of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and Digia. For licensing terms and
14 ** conditions see http://qt.digia.com/licensing. For further information
15 ** use the contact form at http://qt.digia.com/contact-us.
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 2.1 requirements
23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 ** In addition, as a special exception, Digia gives you certain additional
26 ** rights. These rights are described in the Digia Qt LGPL Exception
27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 ** GNU General Public License Usage
30 ** Alternatively, this file may be used under the terms of the GNU
31 ** General Public License version 3.0 as published by the Free Software
32 ** Foundation and appearing in the file LICENSE.GPL included in the
33 ** packaging of this file. Please review the following information to
34 ** ensure the GNU General Public License version 3.0 requirements will be
35 ** met: http://www.gnu.org/copyleft/gpl.html.
40 ****************************************************************************/
44 #include "qalgorithms.h"
45 #include "qbitarray.h"
47 #include "qdatastream.h"
53 #include "qstringlist.h"
54 #include "qstringmatcher.h"
56 #include "private/qfunctions_p.h"
62 int qFindString(const QChar *haystack, int haystackLen, int from,
63 const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
65 // error strings for the regexp parser
66 #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
67 #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
68 #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
69 #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
70 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
71 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
72 #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
73 #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
74 #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
75 #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
76 #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
77 #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
83 \brief The QRegExp class provides pattern matching using regular expressions.
88 \keyword regular expression
90 A regular expression, or "regexp", is a pattern for matching
91 substrings in a text. This is useful in many contexts, e.g.,
95 \li A regexp can test whether a substring meets some criteria,
96 e.g. is an integer or contains no whitespace.
98 \li A regexp provides more powerful pattern matching than
99 simple substring matching, e.g., match one of the words
100 \e{mail}, \e{letter} or \e{correspondence}, but none of the
101 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
102 \row \li Search and Replace
103 \li A regexp can replace all occurrences of a substring with a
104 different substring, e.g., replace all occurrences of \e{&}
105 with \e{\&} except where the \e{&} is already followed by
107 \row \li String Splitting
108 \li A regexp can be used to identify where a string should be
109 split apart, e.g. splitting tab-delimited strings.
112 A brief introduction to regexps is presented, a description of
113 Qt's regexp language, some examples, and the function
114 documentation itself. QRegExp is modeled on Perl's regexp
115 language. It fully supports Unicode. QRegExp can also be used in a
116 simpler, \e{wildcard mode} that is similar to the functionality
117 found in command shells. The syntax rules used by QRegExp can be
118 changed with setPatternSyntax(). In particular, the pattern syntax
119 can be set to QRegExp::FixedString, which means the pattern to be
120 matched is interpreted as a plain string, i.e., special characters
121 (e.g., backslash) are not escaped.
123 A good text on regexps is \e {Mastering Regular Expressions}
124 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
128 \section1 Introduction
130 Regexps are built up from expressions, quantifiers, and
131 assertions. The simplest expression is a character, e.g. \b{x}
132 or \b{5}. An expression can also be a set of characters
133 enclosed in square brackets. \b{[ABCD]} will match an \b{A}
134 or a \b{B} or a \b{C} or a \b{D}. We can write this same
135 expression as \b{[A-D]}, and an experession to match any
136 captital letter in the English alphabet is written as
139 A quantifier specifies the number of occurrences of an expression
140 that must be matched. \b{x{1,1}} means match one and only one
141 \b{x}. \b{x{1,5}} means match a sequence of \b{x}
142 characters that contains at least one \b{x} but no more than
145 Note that in general regexps cannot be used to check for balanced
146 brackets or tags. For example, a regexp can be written to match an
147 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
148 are not nested, but if the \c{<b>} tags are nested, that same
149 regexp will match an opening \c{<b>} tag with the wrong closing
150 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
151 first \c{<b>} would be matched with the first \c{</b>}, which is
152 not correct. However, it is possible to write a regexp that will
153 match nested brackets or tags correctly, but only if the number of
154 nesting levels is fixed and known. If the number of nesting levels
155 is not fixed and known, it is impossible to write a regexp that
158 Suppose we want a regexp to match integers in the range 0 to 99.
159 At least one digit is required, so we start with the expression
160 \b{[0-9]{1,1}}, which matches a single digit exactly once. This
161 regexp matches integers in the range 0 to 9. To match integers up
162 to 99, increase the maximum number of occurrences to 2, so the
163 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
164 original requirement to match integers from 0 to 99, but it will
165 also match integers that occur in the middle of strings. If we
166 want the matched integer to be the whole string, we must use the
167 anchor assertions, \b{^} (caret) and \b{$} (dollar). When
168 \b{^} is the first character in a regexp, it means the regexp
169 must match from the beginning of the string. When \b{$} is the
170 last character of the regexp, it means the regexp must match to
171 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
172 Note that assertions, e.g. \b{^} and \b{$}, do not match
173 characters but locations in the string.
175 If you have seen regexps described elsewhere, they may have looked
176 different from the ones shown here. This is because some sets of
177 characters and some quantifiers are so common that they have been
178 given special symbols to represent them. \b{[0-9]} can be
179 replaced with the symbol \b{\\d}. The quantifier to match
180 exactly one occurrence, \b{{1,1}}, can be replaced with the
181 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
182 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
183 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
184 the string, match a digit, followed immediately by 0 or 1 digits}.
185 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
186 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
187 occurrences. \b{?} makes an expression optional. The regexp
188 \b{^\\d\\d?$} means \e{From the beginning of the string, match
189 one digit, followed immediately by 0 or 1 more digit, followed
190 immediately by end of string}.
192 To write a regexp that matches one of the words 'mail' \e or
193 'letter' \e or 'correspondence' but does not match words that
194 contain these words, e.g., 'email', 'mailman', 'mailer', and
195 'letterbox', start with a regexp that matches 'mail'. Expressed
196 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
197 a character expression is automatically quantified by
198 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
199 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
200 we can use the vertical bar \b{|}, which means \b{or}, to
201 include the other two words, so our regexp for matching any of the
202 three words becomes \b{mail|letter|correspondence}. Match
203 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
204 regexp will match one of the three words we want to match, it will
205 also match words we don't want to match, e.g., 'email'. To
206 prevent the regexp from matching unwanted words, we must tell it
207 to begin and end the match at word boundaries. First we enclose
208 our regexp in parentheses, \b{(mail|letter|correspondence)}.
209 Parentheses group expressions together, and they identify a part
210 of the regexp that we wish to \l{capturing text}{capture}.
211 Enclosing the expression in parentheses allows us to use it as a
212 component in more complex regexps. It also allows us to examine
213 which of the three words was actually matched. To force the match
214 to begin and end on word boundaries, we enclose the regexp in
215 \b{\\b} \e{word boundary} assertions:
216 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
217 \e{Match a word boundary, followed by the regexp in parentheses,
218 followed by a word boundary}. The \b{\\b} assertion matches a
219 \e position in the regexp, not a \e character. A word boundary is
220 any non-word character, e.g., a space, newline, or the beginning
221 or ending of a string.
223 If we want to replace ampersand characters with the HTML entity
224 \b{\&}, the regexp to match is simply \b{\&}. But this
225 regexp will also match ampersands that have already been converted
226 to HTML entities. We want to replace only ampersands that are not
227 already followed by \b{amp;}. For this, we need the negative
228 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
229 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
230 \b{not} \e{followed by} \b{amp;}.
232 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
233 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
234 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
235 required to avoid matching words that contain either name,
236 e.g. 'Ericsson'. Note that the second regexp matches more
237 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
239 Some of the examples discussed above are implemented in the
240 \l{#code-examples}{code examples} section.
242 \target characters-and-abbreviations-for-sets-of-characters
243 \section1 Characters and Abbreviations for Sets of Characters
246 \header \li Element \li Meaning
248 \li A character represents itself unless it has a special
249 regexp meaning. e.g. \b{c} matches the character \e c.
251 \li A character that follows a backslash matches the character
252 itself, except as specified below. e.g., To match a literal
253 caret at the beginning of a string, write \b{\\^}.
255 \li Matches the ASCII bell (BEL, 0x07).
257 \li Matches the ASCII form feed (FF, 0x0C).
259 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
261 \li Matches the ASCII carriage return (CR, 0x0D).
263 \li Matches the ASCII horizontal tab (HT, 0x09).
265 \li Matches the ASCII vertical tab (VT, 0x0B).
266 \row \li \b{\\x\e{hhhh}}
267 \li Matches the Unicode character corresponding to the
268 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
269 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
270 \li matches the ASCII/Latin1 character for the octal number
271 \e{ooo} (between 0 and 0377).
273 \li Matches any character (including newline).
275 \li Matches a digit (QChar::isDigit()).
277 \li Matches a non-digit.
279 \li Matches a whitespace character (QChar::isSpace()).
281 \li Matches a non-whitespace character.
283 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
285 \li Matches a non-word character.
287 \li The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
290 \b{Note:} The C++ compiler transforms backslashes in strings.
291 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
292 To match the backslash character itself, enter it four times, i.e.
295 \target sets-of-characters
296 \section1 Sets of Characters
298 Square brackets mean match any character contained in the square
299 brackets. The character set abbreviations described above can
300 appear in a character set in square brackets. Except for the
301 character set abbreviations and the following two exceptions,
302 characters do not have special meanings in square brackets.
307 \li The caret negates the character set if it occurs as the
308 first character (i.e. immediately after the opening square
309 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
310 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
314 \li The dash indicates a range of characters. \b{[W-Z]}
315 matches 'W' or 'X' or 'Y' or 'Z'.
319 Using the predefined character set abbreviations is more portable
320 than using character ranges across platforms and languages. For
321 example, \b{[0-9]} matches a digit in Western alphabets but
322 \b{\\d} matches a digit in \e any alphabet.
324 Note: In other regexp documentation, sets of characters are often
325 called "character classes".
328 \section1 Quantifiers
330 By default, an expression is automatically quantified by
331 \b{{1,1}}, i.e. it should occur exactly once. In the following
332 list, \b{\e {E}} stands for expression. An expression is a
333 character, or an abbreviation for a set of characters, or a set of
334 characters in square brackets, or an expression in parentheses.
339 \li Matches zero or one occurrences of \e E. This quantifier
340 means \e{The previous expression is optional}, because it
341 will match whether or not the expression is found. \b{\e
342 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
343 matches 'dent' or 'dents'.
347 \li Matches one or more occurrences of \e E. \b{\e {E}+} is
348 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
353 \li Matches zero or more occurrences of \e E. It is the same
354 as \b{\e {E}{0,}}. The \b{*} quantifier is often used
355 in error where \b{+} should be used. For example, if
356 \b{\\s*$} is used in an expression to match strings that
357 end in whitespace, it will match every string because
358 \b{\\s*$} means \e{Match zero or more whitespaces followed
359 by end of string}. The correct regexp to match strings that
360 have at least one trailing whitespace character is
363 \row \li \b{\e {E}{n}}
365 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
366 is the same as repeating \e E \e n times. For example,
367 \b{x{5}} is the same as \b{xxxxx}. It is also the same
368 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
370 \row \li \b{\e {E}{n,}}
371 \li Matches at least \e n occurrences of \e E.
373 \row \li \b{\e {E}{,m}}
374 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
375 is the same as \b{\e {E}{0,m}}.
377 \row \li \b{\e {E}{n,m}}
378 \li Matches at least \e n and at most \e m occurrences of \e E.
381 To apply a quantifier to more than just the preceding character,
382 use parentheses to group characters together in an expression. For
383 example, \b{tag+} matches a 't' followed by an 'a' followed by
384 at least one 'g', whereas \b{(tag)+} matches at least one
387 Note: Quantifiers are normally "greedy". They always match as much
388 text as they can. For example, \b{0+} matches the first zero it
389 finds and all the consecutive zeros after the first zero. Applied
390 to '20005', it matches'2\underline{000}5'. Quantifiers can be made
391 non-greedy, see setMinimal().
393 \target capturing parentheses
394 \target backreferences
395 \section1 Capturing Text
397 Parentheses allow us to group elements together so that we can
398 quantify and capture them. For example if we have the expression
399 \b{mail|letter|correspondence} that matches a string we know
400 that \e one of the words matched but not which one. Using
401 parentheses allows us to "capture" whatever is matched within
402 their bounds, so if we used \b{(mail|letter|correspondence)}
403 and matched this regexp against the string "I sent you some email"
404 we can use the cap() or capturedTexts() functions to extract the
405 matched characters, in this case 'mail'.
407 We can use captured text within the regexp itself. To refer to the
408 captured text we use \e backreferences which are indexed from 1,
409 the same as for cap(). For example we could search for duplicate
410 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
411 word boundary followed by one or more word characters followed by
412 one or more non-word characters followed by the same text as the
413 first parenthesized expression followed by a word boundary.
415 If we want to use parentheses purely for grouping and not for
416 capturing we can use the non-capturing syntax, e.g.
417 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
418 end ')'. In this example we match either 'green' or 'blue' but we
419 do not capture the match so we only know whether or not we matched
420 but not which color we actually found. Using non-capturing
421 parentheses is more efficient than using capturing parentheses
422 since the regexp engine has to do less book-keeping.
424 Both capturing and non-capturing parentheses may be nested.
426 \target greedy quantifiers
428 For historical reasons, quantifiers (e.g. \b{*}) that apply to
429 capturing parentheses are more "greedy" than other quantifiers.
430 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
431 This behavior is different from what other regexp engines do
432 (notably, Perl). To obtain a more intuitive capturing behavior,
433 specify QRegExp::RegExp2 to the QRegExp constructor or call
434 setPatternSyntax(QRegExp::RegExp2).
436 \target cap_in_a_loop
438 When the number of matches cannot be determined in advance, a
439 common idiom is to use cap() in a loop. For example:
441 \snippet code/src_corelib_tools_qregexp.cpp 0
446 Assertions make some statement about the text at the point where
447 they occur in the regexp but they do not match any characters. In
448 the following list \b{\e {E}} stands for any expression.
452 \li The caret signifies the beginning of the string. If you
453 wish to match a literal \c{^} you must escape it by
454 writing \c{\\^}. For example, \b{^#include} will only
455 match strings which \e begin with the characters '#include'.
456 (When the caret is the first character of a character set it
457 has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
460 \li The dollar signifies the end of the string. For example
461 \b{\\d\\s*$} will match strings which end with a digit
462 optionally followed by whitespace. If you wish to match a
463 literal \c{$} you must escape it by writing
467 \li A word boundary. For example the regexp
468 \b{\\bOK\\b} means match immediately after a word
469 boundary (e.g. start of string or whitespace) the letter 'O'
470 then the letter 'K' immediately before another word boundary
471 (e.g. end of string or whitespace). But note that the
472 assertion does not actually match any whitespace so if we
473 write \b{(\\bOK\\b)} and we have a match it will only
474 contain 'OK' even if the string is "It's \underline{OK} now".
477 \li A non-word boundary. This assertion is true wherever
478 \b{\\b} is false. For example if we searched for
479 \b{\\Bon\\B} in "Left on" the match would fail (space
480 and end of string aren't non-word boundaries), but it would
481 match in "t\underline{on}ne".
483 \row \li \b{(?=\e E)}
484 \li Positive lookahead. This assertion is true if the
485 expression matches at this point in the regexp. For example,
486 \b{const(?=\\s+char)} matches 'const' whenever it is
487 followed by 'char', as in 'static \underline{const} char *'.
488 (Compare with \b{const\\s+char}, which matches 'static
489 \underline{const char} *'.)
491 \row \li \b{(?!\e E)}
492 \li Negative lookahead. This assertion is true if the
493 expression does not match at this point in the regexp. For
494 example, \b{const(?!\\s+char)} matches 'const' \e except
495 when it is followed by 'char'.
498 \keyword QRegExp wildcard matching
499 \section1 Wildcard Matching
501 Most command shells such as \e bash or \e cmd.exe support "file
502 globbing", the ability to identify a group of files by using
503 wildcards. The setPatternSyntax() function is used to switch
504 between regexp and wildcard mode. Wildcard matching is much
505 simpler than full regexps and has only four features:
509 \li Any character represents itself apart from those mentioned
510 below. Thus \b{c} matches the character \e c.
512 \li Matches any single character. It is the same as
513 \b{.} in full regexps.
515 \li Matches zero or more of any characters. It is the
516 same as \b{.*} in full regexps.
518 \li Sets of characters can be represented in square brackets,
519 similar to full regexps. Within the character class, like
520 outside, backslash has no special meaning.
523 In the mode Wildcard, the wildcard characters cannot be
524 escaped. In the mode WildcardUnix, the character '\\' escapes the
527 For example if we are in wildcard mode and have strings which
528 contain filenames we could identify HTML files with \b{*.html}.
529 This will match zero or more characters followed by a dot followed
530 by 'h', 't', 'm' and 'l'.
532 To test a string against a wildcard expression, use exactMatch().
535 \snippet code/src_corelib_tools_qregexp.cpp 1
538 \section1 Notes for Perl Users
540 Most of the character class abbreviations supported by Perl are
541 supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
542 {characters and abbreviations for sets of characters}.
544 In QRegExp, apart from within character classes, \c{^} always
545 signifies the start of the string, so carets must always be
546 escaped unless used for that purpose. In Perl the meaning of caret
547 varies automagically depending on where it occurs so escaping it
548 is rarely necessary. The same applies to \c{$} which in
549 QRegExp always signifies the end of the string.
551 QRegExp's quantifiers are the same as Perl's greedy quantifiers
552 (but see the \l{greedy quantifiers}{note above}). Non-greedy
553 matching cannot be applied to individual quantifiers, but can be
554 applied to all the quantifiers in the pattern. For example, to
555 match the Perl regexp \b{ro+?m} requires:
557 \snippet code/src_corelib_tools_qregexp.cpp 2
559 The equivalent of Perl's \c{/i} option is
560 setCaseSensitivity(Qt::CaseInsensitive).
562 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
564 In QRegExp \b{.} matches any character, therefore all QRegExp
565 regexps have the equivalent of Perl's \c{/s} option. QRegExp
566 does not have an equivalent to Perl's \c{/m} option, but this
567 can be emulated in various ways for example by splitting the input
568 into lines or by looping with a regexp that searches for newlines.
570 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
571 assertions. The \\G assertion is not supported but can be emulated
574 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
575 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
576 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
577 capturedTexts()[2], etc.
579 To substitute a pattern use QString::replace().
581 Perl's extended \c{/x} syntax is not supported, nor are
582 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
583 the other hand, C++'s rules for literal strings can be used to
586 \snippet code/src_corelib_tools_qregexp.cpp 3
588 Both zero-width positive and zero-width negative lookahead
589 assertions (?=pattern) and (?!pattern) are supported with the same
590 syntax as Perl. Perl's lookbehind assertions, "independent"
591 subexpressions and conditional expressions are not supported.
593 Non-capturing parentheses are also supported, with the same
596 See QString::split() and QStringList::join() for equivalents
597 to Perl's split and join functions.
599 Note: because C++ transforms \\'s they must be written \e twice in
600 code, e.g. \b{\\b} must be written \b{\\\\b}.
602 \target code-examples
603 \section1 Code Examples
605 \snippet code/src_corelib_tools_qregexp.cpp 4
607 The third string matches '\underline{6}'. This is a simple validation
608 regexp for integers in the range 0 to 99.
610 \snippet code/src_corelib_tools_qregexp.cpp 5
612 The second string matches '\underline{This_is-OK}'. We've used the
613 character set abbreviation '\\S' (non-whitespace) and the anchors
614 to match strings which contain no whitespace.
616 In the following example we match strings containing 'mail' or
617 'letter' or 'correspondence' but only match whole words i.e. not
620 \snippet code/src_corelib_tools_qregexp.cpp 6
622 The second string matches "Please write the \underline{letter}". The
623 word 'letter' is also captured (because of the parentheses). We
624 can see what text we've captured like this:
626 \snippet code/src_corelib_tools_qregexp.cpp 7
628 This will capture the text from the first set of capturing
629 parentheses (counting capturing left parentheses from left to
630 right). The parentheses are counted from 1 since cap(0) is the
631 whole matched regexp (equivalent to '&' in most regexp engines).
633 \snippet code/src_corelib_tools_qregexp.cpp 8
635 Here we've passed the QRegExp to QString's replace() function to
636 replace the matched text with new text.
638 \snippet code/src_corelib_tools_qregexp.cpp 9
640 We've used the indexIn() function to repeatedly match the regexp in
641 the string. Note that instead of moving forward by one character
642 at a time \c pos++ we could have written \c {pos +=
643 rx.matchedLength()} to skip over the already matched string. The
644 count will equal 3, matching 'One \underline{Eric} another
645 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
646 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
647 by non-word boundaries.
649 One common use of regexps is to split lines of delimited data into
650 their component fields.
652 \snippet code/src_corelib_tools_qregexp.cpp 10
654 In this example our input lines have the format company name, web
655 address and country. Unfortunately the regexp is rather long and
656 not very versatile -- the code will break if we add any more
657 fields. A simpler and better solution is to look for the
658 separator, '\\t' in this case, and take the surrounding text. The
659 QString::split() function can take a separator string or regexp
660 as an argument and split a string accordingly.
662 \snippet code/src_corelib_tools_qregexp.cpp 11
664 Here field[0] is the company, field[1] the web address and so on.
666 To imitate the matching of a shell we can use wildcard mode.
668 \snippet code/src_corelib_tools_qregexp.cpp 12
670 Wildcard matching can be convenient because of its simplicity, but
671 any wildcard regexp can be defined using full regexps, e.g.
672 \b{.*\\.html$}. Notice that we can't match both \c .html and \c
673 .htm files with a wildcard unless we use \b{*.htm*} which will
674 also match 'test.html.bak'. A full regexp gives us the precision
675 we need, \b{.*\\.html?$}.
677 QRegExp can match case insensitively using setCaseSensitivity(),
678 and can use non-greedy matching, see setMinimal(). By
679 default QRegExp uses full regexps but this can be changed with
680 setWildcard(). Searching can be forward with indexIn() or backward
681 with lastIndexIn(). Captured text can be accessed using
682 capturedTexts() which returns a string list of all captured
683 strings, or using cap() which returns the captured string for the
684 given index. The pos() function takes a match index and returns
685 the position in the string where the match was made (or -1 if
688 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
689 {tools/regexp}{Regular Expression Example}
692 #if defined(Q_OS_VXWORKS) && defined(EOS)
696 const int NumBadChars = 64;
697 #define BadChar(ch) ((ch).unicode() % NumBadChars)
699 const int NoOccurrence = INT_MAX;
700 const int EmptyCapture = INT_MAX;
701 const int InftyLen = INT_MAX;
702 const int InftyRep = 1025;
705 static bool isWord(QChar ch)
707 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
711 Merges two vectors of ints and puts the result into the first
714 static void mergeInto(QVector<int> *a, const QVector<int> &b)
716 int asize = a->size();
717 int bsize = b.size();
720 #ifndef QT_NO_REGEXP_OPTIM
721 } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
722 a->resize(asize + 1);
723 (*a)[asize] = b.at(0);
725 } else if (bsize >= 1) {
726 int csize = asize + bsize;
727 QVector<int> c(csize);
728 int i = 0, j = 0, k = 0;
731 if (a->at(i) == b.at(j)) {
734 } else if (a->at(i) < b.at(j)) {
740 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
746 memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
751 #ifndef QT_NO_REGEXP_WILDCARD
753 Translates a wildcard pattern to an equivalent regular expression
754 pattern (e.g., *.cpp to .*\.cpp).
756 If enableEscaping is true, it is possible to escape the wildcard
759 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
761 const int wclen = wc_str.length();
764 bool isEscaping = false; // the previous character is '\'
765 const QChar *wc = wc_str.unicode();
768 const QChar c = wc[i++];
769 switch (c.unicode()) {
771 if (enableEscaping) {
773 rx += QLatin1String("\\\\");
774 } // we insert the \\ later if necessary
775 if (i == wclen) { // the end
776 rx += QLatin1String("\\\\");
779 rx += QLatin1String("\\\\");
785 rx += QLatin1String("\\*");
788 rx += QLatin1String(".*");
793 rx += QLatin1String("\\?");
796 rx += QLatin1Char('.');
811 rx += QLatin1String("\\\\");
813 rx += QLatin1Char('\\');
819 rx += QLatin1String("\\[");
822 if (wc[i] == QLatin1Char('^'))
825 if (rx[i] == QLatin1Char(']'))
827 while (i < wclen && wc[i] != QLatin1Char(']')) {
828 if (wc[i] == QLatin1Char('\\'))
829 rx += QLatin1Char('\\');
839 rx += QLatin1String("\\");
847 rx += QLatin1String("\\\\");
856 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
858 if (caretMode == QRegExp::CaretAtZero) {
860 } else if (caretMode == QRegExp::CaretAtOffset) {
862 } else { // QRegExp::CaretWontMatch
868 The QRegExpEngineKey struct uniquely identifies an engine.
870 struct QRegExpEngineKey
873 QRegExp::PatternSyntax patternSyntax;
874 Qt::CaseSensitivity cs;
876 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
877 Qt::CaseSensitivity cs)
878 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
880 inline void clear() {
882 patternSyntax = QRegExp::RegExp;
883 cs = Qt::CaseSensitive;
887 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
889 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
890 && key1.cs == key2.cs;
895 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
898 This is the engine state during matching.
900 struct QRegExpMatchState
902 const QChar *in; // a pointer to the input string data
903 int pos; // the current position in the string
905 int len; // the length of the input string
906 bool minimal; // minimal matching?
907 int *bigArray; // big array holding the data for the next pointers
908 int *inNextStack; // is state is nextStack?
909 int *curStack; // stack of current states
910 int *nextStack; // stack of next states
911 int *curCapBegin; // start of current states' captures
912 int *nextCapBegin; // start of next states' captures
913 int *curCapEnd; // end of current states' captures
914 int *nextCapEnd; // end of next states' captures
915 int *tempCapBegin; // start of temporary captures
916 int *tempCapEnd; // end of temporary captures
917 int *capBegin; // start of captures for a next state
918 int *capEnd; // end of captures for a next state
919 int *slideTab; // bump-along slide table for bad-character heuristic
920 int *captured; // what match() returned last
921 int slideTabSize; // size of slide table
923 #ifndef QT_NO_REGEXP_BACKREF
924 QList<QVector<int> > sleeping; // list of back-reference sleepers
926 int matchLen; // length of match
927 int oneTestMatchedLen; // length of partial match
929 const QRegExpEngine *eng;
931 inline QRegExpMatchState() : bigArray(0), captured(0) {}
932 inline ~QRegExpMatchState() { free(bigArray); }
934 void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
935 void prepareForMatch(QRegExpEngine *eng);
936 void match(const QChar *str, int len, int pos, bool minimal,
937 bool oneTest, int caretIndex);
939 bool testAnchor(int i, int a, const int *capBegin);
943 The struct QRegExpAutomatonState represents one state in a modified NFA. The
944 input characters matched are stored in the state instead of on
945 the transitions, something possible for an automaton
946 constructed from a regular expression.
948 struct QRegExpAutomatonState
950 #ifndef QT_NO_REGEXP_CAPTURE
951 int atom; // which atom does this state belong to?
953 int match; // what does it match? (see CharClassBit and BackRefBit)
954 QVector<int> outs; // out-transitions
955 QMap<int, int> reenter; // atoms reentered when transiting out
956 QMap<int, int> anchors; // anchors met when transiting out
958 inline QRegExpAutomatonState() { }
959 #ifndef QT_NO_REGEXP_CAPTURE
960 inline QRegExpAutomatonState(int a, int m)
961 : atom(a), match(m) { }
963 inline QRegExpAutomatonState(int m)
968 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
971 The struct QRegExpCharClassRange represents a range of characters (e.g.,
972 [0-9] denotes range 48 to 57).
974 struct QRegExpCharClassRange
980 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
982 #ifndef QT_NO_REGEXP_CAPTURE
984 The struct QRegExpAtom represents one node in the hierarchy of regular
989 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
991 int parent; // index of parent in array of atoms
992 int capture; // index of capture, from 1 to ncap - 1
995 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
998 struct QRegExpLookahead;
1000 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1002 The struct QRegExpAnchorAlternation represents a pair of anchors with
1005 struct QRegExpAnchorAlternation
1007 int a; // this anchor...
1008 int b; // ...or this one
1011 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1014 #ifndef QT_NO_REGEXP_CCLASS
1016 #define FLAG(x) (1 << (x))
1018 The class QRegExpCharClass represents a set of characters, such as can
1019 be found in regular expressions (e.g., [a-z] denotes the set
1022 class QRegExpCharClass
1026 inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1028 QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1031 bool negative() const { return n; }
1032 void setNegative(bool negative);
1033 void addCategories(uint cats);
1034 void addRange(ushort from, ushort to);
1035 void addSingleton(ushort ch) { addRange(ch, ch); }
1037 bool in(QChar ch) const;
1038 #ifndef QT_NO_REGEXP_OPTIM
1039 const QVector<int> &firstOccurrence() const { return occ1; }
1042 #if defined(QT_DEBUG)
1047 uint c; // character classes
1048 QVector<QRegExpCharClassRange> r; // character ranges
1049 bool n; // negative?
1050 #ifndef QT_NO_REGEXP_OPTIM
1051 QVector<int> occ1; // first-occurrence array
1055 struct QRegExpCharClass
1059 #ifndef QT_NO_REGEXP_OPTIM
1060 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1062 const QVector<int> &firstOccurrence() const { return occ1; }
1068 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1071 The QRegExpEngine class encapsulates a modified nondeterministic
1072 finite automaton (NFA).
1077 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1078 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1080 QRegExpEngine(const QRegExpEngineKey &key);
1083 bool isValid() const { return valid; }
1084 const QString &errorString() const { return yyError; }
1085 int captureCount() const { return officialncap; }
1087 int createState(QChar ch);
1088 int createState(const QRegExpCharClass &cc);
1089 #ifndef QT_NO_REGEXP_BACKREF
1090 int createState(int bref);
1093 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1094 #ifndef QT_NO_REGEXP_CAPTURE
1095 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1098 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1099 int anchorAlternation(int a, int b);
1100 int anchorConcatenation(int a, int b);
1102 int anchorAlternation(int a, int b) { return a & b; }
1103 int anchorConcatenation(int a, int b) { return a | b; }
1105 void addAnchors(int from, int to, int a);
1107 #ifndef QT_NO_REGEXP_OPTIM
1108 void heuristicallyChooseHeuristic();
1111 #if defined(QT_DEBUG)
1118 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1119 enum { InitialState = 0, FinalState = 1 };
1122 int setupState(int match);
1125 Let's hope that 13 lookaheads and 14 back-references are
1128 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1129 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1130 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1131 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1132 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1133 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1135 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1136 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1137 #ifndef QT_NO_REGEXP_CAPTURE
1138 int startAtom(bool officialCapture);
1139 void finishAtom(int atom, bool needCapture);
1142 #ifndef QT_NO_REGEXP_LOOKAHEAD
1143 int addLookahead(QRegExpEngine *eng, bool negative);
1146 #ifndef QT_NO_REGEXP_OPTIM
1147 bool goodStringMatch(QRegExpMatchState &matchState) const;
1148 bool badCharMatch(QRegExpMatchState &matchState) const;
1150 bool bruteMatch(QRegExpMatchState &matchState) const;
1153 QVector<QRegExpAutomatonState> s; // array of states
1154 #ifndef QT_NO_REGEXP_CAPTURE
1155 QVector<QRegExpAtom> f; // atom hierarchy
1156 int nf; // number of atoms
1157 int cf; // current atom
1158 QVector<int> captureForOfficialCapture;
1160 int officialncap; // number of captures, seen from the outside
1161 int ncap; // number of captures, seen from the inside
1162 #ifndef QT_NO_REGEXP_CCLASS
1163 QVector<QRegExpCharClass> cl; // array of character classes
1165 #ifndef QT_NO_REGEXP_LOOKAHEAD
1166 QVector<QRegExpLookahead *> ahead; // array of lookaheads
1168 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1169 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1171 #ifndef QT_NO_REGEXP_OPTIM
1172 bool caretAnchored; // does the regexp start with ^?
1173 bool trivial; // is the good-string all that needs to match?
1175 bool valid; // is the regular expression valid?
1176 Qt::CaseSensitivity cs; // case sensitive?
1177 bool greedyQuantifiers; // RegExp2?
1178 bool xmlSchemaExtensions;
1179 #ifndef QT_NO_REGEXP_BACKREF
1180 int nbrefs; // number of back-references
1183 #ifndef QT_NO_REGEXP_OPTIM
1184 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1186 int goodEarlyStart; // the index where goodStr can first occur in a match
1187 int goodLateStart; // the index where goodStr can last occur in a match
1188 QString goodStr; // the string that any match has to contain
1190 int minl; // the minimum length of a match
1191 QVector<int> occ1; // first-occurrence array
1195 The class Box is an abstraction for a regular expression
1196 fragment. It can also be seen as one node in the syntax tree of
1197 a regular expression with synthetized attributes.
1199 Its interface is ugly for performance reasons.
1204 Box(QRegExpEngine *engine);
1205 Box(const Box &b) { operator=(b); }
1207 Box &operator=(const Box &b);
1209 void clear() { operator=(Box(eng)); }
1211 void set(const QRegExpCharClass &cc);
1212 #ifndef QT_NO_REGEXP_BACKREF
1216 void cat(const Box &b);
1217 void orx(const Box &b);
1218 void plus(int atom);
1220 void catAnchor(int a);
1221 #ifndef QT_NO_REGEXP_OPTIM
1222 void setupHeuristics();
1225 #if defined(QT_DEBUG)
1230 void addAnchorsToEngine(const Box &to) const;
1232 QRegExpEngine *eng; // the automaton under construction
1233 QVector<int> ls; // the left states (firstpos)
1234 QVector<int> rs; // the right states (lastpos)
1235 QMap<int, int> lanchors; // the left anchors
1236 QMap<int, int> ranchors; // the right anchors
1237 int skipanchors; // the anchors to match if the box is skipped
1239 #ifndef QT_NO_REGEXP_OPTIM
1240 int earlyStart; // the index where str can first occur
1241 int lateStart; // the index where str can last occur
1242 QString str; // a string that has to occur in any match
1243 QString leftStr; // a string occurring at the left of this box
1244 QString rightStr; // a string occurring at the right of this box
1245 int maxl; // the maximum length of this box (possibly InftyLen)
1248 int minl; // the minimum length of this box
1249 #ifndef QT_NO_REGEXP_OPTIM
1250 QVector<int> occ1; // first-occurrence array
1257 This is the lexical analyzer for regular expressions.
1259 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1260 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1261 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1264 #ifndef QT_NO_REGEXP_INTERVAL
1265 int getRep(int def);
1267 #ifndef QT_NO_REGEXP_LOOKAHEAD
1268 void skipChars(int n);
1270 void error(const char *msg);
1271 void startTokenizer(const QChar *rx, int len);
1274 const QChar *yyIn; // a pointer to the input regular expression pattern
1275 int yyPos0; // the position of yyTok in the input pattern
1276 int yyPos; // the position of the next character to read
1277 int yyLen; // the length of yyIn
1278 int yyCh; // the last character read
1279 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1280 int yyMinRep; // attribute for Tok_Quantifier
1281 int yyMaxRep; // ditto
1282 QString yyError; // syntax error or overflow during parsing?
1285 This is the syntactic analyzer for regular expressions.
1287 int parse(const QChar *rx, int len);
1288 void parseAtom(Box *box);
1289 void parseFactor(Box *box);
1290 void parseTerm(Box *box);
1291 void parseExpression(Box *box);
1293 int yyTok; // the last token read
1294 bool yyMayCapture; // set this to false to disable capturing
1296 friend struct QRegExpMatchState;
1299 #ifndef QT_NO_REGEXP_LOOKAHEAD
1301 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1302 (?=foo) and (?!bar)).
1304 struct QRegExpLookahead
1306 QRegExpEngine *eng; // NFA representing the embedded regular expression
1307 bool neg; // negative lookahead?
1309 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1310 : eng(eng0), neg(neg0) { }
1311 inline ~QRegExpLookahead() { delete eng; }
1317 convert the pattern string to the RegExp syntax.
1319 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1323 switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325 case QRegExp::Wildcard:
1326 return wc2rx(pattern, false);
1327 case QRegExp::WildcardUnix:
1328 return wc2rx(pattern, true);
1330 case QRegExp::FixedString:
1331 return QRegExp::escape(pattern);
1332 case QRegExp::W3CXmlSchema11:
1338 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1339 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1340 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1344 QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1346 valid = (parse(rx.unicode(), rx.length()) == rx.length());
1348 #ifndef QT_NO_REGEXP_OPTIM
1351 error(RXERR_LEFTDELIM);
1355 QRegExpEngine::~QRegExpEngine()
1357 #ifndef QT_NO_REGEXP_LOOKAHEAD
1362 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1365 We use one QVector<int> for all the big data used a lot in
1366 matchHere() and friends.
1368 int ns = eng->s.size(); // number of states
1369 int ncap = eng->ncap;
1370 #ifndef QT_NO_REGEXP_OPTIM
1371 int newSlideTabSize = qMax(eng->minl + 1, 16);
1373 int newSlideTabSize = 0;
1375 int numCaptures = eng->captureCount();
1376 int newCapturedSize = 2 + 2 * numCaptures;
1377 bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1379 // set all internal variables only _after_ bigArray is realloc'ed
1380 // to prevent a broken regexp in oom case
1382 slideTabSize = newSlideTabSize;
1383 capturedSize = newCapturedSize;
1384 inNextStack = bigArray;
1385 memset(inNextStack, -1, ns * sizeof(int));
1386 curStack = inNextStack + ns;
1387 nextStack = inNextStack + 2 * ns;
1389 curCapBegin = inNextStack + 3 * ns;
1390 nextCapBegin = curCapBegin + ncap * ns;
1391 curCapEnd = curCapBegin + 2 * ncap * ns;
1392 nextCapEnd = curCapBegin + 3 * ncap * ns;
1394 tempCapBegin = curCapBegin + 4 * ncap * ns;
1395 tempCapEnd = tempCapBegin + ncap;
1396 capBegin = tempCapBegin + 2 * ncap;
1397 capEnd = tempCapBegin + 3 * ncap;
1399 slideTab = tempCapBegin + 4 * ncap;
1400 captured = slideTab + slideTabSize;
1401 memset(captured, -1, capturedSize*sizeof(int));
1406 Tries to match in str and returns an array of (begin, length) pairs
1407 for captured text. If there is no match, all pairs are (-1, -1).
1409 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1410 bool minimal0, bool oneTest, int caretIndex)
1412 bool matched = false;
1415 #ifndef QT_NO_REGEXP_OPTIM
1416 if (eng->trivial && !oneTest) {
1417 pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1418 matchLen = eng->goodStr.length();
1419 matched = (pos != -1);
1427 caretPos = caretIndex;
1431 oneTestMatchedLen = 0;
1433 if (eng->valid && pos >= 0 && pos <= len) {
1434 #ifndef QT_NO_REGEXP_OPTIM
1436 matched = matchHere();
1438 if (pos <= len - eng->minl) {
1439 if (eng->caretAnchored) {
1440 matched = matchHere();
1441 } else if (eng->useGoodStringHeuristic) {
1442 matched = eng->goodStringMatch(*this);
1444 matched = eng->badCharMatch(*this);
1449 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1459 int numCaptures = (capturedSize - 2) >> 1;
1460 #ifndef QT_NO_REGEXP_CAPTURE
1461 for (int i = 0; i < numCaptures; ++i) {
1462 int j = eng->captureForOfficialCapture.at(i);
1463 if (capBegin[j] != EmptyCapture) {
1464 int len = capEnd[j] - capBegin[j];
1465 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1474 // we rely on 2's complement here
1475 memset(captured, -1, capturedSize * sizeof(int));
1480 The three following functions add one state to the automaton and
1481 return the number of the state.
1484 int QRegExpEngine::createState(QChar ch)
1486 return setupState(ch.unicode());
1489 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1491 #ifndef QT_NO_REGEXP_CCLASS
1493 cl += QRegExpCharClass(cc);
1494 return setupState(CharClassBit | n);
1497 return setupState(CharClassBit);
1501 #ifndef QT_NO_REGEXP_BACKREF
1502 int QRegExpEngine::createState(int bref)
1504 if (bref > nbrefs) {
1506 if (nbrefs > MaxBackRefs) {
1511 return setupState(BackRefBit | bref);
1516 The two following functions add a transition between all pairs of
1517 states (i, j) where i is found in from, and j is found in to.
1519 Cat-transitions are distinguished from plus-transitions for
1523 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1525 for (int i = 0; i < from.size(); i++)
1526 mergeInto(&s[from.at(i)].outs, to);
1529 #ifndef QT_NO_REGEXP_CAPTURE
1530 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1532 for (int i = 0; i < from.size(); i++) {
1533 QRegExpAutomatonState &st = s[from.at(i)];
1534 const QVector<int> oldOuts = st.outs;
1535 mergeInto(&st.outs, to);
1536 if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1537 for (int j = 0; j < to.size(); j++) {
1538 // ### st.reenter.contains(to.at(j)) check looks suspicious
1539 if (!st.reenter.contains(to.at(j)) &&
1540 qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1541 st.reenter.insert(to.at(j), atom);
1548 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1550 Returns an anchor that means a OR b.
1552 int QRegExpEngine::anchorAlternation(int a, int b)
1554 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1558 #ifndef QT_NO_REGEXP_OPTIM
1559 if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1560 return Anchor_Alternation | (n - 1);
1563 QRegExpAnchorAlternation element = {a, b};
1565 return Anchor_Alternation | n;
1569 Returns an anchor that means a AND b.
1571 int QRegExpEngine::anchorConcatenation(int a, int b)
1573 if (((a | b) & Anchor_Alternation) == 0)
1575 if ((b & Anchor_Alternation) != 0)
1578 int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1579 int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1580 return anchorAlternation(aprime, bprime);
1585 Adds anchor a on a transition caracterised by its from state and
1588 void QRegExpEngine::addAnchors(int from, int to, int a)
1590 QRegExpAutomatonState &st = s[from];
1591 if (st.anchors.contains(to))
1592 a = anchorAlternation(st.anchors.value(to), a);
1593 st.anchors.insert(to, a);
1596 #ifndef QT_NO_REGEXP_OPTIM
1598 This function chooses between the good-string and the bad-character
1599 heuristics. It computes two scores and chooses the heuristic with
1602 Here are some common-sense constraints on the scores that should be
1603 respected if the formulas are ever modified: (1) If goodStr is
1604 empty, the good-string heuristic scores 0. (2) If the regular
1605 expression is trivial, the good-string heuristic should be used.
1606 (3) If the search is case insensitive, the good-string heuristic
1607 should be used, unless it scores 0. (Case insensitivity turns all
1608 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1609 big, the good-string heuristic should score less.
1611 void QRegExpEngine::heuristicallyChooseHeuristic()
1614 useGoodStringHeuristic = false;
1615 } else if (trivial) {
1616 useGoodStringHeuristic = true;
1619 Magic formula: The good string has to constitute a good
1620 proportion of the minimum-length string, and appear at a
1621 more-or-less known index.
1623 int goodStringScore = (64 * goodStr.length() / minl) -
1624 (goodLateStart - goodEarlyStart);
1626 Less magic formula: We pick some characters at random, and
1627 check whether they are good or bad.
1629 int badCharScore = 0;
1630 int step = qMax(1, NumBadChars / 32);
1631 for (int i = 1; i < NumBadChars; i += step) {
1632 if (occ1.at(i) == NoOccurrence)
1633 badCharScore += minl;
1635 badCharScore += occ1.at(i);
1637 badCharScore /= minl;
1638 useGoodStringHeuristic = (goodStringScore > badCharScore);
1643 #if defined(QT_DEBUG)
1644 void QRegExpEngine::dump() const
1647 qDebug("Case %ssensitive engine", cs ? "" : "in");
1649 for (i = 0; i < s.size(); i++) {
1650 qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1651 #ifndef QT_NO_REGEXP_CAPTURE
1653 qDebug(" in atom %d", s[i].atom);
1656 if ((m & CharClassBit) != 0) {
1657 qDebug(" match character class %d", m ^ CharClassBit);
1658 #ifndef QT_NO_REGEXP_CCLASS
1659 cl[m ^ CharClassBit].dump();
1661 qDebug(" negative character class");
1663 } else if ((m & BackRefBit) != 0) {
1664 qDebug(" match back-reference %d", m ^ BackRefBit);
1665 } else if (m >= 0x20 && m <= 0x7e) {
1666 qDebug(" match 0x%.4x (%c)", m, m);
1668 qDebug(" match 0x%.4x", m);
1670 for (j = 0; j < s[i].outs.size(); j++) {
1671 int next = s[i].outs[j];
1672 qDebug(" -> %d", next);
1673 if (s[i].reenter.contains(next))
1674 qDebug(" [reenter %d]", s[i].reenter[next]);
1675 if (s[i].anchors.value(next) != 0)
1676 qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
1679 #ifndef QT_NO_REGEXP_CAPTURE
1681 qDebug(" Atom Parent Capture");
1682 for (i = 0; i < nf; i++) {
1683 if (f[i].capture == QRegExpAtom::NoCapture) {
1684 qDebug(" %6d %6d nil", i, f[i].parent);
1686 int cap = f[i].capture;
1687 bool official = captureForOfficialCapture.contains(cap);
1688 qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1689 official ? "official" : "");
1694 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1695 for (i = 0; i < aa.size(); i++)
1696 qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1701 void QRegExpEngine::setup()
1704 #ifndef QT_NO_REGEXP_CAPTURE
1711 #ifndef QT_NO_REGEXP_OPTIM
1712 caretAnchored = true;
1716 #ifndef QT_NO_REGEXP_BACKREF
1719 #ifndef QT_NO_REGEXP_OPTIM
1720 useGoodStringHeuristic = true;
1722 occ1.fill(0, NumBadChars);
1726 int QRegExpEngine::setupState(int match)
1728 #ifndef QT_NO_REGEXP_CAPTURE
1729 s += QRegExpAutomatonState(cf, match);
1731 s += QRegExpAutomatonState(match);
1733 return s.size() - 1;
1736 #ifndef QT_NO_REGEXP_CAPTURE
1738 Functions startAtom() and finishAtom() should be called to delimit
1739 atoms. When a state is created, it is assigned to the current atom.
1740 The information is later used for capturing.
1742 int QRegExpEngine::startAtom(bool officialCapture)
1744 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1745 f.resize((nf + 1) << 1);
1748 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1752 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1754 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1755 f[atom].capture = QRegExpAtom::UnofficialCapture;
1756 cf = f.at(atom).parent;
1760 #ifndef QT_NO_REGEXP_LOOKAHEAD
1762 Creates a lookahead anchor.
1764 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1766 int n = ahead.size();
1767 if (n == MaxLookaheads) {
1771 ahead += new QRegExpLookahead(eng, negative);
1772 return Anchor_FirstLookahead << n;
1776 #ifndef QT_NO_REGEXP_CAPTURE
1778 We want the longest leftmost captures.
1780 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1783 for (int i = 0; i < ncap; i++) {
1784 int delta = begin2[i] - begin1[i]; // it has to start early...
1786 delta = end1[i] - end2[i]; // ...and end late
1796 Returns true if anchor a matches at position pos + i in the input
1797 string, otherwise false.
1799 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1803 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1804 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1805 return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1806 || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1809 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1810 if (pos + i != caretPos)
1813 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1817 #ifndef QT_NO_REGEXP_ESCAPE
1818 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1819 bool before = false;
1822 before = isWord(in[pos + i - 1]);
1824 after = isWord(in[pos + i]);
1825 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1827 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1831 #ifndef QT_NO_REGEXP_LOOKAHEAD
1832 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1833 const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1834 for (j = 0; j < ahead.size(); j++) {
1835 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1836 QRegExpMatchState matchState;
1837 matchState.prepareForMatch(ahead[j]->eng);
1838 matchState.match(in + pos + i, len - pos - i, 0,
1839 true, true, caretPos - pos - i);
1840 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1846 #ifndef QT_NO_REGEXP_CAPTURE
1847 #ifndef QT_NO_REGEXP_BACKREF
1848 for (j = 0; j < eng->nbrefs; j++) {
1849 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1850 int i = eng->captureForOfficialCapture.at(j);
1851 if (capBegin[i] != EmptyCapture)
1860 #ifndef QT_NO_REGEXP_OPTIM
1862 The three following functions are what Jeffrey Friedl would call
1863 transmissions (or bump-alongs). Using one or the other should make
1864 no difference except in performance.
1867 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1869 int k = matchState.pos + goodEarlyStart;
1870 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1871 while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1872 int from = k - goodLateStart;
1873 int to = k - goodEarlyStart;
1874 if (from > matchState.pos)
1875 matchState.pos = from;
1877 while (matchState.pos <= to) {
1878 if (matchState.matchHere())
1887 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1892 int lastPos = matchState.len - minl;
1893 memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1896 Set up the slide table, used for the bad-character heuristic,
1897 using the table of first occurrence of each character.
1899 for (i = 0; i < minl; i++) {
1900 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1901 if (sk == NoOccurrence)
1909 if (sk > matchState.slideTab[k])
1910 matchState.slideTab[k] = sk;
1914 if (matchState.pos > lastPos)
1918 if (++slideNext >= matchState.slideTabSize)
1920 if (matchState.slideTab[slideHead] > 0) {
1921 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1922 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1923 matchState.slideTab[slideHead] = 0;
1925 if (matchState.matchHere())
1929 if (matchState.pos == lastPos)
1933 Update the slide table. This code has much in common with
1934 the initialization code.
1936 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1937 if (sk == NoOccurrence) {
1938 matchState.slideTab[slideNext] = minl;
1939 } else if (sk > 0) {
1940 int k = slideNext + minl - sk;
1941 if (k >= matchState.slideTabSize)
1942 k -= matchState.slideTabSize;
1943 if (sk > matchState.slideTab[k])
1944 matchState.slideTab[k] = sk;
1946 slideHead = slideNext;
1952 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1954 while (matchState.pos <= matchState.len) {
1955 if (matchState.matchHere())
1964 Here's the core of the engine. It tries to do a match here and now.
1966 bool QRegExpMatchState::matchHere()
1968 int ncur = 1, nnext = 0;
1973 oneTestMatchedLen = -1;
1974 curStack[0] = QRegExpEngine::InitialState;
1976 int ncap = eng->ncap;
1977 #ifndef QT_NO_REGEXP_CAPTURE
1979 for (j = 0; j < ncap; j++) {
1980 curCapBegin[j] = EmptyCapture;
1981 curCapEnd[j] = EmptyCapture;
1986 #ifndef QT_NO_REGEXP_BACKREF
1987 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1989 while (ncur > 0 && i <= len - pos && !stop)
1992 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1993 for (j = 0; j < ncur; j++) {
1994 int cur = curStack[j];
1995 const QRegExpAutomatonState &scur = eng->s.at(cur);
1996 const QVector<int> &outs = scur.outs;
1997 for (k = 0; k < outs.size(); k++) {
1998 int next = outs.at(k);
1999 const QRegExpAutomatonState &snext = eng->s.at(next);
2001 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2002 int needSomeSleep = 0;
2006 First, check if the anchors are anchored properly.
2008 int a = scur.anchors.value(next);
2009 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2013 If indeed they are, check if the input character is
2014 correct for this transition.
2018 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2022 inside = (QChar(m).toLower() == QChar(ch).toLower());
2023 } else if (next == QRegExpEngine::FinalState) {
2027 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2028 #ifndef QT_NO_REGEXP_CCLASS
2029 const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2032 else if (cc.negative())
2033 inside = cc.in(QChar(ch).toLower()) &&
2034 cc.in(QChar(ch).toUpper());
2036 inside = cc.in(QChar(ch).toLower()) ||
2037 cc.in(QChar(ch).toUpper());
2039 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2040 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2041 int bref = m ^ QRegExpEngine::BackRefBit;
2042 int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2044 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2047 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2049 inside = (in[pos + curCapBegin[ell]].toLower()
2050 == QChar(ch).toLower());
2055 if (curCapEnd[ell] == EmptyCapture)
2056 delta = i - curCapBegin[ell];
2058 delta = curCapEnd[ell] - curCapBegin[ell];
2060 inside = (delta <= len - (pos + i));
2061 if (inside && delta > 1) {
2065 if (in[pos + curCapBegin[ell] + n]
2072 QChar a = in[pos + curCapBegin[ell] + n];
2073 QChar b = in[pos + i + n];
2074 if (a.toLower() != b.toLower())
2079 inside = (n == delta);
2081 needSomeSleep = delta - 1;
2089 We must now update our data structures.
2092 #ifndef QT_NO_REGEXP_CAPTURE
2093 int *capBegin, *capEnd;
2096 If the next state was not encountered yet, all
2099 if ((m = inNextStack[next]) == -1) {
2101 nextStack[m] = next;
2102 inNextStack[next] = m;
2103 #ifndef QT_NO_REGEXP_CAPTURE
2104 capBegin = nextCapBegin + m * ncap;
2105 capEnd = nextCapEnd + m * ncap;
2108 Otherwise, we'll first maintain captures in
2109 temporary arrays, and decide at the end whether
2110 it's best to keep the previous capture zones or
2114 capBegin = tempCapBegin;
2115 capEnd = tempCapEnd;
2119 #ifndef QT_NO_REGEXP_CAPTURE
2121 Updating the capture zones is much of a task.
2124 memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2125 memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2126 int c = scur.atom, n = snext.atom;
2131 Lemma 1. For any x in the range [0..nf), we
2132 have f[x].parent < x.
2134 Proof. By looking at startAtom(), it is
2135 clear that cf < nf holds all the time, and
2136 thus that f[nf].parent < nf.
2140 If we are reentering an atom, we empty all
2141 capture zones inside it.
2143 if ((q = scur.reenter.value(next)) != 0) {
2144 QBitArray b(eng->nf, false);
2146 for (int ell = q + 1; ell < eng->nf; ell++) {
2147 if (b.testBit(eng->f.at(ell).parent)) {
2148 b.setBit(ell, true);
2149 cap = eng->f.at(ell).capture;
2151 capBegin[cap] = EmptyCapture;
2152 capEnd[cap] = EmptyCapture;
2156 p = eng->f.at(q).parent;
2159 Otherwise, close the capture zones we are
2160 leaving. We are leaving f[c].capture,
2161 f[f[c].parent].capture,
2162 f[f[f[c].parent].parent].capture, ...,
2163 until f[x].capture, with x such that
2164 f[x].parent is the youngest common ancestor
2167 We go up along c's and n's ancestry until
2175 cap = eng->f.at(p).capture;
2177 if (capBegin[cap] == i) {
2178 capBegin[cap] = EmptyCapture;
2179 capEnd[cap] = EmptyCapture;
2184 p = eng->f.at(p).parent;
2186 q = eng->f.at(q).parent;
2192 In any case, we now open the capture zones
2193 we are entering. We work upwards from n
2194 until we reach p (the parent of the atom we
2195 reenter or the youngest common ancestor).
2198 cap = eng->f.at(n).capture;
2201 capEnd[cap] = EmptyCapture;
2203 n = eng->f.at(n).parent;
2206 If the next state was already in
2207 nextStack, we must choose carefully which
2208 capture zones we want to keep.
2210 if (capBegin == tempCapBegin &&
2211 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2212 nextCapEnd + m * ncap)) {
2213 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2214 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2217 #ifndef QT_NO_REGEXP_BACKREF
2219 We are done with updating the capture zones.
2220 It's now time to put the next state to sleep,
2221 if it needs to, and to remove it from
2224 if (needSomeSleep > 0) {
2225 QVector<int> zzZ(2 + 2 * ncap);
2226 zzZ[0] = i + needSomeSleep;
2229 memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2230 memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2232 inNextStack[nextStack[--nnext]] = -1;
2233 sleeping.append(zzZ);
2240 #ifndef QT_NO_REGEXP_CAPTURE
2242 If we reached the final state, hurray! Copy the captured
2245 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2246 memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2247 memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2249 #ifndef QT_NO_REGEXP_BACKREF
2251 It's time to wake up the sleepers.
2254 while (j < sleeping.count()) {
2255 if (sleeping.at(j)[0] == i) {
2256 const QVector<int> &zzZ = sleeping.at(j);
2258 const int *capBegin = zzZ.data() + 2;
2259 const int *capEnd = zzZ.data() + 2 + ncap;
2260 bool copyOver = true;
2262 if ((m = inNextStack[next]) == -1) {
2264 nextStack[m] = next;
2265 inNextStack[next] = m;
2267 copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2271 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2272 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2275 sleeping.removeAt(j);
2282 for (j = 0; j < nnext; j++)
2283 inNextStack[nextStack[j]] = -1;
2285 // avoid needless iteration that confuses oneTestMatchedLen
2286 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2287 #ifndef QT_NO_REGEXP_BACKREF
2288 && sleeping.isEmpty()
2293 qSwap(curStack, nextStack);
2294 #ifndef QT_NO_REGEXP_CAPTURE
2295 qSwap(curCapBegin, nextCapBegin);
2296 qSwap(curCapEnd, nextCapEnd);
2303 #ifndef QT_NO_REGEXP_BACKREF
2305 If minimal matching is enabled, we might have some sleepers
2308 if (!sleeping.isEmpty())
2312 oneTestMatchedLen = i - 1;
2313 return (matchLen >= 0);
2316 #ifndef QT_NO_REGEXP_CCLASS
2318 QRegExpCharClass::QRegExpCharClass()
2321 #ifndef QT_NO_REGEXP_OPTIM
2322 occ1.fill(NoOccurrence, NumBadChars);
2326 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2331 #ifndef QT_NO_REGEXP_OPTIM
2337 void QRegExpCharClass::clear()
2344 void QRegExpCharClass::setNegative(bool negative)
2347 #ifndef QT_NO_REGEXP_OPTIM
2348 occ1.fill(0, NumBadChars);
2352 void QRegExpCharClass::addCategories(uint cats)
2354 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2355 FLAG(QChar::Mark_SpacingCombining) |
2356 FLAG(QChar::Mark_Enclosing) |
2357 FLAG(QChar::Number_DecimalDigit) |
2358 FLAG(QChar::Number_Letter) |
2359 FLAG(QChar::Number_Other) |
2360 FLAG(QChar::Separator_Space) |
2361 FLAG(QChar::Separator_Line) |
2362 FLAG(QChar::Separator_Paragraph) |
2363 FLAG(QChar::Other_Control) |
2364 FLAG(QChar::Other_Format) |
2365 FLAG(QChar::Other_Surrogate) |
2366 FLAG(QChar::Other_PrivateUse) |
2367 FLAG(QChar::Other_NotAssigned) |
2368 FLAG(QChar::Letter_Uppercase) |
2369 FLAG(QChar::Letter_Lowercase) |
2370 FLAG(QChar::Letter_Titlecase) |
2371 FLAG(QChar::Letter_Modifier) |
2372 FLAG(QChar::Letter_Other) |
2373 FLAG(QChar::Punctuation_Connector) |
2374 FLAG(QChar::Punctuation_Dash) |
2375 FLAG(QChar::Punctuation_Open) |
2376 FLAG(QChar::Punctuation_Close) |
2377 FLAG(QChar::Punctuation_InitialQuote) |
2378 FLAG(QChar::Punctuation_FinalQuote) |
2379 FLAG(QChar::Punctuation_Other) |
2380 FLAG(QChar::Symbol_Math) |
2381 FLAG(QChar::Symbol_Currency) |
2382 FLAG(QChar::Symbol_Modifier) |
2383 FLAG(QChar::Symbol_Other);
2384 c |= (all_cats & cats);
2385 #ifndef QT_NO_REGEXP_OPTIM
2386 occ1.fill(0, NumBadChars);
2390 void QRegExpCharClass::addRange(ushort from, ushort to)
2397 r[m].len = to - from + 1;
2399 #ifndef QT_NO_REGEXP_OPTIM
2402 if (to - from < NumBadChars) {
2403 if (from % NumBadChars <= to % NumBadChars) {
2404 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2407 for (i = 0; i <= to % NumBadChars; i++)
2409 for (i = from % NumBadChars; i < NumBadChars; i++)
2413 occ1.fill(0, NumBadChars);
2418 bool QRegExpCharClass::in(QChar ch) const
2420 #ifndef QT_NO_REGEXP_OPTIM
2421 if (occ1.at(BadChar(ch)) == NoOccurrence)
2425 if (c != 0 && (c & FLAG(ch.category())) != 0)
2428 const int uc = ch.unicode();
2429 int size = r.size();
2431 for (int i = 0; i < size; ++i) {
2432 const QRegExpCharClassRange &range = r.at(i);
2433 if (uint(uc - range.from) < uint(r.at(i).len))
2439 #if defined(QT_DEBUG)
2440 void QRegExpCharClass::dump() const
2443 qDebug(" %stive character class", n ? "nega" : "posi");
2444 #ifndef QT_NO_REGEXP_CCLASS
2446 qDebug(" categories 0x%.8x", c);
2448 for (i = 0; i < r.size(); i++)
2449 qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2454 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2455 : eng(engine), skipanchors(0)
2456 #ifndef QT_NO_REGEXP_OPTIM
2457 , earlyStart(0), lateStart(0), maxl(0)
2460 #ifndef QT_NO_REGEXP_OPTIM
2461 occ1.fill(NoOccurrence, NumBadChars);
2466 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2471 lanchors = b.lanchors;
2472 ranchors = b.ranchors;
2473 skipanchors = b.skipanchors;
2474 #ifndef QT_NO_REGEXP_OPTIM
2475 earlyStart = b.earlyStart;
2476 lateStart = b.lateStart;
2478 leftStr = b.leftStr;
2479 rightStr = b.rightStr;
2487 void QRegExpEngine::Box::set(QChar ch)
2490 ls[0] = eng->createState(ch);
2492 #ifndef QT_NO_REGEXP_OPTIM
2497 occ1[BadChar(ch)] = 0;
2502 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2505 ls[0] = eng->createState(cc);
2507 #ifndef QT_NO_REGEXP_OPTIM
2509 occ1 = cc.firstOccurrence();
2514 #ifndef QT_NO_REGEXP_BACKREF
2515 void QRegExpEngine::Box::set(int bref)
2518 ls[0] = eng->createState(bref);
2520 if (bref >= 1 && bref <= MaxBackRefs)
2521 skipanchors = Anchor_BackRef0Empty << bref;
2522 #ifndef QT_NO_REGEXP_OPTIM
2529 void QRegExpEngine::Box::cat(const Box &b)
2531 eng->addCatTransitions(rs, b.ls);
2532 addAnchorsToEngine(b);
2534 lanchors.unite(b.lanchors);
2535 if (skipanchors != 0) {
2536 for (int i = 0; i < b.ls.size(); i++) {
2537 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2538 lanchors.insert(b.ls.at(i), a);
2541 mergeInto(&ls, b.ls);
2544 ranchors.unite(b.ranchors);
2545 if (b.skipanchors != 0) {
2546 for (int i = 0; i < rs.size(); i++) {
2547 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2548 ranchors.insert(rs.at(i), a);
2551 mergeInto(&rs, b.rs);
2553 ranchors = b.ranchors;
2557 #ifndef QT_NO_REGEXP_OPTIM
2558 if (maxl != InftyLen) {
2559 if (rightStr.length() + b.leftStr.length() >
2560 qMax(str.length(), b.str.length())) {
2561 earlyStart = minl - rightStr.length();
2562 lateStart = maxl - rightStr.length();
2563 str = rightStr + b.leftStr;
2564 } else if (b.str.length() > str.length()) {
2565 earlyStart = minl + b.earlyStart;
2566 lateStart = maxl + b.lateStart;
2571 if (leftStr.length() == maxl)
2572 leftStr += b.leftStr;
2574 if (b.rightStr.length() == b.maxl) {
2575 rightStr += b.rightStr;
2577 rightStr = b.rightStr;
2580 if (maxl == InftyLen || b.maxl == InftyLen) {
2586 for (int i = 0; i < NumBadChars; i++) {
2587 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2588 occ1[i] = minl + b.occ1.at(i);
2594 skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2599 void QRegExpEngine::Box::orx(const Box &b)
2601 mergeInto(&ls, b.ls);
2602 lanchors.unite(b.lanchors);
2603 mergeInto(&rs, b.rs);
2604 ranchors.unite(b.ranchors);
2608 skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2610 skipanchors = b.skipanchors;
2613 #ifndef QT_NO_REGEXP_OPTIM
2614 for (int i = 0; i < NumBadChars; i++) {
2615 if (occ1.at(i) > b.occ1.at(i))
2616 occ1[i] = b.occ1.at(i);
2621 leftStr = QString();
2622 rightStr = QString();
2630 void QRegExpEngine::Box::plus(int atom)
2632 #ifndef QT_NO_REGEXP_CAPTURE
2633 eng->addPlusTransitions(rs, ls, atom);
2636 eng->addCatTransitions(rs, ls);
2638 addAnchorsToEngine(*this);
2639 #ifndef QT_NO_REGEXP_OPTIM
2644 void QRegExpEngine::Box::opt()
2646 #ifndef QT_NO_REGEXP_OPTIM
2650 leftStr = QString();
2651 rightStr = QString();
2657 void QRegExpEngine::Box::catAnchor(int a)
2660 for (int i = 0; i < rs.size(); i++) {
2661 a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2662 ranchors.insert(rs.at(i), a);
2665 skipanchors = eng->anchorConcatenation(skipanchors, a);
2669 #ifndef QT_NO_REGEXP_OPTIM
2670 void QRegExpEngine::Box::setupHeuristics()
2672 eng->goodEarlyStart = earlyStart;
2673 eng->goodLateStart = lateStart;
2674 eng->goodStr = eng->cs ? str : str.toLower();
2679 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2680 1 at this point. An entry of occ1 has to be at most minl or
2681 infinity for the rest of the algorithm to go well.
2683 We waited until here before normalizing these cases (instead of
2684 doing it in Box::orx()) because sometimes things improve by
2685 themselves. Consider for example (112|1)34.
2687 for (int i = 0; i < NumBadChars; i++) {
2688 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2693 eng->occ1.fill(0, NumBadChars);
2696 eng->heuristicallyChooseHeuristic();
2700 #if defined(QT_DEBUG)
2701 void QRegExpEngine::Box::dump() const
2704 qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2705 qDebug(" Left states:");
2706 for (i = 0; i < ls.size(); i++) {
2707 if (lanchors.value(ls[i], 0) == 0)
2708 qDebug(" %d", ls[i]);
2710 qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2712 qDebug(" Right states:");
2713 for (i = 0; i < rs.size(); i++) {
2714 if (ranchors.value(rs[i], 0) == 0)
2715 qDebug(" %d", rs[i]);
2717 qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2719 qDebug(" Skip anchors: 0x%.8x", skipanchors);
2723 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2725 for (int i = 0; i < to.ls.size(); i++) {
2726 for (int j = 0; j < rs.size(); j++) {
2727 int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2728 to.lanchors.value(to.ls.at(i), 0));
2729 eng->addAnchors(rs[j], to.ls[i], a);
2734 #ifndef QT_NO_REGEXP_CCLASS
2735 // fast lookup hash for xml schema extensions
2736 // sorted by name for b-search
2737 static const struct CategoriesRangeMapEntry {
2738 const char name[40];
2740 } categoriesRangeMap[] = {
2741 { "AegeanNumbers", 0x10100, 0x1013F },
2742 { "AlphabeticPresentationForms", 0xFB00, 0xFB4F },
2743 { "AncientGreekMusicalNotation", 0x1D200, 0x1D24F },
2744 { "AncientGreekNumbers", 0x10140, 0x1018F },
2745 { "Arabic", 0x0600, 0x06FF },
2746 { "ArabicPresentationForms-A", 0xFB50, 0xFDFF },
2747 { "ArabicPresentationForms-B", 0xFE70, 0xFEFF },
2748 { "ArabicSupplement", 0x0750, 0x077F },
2749 { "Armenian", 0x0530, 0x058F },
2750 { "Arrows", 0x2190, 0x21FF },
2751 { "BasicLatin", 0x0000, 0x007F },
2752 { "Bengali", 0x0980, 0x09FF },
2753 { "BlockElements", 0x2580, 0x259F },
2754 { "Bopomofo", 0x3100, 0x312F },
2755 { "BopomofoExtended", 0x31A0, 0x31BF },
2756 { "BoxDrawing", 0x2500, 0x257F },
2757 { "BraillePatterns", 0x2800, 0x28FF },
2758 { "Buginese", 0x1A00, 0x1A1F },
2759 { "Buhid", 0x1740, 0x175F },
2760 { "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF },
2761 { "CJKCompatibility", 0x3300, 0x33FF },
2762 { "CJKCompatibilityForms", 0xFE30, 0xFE4F },
2763 { "CJKCompatibilityIdeographs", 0xF900, 0xFAFF },
2764 { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },
2765 { "CJKRadicalsSupplement", 0x2E80, 0x2EFF },
2766 { "CJKStrokes", 0x31C0, 0x31EF },
2767 { "CJKSymbolsandPunctuation", 0x3000, 0x303F },
2768 { "CJKUnifiedIdeographs", 0x4E00, 0x9FFF },
2769 { "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 },
2770 { "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF },
2771 { "Cherokee", 0x13A0, 0x13FF },
2772 { "CombiningDiacriticalMarks", 0x0300, 0x036F },
2773 { "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF },
2774 { "CombiningHalfMarks", 0xFE20, 0xFE2F },
2775 { "CombiningMarksforSymbols", 0x20D0, 0x20FF },
2776 { "ControlPictures", 0x2400, 0x243F },
2777 { "Coptic", 0x2C80, 0x2CFF },
2778 { "CurrencySymbols", 0x20A0, 0x20CF },
2779 { "CypriotSyllabary", 0x10800, 0x1083F },
2780 { "Cyrillic", 0x0400, 0x04FF },
2781 { "CyrillicSupplement", 0x0500, 0x052F },
2782 { "Deseret", 0x10400, 0x1044F },
2783 { "Devanagari", 0x0900, 0x097F },
2784 { "Dingbats", 0x2700, 0x27BF },
2785 { "EnclosedAlphanumerics", 0x2460, 0x24FF },
2786 { "EnclosedCJKLettersandMonths", 0x3200, 0x32FF },
2787 { "Ethiopic", 0x1200, 0x137F },
2788 { "EthiopicExtended", 0x2D80, 0x2DDF },
2789 { "EthiopicSupplement", 0x1380, 0x139F },
2790 { "GeneralPunctuation", 0x2000, 0x206F },
2791 { "GeometricShapes", 0x25A0, 0x25FF },
2792 { "Georgian", 0x10A0, 0x10FF },
2793 { "GeorgianSupplement", 0x2D00, 0x2D2F },
2794 { "Glagolitic", 0x2C00, 0x2C5F },
2795 { "Gothic", 0x10330, 0x1034F },
2796 { "Greek", 0x0370, 0x03FF },
2797 { "GreekExtended", 0x1F00, 0x1FFF },
2798 { "Gujarati", 0x0A80, 0x0AFF },
2799 { "Gurmukhi", 0x0A00, 0x0A7F },
2800 { "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF },
2801 { "HangulCompatibilityJamo", 0x3130, 0x318F },
2802 { "HangulJamo", 0x1100, 0x11FF },
2803 { "HangulSyllables", 0xAC00, 0xD7A3 },
2804 { "Hanunoo", 0x1720, 0x173F },
2805 { "Hebrew", 0x0590, 0x05FF },
2806 { "Hiragana", 0x3040, 0x309F },
2807 { "IPAExtensions", 0x0250, 0x02AF },
2808 { "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF },
2809 { "Kanbun", 0x3190, 0x319F },
2810 { "KangxiRadicals", 0x2F00, 0x2FDF },
2811 { "Kannada", 0x0C80, 0x0CFF },
2812 { "Katakana", 0x30A0, 0x30FF },
2813 { "KatakanaPhoneticExtensions", 0x31F0, 0x31FF },
2814 { "Kharoshthi", 0x10A00, 0x10A5F },
2815 { "Khmer", 0x1780, 0x17FF },
2816 { "KhmerSymbols", 0x19E0, 0x19FF },
2817 { "Lao", 0x0E80, 0x0EFF },
2818 { "Latin-1Supplement", 0x0080, 0x00FF },
2819 { "LatinExtended-A", 0x0100, 0x017F },
2820 { "LatinExtended-B", 0x0180, 0x024F },
2821 { "LatinExtendedAdditional", 0x1E00, 0x1EFF },
2822 { "LetterlikeSymbols", 0x2100, 0x214F },
2823 { "Limbu", 0x1900, 0x194F },
2824 { "LinearBIdeograms", 0x10080, 0x100FF },
2825 { "LinearBSyllabary", 0x10000, 0x1007F },
2826 { "Malayalam", 0x0D00, 0x0D7F },
2827 { "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF },
2828 { "MathematicalOperators", 0x2200, 0x22FF },
2829 { "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF },
2830 { "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF },
2831 { "MiscellaneousSymbols", 0x2600, 0x26FF },
2832 { "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF },
2833 { "MiscellaneousTechnical", 0x2300, 0x23FF },
2834 { "ModifierToneLetters", 0xA700, 0xA71F },
2835 { "Mongolian", 0x1800, 0x18AF },
2836 { "MusicalSymbols", 0x1D100, 0x1D1FF },
2837 { "Myanmar", 0x1000, 0x109F },
2838 { "NewTaiLue", 0x1980, 0x19DF },
2839 { "NumberForms", 0x2150, 0x218F },
2840 { "Ogham", 0x1680, 0x169F },
2841 { "OldItalic", 0x10300, 0x1032F },
2842 { "OldPersian", 0x103A0, 0x103DF },
2843 { "OpticalCharacterRecognition", 0x2440, 0x245F },
2844 { "Oriya", 0x0B00, 0x0B7F },
2845 { "Osmanya", 0x10480, 0x104AF },
2846 { "PhoneticExtensions", 0x1D00, 0x1D7F },
2847 { "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF },
2848 { "PrivateUse", 0xE000, 0xF8FF },
2849 { "Runic", 0x16A0, 0x16FF },
2850 { "Shavian", 0x10450, 0x1047F },
2851 { "Sinhala", 0x0D80, 0x0DFF },
2852 { "SmallFormVariants", 0xFE50, 0xFE6F },
2853 { "SpacingModifierLetters", 0x02B0, 0x02FF },
2854 { "Specials", 0xFFF0, 0xFFFF },
2855 { "SuperscriptsandSubscripts", 0x2070, 0x209F },
2856 { "SupplementalArrows-A", 0x27F0, 0x27FF },
2857 { "SupplementalArrows-B", 0x2900, 0x297F },
2858 { "SupplementalMathematicalOperators", 0x2A00, 0x2AFF },
2859 { "SupplementalPunctuation", 0x2E00, 0x2E7F },
2860 { "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF },
2861 { "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF },
2862 { "SylotiNagri", 0xA800, 0xA82F },
2863 { "Syriac", 0x0700, 0x074F },
2864 { "Tagalog", 0x1700, 0x171F },
2865 { "Tagbanwa", 0x1760, 0x177F },
2866 { "Tags", 0xE0000, 0xE007F },
2867 { "TaiLe", 0x1950, 0x197F },
2868 { "TaiXuanJingSymbols", 0x1D300, 0x1D35F },
2869 { "Tamil", 0x0B80, 0x0BFF },
2870 { "Telugu", 0x0C00, 0x0C7F },
2871 { "Thaana", 0x0780, 0x07BF },
2872 { "Thai", 0x0E00, 0x0E7F },
2873 { "Tibetan", 0x0F00, 0x0FFF },
2874 { "Tifinagh", 0x2D30, 0x2D7F },
2875 { "Ugaritic", 0x10380, 0x1039F },
2876 { "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F },
2877 { "VariationSelectors", 0xFE00, 0xFE0F },
2878 { "VariationSelectorsSupplement", 0xE0100, 0xE01EF },
2879 { "VerticalForms", 0xFE10, 0xFE1F },
2880 { "YiRadicals", 0xA490, 0xA4CF },
2881 { "YiSyllables", 0xA000, 0xA48F },
2882 { "YijingHexagramSymbols", 0x4DC0, 0x4DFF }
2885 inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2886 { return qstrcmp(name, entry.name) < 0; }
2887 inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2888 { return qstrcmp(entry.name, name) < 0; }
2889 #endif // QT_NO_REGEXP_CCLASS
2891 int QRegExpEngine::getChar()
2893 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2896 int QRegExpEngine::getEscape()
2898 #ifndef QT_NO_REGEXP_ESCAPE
2899 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2900 const char backTab[] = "\a\f\n\r\t\v";
2907 if (prevCh == EOS) {
2909 return Tok_Char | '\\';
2912 #ifndef QT_NO_REGEXP_ESCAPE
2913 if ((prevCh & ~0xff) == 0) {
2914 const char *p = strchr(tab, prevCh);
2916 return Tok_Char | backTab[p - tab];
2921 #ifndef QT_NO_REGEXP_ESCAPE
2924 for (i = 0; i < 3; i++) {
2925 if (yyCh >= '0' && yyCh <= '7')
2926 val = (val << 3) | (yyCh - '0');
2931 if ((val & ~0377) != 0)
2933 return Tok_Char | val;
2935 #ifndef QT_NO_REGEXP_ESCAPE
2939 #ifndef QT_NO_REGEXP_CCLASS
2941 // see QChar::isDigit()
2942 yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2943 return Tok_CharClass;
2945 // see QChar::isSpace()
2946 yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |
2947 FLAG(QChar::Separator_Line) |
2948 FLAG(QChar::Separator_Paragraph) |
2949 FLAG(QChar::Other_Control)));
2950 yyCharClass->addRange(0x0000, 0x0008);
2951 yyCharClass->addRange(0x000e, 0x001f);
2952 yyCharClass->addRange(0x007f, 0x0084);
2953 yyCharClass->addRange(0x0086, 0x009f);
2954 return Tok_CharClass;
2956 // see QChar::isLetterOrNumber() and QChar::isMark()
2957 yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2958 FLAG(QChar::Mark_SpacingCombining) |
2959 FLAG(QChar::Mark_Enclosing) |
2960 FLAG(QChar::Number_DecimalDigit) |
2961 FLAG(QChar::Number_Letter) |
2962 FLAG(QChar::Number_Other) |
2963 FLAG(QChar::Letter_Uppercase) |
2964 FLAG(QChar::Letter_Lowercase) |
2965 FLAG(QChar::Letter_Titlecase) |
2966 FLAG(QChar::Letter_Modifier) |
2967 FLAG(QChar::Letter_Other) |
2968 FLAG(QChar::Punctuation_Connector)));
2969 yyCharClass->addRange(0x203f, 0x2040);
2970 yyCharClass->addSingleton(0x2040);
2971 yyCharClass->addSingleton(0x2054);
2972 yyCharClass->addSingleton(0x30fb);
2973 yyCharClass->addRange(0xfe33, 0xfe34);
2974 yyCharClass->addRange(0xfe4d, 0xfe4f);
2975 yyCharClass->addSingleton(0xff3f);
2976 yyCharClass->addSingleton(0xff65);
2977 return Tok_CharClass;
2979 #ifndef QT_NO_REGEXP_ESCAPE
2983 #ifndef QT_NO_REGEXP_CCLASS
2985 // see QChar::isDigit()
2986 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2987 return Tok_CharClass;
2989 // see QChar::isSpace()
2990 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2991 FLAG(QChar::Separator_Line) |
2992 FLAG(QChar::Separator_Paragraph));
2993 yyCharClass->addRange(0x0009, 0x000d);
2994 yyCharClass->addSingleton(0x0085);
2995 return Tok_CharClass;
2997 // see QChar::isLetterOrNumber() and QChar::isMark()
2998 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2999 FLAG(QChar::Mark_SpacingCombining) |
3000 FLAG(QChar::Mark_Enclosing) |
3001 FLAG(QChar::Number_DecimalDigit) |
3002 FLAG(QChar::Number_Letter) |
3003 FLAG(QChar::Number_Other) |
3004 FLAG(QChar::Letter_Uppercase) |
3005 FLAG(QChar::Letter_Lowercase) |
3006 FLAG(QChar::Letter_Titlecase) |
3007 FLAG(QChar::Letter_Modifier) |
3008 FLAG(QChar::Letter_Other));
3009 yyCharClass->addSingleton(0x005f); // '_'
3010 return Tok_CharClass;
3012 if (xmlSchemaExtensions) {
3013 yyCharClass->setNegative(!yyCharClass->negative());
3019 if (xmlSchemaExtensions) {
3020 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3021 FLAG(QChar::Mark_SpacingCombining) |
3022 FLAG(QChar::Mark_Enclosing) |
3023 FLAG(QChar::Number_DecimalDigit) |
3024 FLAG(QChar::Number_Letter) |
3025 FLAG(QChar::Number_Other) |
3026 FLAG(QChar::Letter_Uppercase) |
3027 FLAG(QChar::Letter_Lowercase) |
3028 FLAG(QChar::Letter_Titlecase) |
3029 FLAG(QChar::Letter_Modifier) |
3030 FLAG(QChar::Letter_Other));
3031 yyCharClass->addSingleton(0x003a); // ':'
3032 yyCharClass->addSingleton(0x005f); // '_'
3033 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3034 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3035 yyCharClass->addRange(0xc0, 0xd6);
3036 yyCharClass->addRange(0xd8, 0xf6);
3037 yyCharClass->addRange(0xf8, 0x2ff);
3038 yyCharClass->addRange(0x370, 0x37d);
3039 yyCharClass->addRange(0x37f, 0x1fff);
3040 yyCharClass->addRange(0x200c, 0x200d);
3041 yyCharClass->addRange(0x2070, 0x218f);
3042 yyCharClass->addRange(0x2c00, 0x2fef);
3043 yyCharClass->addRange(0x3001, 0xd7ff);
3044 yyCharClass->addRange(0xf900, 0xfdcf);
3045 yyCharClass->addRange(0xfdf0, 0xfffd);
3046 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3047 return Tok_CharClass;
3052 if (xmlSchemaExtensions) {
3053 yyCharClass->setNegative(!yyCharClass->negative());
3059 if (xmlSchemaExtensions) {
3060 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3061 FLAG(QChar::Mark_SpacingCombining) |
3062 FLAG(QChar::Mark_Enclosing) |
3063 FLAG(QChar::Number_DecimalDigit) |
3064 FLAG(QChar::Number_Letter) |
3065 FLAG(QChar::Number_Other) |
3066 FLAG(QChar::Letter_Uppercase) |
3067 FLAG(QChar::Letter_Lowercase) |
3068 FLAG(QChar::Letter_Titlecase) |
3069 FLAG(QChar::Letter_Modifier) |
3070 FLAG(QChar::Letter_Other));
3071 yyCharClass->addSingleton(0x002d); // '-'
3072 yyCharClass->addSingleton(0x002e); // '.'
3073 yyCharClass->addSingleton(0x003a); // ':'
3074 yyCharClass->addSingleton(0x005f); // '_'
3075 yyCharClass->addSingleton(0xb7);
3076 yyCharClass->addRange(0x0030, 0x0039); // [0-9]
3077 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3078 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3079 yyCharClass->addRange(0xc0, 0xd6);
3080 yyCharClass->addRange(0xd8, 0xf6);
3081 yyCharClass->addRange(0xf8, 0x2ff);
3082 yyCharClass->addRange(0x370, 0x37d);
3083 yyCharClass->addRange(0x37f, 0x1fff);
3084 yyCharClass->addRange(0x200c, 0x200d);
3085 yyCharClass->addRange(0x2070, 0x218f);
3086 yyCharClass->addRange(0x2c00, 0x2fef);
3087 yyCharClass->addRange(0x3001, 0xd7ff);
3088 yyCharClass->addRange(0xf900, 0xfdcf);
3089 yyCharClass->addRange(0xfdf0, 0xfffd);
3090 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3091 yyCharClass->addRange(0x0300, 0x036f);
3092 yyCharClass->addRange(0x203f, 0x2040);
3093 return Tok_CharClass;
3098 if (xmlSchemaExtensions) {
3099 yyCharClass->setNegative(!yyCharClass->negative());
3105 if (xmlSchemaExtensions) {
3107 error(RXERR_CHARCLASS);
3108 return Tok_CharClass;
3111 QByteArray category;
3113 while (yyCh != '}') {
3116 return Tok_CharClass;
3118 category.append(yyCh);
3121 yyCh = getChar(); // skip closing '}'
3123 int catlen = category.length();
3124 if (catlen == 1 || catlen == 2) {
3125 switch (category.at(0)) {
3128 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3129 FLAG(QChar::Mark_SpacingCombining) |
3130 FLAG(QChar::Mark_Enclosing));
3132 switch (category.at(1)) {
3133 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3134 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3135 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3136 default: error(RXERR_CATEGORY); break;
3142 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3143 FLAG(QChar::Number_Letter) |
3144 FLAG(QChar::Number_Other));
3146 switch (category.at(1)) {
3147 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3148 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3149 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3150 default: error(RXERR_CATEGORY); break;
3156 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3157 FLAG(QChar::Separator_Line) |
3158 FLAG(QChar::Separator_Paragraph));
3160 switch (category.at(1)) {
3161 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3162 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3163 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3164 default: error(RXERR_CATEGORY); break;
3170 yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3171 FLAG(QChar::Other_Format) |
3172 FLAG(QChar::Other_Surrogate) |
3173 FLAG(QChar::Other_PrivateUse) |
3174 FLAG(QChar::Other_NotAssigned));
3176 switch (category.at(1)) {
3177 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3178 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3179 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3180 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3181 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3182 default: error(RXERR_CATEGORY); break;
3188 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3189 FLAG(QChar::Letter_Lowercase) |
3190 FLAG(QChar::Letter_Titlecase) |
3191 FLAG(QChar::Letter_Modifier) |
3192 FLAG(QChar::Letter_Other));
3194 switch (category.at(1)) {
3195 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3196 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3197 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3198 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3199 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3200 default: error(RXERR_CATEGORY); break;
3206 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3207 FLAG(QChar::Punctuation_Dash) |
3208 FLAG(QChar::Punctuation_Open) |
3209 FLAG(QChar::Punctuation_Close) |
3210 FLAG(QChar::Punctuation_InitialQuote) |
3211 FLAG(QChar::Punctuation_FinalQuote) |
3212 FLAG(QChar::Punctuation_Other));
3214 switch (category.at(1)) {
3215 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3216 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3217 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3218 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3219 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3220 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3221 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3222 default: error(RXERR_CATEGORY); break;
3228 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3229 FLAG(QChar::Symbol_Currency) |
3230 FLAG(QChar::Symbol_Modifier) |
3231 FLAG(QChar::Symbol_Other));
3233 switch (category.at(1)) {
3234 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3235 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3236 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3237 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3238 default: error(RXERR_CATEGORY); break;
3243 error(RXERR_CATEGORY);
3246 } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {
3247 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3248 const CategoriesRangeMapEntry *r = qBinaryFind(categoriesRangeMap, categoriesRangeMap + N, category.constData() + 2);
3249 if (r != categoriesRangeMap + N)
3250 yyCharClass->addRange(r->first, r->second);
3252 error(RXERR_CATEGORY);
3254 error(RXERR_CATEGORY);
3256 return Tok_CharClass;
3261 #ifndef QT_NO_REGEXP_ESCAPE
3264 for (i = 0; i < 4; i++) {
3265 low = QChar(yyCh).toLower().unicode();
3266 if (low >= '0' && low <= '9')
3267 val = (val << 4) | (low - '0');
3268 else if (low >= 'a' && low <= 'f')
3269 val = (val << 4) | (low - 'a' + 10);
3274 return Tok_Char | val;
3279 if (prevCh >= '1' && prevCh <= '9') {
3280 #ifndef QT_NO_REGEXP_BACKREF
3282 while (yyCh >= '0' && yyCh <= '9') {
3283 val = (val * 10) + (yyCh - '0');
3286 return Tok_BackRef | val;
3288 error(RXERR_DISABLED);
3291 return Tok_Char | prevCh;
3294 #ifndef QT_NO_REGEXP_INTERVAL
3295 int QRegExpEngine::getRep(int def)
3297 if (yyCh >= '0' && yyCh <= '9') {
3300 rep = 10 * rep + yyCh - '0';
3301 if (rep >= InftyRep) {
3302 error(RXERR_REPETITION);
3306 } while (yyCh >= '0' && yyCh <= '9');
3314 #ifndef QT_NO_REGEXP_LOOKAHEAD
3315 void QRegExpEngine::skipChars(int n)
3324 void QRegExpEngine::error(const char *msg)
3326 if (yyError.isEmpty())
3327 yyError = QLatin1String(msg);
3330 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3337 yyCharClass.reset(new QRegExpCharClass);
3340 yyError = QString();
3343 int QRegExpEngine::getToken()
3345 #ifndef QT_NO_REGEXP_CCLASS
3346 ushort pendingCh = 0;
3354 #ifndef QT_NO_REGEXP_CCLASS
3355 yyCharClass->clear();
3372 #ifndef QT_NO_REGEXP_LOOKAHEAD
3374 return Tok_NegLookahead;
3376 return Tok_PosLookahead;
3379 return Tok_MagicLeftParen;
3381 error(RXERR_LOOKBEHIND);
3382 return Tok_MagicLeftParen;
3384 error(RXERR_LOOKAHEAD);
3385 return Tok_MagicLeftParen;
3388 return Tok_LeftParen;
3391 return Tok_RightParen;
3394 yyMaxRep = InftyRep;
3395 return Tok_Quantifier;
3398 yyMaxRep = InftyRep;
3399 return Tok_Quantifier;
3401 #ifndef QT_NO_REGEXP_CCLASS
3402 yyCharClass->setNegative(true);
3404 return Tok_CharClass;
3408 return Tok_Quantifier;
3410 #ifndef QT_NO_REGEXP_CCLASS
3412 yyCharClass->setNegative(true);
3415 charPending = false;
3416 rangePending = false;
3418 if (yyCh == '-' && charPending && !rangePending) {
3419 rangePending = true;
3422 if (charPending && !rangePending) {
3423 yyCharClass->addSingleton(pendingCh);
3424 charPending = false;
3429 if (tok == Tok_Word)
3432 tok = Tok_Char | yyCh;
3435 if (tok == Tok_CharClass) {
3437 yyCharClass->addSingleton('-');
3438 yyCharClass->addSingleton(pendingCh);
3439 charPending = false;
3440 rangePending = false;
3442 } else if ((tok & Tok_Char) != 0) {
3444 yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3445 charPending = false;
3446 rangePending = false;
3448 pendingCh = tok ^ Tok_Char;
3452 error(RXERR_CHARCLASS);
3455 } while (yyCh != ']' && yyCh != EOS);
3457 yyCharClass->addSingleton('-');
3459 yyCharClass->addSingleton(pendingCh);
3464 return Tok_CharClass;
3467 return Tok_Char | '[';
3472 error(RXERR_LEFTDELIM);
3473 return Tok_Char | ']';
3477 #ifndef QT_NO_REGEXP_INTERVAL
3478 yyMinRep = getRep(0);
3479 yyMaxRep = yyMinRep;
3482 yyMaxRep = getRep(InftyRep);
3484 if (yyMaxRep < yyMinRep)
3485 error(RXERR_INTERVAL);
3487 error(RXERR_REPETITION);
3489 return Tok_Quantifier;
3491 error(RXERR_DISABLED);
3492 return Tok_Char | '{';
3497 error(RXERR_LEFTDELIM);
3498 return Tok_Char | '}';
3500 return Tok_Char | prevCh;
3504 int QRegExpEngine::parse(const QChar *pattern, int len)
3507 startTokenizer(pattern, len);
3509 #ifndef QT_NO_REGEXP_CAPTURE
3510 yyMayCapture = true;
3512 yyMayCapture = false;
3515 #ifndef QT_NO_REGEXP_CAPTURE
3516 int atom = startAtom(false);
3518 QRegExpCharClass anything;
3519 Box box(this); // create InitialState
3521 Box rightBox(this); // create FinalState
3522 rightBox.set(anything);
3524 Box middleBox(this);
3525 parseExpression(&middleBox);
3526 #ifndef QT_NO_REGEXP_CAPTURE
3527 finishAtom(atom, false);
3529 #ifndef QT_NO_REGEXP_OPTIM
3530 middleBox.setupHeuristics();
3534 yyCharClass.reset(0);
3536 #ifndef QT_NO_REGEXP_CAPTURE
3537 for (int i = 0; i < nf; ++i) {
3538 switch (f[i].capture) {
3539 case QRegExpAtom::NoCapture:
3541 case QRegExpAtom::OfficialCapture:
3542 f[i].capture = ncap;
3543 captureForOfficialCapture.append(ncap);
3547 case QRegExpAtom::UnofficialCapture:
3548 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3552 #ifndef QT_NO_REGEXP_BACKREF
3553 #ifndef QT_NO_REGEXP_OPTIM
3554 if (officialncap == 0 && nbrefs == 0) {
3559 // handle the case where there's a \5 with no corresponding capture
3560 // (captureForOfficialCapture.size() != officialncap)
3561 for (int i = 0; i < nbrefs - officialncap; ++i) {
3562 captureForOfficialCapture.append(ncap);
3568 if (!yyError.isEmpty())
3571 #ifndef QT_NO_REGEXP_OPTIM
3572 const QRegExpAutomatonState &sinit = s.at(InitialState);
3573 caretAnchored = !sinit.anchors.isEmpty();
3574 if (caretAnchored) {
3575 const QMap<int, int> &anchors = sinit.anchors;
3576 QMap<int, int>::const_iterator a;
3577 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3579 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3580 (*a & Anchor_Alternation) != 0 ||
3582 (*a & Anchor_Caret) == 0)
3584 caretAnchored = false;
3592 int numStates = s.count();
3593 for (int i = 0; i < numStates; ++i) {
3594 QRegExpAutomatonState &state = s[i];
3595 if (!state.anchors.isEmpty()) {
3596 QMap<int, int>::iterator a = state.anchors.begin();
3597 while (a != state.anchors.end()) {
3599 a = state.anchors.erase(a);
3609 void QRegExpEngine::parseAtom(Box *box)
3611 #ifndef QT_NO_REGEXP_LOOKAHEAD
3612 QRegExpEngine *eng = 0;
3617 if ((yyTok & Tok_Char) != 0) {
3618 box->set(QChar(yyTok ^ Tok_Char));
3620 #ifndef QT_NO_REGEXP_OPTIM
3625 box->catAnchor(Anchor_Dollar);
3628 box->catAnchor(Anchor_Caret);
3630 #ifndef QT_NO_REGEXP_LOOKAHEAD
3631 case Tok_PosLookahead:
3632 case Tok_NegLookahead:
3633 neg = (yyTok == Tok_NegLookahead);
3634 eng = new QRegExpEngine(cs, greedyQuantifiers);
3635 len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3639 error(RXERR_LOOKAHEAD);
3640 box->catAnchor(addLookahead(eng, neg));
3642 if (yyTok != Tok_RightParen)
3643 error(RXERR_LOOKAHEAD);
3646 #ifndef QT_NO_REGEXP_ESCAPE
3648 box->catAnchor(Anchor_Word);
3651 box->catAnchor(Anchor_NonWord);
3655 case Tok_MagicLeftParen:
3657 parseExpression(box);
3658 if (yyTok != Tok_RightParen)
3662 box->set(*yyCharClass);
3664 case Tok_Quantifier:
3665 error(RXERR_REPETITION);
3668 #ifndef QT_NO_REGEXP_BACKREF
3669 if ((yyTok & Tok_BackRef) != 0)
3670 box->set(yyTok ^ Tok_BackRef);
3673 error(RXERR_DISABLED);
3679 void QRegExpEngine::parseFactor(Box *box)
3681 #ifndef QT_NO_REGEXP_CAPTURE
3682 int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3683 int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3684 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3686 const int innerAtom = -1;
3689 #ifndef QT_NO_REGEXP_INTERVAL
3691 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3692 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3694 const QChar *in = yyIn;
3699 QRegExpCharClass charClass;
3700 if (yyTok == Tok_CharClass)
3701 charClass = *yyCharClass;
3703 bool mayCapture = yyMayCapture;
3707 #ifndef QT_NO_REGEXP_CAPTURE
3708 finishAtom(innerAtom, magicLeftParen);
3711 bool hasQuantifier = (yyTok == Tok_Quantifier);
3712 if (hasQuantifier) {
3713 #ifndef QT_NO_REGEXP_OPTIM
3716 if (yyMaxRep == InftyRep) {
3717 box->plus(innerAtom);
3718 #ifndef QT_NO_REGEXP_INTERVAL
3719 } else if (yyMaxRep == 0) {
3726 #ifndef QT_NO_REGEXP_INTERVAL
3727 yyMayCapture = false;
3728 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3729 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3734 for (i = 0; i < beta; i++) {
3737 parseAtom(&leftBox);
3738 leftBox.cat(rightBox);
3742 for (i = 0; i < alpha; i++) {
3745 parseAtom(&leftBox);
3746 leftBox.cat(rightBox);
3753 #ifndef QT_NO_REGEXP_INTERVAL
3754 yyMayCapture = mayCapture;
3758 #ifndef QT_NO_REGEXP_CAPTURE
3759 if (greedyQuantifiers)
3760 finishAtom(outerAtom, hasQuantifier);
3764 void QRegExpEngine::parseTerm(Box *box)
3766 #ifndef QT_NO_REGEXP_OPTIM
3767 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3770 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3772 parseFactor(&rightBox);
3777 void QRegExpEngine::parseExpression(Box *box)
3780 while (yyTok == Tok_Bar) {
3781 #ifndef QT_NO_REGEXP_OPTIM
3786 parseTerm(&rightBox);
3792 The struct QRegExpPrivate contains the private data of a regular
3793 expression other than the automaton. It makes it possible for many
3794 QRegExp objects to use the same QRegExpEngine object with different
3795 QRegExpPrivate objects.
3797 struct QRegExpPrivate
3800 QRegExpEngineKey engineKey;
3802 #ifndef QT_NO_REGEXP_CAPTURE
3803 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3804 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3806 QRegExpMatchState matchState;
3808 inline QRegExpPrivate()
3809 : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3810 inline QRegExpPrivate(const QRegExpEngineKey &key)
3811 : eng(0), engineKey(key), minimal(false) {}
3814 #if !defined(QT_NO_REGEXP_OPTIM)
3815 uint qHash(const QRegExpEngineKey &key, uint seed = 0) Q_DECL_NOTHROW
3817 return qHash(key.pattern, seed);
3820 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3821 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3822 static QBasicMutex globalEngineCacheMutex;
3823 #endif // QT_NO_REGEXP_OPTIM
3825 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3827 if (!eng->ref.deref()) {
3828 #if !defined(QT_NO_REGEXP_OPTIM)
3829 if (globalEngineCache()) {
3830 QMutexLocker locker(&globalEngineCacheMutex);
3832 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3833 } QT_CATCH(const std::bad_alloc &) {
3834 // in case of an exception (e.g. oom), just delete the engine
3847 static void prepareEngine_helper(QRegExpPrivate *priv)
3849 bool initMatchState = !priv->eng;
3850 #if !defined(QT_NO_REGEXP_OPTIM)
3851 if (!priv->eng && globalEngineCache()) {
3852 QMutexLocker locker(&globalEngineCacheMutex);
3853 priv->eng = globalEngineCache()->take(priv->engineKey);
3855 priv->eng->ref.ref();
3857 #endif // QT_NO_REGEXP_OPTIM
3860 priv->eng = new QRegExpEngine(priv->engineKey);
3863 priv->matchState.prepareForMatch(priv->eng);
3866 inline static void prepareEngine(QRegExpPrivate *priv)
3870 prepareEngine_helper(priv);
3873 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3875 prepareEngine(priv);
3876 priv->matchState.prepareForMatch(priv->eng);
3877 #ifndef QT_NO_REGEXP_CAPTURE
3879 priv->capturedCache.clear();
3885 static void invalidateEngine(QRegExpPrivate *priv)
3887 if (priv->eng != 0) {
3888 derefEngine(priv->eng, priv->engineKey);
3890 priv->matchState.drain();
3895 \enum QRegExp::CaretMode
3897 The CaretMode enum defines the different meanings of the caret
3898 (\b{^}) in a regular expression. The possible values are:
3901 The caret corresponds to index 0 in the searched string.
3903 \value CaretAtOffset
3904 The caret corresponds to the start offset of the search.
3906 \value CaretWontMatch
3907 The caret never matches.
3911 \enum QRegExp::PatternSyntax
3913 The syntax used to interpret the meaning of the pattern.
3915 \value RegExp A rich Perl-like pattern matching syntax. This is
3918 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3919 (Introduced in Qt 4.2.)
3921 \value Wildcard This provides a simple pattern matching syntax
3922 similar to that used by shells (command interpreters) for "file
3923 globbing". See \l{Wildcard Matching}.
3925 \value WildcardUnix This is similar to Wildcard but with the
3926 behavior of a Unix shell. The wildcard characters can be escaped
3927 with the character "\\".
3929 \value FixedString The pattern is a fixed string. This is
3930 equivalent to using the RegExp pattern on a string in
3931 which all metacharacters are escaped using escape().
3933 \value W3CXmlSchema11 The pattern is a regular expression as
3934 defined by the W3C XML Schema 1.1 specification.
3936 \sa setPatternSyntax()
3940 Constructs an empty regexp.
3942 \sa isValid(), errorString()
3946 priv = new QRegExpPrivate;
3947 prepareEngine(priv);
3951 Constructs a regular expression object for the given \a pattern
3952 string. The pattern must be given using wildcard notation if \a
3953 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3954 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3955 greedy (maximal), but can be changed by calling
3958 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3960 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3962 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3963 prepareEngine(priv);
3967 Constructs a regular expression as a copy of \a rx.
3971 QRegExp::QRegExp(const QRegExp &rx)
3973 priv = new QRegExpPrivate;
3978 Destroys the regular expression and cleans up its internal data.
3982 invalidateEngine(priv);
3987 Copies the regular expression \a rx and returns a reference to the
3988 copy. The case sensitivity, wildcard, and minimal matching options
3991 QRegExp &QRegExp::operator=(const QRegExp &rx)
3993 prepareEngine(rx.priv); // to allow sharing
3994 QRegExpEngine *otherEng = rx.priv->eng;
3996 otherEng->ref.ref();
3997 invalidateEngine(priv);
3998 priv->eng = otherEng;
3999 priv->engineKey = rx.priv->engineKey;
4000 priv->minimal = rx.priv->minimal;
4001 #ifndef QT_NO_REGEXP_CAPTURE
4002 priv->t = rx.priv->t;
4003 priv->capturedCache = rx.priv->capturedCache;
4006 priv->matchState.prepareForMatch(priv->eng);
4007 priv->matchState.captured = rx.priv->matchState.captured;
4012 \fn void QRegExp::swap(QRegExp &other)
4015 Swaps regular expression \a other with this regular
4016 expression. This operation is very fast and never fails.
4020 Returns true if this regular expression is equal to \a rx;
4021 otherwise returns false.
4023 Two QRegExp objects are equal if they have the same pattern
4024 strings and the same settings for case sensitivity, wildcard and
4027 bool QRegExp::operator==(const QRegExp &rx) const
4029 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4033 \fn bool QRegExp::operator!=(const QRegExp &rx) const
4035 Returns true if this regular expression is not equal to \a rx;
4036 otherwise returns false.
4042 Returns true if the pattern string is empty; otherwise returns
4045 If you call exactMatch() with an empty pattern on an empty string
4046 it will return true; otherwise it returns false since it operates
4047 over the whole string. If you call indexIn() with an empty pattern
4048 on \e any string it will return the start offset (0 by default)
4049 because the empty pattern matches the 'emptiness' at the start of
4050 the string. In this case the length of the match returned by
4051 matchedLength() will be 0.
4053 See QString::isEmpty().
4056 bool QRegExp::isEmpty() const
4058 return priv->engineKey.pattern.isEmpty();
4062 Returns true if the regular expression is valid; otherwise returns
4063 false. An invalid regular expression never matches.
4065 The pattern \b{[a-z} is an example of an invalid pattern, since
4066 it lacks a closing square bracket.
4068 Note that the validity of a regexp may also depend on the setting
4069 of the wildcard flag, for example \b{*.html} is a valid
4070 wildcard regexp but an invalid full regexp.
4074 bool QRegExp::isValid() const
4076 if (priv->engineKey.pattern.isEmpty()) {
4079 prepareEngine(priv);
4080 return priv->eng->isValid();
4085 Returns the pattern string of the regular expression. The pattern
4086 has either regular expression syntax or wildcard syntax, depending
4089 \sa patternSyntax(), caseSensitivity()
4091 QString QRegExp::pattern() const
4093 return priv->engineKey.pattern;
4097 Sets the pattern string to \a pattern. The case sensitivity,
4098 wildcard, and minimal matching options are not changed.
4100 \sa setPatternSyntax(), setCaseSensitivity()
4102 void QRegExp::setPattern(const QString &pattern)
4104 if (priv->engineKey.pattern != pattern) {
4105 invalidateEngine(priv);
4106 priv->engineKey.pattern = pattern;
4111 Returns Qt::CaseSensitive if the regexp is matched case
4112 sensitively; otherwise returns Qt::CaseInsensitive.
4114 \sa patternSyntax(), pattern(), isMinimal()
4116 Qt::CaseSensitivity QRegExp::caseSensitivity() const
4118 return priv->engineKey.cs;
4122 Sets case sensitive matching to \a cs.
4124 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4125 \c{readme.txt} but not \c{README.TXT}.
4127 \sa setPatternSyntax(), setPattern(), setMinimal()
4129 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4131 if ((bool)cs != (bool)priv->engineKey.cs) {
4132 invalidateEngine(priv);
4133 priv->engineKey.cs = cs;
4138 Returns the syntax used by the regular expression. The default is
4141 \sa pattern(), caseSensitivity()
4143 QRegExp::PatternSyntax QRegExp::patternSyntax() const
4145 return priv->engineKey.patternSyntax;
4149 Sets the syntax mode for the regular expression. The default is
4152 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4153 \l{wildcard matching}. For example, \b{r*.txt} matches the
4154 string \c{readme.txt} in wildcard mode, but does not match
4157 Setting \a syntax to QRegExp::FixedString means that the pattern
4158 is interpreted as a plain string. Special characters (e.g.,
4159 backslash) don't need to be escaped then.
4161 \sa setPattern(), setCaseSensitivity(), escape()
4163 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4165 if (syntax != priv->engineKey.patternSyntax) {
4166 invalidateEngine(priv);
4167 priv->engineKey.patternSyntax = syntax;
4172 Returns true if minimal (non-greedy) matching is enabled;
4173 otherwise returns false.
4175 \sa caseSensitivity(), setMinimal()
4177 bool QRegExp::isMinimal() const
4179 return priv->minimal;
4183 Enables or disables minimal matching. If \a minimal is false,
4184 matching is greedy (maximal) which is the default.
4186 For example, suppose we have the input string "We must be
4187 <b>bold</b>, very <b>bold</b>!" and the pattern
4188 \b{<b>.*</b>}. With the default greedy (maximal) matching,
4189 the match is "We must be \underline{<b>bold</b>, very
4190 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4191 first match is: "We must be \underline{<b>bold</b>}, very
4192 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4193 very \underline{<b>bold</b>}!". In practice we might use the pattern
4194 \b{<b>[^<]*\</b>} instead, although this will still fail for
4197 \sa setCaseSensitivity()
4199 void QRegExp::setMinimal(bool minimal)
4201 priv->minimal = minimal;
4204 // ### Qt 5: make non-const
4206 Returns true if \a str is matched exactly by this regular
4207 expression; otherwise returns false. You can determine how much of
4208 the string was matched by calling matchedLength().
4210 For a given regexp string R, exactMatch("R") is the equivalent of
4211 indexIn("^R$") since exactMatch() effectively encloses the regexp
4212 in the start of string and end of string anchors, except that it
4213 sets matchedLength() differently.
4215 For example, if the regular expression is \b{blue}, then
4216 exactMatch() returns true only for input \c blue. For inputs \c
4217 bluebell, \c blutak and \c lightblue, exactMatch() returns false
4218 and matchedLength() will return 4, 3 and 0 respectively.
4220 Although const, this function sets matchedLength(),
4221 capturedTexts(), and pos().
4223 \sa indexIn(), lastIndexIn()
4225 bool QRegExp::exactMatch(const QString &str) const
4227 prepareEngineForMatch(priv, str);
4228 priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4229 if (priv->matchState.captured[1] == str.length()) {
4232 priv->matchState.captured[0] = 0;
4233 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4238 // ### Qt 5: make non-const
4240 Attempts to find a match in \a str from position \a offset (0 by
4241 default). If \a offset is -1, the search starts at the last
4242 character; if -2, at the next to last character; etc.
4244 Returns the position of the first match, or -1 if there was no
4247 The \a caretMode parameter can be used to instruct whether \b{^}
4248 should match at index 0 or at \a offset.
4250 You might prefer to use QString::indexOf(), QString::contains(),
4251 or even QStringList::filter(). To replace matches use
4255 \snippet code/src_corelib_tools_qregexp.cpp 13
4257 Although const, this function sets matchedLength(),
4258 capturedTexts() and pos().
4260 If the QRegExp is a wildcard expression (see setPatternSyntax())
4261 and want to test a string against the whole wildcard expression,
4262 use exactMatch() instead of this function.
4264 \sa lastIndexIn(), exactMatch()
4267 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4269 prepareEngineForMatch(priv, str);
4271 offset += str.length();
4272 priv->matchState.match(str.unicode(), str.length(), offset,
4273 priv->minimal, false, caretIndex(offset, caretMode));
4274 return priv->matchState.captured[0];
4277 // ### Qt 5: make non-const
4279 Attempts to find a match backwards in \a str from position \a
4280 offset. If \a offset is -1 (the default), the search starts at the
4281 last character; if -2, at the next to last character; etc.
4283 Returns the position of the first match, or -1 if there was no
4286 The \a caretMode parameter can be used to instruct whether \b{^}
4287 should match at index 0 or at \a offset.
4289 Although const, this function sets matchedLength(),
4290 capturedTexts() and pos().
4292 \warning Searching backwards is much slower than searching
4295 \sa indexIn(), exactMatch()
4298 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4300 prepareEngineForMatch(priv, str);
4302 offset += str.length();
4303 if (offset < 0 || offset > str.length()) {
4304 memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4308 while (offset >= 0) {
4309 priv->matchState.match(str.unicode(), str.length(), offset,
4310 priv->minimal, true, caretIndex(offset, caretMode));
4311 if (priv->matchState.captured[0] == offset)
4319 Returns the length of the last matched string, or -1 if there was
4322 \sa exactMatch(), indexIn(), lastIndexIn()
4324 int QRegExp::matchedLength() const
4326 return priv->matchState.captured[1];
4329 #ifndef QT_NO_REGEXP_CAPTURE
4333 Returns the number of captures contained in the regular expression.
4335 int QRegExp::captureCount() const
4337 prepareEngine(priv);
4338 return priv->eng->captureCount();
4342 Returns a list of the captured text strings.
4344 The first string in the list is the entire matched string. Each
4345 subsequent list element contains a string that matched a
4346 (capturing) subexpression of the regexp.
4349 \snippet code/src_corelib_tools_qregexp.cpp 14
4351 The above example also captures elements that may be present but
4352 which we have no interest in. This problem can be solved by using
4353 non-capturing parentheses:
4355 \snippet code/src_corelib_tools_qregexp.cpp 15
4357 Note that if you want to iterate over the list, you should iterate
4359 \snippet code/src_corelib_tools_qregexp.cpp 16
4361 Some regexps can match an indeterminate number of times. For
4362 example if the input string is "Offsets: 12 14 99 231 7" and the
4363 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4364 all the numbers matched. However, after calling
4365 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4366 "12"), i.e. the entire match was "12" and the first subexpression
4367 matched was "12". The correct approach is to use cap() in a
4368 \l{QRegExp#cap_in_a_loop}{loop}.
4370 The order of elements in the string list is as follows. The first
4371 element is the entire matching string. Each subsequent element
4372 corresponds to the next capturing open left parentheses. Thus
4373 capturedTexts()[1] is the text of the first capturing parentheses,
4374 capturedTexts()[2] is the text of the second and so on
4375 (corresponding to $1, $2, etc., in some other regexp languages).
4379 QStringList QRegExp::capturedTexts() const
4381 if (priv->capturedCache.isEmpty()) {
4382 prepareEngine(priv);
4383 const int *captured = priv->matchState.captured;
4384 int n = priv->matchState.capturedSize;
4386 for (int i = 0; i < n; i += 2) {
4388 if (captured[i + 1] == 0)
4389 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4390 else if (captured[i] >= 0)
4391 m = priv->t.mid(captured[i], captured[i + 1]);
4392 priv->capturedCache.append(m);
4396 return priv->capturedCache;
4402 QStringList QRegExp::capturedTexts()
4404 return const_cast<const QRegExp *>(this)->capturedTexts();
4408 Returns the text captured by the \a nth subexpression. The entire
4409 match has index 0 and the parenthesized subexpressions have
4410 indexes starting from 1 (excluding non-capturing parentheses).
4412 \snippet code/src_corelib_tools_qregexp.cpp 17
4414 The order of elements matched by cap() is as follows. The first
4415 element, cap(0), is the entire matching string. Each subsequent
4416 element corresponds to the next capturing open left parentheses.
4417 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4418 is the text of the second, and so on.
4420 \sa capturedTexts(), pos()
4422 QString QRegExp::cap(int nth) const
4424 return capturedTexts().value(nth);
4430 QString QRegExp::cap(int nth)
4432 return const_cast<const QRegExp *>(this)->cap(nth);
4436 Returns the position of the \a nth captured text in the searched
4437 string. If \a nth is 0 (the default), pos() returns the position
4441 \snippet code/src_corelib_tools_qregexp.cpp 18
4443 For zero-length matches, pos() always returns -1. (For example, if
4444 cap(4) would return an empty string, pos(4) returns -1.) This is
4445 a feature of the implementation.
4447 \sa cap(), capturedTexts()
4449 int QRegExp::pos(int nth) const
4451 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4454 return priv->matchState.captured[2 * nth];
4460 int QRegExp::pos(int nth)
4462 return const_cast<const QRegExp *>(this)->pos(nth);
4466 Returns a text string that explains why a regexp pattern is
4467 invalid the case being; otherwise returns "no error occurred".
4471 QString QRegExp::errorString() const
4474 return QString::fromLatin1(RXERR_OK);
4476 return priv->eng->errorString();
4483 QString QRegExp::errorString()
4485 return const_cast<const QRegExp *>(this)->errorString();
4490 Returns the string \a str with every regexp special character
4491 escaped with a backslash. The special characters are $, (,), *, +,
4492 ., ?, [, \,], ^, {, | and }.
4496 \snippet code/src_corelib_tools_qregexp.cpp 19
4498 This function is useful to construct regexp patterns dynamically:
4500 \snippet code/src_corelib_tools_qregexp.cpp 20
4502 \sa setPatternSyntax()
4504 QString QRegExp::escape(const QString &str)
4507 const int count = str.count();
4508 quoted.reserve(count * 2);
4509 const QLatin1Char backslash('\\');
4510 for (int i = 0; i < count; i++) {
4511 switch (str.at(i).toLatin1()) {
4526 quoted.append(backslash);
4528 quoted.append(str.at(i));
4534 #ifndef QT_NO_DATASTREAM
4538 Writes the regular expression \a regExp to stream \a out.
4540 \sa {Serializing Qt Data Types}
4542 QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp)
4544 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4545 << (quint8)regExp.patternSyntax()
4546 << (quint8)!!regExp.isMinimal();
4552 Reads a regular expression from stream \a in into \a regExp.
4554 \sa {Serializing Qt Data Types}
4556 QDataStream &operator>>(QDataStream &in, QRegExp ®Exp)
4560 quint8 patternSyntax;
4563 in >> pattern >> cs >> patternSyntax >> isMinimal;
4565 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4566 QRegExp::PatternSyntax(patternSyntax));
4568 newRegExp.setMinimal(isMinimal);
4572 #endif // QT_NO_DATASTREAM
4574 #ifndef QT_NO_DEBUG_STREAM
4575 QDebug operator<<(QDebug dbg, const QRegExp &r)
4577 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4578 << ", pattern='"<< r.pattern() << "')";