QRegularExpression: optimize global match
authorGiuseppe D'Angelo <dangelog@gmail.com>
Sat, 7 Apr 2012 00:08:58 +0000 (01:08 +0100)
committerQt by Nokia <qt-info@nokia.com>
Thu, 12 Apr 2012 13:12:23 +0000 (15:12 +0200)
PCRE doesn't like illegal Unicode sequences (it's explicitely
documented in pcreunicode(3) that they trigger undefined behaviour,
and the program may crash). Therefore, we always let PCRE check
the validity of both the pattern and the subject string.

However, when performing global matching, the subject string
can be checked only once: subsequent matches can safely skip the check
and avoid a huge performance hit of scanning the whole subject
string for each match (!).

This patch implements that behaviour internally -- it's still
not possible for the user to skip the sanity check. On large
subject strings, this gives a terrific performance benefit.

Change-Id: Ia44cf18782e07966c9cd6ec4ccfef081ed131763
Reviewed-by: Robin Burchell <robin+qt@viroteck.net>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
src/corelib/tools/qregularexpression.cpp

index 27264f7..a65ac0c 100644 (file)
@@ -796,6 +796,7 @@ struct QRegularExpressionPrivate : QSharedData
                                             int offset,
                                             QRegularExpression::MatchType matchType,
                                             QRegularExpression::MatchOptions matchOptions,
+                                            bool checkSubjectString = true,
                                             const QRegularExpressionMatchPrivate *previous = 0) const;
 
     int captureIndexForName(const QString &name) const;
@@ -1165,7 +1166,8 @@ static int pcre16SafeExec(const pcre16 *code, const pcre16_extra *extra,
     Performs a match of type \a matchType on the given \a subject string with
     options \a matchOptions and returns the QRegularExpressionMatchPrivate of
     the result. It also advances a match if a previous result is given as \a
-    previous.
+    previous. The \a subject string goes a Unicode validity check if
+    \a checkSubjectString is true (PCRE doesn't like illegal UTF-16 sequences).
 
     Advancing a match is a tricky algorithm. If the previous match matched a
     non-empty string, we just do an ordinary match at the offset position.
@@ -1182,6 +1184,7 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString
                                                                    int offset,
                                                                    QRegularExpression::MatchType matchType,
                                                                    QRegularExpression::MatchOptions matchOptions,
+                                                                   bool checkSubjectString,
                                                                    const QRegularExpressionMatchPrivate *previous) const
 {
     if (offset < 0)
@@ -1211,6 +1214,9 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString
     else if (matchType == QRegularExpression::PartialPreferFirstMatch)
         pcreOptions |= PCRE_PARTIAL_HARD;
 
+    if (!checkSubjectString)
+        pcreOptions |= PCRE_NO_UTF16_CHECK;
+
     bool previousMatchWasEmpty = false;
     if (previous && previous->hasMatch &&
             (previous->capturedOffsets.at(0) == previous->capturedOffsets.at(1))) {
@@ -1318,10 +1324,15 @@ QRegularExpressionMatch QRegularExpressionMatchPrivate::nextMatch() const
     Q_ASSERT(isValid);
     Q_ASSERT(hasMatch || hasPartialMatch);
 
+    // Note the "false" passed for the check of the subject string:
+    // if we're advancing a match on the same subject,
+    // then that subject was already checked at least once (when this object
+    // was created, or when the object that created this one was created, etc.)
     QRegularExpressionMatchPrivate *nextPrivate = regularExpression.d->doMatch(subject,
                                                                                capturedOffsets.at(1),
                                                                                matchType,
                                                                                matchOptions,
+                                                                               false,
                                                                                this);
     return QRegularExpressionMatch(*nextPrivate);
 }