QRegularExpression: fix matching over QStringRefs
authorGiuseppe D'Angelo <giuseppe.dangelo@kdab.com>
Mon, 17 Aug 2015 09:28:16 +0000 (11:28 +0200)
committerGiuseppe D'Angelo <giuseppe.dangelo@kdab.com>
Tue, 18 Aug 2015 14:18:48 +0000 (14:18 +0000)
Playing with the offset argument of pcre_exec is not equivalent to
adjusting the pointer to the subject string. In particular, PCRE
can go behind the offset to check for lookbehinds or "transition"
metacharacters (\b, \B, etc.).

This made the code that deals with QStringRefs not matching in behavior
with the corresponding code dealing with QStrings. For instance,

   QString subject("Miss");
   QRegularExpression re("(?<=M)iss");
   re.match(subject.mid(1));           // doesn't match
   re.match(subject.midRef(1));        // matches!!!

Instead, actually adjust the pointer to the subject string so that
the behavior is identical. A broken test that relied on the
equivalence is also removed.

Change-Id: If96333241ef59621d7f5a6a170ebd0a186844874
Reviewed-by: Volker Krause <volker.krause@kdab.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
src/corelib/tools/qregularexpression.cpp
tests/auto/corelib/tools/qregularexpression/tst_qregularexpression.cpp
tests/auto/corelib/tools/qregularexpression/tst_qregularexpression.h

index 2e3c2ca79f3f93e0f43d7ee984ea8a4d0bfeb6dc..070e68154fdedbfae9e923b2ecd84143149d47b6 100644 (file)
@@ -1,7 +1,7 @@
 /****************************************************************************
 **
-** Copyright (C) 2012 Giuseppe D'Angelo <dangelog@gmail.com>.
-** Copyright (C) 2012 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
+** Copyright (C) 2015 Giuseppe D'Angelo <dangelog@gmail.com>.
+** Copyright (C) 2015 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
 ** Copyright (C) 2015 The Qt Company Ltd.
 ** Contact: http://www.qt.io/licensing/
 **
@@ -1325,48 +1325,45 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString
     int * const captureOffsets = priv->capturedOffsets.data();
     const int captureOffsetsCount = priv->capturedOffsets.size();
 
-    int realOffset = offset + subjectStart;
-    const int realSubjectLength = subjectLength + subjectStart;
-
-    const unsigned short * const subjectUtf16 = subject.utf16();
+    const unsigned short * const subjectUtf16 = subject.utf16() + subjectStart;
 
     int result;
 
     if (!previousMatchWasEmpty) {
         result = pcre16SafeExec(compiledPattern, currentStudyData,
-                                subjectUtf16, realSubjectLength,
-                                realOffset, pcreOptions,
+                                subjectUtf16, subjectLength,
+                                offset, pcreOptions,
                                 captureOffsets, captureOffsetsCount);
     } else {
         result = pcre16SafeExec(compiledPattern, currentStudyData,
-                                subjectUtf16, realSubjectLength,
-                                realOffset, pcreOptions | PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED,
+                                subjectUtf16, subjectLength,
+                                offset, pcreOptions | PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED,
                                 captureOffsets, captureOffsetsCount);
 
         if (result == PCRE_ERROR_NOMATCH) {
-            ++realOffset;
+            ++offset;
 
             if (usingCrLfNewlines
-                    && realOffset < realSubjectLength
-                    && subjectUtf16[realOffset - 1] == QLatin1Char('\r')
-                    && subjectUtf16[realOffset] == QLatin1Char('\n')) {
-                ++realOffset;
-            } else if (realOffset < realSubjectLength
-                       && QChar::isLowSurrogate(subjectUtf16[realOffset])) {
-                ++realOffset;
+                    && offset < subjectLength
+                    && subjectUtf16[offset - 1] == QLatin1Char('\r')
+                    && subjectUtf16[offset] == QLatin1Char('\n')) {
+                ++offset;
+            } else if (offset < subjectLength
+                       && QChar::isLowSurrogate(subjectUtf16[offset])) {
+                ++offset;
             }
 
             result = pcre16SafeExec(compiledPattern, currentStudyData,
-                                    subjectUtf16, realSubjectLength,
-                                    realOffset, pcreOptions,
+                                    subjectUtf16, subjectLength,
+                                    offset, pcreOptions,
                                     captureOffsets, captureOffsetsCount);
         }
     }
 
 #ifdef QREGULAREXPRESSION_DEBUG
     qDebug() << "Matching" <<  pattern << "against" << subject
-             << "starting at" << subjectStart << "len" << subjectLength << "real len" << realSubjectLength
-             << "offset" << offset << "real offset" << realOffset
+             << "starting at" << subjectStart << "len" << subjectLength
+             << "offset" << offset
              << matchType << matchOptions << previousMatchWasEmpty
              << "result" << result;
 #endif
@@ -2041,7 +2038,7 @@ QString QRegularExpressionMatch::captured(int nth) const
     if (start == -1) // didn't capture
         return QString();
 
-    return d->subject.mid(start, capturedLength(nth));
+    return d->subject.mid(start + d->subjectStart, capturedLength(nth));
 }
 
 /*!
@@ -2062,7 +2059,7 @@ QStringRef QRegularExpressionMatch::capturedRef(int nth) const
     if (start == -1) // didn't capture
         return QStringRef();
 
-    return d->subject.midRef(start, capturedLength(nth));
+    return d->subject.midRef(start + d->subjectStart, capturedLength(nth));
 }
 
 /*!
index 99f6a312672ec96b53ef72d14f3021511881bd5f..d185e6425101f72cda4c84267283d9febde39ba9 100644 (file)
@@ -1,7 +1,7 @@
 /****************************************************************************
 **
-** Copyright (C) 2012 Giuseppe D'Angelo <dangelog@gmail.com>.
-** Copyright (C) 2013 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
+** Copyright (C) 2015 Giuseppe D'Angelo <dangelog@gmail.com>.
+** Copyright (C) 2015 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
 ** Contact: http://www.qt.io/licensing/
 **
 ** This file is part of the test suite of the Qt Toolkit.
@@ -306,19 +306,6 @@ static void testMatch(const QRegularExpression &regexp,
                             matchType,
                             matchOptions,
                             result);
-
-    // offset <= 0 tested above; now also test stringrefs not spanning over
-    // the entire subject. Note that the offset can be negative, hence the above
-    // tests can't be merged into this one
-    for (int i = 1; i <= offset; ++i) {
-        testMatchImpl<QREMatch>(regexp,
-                                matchingMethodForStringRef,
-                                QStringRef(&subject, i, subject.length() - i),
-                                offset - i,
-                                matchType,
-                                matchOptions,
-                                result);
-    }
 }
 
 typedef QRegularExpressionMatch (QRegularExpression::*QREMatchStringPMF)(const QString &, int, QRegularExpression::MatchType, QRegularExpression::MatchOptions) const;
@@ -736,6 +723,18 @@ void tst_QRegularExpression::normalMatch_data()
 
     // ***
 
+    m.clear();
+    m.isValid = true; m.hasMatch = true;
+    m.captured << "bcd";
+    QTest::newRow("match12")
+            << QRegularExpression("\\Bbcd\\B")
+            << "abcde"
+            << 1
+            << QRegularExpression::MatchOptions(QRegularExpression::NoMatchOption)
+            << m;
+
+    // ***
+
     m.clear();
     m.isValid = true;
     QTest::newRow("nomatch01") << QRegularExpression("\\d+")
@@ -1705,3 +1704,360 @@ void tst_QRegularExpression::JOptionUsage()
         re.optimize();
     QCOMPARE(re.isValid(), isValid);
 }
+
+void tst_QRegularExpression::QStringAndQStringRefEquivalence()
+{
+    const QString subject = QStringLiteral("Mississippi");
+    {
+        const QRegularExpression re("\\Biss\\B");
+        QVERIFY(re.isValid());
+        {
+            const QRegularExpressionMatch match = re.match(subject);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 1);
+            QCOMPARE(match.capturedEnd(), 4);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(QStringRef(&subject));
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 1);
+            QCOMPARE(match.capturedEnd(), 4);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject, 1);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 1);
+            QCOMPARE(match.capturedEnd(), 4);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(QStringRef(&subject), 1);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 1);
+            QCOMPARE(match.capturedEnd(), 4);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject.mid(1));
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject.midRef(1));
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject.mid(1), 1);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject.midRef(1), 1);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject, 4);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 4);
+            QCOMPARE(match.capturedEnd(), 7);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(QStringRef(&subject), 4);
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 4);
+            QCOMPARE(match.capturedEnd(), 7);
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject.mid(4));
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(!match.hasMatch());
+        }
+        {
+            const QRegularExpressionMatch match = re.match(subject.midRef(4));
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(!match.hasMatch());
+        }
+
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match1 = i.next();
+            consistencyCheck(match1);
+            QVERIFY(match1.isValid());
+            QVERIFY(match1.hasMatch());
+            QCOMPARE(match1.captured(), QStringLiteral("iss"));
+            QCOMPARE(match1.capturedStart(), 1);
+            QCOMPARE(match1.capturedEnd(), 4);
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match2 = i.next();
+            consistencyCheck(match2);
+            QVERIFY(match2.isValid());
+            QVERIFY(match2.hasMatch());
+            QCOMPARE(match2.captured(), QStringLiteral("iss"));
+            QCOMPARE(match2.capturedStart(), 4);
+            QCOMPARE(match2.capturedEnd(), 7);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(QStringRef(&subject));
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match1 = i.next();
+            consistencyCheck(match1);
+            QVERIFY(match1.isValid());
+            QVERIFY(match1.hasMatch());
+            QCOMPARE(match1.captured(), QStringLiteral("iss"));
+            QCOMPARE(match1.capturedStart(), 1);
+            QCOMPARE(match1.capturedEnd(), 4);
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match2 = i.next();
+            consistencyCheck(match2);
+            QVERIFY(match2.isValid());
+            QVERIFY(match2.hasMatch());
+            QCOMPARE(match2.captured(), QStringLiteral("iss"));
+            QCOMPARE(match2.capturedStart(), 4);
+            QCOMPARE(match2.capturedEnd(), 7);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject, 1);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match1 = i.next();
+            consistencyCheck(match1);
+            QVERIFY(match1.isValid());
+            QVERIFY(match1.hasMatch());
+            QCOMPARE(match1.captured(), QStringLiteral("iss"));
+            QCOMPARE(match1.capturedStart(), 1);
+            QCOMPARE(match1.capturedEnd(), 4);
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match2 = i.next();
+            consistencyCheck(match2);
+            QVERIFY(match2.isValid());
+            QVERIFY(match2.hasMatch());
+            QCOMPARE(match2.captured(), QStringLiteral("iss"));
+            QCOMPARE(match2.capturedStart(), 4);
+            QCOMPARE(match2.capturedEnd(), 7);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(QStringRef(&subject), 1);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match1 = i.next();
+            consistencyCheck(match1);
+            QVERIFY(match1.isValid());
+            QVERIFY(match1.hasMatch());
+            QCOMPARE(match1.captured(), QStringLiteral("iss"));
+            QCOMPARE(match1.capturedStart(), 1);
+            QCOMPARE(match1.capturedEnd(), 4);
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match2 = i.next();
+            consistencyCheck(match2);
+            QVERIFY(match2.isValid());
+            QVERIFY(match2.hasMatch());
+            QCOMPARE(match2.captured(), QStringLiteral("iss"));
+            QCOMPARE(match2.capturedStart(), 4);
+            QCOMPARE(match2.capturedEnd(), 7);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.mid(1));
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.midRef(1));
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.mid(1), 1);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.midRef(1), 1);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.mid(1), 1);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.midRef(1), 1);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 3);
+            QCOMPARE(match.capturedEnd(), 6);
+
+            QVERIFY(!i.hasNext());
+        }
+
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject, 4);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 4);
+            QCOMPARE(match.capturedEnd(), 7);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(QStringRef(&subject), 4);
+            QVERIFY(i.isValid());
+
+            consistencyCheck(i);
+            QVERIFY(i.hasNext());
+            const QRegularExpressionMatch match = i.next();
+            consistencyCheck(match);
+            QVERIFY(match.isValid());
+            QVERIFY(match.hasMatch());
+            QCOMPARE(match.captured(), QStringLiteral("iss"));
+            QCOMPARE(match.capturedStart(), 4);
+            QCOMPARE(match.capturedEnd(), 7);
+
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.mid(4));
+            consistencyCheck(i);
+            QVERIFY(i.isValid());
+            QVERIFY(!i.hasNext());
+        }
+        {
+            QRegularExpressionMatchIterator i = re.globalMatch(subject.midRef(4));
+            consistencyCheck(i);
+            QVERIFY(i.isValid());
+            QVERIFY(!i.hasNext());
+        }
+    }
+}
index 578a4129eca4c4662ec803a5ce69a126b982cffb..aed262d15d6f63c6dbcc9b276b1275b58d79213d 100644 (file)
@@ -72,6 +72,7 @@ private slots:
     void regularExpressionMatch();
     void JOptionUsage_data();
     void JOptionUsage();
+    void QStringAndQStringRefEquivalence();
 
 private:
     void provideRegularExpressions();