Only the interpreter is enabled right now, the JIT needs more compile fixes.
This is the initial port that is functionally equivalent to the
QRegularExpression based implementation except for the tests that pass now
because of the actual JS compatible regex implementation.
Change-Id: Ieb7e66e9b38071ea1d32effe045c70023b17fabd
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
*.pro.user
*.bc
*.ll
+RegExpJitTables.h
#include <wtf/Platform.h>
#ifdef __cplusplus
#include <wtf/Vector.h>
+#include <wtf/FastAllocBase.h>
+#include <wtf/RefPtr.h>
#include <cmath>
#else
#include <math.h>
--- /dev/null
+# Copyright (C) 2010 Apple Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+types = {
+ "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
+ "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]},
+ "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
+ "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
+ "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]},
+ "digits": { "UseTable" : False, "data": [('0', '9')]},
+ "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] }
+}
+entriesPerLine = 50
+arrays = "";
+functions = "";
+emitTables = (len(sys.argv) < 2 or sys.argv[1] != "--no-tables")
+
+for name, classes in types.items():
+ ranges = [];
+ size = 0;
+ for _class in classes["data"]:
+ if type(_class) == str:
+ ranges.append((ord(_class), ord(_class)))
+ elif type(_class) == int:
+ ranges.append((_class, _class))
+ else:
+ (min, max) = _class;
+ if type(min) == str:
+ min = ord(min)
+ if type(max) == str:
+ max = ord(max)
+ if max > 0x7f and min <= 0x7f:
+ ranges.append((min, 0x7f))
+ min = 0x80
+ ranges.append((min,max))
+ ranges.sort();
+
+ if emitTables and classes["UseTable"] and (not "Inverse" in classes):
+ array = ("static const char _%sData[65536] = {\n" % name);
+ i = 0
+ for (min,max) in ranges:
+ while i < min:
+ i = i + 1
+ array += ('0,')
+ if (i % entriesPerLine == 0) and (i != 0):
+ array += ('\n')
+ while i <= max:
+ i = i + 1
+ if (i == 65536):
+ array += ("1")
+ else:
+ array += ('1,')
+ if (i % entriesPerLine == 0) and (i != 0):
+ array += ('\n')
+ while i < 0xffff:
+ array += ("0,")
+ i = i + 1;
+ if (i % entriesPerLine == 0) and (i != 0):
+ array += ('\n')
+ if i == 0xffff:
+ array += ("0")
+ array += ("\n};\n\n");
+ arrays += array
+
+ # Generate createFunction:
+ function = "";
+ function += ("CharacterClass* %sCreate()\n" % name)
+ function += ("{\n")
+ if emitTables and classes["UseTable"]:
+ if "Inverse" in classes:
+ function += (" CharacterClass* characterClass = new CharacterClass(CharacterClassTable::create(_%sData, true));\n" % (classes["Inverse"]))
+ else:
+ function += (" CharacterClass* characterClass = new CharacterClass(CharacterClassTable::create(_%sData, false));\n" % (name))
+ else:
+ function += (" CharacterClass* characterClass = new CharacterClass(0);\n")
+ for (min, max) in ranges:
+ if (min == max):
+ if (min > 127):
+ function += (" characterClass->m_matchesUnicode.append(0x%04x);\n" % min)
+ else:
+ function += (" characterClass->m_matches.append(0x%02x);\n" % min)
+ continue
+ if (min > 127) or (max > 127):
+ function += (" characterClass->m_rangesUnicode.append(CharacterRange(0x%04x, 0x%04x));\n" % (min, max))
+ else:
+ function += (" characterClass->m_ranges.append(CharacterRange(0x%02x, 0x%02x));\n" % (min, max))
+ function += (" return characterClass;\n")
+ function += ("}\n\n")
+ functions += function
+
+if (len(sys.argv) > 1):
+ f = open(sys.argv[-1], "w")
+ f.write(arrays)
+ f.write(functions)
+ f.close()
+else:
+ print(arrays)
+ print(functions)
+
SOURCES += $$PWD/stubs/WTFStubs.cpp
HEADERS += $$PWD/stubs/WTFStubs.h
-DEFINES += WTF_EXPORT_PRIVATE=""
+DEFINES += WTF_EXPORT_PRIVATE="" JS_EXPORT_PRIVATE=""
DEFINES += ENABLE_LLINT=0
DEFINES += ENABLE_DFG_JIT=0
INCLUDEPATH += $$PWD/jit
INCLUDEPATH += $$PWD/assembler
+INCLUDEPATH += $$PWD/runtime
INCLUDEPATH += $$PWD/wtf
INCLUDEPATH += $$PWD/stubs
INCLUDEPATH += $$PWD/stubs/wtf
SOURCES += $$PWD/disassembler/udis86/udis86_syn.c
SOURCES += $$PWD/disassembler/udis86/udis86_syn-intel.c
+DEFINES += ENABLE_YARR_JIT=0
+SOURCES += \
+ $$PWD/yarr/YarrCanonicalizeUCS2.cpp \
+ $$PWD/yarr/YarrInterpreter.cpp \
+ $$PWD/yarr/YarrPattern.cpp \
+ $$PWD/yarr/YarrSyntaxChecker.cpp
+
+HEADERS += $$PWD/yarr/*.h
+
+retgen.output = RegExpJitTables.h
+retgen.script = $$PWD/create_regex_tables
+retgen.input = retgen.script
+retgen.CONFIG += no_link
+retgen.commands = python $$retgen.script > ${QMAKE_FILE_OUT}
+QMAKE_EXTRA_COMPILERS += retgen
ITAB = $$PWD/disassembler/udis86/optable.xml
udis86.output = udis86_itab.h
--- /dev/null
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MatchResult_h
+#define MatchResult_h
+
+typedef uint64_t EncodedMatchResult;
+
+struct MatchResult {
+ ALWAYS_INLINE MatchResult(size_t start, size_t end)
+ : start(start)
+ , end(end)
+ {
+ }
+
+ explicit ALWAYS_INLINE MatchResult(EncodedMatchResult encoded)
+ {
+ union u {
+ uint64_t encoded;
+ struct s {
+ size_t start;
+ size_t end;
+ } split;
+ } value;
+ value.encoded = encoded;
+ start = value.split.start;
+ end = value.split.end;
+ }
+
+ ALWAYS_INLINE static MatchResult failed()
+ {
+ return MatchResult(WTF::notFound, 0);
+ }
+
+ ALWAYS_INLINE operator bool()
+ {
+ return start != WTF::notFound;
+ }
+
+ ALWAYS_INLINE bool empty()
+ {
+ return start == end;
+ }
+
+ size_t start;
+ size_t end;
+};
+
+#endif
#include <qscopedpointer.h>
-#define OwnPtr QScopedPointer
+template <typename T> class PassOwnPtr;
+template <typename PtrType> PassOwnPtr<PtrType> adoptPtr(PtrType*);
+
+template <typename T>
+struct OwnPtr : public QScopedPointer<T>
+{
+ OwnPtr() {}
+ OwnPtr(const PassOwnPtr<T> &ptr)
+ : QScopedPointer<T>(ptr.leakRef())
+ {}
+
+ OwnPtr& operator=(const OwnPtr<T>& other)
+ {
+ this->reset(const_cast<OwnPtr<T> &>(other).take());
+ return *this;
+ }
+
+ T* get() const { return this->data(); }
+
+ PassOwnPtr<T> release()
+ {
+ return adoptPtr(this->take());
+ }
+};
template <typename T>
class PassOwnPtr {
#ifndef REFCOUNTED_H
#define REFCOUNTED_H
+#include "PassRefPtr.h"
+
template <typename Base>
class RefCounted {
public:
#include <vector>
#include <wtf/Assertions.h>
+#include <wtf/NotFound.h>
+#include <qalgorithms.h>
namespace WTF {
inline void append(const T& value)
{ this->push_back(value); }
+ inline void append(const Vector<T>& vector)
+ {
+ this->insert(this->end(), vector.begin(), vector.end());
+ }
+
+ using std::vector<T>::insert;
+
+ inline void insert(size_t position, T value)
+ { this->insert(this->begin() + position, value); }
+
inline void grow(size_t size)
{ this->resize(size); }
+
+ inline void shrink(size_t size)
+ { this->erase(this->begin() + size, this->end()); }
+
+ inline void remove(size_t position)
+ { this->erase(this->begin() + position); }
+
+ inline bool isEmpty() const { return this->empty(); }
+
+ inline T &last() { return *(this->begin() + this->size() - 1); }
};
+template <typename T, int capacity>
+void deleteAllValues(const Vector<T, capacity> &vector)
+{
+ qDeleteAll(vector);
+}
+
}
using WTF::Vector;
+using WTF::deleteAllValues;
#endif // VECTOR_H
--- /dev/null
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia. For licensing terms and
+** conditions see http://qt.digia.com/licensing. For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights. These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef CSTRING_H
+#define CSTRING_H
+
+#endif // CSTRING_H
--- /dev/null
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia. For licensing terms and
+** conditions see http://qt.digia.com/licensing. For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights. These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef WTFSTRING_H
+#define WTFSTRING_H
+
+#include <QString>
+#include <wtf/ASCIICType.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace WTF {
+
+class String : public QString
+{
+public:
+ String(const QString& s) : QString(s) {}
+ bool is8Bit() const { return false; }
+ const unsigned char *characters8() const { return 0; }
+ const UChar *characters16() const { return reinterpret_cast<const UChar*>(constData()); }
+
+ template <typename T>
+ const T* getCharacters() const;
+
+};
+
+template <>
+inline const unsigned char* String::getCharacters<unsigned char>() const { return characters8(); }
+template <>
+inline const UChar* String::getCharacters<UChar>() const { return characters16(); }
+
+}
+
+// Don't import WTF::String into the global namespace to avoid conflicts with QQmlJS::VM::String
+namespace JSC {
+ using WTF::String;
+}
+
+#endif // WTFSTRING_H
--- /dev/null
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia. For licensing terms and
+** conditions see http://qt.digia.com/licensing. For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights. These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef UNICODE_H
+#define UNICODE_H
+
+#include <QChar>
+
+typedef unsigned char LChar;
+typedef uint16_t UChar;
+
+namespace Unicode {
+ inline UChar toLower(UChar ch) {
+ return QChar::toLower(ch);
+ }
+
+ inline UChar toUpper(UChar ch) {
+ return QChar::toUpper(ch);
+ }
+}
+
+#endif // UNICODE_H
--- /dev/null
+/*
+ * Copyright (C) 2007, 2008, 2009, 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef WTF_ASCIICType_h
+#define WTF_ASCIICType_h
+
+#include <wtf/Assertions.h>
+
+// The behavior of many of the functions in the <ctype.h> header is dependent
+// on the current locale. But in the WebKit project, all uses of those functions
+// are in code processing something that's not locale-specific. These equivalents
+// for some of the <ctype.h> functions are named more explicitly, not dependent
+// on the C library locale, and we should also optimize them as needed.
+
+// All functions return false or leave the character unchanged if passed a character
+// that is outside the range 0-7F. So they can be used on Unicode strings or
+// characters if the intent is to do processing only if the character is ASCII.
+
+namespace WTF {
+
+template<typename CharType> inline bool isASCII(CharType c)
+{
+ return !(c & ~0x7F);
+}
+
+template<typename CharType> inline bool isASCIIAlpha(CharType c)
+{
+ return (c | 0x20) >= 'a' && (c | 0x20) <= 'z';
+}
+
+template<typename CharType> inline bool isASCIIDigit(CharType c)
+{
+ return c >= '0' && c <= '9';
+}
+
+template<typename CharType> inline bool isASCIIAlphanumeric(CharType c)
+{
+ return isASCIIDigit(c) || isASCIIAlpha(c);
+}
+
+template<typename CharType> inline bool isASCIIHexDigit(CharType c)
+{
+ return isASCIIDigit(c) || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f');
+}
+
+template<typename CharType> inline bool isASCIILower(CharType c)
+{
+ return c >= 'a' && c <= 'z';
+}
+
+template<typename CharType> inline bool isASCIIOctalDigit(CharType c)
+{
+ return (c >= '0') & (c <= '7');
+}
+
+template<typename CharType> inline bool isASCIIPrintable(CharType c)
+{
+ return c >= ' ' && c <= '~';
+}
+
+/*
+ Statistics from a run of Apple's page load test for callers of isASCIISpace:
+
+ character count
+ --------- -----
+ non-spaces 689383
+ 20 space 294720
+ 0A \n 89059
+ 09 \t 28320
+ 0D \r 0
+ 0C \f 0
+ 0B \v 0
+ */
+template<typename CharType> inline bool isASCIISpace(CharType c)
+{
+ return c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
+}
+
+template<typename CharType> inline bool isASCIIUpper(CharType c)
+{
+ return c >= 'A' && c <= 'Z';
+}
+
+template<typename CharType> inline CharType toASCIILower(CharType c)
+{
+ return c | ((c >= 'A' && c <= 'Z') << 5);
+}
+
+template<typename CharType> inline CharType toASCIILowerUnchecked(CharType character)
+{
+ // This function can be used for comparing any input character
+ // to a lowercase English character. The isASCIIAlphaCaselessEqual
+ // below should be used for regular comparison of ASCII alpha
+ // characters, but switch statements in CSS tokenizer require
+ // direct use of this function.
+ return character | 0x20;
+}
+
+template<typename CharType> inline CharType toASCIIUpper(CharType c)
+{
+ return c & ~((c >= 'a' && c <= 'z') << 5);
+}
+
+template<typename CharType> inline int toASCIIHexValue(CharType c)
+{
+ ASSERT(isASCIIHexDigit(c));
+ return c < 'A' ? c - '0' : (c - 'A' + 10) & 0xF;
+}
+
+template<typename CharType> inline int toASCIIHexValue(CharType upperValue, CharType lowerValue)
+{
+ ASSERT(isASCIIHexDigit(upperValue) && isASCIIHexDigit(lowerValue));
+ return ((toASCIIHexValue(upperValue) << 4) & 0xF0) | toASCIIHexValue(lowerValue);
+}
+
+inline char lowerNibbleToASCIIHexDigit(char c)
+{
+ char nibble = c & 0xF;
+ return nibble < 10 ? '0' + nibble : 'A' + nibble - 10;
+}
+
+inline char upperNibbleToASCIIHexDigit(char c)
+{
+ char nibble = (c >> 4) & 0xF;
+ return nibble < 10 ? '0' + nibble : 'A' + nibble - 10;
+}
+
+template<typename CharType> inline bool isASCIIAlphaCaselessEqual(CharType cssCharacter, char character)
+{
+ // This function compares a (preferrably) constant ASCII
+ // lowercase letter to any input character.
+ ASSERT(character >= 'a' && character <= 'z');
+ return LIKELY(toASCIILowerUnchecked(cssCharacter) == character);
+}
+
+}
+
+using WTF::isASCII;
+using WTF::isASCIIAlpha;
+using WTF::isASCIIAlphanumeric;
+using WTF::isASCIIDigit;
+using WTF::isASCIIHexDigit;
+using WTF::isASCIILower;
+using WTF::isASCIIOctalDigit;
+using WTF::isASCIIPrintable;
+using WTF::isASCIISpace;
+using WTF::isASCIIUpper;
+using WTF::toASCIIHexValue;
+using WTF::toASCIILower;
+using WTF::toASCIILowerUnchecked;
+using WTF::toASCIIUpper;
+using WTF::lowerNibbleToASCIIHexDigit;
+using WTF::upperNibbleToASCIIHexDigit;
+using WTF::isASCIIAlphaCaselessEqual;
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2010 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef BumpPointerAllocator_h
+#define BumpPointerAllocator_h
+
+#include <algorithm>
+#include <wtf/PageAllocation.h>
+#include <wtf/PageBlock.h>
+
+namespace WTF {
+
+#define MINIMUM_BUMP_POOL_SIZE 0x1000
+
+class BumpPointerPool {
+public:
+ // ensureCapacity will check whether the current pool has capacity to
+ // allocate 'size' bytes of memory If it does not, it will attempt to
+ // allocate a new pool (which will be added to this one in a chain).
+ //
+ // If allocation fails (out of memory) this method will return null.
+ // If the return value is non-null, then callers should update any
+ // references they have to this current (possibly full) BumpPointerPool
+ // to instead point to the newly returned BumpPointerPool.
+ BumpPointerPool* ensureCapacity(size_t size)
+ {
+ void* allocationEnd = static_cast<char*>(m_current) + size;
+ ASSERT(allocationEnd > m_current); // check for overflow
+ if (allocationEnd <= static_cast<void*>(this))
+ return this;
+ return ensureCapacityCrossPool(this, size);
+ }
+
+ // alloc should only be called after calling ensureCapacity; as such
+ // alloc will never fail.
+ void* alloc(size_t size)
+ {
+ void* current = m_current;
+ void* allocationEnd = static_cast<char*>(current) + size;
+ ASSERT(allocationEnd > current); // check for overflow
+ ASSERT(allocationEnd <= static_cast<void*>(this));
+ m_current = allocationEnd;
+ return current;
+ }
+
+ // The dealloc method releases memory allocated using alloc. Memory
+ // must be released in a LIFO fashion, e.g. if the client calls alloc
+ // four times, returning pointer A, B, C, D, then the only valid order
+ // in which these may be deallocaed is D, C, B, A.
+ //
+ // The client may optionally skip some deallocations. In the example
+ // above, it would be valid to only explicitly dealloc C, A (D being
+ // dealloced along with C, B along with A).
+ //
+ // If pointer was not allocated from this pool (or pools) then dealloc
+ // will CRASH(). Callers should update any references they have to
+ // this current BumpPointerPool to instead point to the returned
+ // BumpPointerPool.
+ BumpPointerPool* dealloc(void* position)
+ {
+ if ((position >= m_start) && (position <= static_cast<void*>(this))) {
+ ASSERT(position <= m_current);
+ m_current = position;
+ return this;
+ }
+ return deallocCrossPool(this, position);
+ }
+
+private:
+ // Placement operator new, returns the last 'size' bytes of allocation for use as this.
+ void* operator new(size_t size, const PageAllocation& allocation)
+ {
+ ASSERT(size < allocation.size());
+ return reinterpret_cast<char*>(reinterpret_cast<intptr_t>(allocation.base()) + allocation.size()) - size;
+ }
+
+ BumpPointerPool(const PageAllocation& allocation)
+ : m_current(allocation.base())
+ , m_start(allocation.base())
+ , m_next(0)
+ , m_previous(0)
+ , m_allocation(allocation)
+ {
+ }
+
+ static BumpPointerPool* create(size_t minimumCapacity = 0)
+ {
+ // Add size of BumpPointerPool object, check for overflow.
+ minimumCapacity += sizeof(BumpPointerPool);
+ if (minimumCapacity < sizeof(BumpPointerPool))
+ return 0;
+
+ size_t poolSize = std::max(static_cast<size_t>(MINIMUM_BUMP_POOL_SIZE), WTF::pageSize());
+ while (poolSize < minimumCapacity) {
+ poolSize <<= 1;
+ // The following if check relies on MINIMUM_BUMP_POOL_SIZE being a power of 2!
+ ASSERT(!(MINIMUM_BUMP_POOL_SIZE & (MINIMUM_BUMP_POOL_SIZE - 1)));
+ if (!poolSize)
+ return 0;
+ }
+
+ PageAllocation allocation = PageAllocation::allocate(poolSize);
+ if (!!allocation)
+ return new (allocation) BumpPointerPool(allocation);
+ return 0;
+ }
+
+ void shrink()
+ {
+ ASSERT(!m_previous);
+ m_current = m_start;
+ while (m_next) {
+ BumpPointerPool* nextNext = m_next->m_next;
+ m_next->destroy();
+ m_next = nextNext;
+ }
+ }
+
+ void destroy()
+ {
+ m_allocation.deallocate();
+ }
+
+ static BumpPointerPool* ensureCapacityCrossPool(BumpPointerPool* previousPool, size_t size)
+ {
+ // The pool passed should not have capacity, so we'll start with the next one.
+ ASSERT(previousPool);
+ ASSERT((static_cast<char*>(previousPool->m_current) + size) > previousPool->m_current); // check for overflow
+ ASSERT((static_cast<char*>(previousPool->m_current) + size) > static_cast<void*>(previousPool));
+ BumpPointerPool* pool = previousPool->m_next;
+
+ while (true) {
+ if (!pool) {
+ // We've run to the end; allocate a new pool.
+ pool = BumpPointerPool::create(size);
+ previousPool->m_next = pool;
+ pool->m_previous = previousPool;
+ return pool;
+ }
+
+ //
+ void* current = pool->m_current;
+ void* allocationEnd = static_cast<char*>(current) + size;
+ ASSERT(allocationEnd > current); // check for overflow
+ if (allocationEnd <= static_cast<void*>(pool))
+ return pool;
+ }
+ }
+
+ static BumpPointerPool* deallocCrossPool(BumpPointerPool* pool, void* position)
+ {
+ // Should only be called if position is not in the current pool.
+ ASSERT((position < pool->m_start) || (position > static_cast<void*>(pool)));
+
+ while (true) {
+ // Unwind the current pool to the start, move back in the chain to the previous pool.
+ pool->m_current = pool->m_start;
+ pool = pool->m_previous;
+
+ // position was nowhere in the chain!
+ if (!pool)
+ CRASH();
+
+ if ((position >= pool->m_start) && (position <= static_cast<void*>(pool))) {
+ ASSERT(position <= pool->m_current);
+ pool->m_current = position;
+ return pool;
+ }
+ }
+ }
+
+ void* m_current;
+ void* m_start;
+ BumpPointerPool* m_next;
+ BumpPointerPool* m_previous;
+ PageAllocation m_allocation;
+
+ friend class BumpPointerAllocator;
+};
+
+// A BumpPointerAllocator manages a set of BumpPointerPool objects, which
+// can be used for LIFO (stack like) allocation.
+//
+// To begin allocating using this class call startAllocator(). The result
+// of this method will be null if the initial pool allocation fails, or a
+// pointer to a BumpPointerPool object that can be used to perform
+// allocations. Whilst running no memory will be released until
+// stopAllocator() is called. At this point all allocations made through
+// this allocator will be reaped, and underlying memory may be freed.
+//
+// (In practice we will still hold on to the initial pool to allow allocation
+// to be quickly restared, but aditional pools will be freed).
+//
+// This allocator is non-renetrant, it is encumbant on the clients to ensure
+// startAllocator() is not called again until stopAllocator() has been called.
+class BumpPointerAllocator {
+public:
+ BumpPointerAllocator()
+ : m_head(0)
+ {
+ }
+
+ ~BumpPointerAllocator()
+ {
+ if (m_head)
+ m_head->destroy();
+ }
+
+ BumpPointerPool* startAllocator()
+ {
+ if (!m_head)
+ m_head = BumpPointerPool::create();
+ return m_head;
+ }
+
+ void stopAllocator()
+ {
+ if (m_head)
+ m_head->shrink();
+ }
+
+private:
+ BumpPointerPool* m_head;
+};
+
+}
+
+using WTF::BumpPointerAllocator;
+
+#endif // BumpPointerAllocator_h
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef Yarr_h
+#define Yarr_h
+
+#include "YarrInterpreter.h"
+#include "YarrPattern.h"
+
+namespace JSC { namespace Yarr {
+
+#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoBackReference 2
+#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
+#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1
+#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1
+#define YarrStackSpaceForBackTrackInfoParentheses 2
+
+static const unsigned quantifyInfinite = UINT_MAX;
+static const unsigned offsetNoMatch = (unsigned)-1;
+
+// The below limit restricts the number of "recursive" match calls in order to
+// avoid spending exponential time on complex regular expressions.
+static const unsigned matchLimit = 1000000;
+
+enum JSRegExpResult {
+ JSRegExpMatch = 1,
+ JSRegExpNoMatch = 0,
+ JSRegExpErrorNoMatch = -1,
+ JSRegExpErrorHitLimit = -2,
+ JSRegExpErrorNoMemory = -3,
+ JSRegExpErrorInternal = -4
+};
+
+enum YarrCharSize {
+ Char8,
+ Char16
+};
+
+} } // namespace JSC::Yarr
+
+#endif // Yarr_h
+
--- /dev/null
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js
+
+#include "config.h"
+#include "YarrCanonicalizeUCS2.h"
+
+namespace JSC { namespace Yarr {
+
+#include <stdint.h>
+
+uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 };
+uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 };
+uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 };
+uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 };
+uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 };
+uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 };
+uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 };
+uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 };
+uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 };
+uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 };
+uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 };
+uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 };
+uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 };
+uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 };
+uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 };
+
+static const size_t UCS2_CANONICALIZATION_SETS = 15;
+uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {
+ ucs2CharacterSet0,
+ ucs2CharacterSet1,
+ ucs2CharacterSet2,
+ ucs2CharacterSet3,
+ ucs2CharacterSet4,
+ ucs2CharacterSet5,
+ ucs2CharacterSet6,
+ ucs2CharacterSet7,
+ ucs2CharacterSet8,
+ ucs2CharacterSet9,
+ ucs2CharacterSet10,
+ ucs2CharacterSet11,
+ ucs2CharacterSet12,
+ ucs2CharacterSet13,
+ ucs2CharacterSet14,
+};
+
+const size_t UCS2_CANONICALIZATION_RANGES = 364;
+UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {
+ { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique },
+ { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo },
+ { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique },
+ { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi },
+ { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique },
+ { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet },
+ { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique },
+ { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo },
+ { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique },
+ { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo },
+ { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique },
+ { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi },
+ { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique },
+ { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi },
+ { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo },
+ { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique },
+ { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique },
+ { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique },
+ { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi },
+ { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique },
+ { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo },
+ { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo },
+ { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo },
+ { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo },
+ { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique },
+ { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo },
+ { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo },
+ { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo },
+ { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo },
+ { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo },
+ { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo },
+ { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo },
+ { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo },
+ { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo },
+ { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique },
+ { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo },
+ { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo },
+ { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo },
+ { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo },
+ { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo },
+ { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo },
+ { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique },
+ { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo },
+ { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo },
+ { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo },
+ { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique },
+ { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique },
+ { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo },
+ { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique },
+ { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet },
+ { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet },
+ { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet },
+ { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi },
+ { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique },
+ { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet },
+ { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi },
+ { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi },
+ { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi },
+ { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique },
+ { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique },
+ { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo },
+ { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi },
+ { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo },
+ { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo },
+ { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi },
+ { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo },
+ { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo },
+ { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo },
+ { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo },
+ { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo },
+ { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi },
+ { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi },
+ { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique },
+ { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi },
+ { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique },
+ { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi },
+ { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique },
+ { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi },
+ { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique },
+ { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi },
+ { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique },
+ { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi },
+ { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique },
+ { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo },
+ { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique },
+ { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi },
+ { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi },
+ { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique },
+ { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo },
+ { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique },
+ { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi },
+ { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique },
+ { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo },
+ { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi },
+ { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique },
+ { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi },
+ { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique },
+ { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo },
+ { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique },
+ { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique },
+ { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique },
+ { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi },
+ { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi },
+ { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi },
+ { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique },
+ { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi },
+ { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique },
+ { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet },
+ { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique },
+ { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique },
+ { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique },
+ { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo },
+ { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique },
+ { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo },
+ { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique },
+ { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo },
+ { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique },
+ { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo },
+ { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique },
+ { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo },
+ { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique },
+ { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet },
+ { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet },
+ { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet },
+ { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet },
+ { 0x039au, 0x039au, 0x0008u, CanonicalizeSet },
+ { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo },
+ { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet },
+ { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo },
+ { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet },
+ { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet },
+ { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique },
+ { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet },
+ { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo },
+ { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet },
+ { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo },
+ { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi },
+ { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi },
+ { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique },
+ { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet },
+ { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet },
+ { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet },
+ { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet },
+ { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet },
+ { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet },
+ { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet },
+ { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet },
+ { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet },
+ { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet },
+ { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi },
+ { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi },
+ { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo },
+ { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet },
+ { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet },
+ { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique },
+ { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet },
+ { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet },
+ { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi },
+ { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet },
+ { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet },
+ { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo },
+ { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique },
+ { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet },
+ { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique },
+ { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi },
+ { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique },
+ { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi },
+ { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo },
+ { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo },
+ { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi },
+ { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi },
+ { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique },
+ { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo },
+ { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi },
+ { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique },
+ { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo },
+ { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique },
+ { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi },
+ { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique },
+ { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo },
+ { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique },
+ { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo },
+ { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique },
+ { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo },
+ { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique },
+ { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet },
+ { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique },
+ { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet },
+ { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique },
+ { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique },
+ { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique },
+ { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique },
+ { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique },
+ { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique },
+ { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique },
+ { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique },
+ { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique },
+ { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique },
+ { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique },
+ { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique },
+ { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo },
+ { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo },
+ { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo },
+ { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo },
+ { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo },
+ { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo },
+ { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique },
+ { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi },
+ { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique },
+ { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet },
+ { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi },
+ { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique },
+ { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi },
+ { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique },
+ { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique },
+ { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo },
+ { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi },
+ { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi },
+ { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique },
+ { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi },
+ { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi },
+ { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique },
+ { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo },
+ { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique },
+ { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi },
+ { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique },
+ { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo },
+ { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi },
+ { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique },
+ { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique },
+ { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo },
+ { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi },
+ { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique },
+ { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo },
+ { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique },
+ { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi },
+ { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique },
+ { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi },
+ { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi },
+ { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi },
+ { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi },
+ { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi },
+ { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi },
+ { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi },
+ { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi },
+ { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi },
+ { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique },
+ { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique },
+ { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique },
+ { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi },
+ { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique },
+ { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique },
+ { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi },
+ { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique },
+ { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique },
+ { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique },
+ { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique },
+ { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique },
+ { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi },
+ { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique },
+ { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi },
+ { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique },
+ { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique },
+ { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique },
+ { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo },
+ { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique },
+ { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi },
+ { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique },
+};
+
+const size_t LATIN_CANONICALIZATION_RANGES = 20;
+LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {
+ { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther },
+ { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther },
+ { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther },
+ { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid },
+};
+
+} } // JSC::Yarr
+
--- /dev/null
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrCanonicalizeUCS2_H
+#define YarrCanonicalizeUCS2_H
+
+#include <stdint.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
+// provides information for each UCS2 code point as to the set of code points that it should
+// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
+enum UCS2CanonicalizationType {
+ CanonicalizeUnique, // No canonically equal values, e.g. 0x0.
+ CanonicalizeSet, // Value indicates a set in characterSetInfo.
+ CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
+ CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
+ CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
+ CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
+};
+struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t UCS2_CANONICALIZATION_RANGES;
+extern uint16_t* characterSetInfo[];
+extern UCS2CanonicalizationRange rangeInfo[];
+
+// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
+// the set of Latin1 codepoints that could match.
+enum LatinCanonicalizationType {
+ CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range.
+ CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20.
+ CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is.
+ CanonicalizeLatinInvalid, // Cannot match against Latin1 input.
+};
+struct LatinCanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t LATIN_CANONICALIZATION_RANGES;
+extern LatinCanonicalizationRange latinRangeInfo[];
+
+// This searches in log2 time over ~364 entries, so should typically result in 8 compares.
+inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch)
+{
+ UCS2CanonicalizationRange* info = rangeInfo;
+ size_t entries = UCS2_CANONICALIZATION_RANGES;
+
+ while (true) {
+ size_t candidate = entries >> 1;
+ UCS2CanonicalizationRange* candidateInfo = info + candidate;
+ if (ch < candidateInfo->begin)
+ entries = candidate;
+ else if (ch <= candidateInfo->end)
+ return candidateInfo;
+ else {
+ info = candidateInfo + 1;
+ entries -= (candidate + 1);
+ }
+ }
+}
+
+// Should only be called for characters that have one canonically matching value.
+inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch)
+{
+ ASSERT(ch >= info->begin && ch <= info->end);
+ switch (info->type) {
+ case CanonicalizeRangeLo:
+ return ch + info->value;
+ case CanonicalizeRangeHi:
+ return ch - info->value;
+ case CanonicalizeAlternatingAligned:
+ return ch ^ 1;
+ case CanonicalizeAlternatingUnaligned:
+ return ((ch - 1) ^ 1) + 1;
+ default:
+ ASSERT_NOT_REACHED();
+ }
+ ASSERT_NOT_REACHED();
+ return 0;
+}
+
+// Returns true if no other UCS2 codepoint can match this value.
+inline bool isCanonicallyUnique(UChar ch)
+{
+ return rangeInfoFor(ch)->type == CanonicalizeUnique;
+}
+
+// Returns true if values are equal, under the canonicalization rules.
+inline bool areCanonicallyEquivalent(UChar a, UChar b)
+{
+ UCS2CanonicalizationRange* info = rangeInfoFor(a);
+ switch (info->type) {
+ case CanonicalizeUnique:
+ return a == b;
+ case CanonicalizeSet: {
+ for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) {
+ if (a == b)
+ return true;
+ }
+ return false;
+ }
+ case CanonicalizeRangeLo:
+ return (a == b) || (a + info->value == b);
+ case CanonicalizeRangeHi:
+ return (a == b) || (a - info->value == b);
+ case CanonicalizeAlternatingAligned:
+ return (a | 1) == (b | 1);
+ case CanonicalizeAlternatingUnaligned:
+ return ((a - 1) | 1) == ((b - 1) | 1);
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+}
+
+} } // JSC::Yarr
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// See ES 5.1, 15.10.2.8
+function canonicalize(ch)
+{
+ var u = String.fromCharCode(ch).toUpperCase();
+ if (u.length > 1)
+ return ch;
+ var cu = u.charCodeAt(0);
+ if (ch >= 128 && cu < 128)
+ return ch;
+ return cu;
+}
+
+var MAX_UCS2 = 0xFFFF;
+var MAX_LATIN = 0xFF;
+
+var groupedCanonically = [];
+// Pass 1: populate groupedCanonically - this is mapping from canonicalized
+// values back to the set of character code that canonicalize to them.
+for (var i = 0; i <= MAX_UCS2; ++i) {
+ var ch = canonicalize(i);
+ if (!groupedCanonically[ch])
+ groupedCanonically[ch] = [];
+ groupedCanonically[ch].push(i);
+}
+
+var typeInfo = [];
+var latinTypeInfo = [];
+var characterSetInfo = [];
+// Pass 2: populate typeInfo & characterSetInfo. For every character calculate
+// a typeInfo value, described by the types above, and a value payload.
+for (cu in groupedCanonically) {
+ // The set of characters that canonicalize to cu
+ var characters = groupedCanonically[cu];
+
+ // If there is only one, it is unique.
+ if (characters.length == 1) {
+ typeInfo[characters[0]] = "CanonicalizeUnique:0";
+ latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
+ continue;
+ }
+
+ // Sort the array.
+ characters.sort(function(x,y){return x-y;});
+
+ // If there are more than two characters, create an entry in characterSetInfo.
+ if (characters.length > 2) {
+ for (i in characters)
+ typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
+ characterSetInfo.push(characters);
+
+ if (characters[1] <= MAX_LATIN)
+ throw new Error("sets with more than one latin character not supported!");
+ if (characters[0] <= MAX_LATIN) {
+ for (i in characters)
+ latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
+ latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
+ } else {
+ for (i in characters)
+ latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
+ }
+
+ continue;
+ }
+
+ // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
+ var lo = characters[0];
+ var hi = characters[1];
+ var delta = hi - lo;
+ if (delta == 1) {
+ var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
+ typeInfo[lo] = type;
+ typeInfo[hi] = type;
+ } else {
+ typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
+ typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
+ }
+
+ if (lo > MAX_LATIN) {
+ latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
+ } else if (hi > MAX_LATIN) {
+ latinTypeInfo[lo] = "CanonicalizeLatinSelf:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
+ } else {
+ if (delta != 0x20 || lo & 0x20)
+ throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
+ latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
+ }
+}
+
+var rangeInfo = [];
+// Pass 3: coallesce types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+ var begin = end;
+ var type = typeInfo[end];
+ while (end < MAX_UCS2 && typeInfo[end + 1] == type)
+ ++end;
+ rangeInfo.push({begin:begin, end:end, type:type});
+}
+
+var latinRangeInfo = [];
+// Pass 4: coallesce latin-1 types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+ var begin = end;
+ var type = latinTypeInfo[end];
+ while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
+ ++end;
+ latinRangeInfo.push({begin:begin, end:end, type:type});
+}
+
+
+// Helper function to convert a number to a fixed width hex representation of a C uint16_t.
+function hex(x)
+{
+ var s = Number(x).toString(16);
+ while (s.length < 4)
+ s = 0 + s;
+ return "0x" + s + "u";
+}
+
+var copyright = (
+ "/*" + "\n" +
+ " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" +
+ " *" + "\n" +
+ " * Redistribution and use in source and binary forms, with or without" + "\n" +
+ " * modification, are permitted provided that the following conditions" + "\n" +
+ " * are met:" + "\n" +
+ " * 1. Redistributions of source code must retain the above copyright" + "\n" +
+ " * notice, this list of conditions and the following disclaimer." + "\n" +
+ " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" +
+ " * notice, this list of conditions and the following disclaimer in the" + "\n" +
+ " * documentation and/or other materials provided with the distribution." + "\n" +
+ " *" + "\n" +
+ " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" +
+ " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" +
+ " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" +
+ " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" +
+ " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" +
+ " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" +
+ " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" +
+ " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" +
+ " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" +
+ " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" +
+ " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" +
+ " */");
+
+print(copyright);
+print();
+print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
+print();
+print('#include "config.h"');
+print('#include "YarrCanonicalizeUCS2.h"');
+print();
+print("namespace JSC { namespace Yarr {");
+print();
+print("#include <stdint.h>");
+print();
+
+for (i in characterSetInfo) {
+ var characters = ""
+ var set = characterSetInfo[i];
+ for (var j in set)
+ characters += hex(set[j]) + ", ";
+ print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
+}
+print();
+print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
+print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
+for (i in characterSetInfo)
+print(" ucs2CharacterSet" + i + ",");
+print("};");
+print();
+print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
+print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
+for (i in rangeInfo) {
+ var info = rangeInfo[i];
+ var typeAndValue = info.type.split(':');
+ print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
+print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
+for (i in latinRangeInfo) {
+ var info = latinRangeInfo[i];
+ var typeAndValue = info.type.split(':');
+ print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("} } // JSC::Yarr");
+print();
+
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrInterpreter.h"
+
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+#include <wtf/BumpPointerAllocator.h>
+#include <wtf/DataLog.h>
+#include <wtf/text/CString.h>
+#include <wtf/text/WTFString.h>
+
+#ifndef NDEBUG
+#include <stdio.h>
+#endif
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+template<typename CharType>
+class Interpreter {
+public:
+ struct ParenthesesDisjunctionContext;
+
+ struct BackTrackInfoPatternCharacter {
+ uintptr_t matchAmount;
+ };
+ struct BackTrackInfoCharacterClass {
+ uintptr_t matchAmount;
+ };
+ struct BackTrackInfoBackReference {
+ uintptr_t begin; // Not really needed for greedy quantifiers.
+ uintptr_t matchAmount; // Not really needed for fixed quantifiers.
+ };
+ struct BackTrackInfoAlternative {
+ uintptr_t offset;
+ };
+ struct BackTrackInfoParentheticalAssertion {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesOnce {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesTerminal {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParentheses {
+ uintptr_t matchAmount;
+ ParenthesesDisjunctionContext* lastContext;
+ };
+
+ static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
+ {
+ context->next = backTrack->lastContext;
+ backTrack->lastContext = context;
+ ++backTrack->matchAmount;
+ }
+
+ static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
+ {
+ ASSERT(backTrack->matchAmount);
+ ASSERT(backTrack->lastContext);
+ backTrack->lastContext = backTrack->lastContext->next;
+ --backTrack->matchAmount;
+ }
+
+ struct DisjunctionContext
+ {
+ DisjunctionContext()
+ : term(0)
+ {
+ }
+
+ void* operator new(size_t, void* where)
+ {
+ return where;
+ }
+
+ int term;
+ unsigned matchBegin;
+ unsigned matchEnd;
+ uintptr_t frame[1];
+ };
+
+ DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
+ {
+ size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ allocatorPool = allocatorPool->ensureCapacity(size);
+ if (!allocatorPool)
+ CRASH();
+ return new (allocatorPool->alloc(size)) DisjunctionContext();
+ }
+
+ void freeDisjunctionContext(DisjunctionContext* context)
+ {
+ allocatorPool = allocatorPool->dealloc(context);
+ }
+
+ struct ParenthesesDisjunctionContext
+ {
+ ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
+ : next(0)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
+
+ for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
+ subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
+ output[(firstSubpatternId << 1) + i] = offsetNoMatch;
+ }
+
+ new (getDisjunctionContext(term)) DisjunctionContext();
+ }
+
+ void* operator new(size_t, void* where)
+ {
+ return where;
+ }
+
+ void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
+ {
+ for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
+ output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
+ }
+
+ DisjunctionContext* getDisjunctionContext(ByteTerm& term)
+ {
+ return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
+ }
+
+ ParenthesesDisjunctionContext* next;
+ unsigned subpatternBackup[1];
+ };
+
+ ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
+ {
+ size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ allocatorPool = allocatorPool->ensureCapacity(size);
+ if (!allocatorPool)
+ CRASH();
+ return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
+ }
+
+ void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
+ {
+ allocatorPool = allocatorPool->dealloc(context);
+ }
+
+ class InputStream {
+ public:
+ InputStream(const CharType* input, unsigned start, unsigned length)
+ : input(input)
+ , pos(start)
+ , length(length)
+ {
+ }
+
+ void next()
+ {
+ ++pos;
+ }
+
+ void rewind(unsigned amount)
+ {
+ ASSERT(pos >= amount);
+ pos -= amount;
+ }
+
+ int read()
+ {
+ ASSERT(pos < length);
+ if (pos < length)
+ return input[pos];
+ return -1;
+ }
+
+ int readPair()
+ {
+ ASSERT(pos + 1 < length);
+ return input[pos] | input[pos + 1] << 16;
+ }
+
+ int readChecked(unsigned negativePositionOffest)
+ {
+ if (pos < negativePositionOffest)
+ CRASH();
+ unsigned p = pos - negativePositionOffest;
+ ASSERT(p < length);
+ return input[p];
+ }
+
+ int reread(unsigned from)
+ {
+ ASSERT(from < length);
+ return input[from];
+ }
+
+ int prev()
+ {
+ ASSERT(!(pos > length));
+ if (pos && length)
+ return input[pos - 1];
+ return -1;
+ }
+
+ unsigned getPos()
+ {
+ return pos;
+ }
+
+ void setPos(unsigned p)
+ {
+ pos = p;
+ }
+
+ bool atStart()
+ {
+ return pos == 0;
+ }
+
+ bool atEnd()
+ {
+ return pos == length;
+ }
+
+ unsigned end()
+ {
+ return length;
+ }
+
+ bool checkInput(unsigned count)
+ {
+ if (((pos + count) <= length) && ((pos + count) >= pos)) {
+ pos += count;
+ return true;
+ }
+ return false;
+ }
+
+ void uncheckInput(unsigned count)
+ {
+ if (pos < count)
+ CRASH();
+ pos -= count;
+ }
+
+ bool atStart(unsigned negativePositionOffest)
+ {
+ return pos == negativePositionOffest;
+ }
+
+ bool atEnd(unsigned negativePositionOffest)
+ {
+ if (pos < negativePositionOffest)
+ CRASH();
+ return (pos - negativePositionOffest) == length;
+ }
+
+ bool isAvailableInput(unsigned offset)
+ {
+ return (((pos + offset) <= length) && ((pos + offset) >= pos));
+ }
+
+ private:
+ const CharType* input;
+ unsigned pos;
+ unsigned length;
+ };
+
+ bool testCharacterClass(CharacterClass* characterClass, int ch)
+ {
+ if (ch & 0xFF80) {
+ for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
+ if (ch == characterClass->m_matchesUnicode[i])
+ return true;
+ for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
+ if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
+ return true;
+ } else {
+ for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
+ if (ch == characterClass->m_matches[i])
+ return true;
+ for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
+ if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
+ return true;
+ }
+
+ return false;
+ }
+
+ bool checkCharacter(int testChar, unsigned negativeInputOffset)
+ {
+ return testChar == input.readChecked(negativeInputOffset);
+ }
+
+ bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
+ {
+ int ch = input.readChecked(negativeInputOffset);
+ return (loChar == ch) || (hiChar == ch);
+ }
+
+ bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
+ {
+ bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
+ return invert ? !match : match;
+ }
+
+ bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
+ {
+ unsigned matchSize = (unsigned)(matchEnd - matchBegin);
+
+ if (!input.checkInput(matchSize))
+ return false;
+
+ if (pattern->m_ignoreCase) {
+ for (unsigned i = 0; i < matchSize; ++i) {
+ int oldCh = input.reread(matchBegin + i);
+ int ch = input.readChecked(negativeInputOffset + matchSize - i);
+
+ if (oldCh == ch)
+ continue;
+
+ // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
+ // unicode values are never allowed to match against ascii ones.
+ if (isASCII(oldCh) || isASCII(ch)) {
+ if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
+ continue;
+ } else if (areCanonicallyEquivalent(oldCh, ch))
+ continue;
+
+ input.uncheckInput(matchSize);
+ return false;
+ }
+ } else {
+ for (unsigned i = 0; i < matchSize; ++i) {
+ if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
+ input.uncheckInput(matchSize);
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ bool matchAssertionBOL(ByteTerm& term)
+ {
+ return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
+ }
+
+ bool matchAssertionEOL(ByteTerm& term)
+ {
+ if (term.inputPosition)
+ return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
+
+ return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
+ }
+
+ bool matchAssertionWordBoundary(ByteTerm& term)
+ {
+ bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
+ bool readIsWordchar;
+ if (term.inputPosition)
+ readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
+ else
+ readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
+
+ bool wordBoundary = prevIsWordchar != readIsWordchar;
+ return term.invert() ? !wordBoundary : wordBoundary;
+ }
+
+ bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
+ {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
+ {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeCharacterClass);
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
+ return false;
+ }
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ unsigned matchAmount = 0;
+ while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ backTrack->matchAmount = 0;
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeCharacterClass);
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeBackReference);
+ BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+ unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+ unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+ // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
+ // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
+ // Eg.: /(a\1)/
+ if (matchEnd == offsetNoMatch)
+ return true;
+
+ if (matchBegin == offsetNoMatch)
+ return true;
+
+ ASSERT(matchBegin <= matchEnd);
+
+ if (matchBegin == matchEnd)
+ return true;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ backTrack->begin = input.getPos();
+ for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+ if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+ input.setPos(backTrack->begin);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ unsigned matchAmount = 0;
+ while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
+ ++matchAmount;
+ backTrack->matchAmount = matchAmount;
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ backTrack->begin = input.getPos();
+ backTrack->matchAmount = 0;
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeBackReference);
+ BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+ unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+ unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+ if (matchBegin == offsetNoMatch)
+ return false;
+
+ ASSERT(matchBegin <= matchEnd);
+
+ if (matchBegin == matchEnd)
+ return false;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ // for quantityCount == 1, could rewind.
+ input.setPos(backTrack->begin);
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.rewind(matchEnd - matchBegin);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+ ++backTrack->matchAmount;
+ return true;
+ }
+ input.setPos(backTrack->begin);
+ break;
+ }
+
+ return false;
+ }
+
+ void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
+ {
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
+ output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
+ }
+ }
+ void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
+ context->restoreOutput(output, firstSubpatternId, count);
+ }
+ JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+ {
+ while (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+
+ JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch)
+ return JSRegExpMatch;
+
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ return JSRegExpNoMatch;
+ }
+
+ bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy: {
+ // set this speculatively; if we get to the parens end this will be true.
+ backTrack->begin = input.getPos();
+ break;
+ }
+ case QuantifierNonGreedy: {
+ backTrack->begin = notFound;
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+ case QuantifierFixedCount:
+ break;
+ }
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
+ }
+
+ return true;
+ }
+
+ bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
+ }
+
+ if (term.atom.quantityType == QuantifierFixedCount)
+ return true;
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+ return backTrack->begin != input.getPos();
+ }
+
+ bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = offsetNoMatch;
+ output[(subpatternId << 1) + 1] = offsetNoMatch;
+ }
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy:
+ // if we backtrack to this point, there is another chance - try matching nothing.
+ ASSERT(backTrack->begin != notFound);
+ backTrack->begin = notFound;
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ case QuantifierNonGreedy:
+ ASSERT(backTrack->begin != notFound);
+ case QuantifierFixedCount:
+ break;
+ }
+
+ return false;
+ }
+
+ bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy:
+ if (backTrack->begin == notFound) {
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+ case QuantifierNonGreedy:
+ if (backTrack->begin == notFound) {
+ backTrack->begin = input.getPos();
+ if (term.capture()) {
+ // Technically this access to inputPosition should be accessing the begin term's
+ // inputPosition, but for repeats other than fixed these values should be
+ // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
+ ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
+ unsigned subpatternId = term.atom.subpatternId;
+ output[subpatternId << 1] = input.getPos() + term.inputPosition;
+ }
+ context->term -= term.atom.parenthesesWidth;
+ return true;
+ }
+ case QuantifierFixedCount:
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(!term.capture());
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ // Empty match is a failed match.
+ if (backTrack->begin == input.getPos())
+ return false;
+
+ // Successful match! Okay, what's next? - loop around and try to match moar!
+ context->term -= (term.atom.parenthesesWidth + 1);
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(!term.capture());
+
+ // If we backtrack to this point, we have failed to match this iteration of the parens.
+ // Since this is greedy / zero minimum a failed is also accepted as a match!
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
+ {
+ // 'Terminal' parentheses are at the end of the regex, and as such a match past end
+ // should always be returned as a successful match - we should never backtrack to here.
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ input.setPos(backTrack->begin);
+
+ // We've reached the end of the parens; if they are inverted, this is failure.
+ if (term.invert()) {
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+
+ return true;
+ }
+
+ bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ // We've failed to match parens; if they are inverted, this is win!
+ if (term.invert()) {
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ return false;
+ }
+
+ bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ input.setPos(backTrack->begin);
+
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+
+ JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+ BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+ ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+ backTrack->matchAmount = 0;
+ backTrack->lastContext = 0;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ // While we haven't yet reached our fixed limit,
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ // Try to do a match, and it it succeeds, add it to the list.
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ // The match failed; try to find an alternate point to carry on from.
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ }
+ }
+
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ return JSRegExpMatch;
+ }
+
+ case QuantifierGreedy: {
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+
+ break;
+ }
+ }
+
+ if (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return JSRegExpMatch;
+ }
+
+ case QuantifierNonGreedy:
+ return JSRegExpMatch;
+ }
+
+ ASSERT_NOT_REACHED();
+ return JSRegExpErrorNoMatch;
+ }
+
+ // Rules for backtracking differ depending on whether this is greedy or non-greedy.
+ //
+ // Greedy matches never should try just adding more - you should already have done
+ // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
+ // you backtrack an item off the list needs checking, since we'll never have matched
+ // the one less case. Tracking forwards, still add as much as possible.
+ //
+ // Non-greedy, we've already done the one less case, so don't match on popping.
+ // We haven't done the one more case, so always try to add that.
+ //
+ JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+ BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+ ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+
+ ParenthesesDisjunctionContext* context = 0;
+ JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
+
+ if (result != JSRegExpMatch)
+ return result;
+
+ // While we haven't yet reached our fixed limit,
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ // Try to do a match, and it it succeeds, add it to the list.
+ context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+
+ if (result == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ // The match failed; try to find an alternate point to carry on from.
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ }
+ }
+
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+ context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ return JSRegExpMatch;
+ }
+
+ case QuantifierGreedy: {
+ if (!backTrack->matchAmount)
+ return JSRegExpNoMatch;
+
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (parenthesesResult == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (parenthesesResult != JSRegExpNoMatch)
+ return parenthesesResult;
+
+ break;
+ }
+ }
+ } else {
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ if (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return JSRegExpMatch;
+ }
+
+ case QuantifierNonGreedy: {
+ // If we've not reached the limit, try to add one more match.
+ if (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch) {
+ appendParenthesesDisjunctionContext(backTrack, context);
+ recordParenthesesMatch(term, context);
+ return JSRegExpMatch;
+ }
+
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ // Nope - okay backtrack looking for an alternative.
+ while (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
+ // successful backtrack! we're back in the game!
+ if (backTrack->matchAmount) {
+ context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return JSRegExpMatch;
+ }
+
+ // pop a match off the stack
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ return JSRegExpNoMatch;
+ }
+ }
+
+ ASSERT_NOT_REACHED();
+ return JSRegExpErrorNoMatch;
+ }
+
+ bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
+ {
+ UNUSED_PARAM(term);
+ unsigned matchBegin = context->matchBegin;
+
+ if (matchBegin) {
+ for (matchBegin--; true; matchBegin--) {
+ if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
+ ++matchBegin;
+ break;
+ }
+
+ if (!matchBegin)
+ break;
+ }
+ }
+
+ unsigned matchEnd = input.getPos();
+
+ for (; (matchEnd != input.end())
+ && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
+
+ if (((matchBegin && term.anchors.m_bol)
+ || ((matchEnd != input.end()) && term.anchors.m_eol))
+ && !pattern->m_multiline)
+ return false;
+
+ context->matchBegin = matchBegin;
+ context->matchEnd = matchEnd;
+ return true;
+ }
+
+#define MATCH_NEXT() { ++context->term; goto matchAgain; }
+#define BACKTRACK() { --context->term; goto backtrack; }
+#define currentTerm() (disjunction->terms[context->term])
+ JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ {
+ if (!--remainingMatchCount)
+ return JSRegExpErrorHitLimit;
+
+ if (btrack)
+ BACKTRACK();
+
+ context->matchBegin = input.getPos();
+ context->term = 0;
+
+ matchAgain:
+ ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+ switch (currentTerm().type) {
+ case ByteTerm::TypeSubpatternBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeSubpatternEnd:
+ context->matchEnd = input.getPos();
+ return JSRegExpMatch;
+
+ case ByteTerm::TypeBodyAlternativeBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeBodyAlternativeDisjunction:
+ case ByteTerm::TypeBodyAlternativeEnd:
+ context->matchEnd = input.getPos();
+ return JSRegExpMatch;
+
+ case ByteTerm::TypeAlternativeBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeAlternativeDisjunction:
+ case ByteTerm::TypeAlternativeEnd: {
+ int offset = currentTerm().alternative.end;
+ BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+ backTrack->offset = offset;
+ context->term += offset;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypeAssertionBOL:
+ if (matchAssertionBOL(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeAssertionEOL:
+ if (matchAssertionEOL(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeAssertionWordBoundary:
+ if (matchAssertionWordBoundary(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypePatternCharacterOnce:
+ case ByteTerm::TypePatternCharacterFixed: {
+ for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
+ BACKTRACK();
+ }
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCharacterGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ unsigned matchAmount = 0;
+ while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCharacterNonGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ backTrack->matchAmount = 0;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypePatternCasedCharacterOnce:
+ case ByteTerm::TypePatternCasedCharacterFixed: {
+ for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
+ BACKTRACK();
+ }
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCasedCharacterGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ unsigned matchAmount = 0;
+ while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCasedCharacterNonGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ backTrack->matchAmount = 0;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypeCharacterClass:
+ if (matchCharacterClass(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeBackReference:
+ if (matchBackReference(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = matchParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
+ MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
+ BACKTRACK();
+ }
+ case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+ if (matchParenthesesOnceBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+ if (matchParenthesesOnceEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (matchParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (matchParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionBegin:
+ if (matchParentheticalAssertionBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionEnd:
+ if (matchParentheticalAssertionEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeCheckInput:
+ if (input.checkInput(currentTerm().checkInputCount))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeUncheckInput:
+ input.uncheckInput(currentTerm().checkInputCount);
+ MATCH_NEXT();
+
+ case ByteTerm::TypeDotStarEnclosure:
+ if (matchDotStarEnclosure(currentTerm(), context))
+ return JSRegExpMatch;
+ BACKTRACK();
+ }
+
+ // We should never fall-through to here.
+ ASSERT_NOT_REACHED();
+
+ backtrack:
+ ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+ switch (currentTerm().type) {
+ case ByteTerm::TypeSubpatternBegin:
+ return JSRegExpNoMatch;
+ case ByteTerm::TypeSubpatternEnd:
+ ASSERT_NOT_REACHED();
+
+ case ByteTerm::TypeBodyAlternativeBegin:
+ case ByteTerm::TypeBodyAlternativeDisjunction: {
+ int offset = currentTerm().alternative.next;
+ context->term += offset;
+ if (offset > 0)
+ MATCH_NEXT();
+
+ if (input.atEnd())
+ return JSRegExpNoMatch;
+
+ input.next();
+
+ context->matchBegin = input.getPos();
+
+ if (currentTerm().alternative.onceThrough)
+ context->term += currentTerm().alternative.next;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypeBodyAlternativeEnd:
+ ASSERT_NOT_REACHED();
+
+ case ByteTerm::TypeAlternativeBegin:
+ case ByteTerm::TypeAlternativeDisjunction: {
+ int offset = currentTerm().alternative.next;
+ context->term += offset;
+ if (offset > 0)
+ MATCH_NEXT();
+ BACKTRACK();
+ }
+ case ByteTerm::TypeAlternativeEnd: {
+ // We should never backtrack back into an alternative of the main body of the regex.
+ BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+ unsigned offset = backTrack->offset;
+ context->term -= offset;
+ BACKTRACK();
+ }
+
+ case ByteTerm::TypeAssertionBOL:
+ case ByteTerm::TypeAssertionEOL:
+ case ByteTerm::TypeAssertionWordBoundary:
+ BACKTRACK();
+
+ case ByteTerm::TypePatternCharacterOnce:
+ case ByteTerm::TypePatternCharacterFixed:
+ case ByteTerm::TypePatternCharacterGreedy:
+ case ByteTerm::TypePatternCharacterNonGreedy:
+ if (backtrackPatternCharacter(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypePatternCasedCharacterOnce:
+ case ByteTerm::TypePatternCasedCharacterFixed:
+ case ByteTerm::TypePatternCasedCharacterGreedy:
+ case ByteTerm::TypePatternCasedCharacterNonGreedy:
+ if (backtrackPatternCasedCharacter(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeCharacterClass:
+ if (backtrackCharacterClass(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeBackReference:
+ if (backtrackBackReference(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
+ MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
+ BACKTRACK();
+ }
+ case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+ if (backtrackParenthesesOnceBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+ if (backtrackParenthesesOnceEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (backtrackParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (backtrackParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionBegin:
+ if (backtrackParentheticalAssertionBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionEnd:
+ if (backtrackParentheticalAssertionEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeCheckInput:
+ input.uncheckInput(currentTerm().checkInputCount);
+ BACKTRACK();
+
+ case ByteTerm::TypeUncheckInput:
+ input.checkInput(currentTerm().checkInputCount);
+ BACKTRACK();
+
+ case ByteTerm::TypeDotStarEnclosure:
+ ASSERT_NOT_REACHED();
+ }
+
+ ASSERT_NOT_REACHED();
+ return JSRegExpErrorNoMatch;
+ }
+
+ JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ {
+ JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
+
+ if (result == JSRegExpMatch) {
+ while (context->matchBegin == context->matchEnd) {
+ result = matchDisjunction(disjunction, context, true);
+ if (result != JSRegExpMatch)
+ return result;
+ }
+ return JSRegExpMatch;
+ }
+
+ return result;
+ }
+
+ unsigned interpret()
+ {
+ if (!input.isAvailableInput(0))
+ return offsetNoMatch;
+
+ for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
+ output[i << 1] = offsetNoMatch;
+
+ allocatorPool = pattern->m_allocator->startAllocator();
+ if (!allocatorPool)
+ CRASH();
+
+ DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
+
+ JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
+ if (result == JSRegExpMatch) {
+ output[0] = context->matchBegin;
+ output[1] = context->matchEnd;
+ }
+
+ freeDisjunctionContext(context);
+
+ pattern->m_allocator->stopAllocator();
+
+ ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
+ return output[0];
+ }
+
+ Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
+ : pattern(pattern)
+ , output(output)
+ , input(input, start, length)
+ , allocatorPool(0)
+ , remainingMatchCount(matchLimit)
+ {
+ }
+
+private:
+ BytecodePattern* pattern;
+ unsigned* output;
+ InputStream input;
+ BumpPointerPool* allocatorPool;
+ unsigned remainingMatchCount;
+};
+
+
+
+class ByteCompiler {
+ struct ParenthesesStackEntry {
+ unsigned beginTerm;
+ unsigned savedAlternativeIndex;
+ ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
+ : beginTerm(beginTerm)
+ , savedAlternativeIndex(savedAlternativeIndex)
+ {
+ }
+ };
+
+public:
+ ByteCompiler(YarrPattern& pattern)
+ : m_pattern(pattern)
+ {
+ m_currentAlternativeIndex = 0;
+ }
+
+ PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
+ {
+ regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
+ emitDisjunction(m_pattern.m_body);
+ regexEnd();
+
+ return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
+ }
+
+ void checkInput(unsigned count)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
+ }
+
+ void uncheckInput(unsigned count)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
+ }
+
+ void assertionBOL(unsigned inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
+ }
+
+ void assertionEOL(unsigned inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
+ }
+
+ void assertionWordBoundary(bool invert, unsigned inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
+ }
+
+ void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ if (m_pattern.m_ignoreCase) {
+ UChar lo = Unicode::toLower(ch);
+ UChar hi = Unicode::toUpper(ch);
+
+ if (lo != hi) {
+ m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
+ return;
+ }
+ }
+
+ m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
+ }
+
+ void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
+
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ }
+
+ void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ ASSERT(subpatternId);
+
+ m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
+
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
+ // then fix this up at the end! - simplifying this should make it much clearer.
+ // https://bugs.webkit.org/show_bug.cgi?id=50136
+
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
+
+ bool invert = m_bodyDisjunction->terms[beginTerm].invert();
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
+ void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
+ }
+
+ unsigned popParenthesesStack()
+ {
+ ASSERT(m_parenthesesStack.size());
+ int stackEnd = m_parenthesesStack.size() - 1;
+ unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
+ m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
+ m_parenthesesStack.shrink(stackEnd);
+
+ ASSERT(beginTerm < m_bodyDisjunction->terms.size());
+ ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
+
+ return beginTerm;
+ }
+
+#ifndef NDEBUG
+ void dumpDisjunction(ByteDisjunction* disjunction)
+ {
+ dataLogF("ByteDisjunction(%p):\n\t", disjunction);
+ for (unsigned i = 0; i < disjunction->terms.size(); ++i)
+ dataLogF("{ %d } ", disjunction->terms[i].type);
+ dataLogF("\n");
+ }
+#endif
+
+ void closeAlternative(int beginTerm)
+ {
+ int origBeginTerm = beginTerm;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
+ int endIndex = m_bodyDisjunction->terms.size();
+
+ unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+ if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
+ m_bodyDisjunction->terms.remove(beginTerm);
+ else {
+ while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+ beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
+ m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
+ m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+ }
+ }
+
+ void closeBodyAlternative()
+ {
+ int beginTerm = 0;
+ int origBeginTerm = 0;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
+ int endIndex = m_bodyDisjunction->terms.size();
+
+ unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+ while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+ beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
+ m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
+ m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
+
+ bool capture = parenthesesBegin.capture();
+ unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+
+ unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
+ ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
+ for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+ parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+
+ m_bodyDisjunction->terms.shrink(beginTerm);
+
+ m_allParenthesesInfo.append(parenthesesDisjunction);
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ bool capture = m_bodyDisjunction->terms[beginTerm].capture();
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
+ void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+
+ bool capture = m_bodyDisjunction->terms[beginTerm].capture();
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
+ void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
+ {
+ m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
+ m_bodyDisjunction->terms[0].frameLocation = 0;
+ m_currentAlternativeIndex = 0;
+ }
+
+ void regexEnd()
+ {
+ closeBodyAlternative();
+ }
+
+ void alternativeBodyDisjunction(bool onceThrough)
+ {
+ int newAlternativeIndex = m_bodyDisjunction->terms.size();
+ m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
+
+ m_currentAlternativeIndex = newAlternativeIndex;
+ }
+
+ void alternativeDisjunction()
+ {
+ int newAlternativeIndex = m_bodyDisjunction->terms.size();
+ m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
+
+ m_currentAlternativeIndex = newAlternativeIndex;
+ }
+
+ void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
+ {
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
+
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+ if (alt) {
+ if (disjunction == m_pattern.m_body)
+ alternativeBodyDisjunction(alternative->onceThrough());
+ else
+ alternativeDisjunction();
+ }
+
+ unsigned minimumSize = alternative->m_minimumSize;
+ ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
+ unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
+
+ if (countToCheck) {
+ checkInput(countToCheck);
+ currentCountAlreadyChecked += countToCheck;
+ }
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ assertionBOL(currentCountAlreadyChecked - term.inputPosition);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ assertionEOL(currentCountAlreadyChecked - term.inputPosition);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
+ break;
+
+ case PatternTerm::TypePatternCharacter:
+ atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeBackReference:
+ atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern: {
+ unsigned disjunctionAlreadyCheckedCount = 0;
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ unsigned alternativeFrameLocation = term.frameLocation;
+ // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
+ if (term.quantityType == QuantifierFixedCount)
+ disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
+ else
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+ } else if (term.parentheses.isTerminal) {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+ } else {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
+ atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ }
+ break;
+ }
+
+ case PatternTerm::TypeParentheticalAssertion: {
+ unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
+
+ ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
+ unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
+ unsigned uncheckAmount = 0;
+ if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
+ uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
+ uncheckInput(uncheckAmount);
+ currentCountAlreadyChecked -= uncheckAmount;
+ }
+
+ atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
+ atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
+ if (uncheckAmount) {
+ checkInput(uncheckAmount);
+ currentCountAlreadyChecked += uncheckAmount;
+ }
+ break;
+ }
+
+ case PatternTerm::TypeDotStarEnclosure:
+ assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
+ break;
+ }
+ }
+ }
+ }
+
+private:
+ YarrPattern& m_pattern;
+ OwnPtr<ByteDisjunction> m_bodyDisjunction;
+ unsigned m_currentAlternativeIndex;
+ Vector<ParenthesesStackEntry> m_parenthesesStack;
+ Vector<ByteDisjunction*> m_allParenthesesInfo;
+};
+
+PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
+{
+ return ByteCompiler(pattern).compile(allocator);
+}
+
+unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
+{
+ if (input.is8Bit())
+ return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
+ return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
+{
+ return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
+{
+ return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
+}
+
+// These should be the same for both UChar & LChar.
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
+
+
+} }
--- /dev/null
+/*
+ * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrInterpreter_h
+#define YarrInterpreter_h
+
+#include "YarrPattern.h"
+#include <wtf/PassOwnPtr.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace WTF {
+class BumpPointerAllocator;
+}
+using WTF::BumpPointerAllocator;
+
+namespace JSC { namespace Yarr {
+
+class ByteDisjunction;
+
+struct ByteTerm {
+ enum Type {
+ TypeBodyAlternativeBegin,
+ TypeBodyAlternativeDisjunction,
+ TypeBodyAlternativeEnd,
+ TypeAlternativeBegin,
+ TypeAlternativeDisjunction,
+ TypeAlternativeEnd,
+ TypeSubpatternBegin,
+ TypeSubpatternEnd,
+ TypeAssertionBOL,
+ TypeAssertionEOL,
+ TypeAssertionWordBoundary,
+ TypePatternCharacterOnce,
+ TypePatternCharacterFixed,
+ TypePatternCharacterGreedy,
+ TypePatternCharacterNonGreedy,
+ TypePatternCasedCharacterOnce,
+ TypePatternCasedCharacterFixed,
+ TypePatternCasedCharacterGreedy,
+ TypePatternCasedCharacterNonGreedy,
+ TypeCharacterClass,
+ TypeBackReference,
+ TypeParenthesesSubpattern,
+ TypeParenthesesSubpatternOnceBegin,
+ TypeParenthesesSubpatternOnceEnd,
+ TypeParenthesesSubpatternTerminalBegin,
+ TypeParenthesesSubpatternTerminalEnd,
+ TypeParentheticalAssertionBegin,
+ TypeParentheticalAssertionEnd,
+ TypeCheckInput,
+ TypeUncheckInput,
+ TypeDotStarEnclosure,
+ } type;
+ union {
+ struct {
+ union {
+ UChar patternCharacter;
+ struct {
+ UChar lo;
+ UChar hi;
+ } casedCharacter;
+ CharacterClass* characterClass;
+ unsigned subpatternId;
+ };
+ union {
+ ByteDisjunction* parenthesesDisjunction;
+ unsigned parenthesesWidth;
+ };
+ QuantifierType quantityType;
+ unsigned quantityCount;
+ } atom;
+ struct {
+ int next;
+ int end;
+ bool onceThrough;
+ } alternative;
+ struct {
+ bool m_bol : 1;
+ bool m_eol : 1;
+ } anchors;
+ unsigned checkInputCount;
+ };
+ unsigned frameLocation;
+ bool m_capture : 1;
+ bool m_invert : 1;
+ unsigned inputPosition;
+
+ ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ : frameLocation(frameLocation)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ switch (quantityType) {
+ case QuantifierFixedCount:
+ type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
+ break;
+ case QuantifierGreedy:
+ type = ByteTerm::TypePatternCharacterGreedy;
+ break;
+ case QuantifierNonGreedy:
+ type = ByteTerm::TypePatternCharacterNonGreedy;
+ break;
+ }
+
+ atom.patternCharacter = ch;
+ atom.quantityType = quantityType;
+ atom.quantityCount = quantityCount.unsafeGet();
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ : frameLocation(frameLocation)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ switch (quantityType) {
+ case QuantifierFixedCount:
+ type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
+ break;
+ case QuantifierGreedy:
+ type = ByteTerm::TypePatternCasedCharacterGreedy;
+ break;
+ case QuantifierNonGreedy:
+ type = ByteTerm::TypePatternCasedCharacterNonGreedy;
+ break;
+ }
+
+ atom.casedCharacter.lo = lo;
+ atom.casedCharacter.hi = hi;
+ atom.quantityType = quantityType;
+ atom.quantityCount = quantityCount.unsafeGet();
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
+ : type(ByteTerm::TypeCharacterClass)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ atom.characterClass = characterClass;
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos)
+ : type(type)
+ , m_capture(capture)
+ , m_invert(false)
+ {
+ atom.subpatternId = subpatternId;
+ atom.parenthesesDisjunction = parenthesesInfo;
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(Type type, bool invert = false)
+ : type(type)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ }
+
+ ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos)
+ : type(type)
+ , m_capture(capture)
+ , m_invert(invert)
+ {
+ atom.subpatternId = subpatternId;
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ inputPosition = inputPos;
+ }
+
+ static ByteTerm BOL(int inputPos)
+ {
+ ByteTerm term(TypeAssertionBOL);
+ term.inputPosition = inputPos;
+ return term;
+ }
+
+ static ByteTerm CheckInput(Checked<unsigned> count)
+ {
+ ByteTerm term(TypeCheckInput);
+ term.checkInputCount = count.unsafeGet();
+ return term;
+ }
+
+ static ByteTerm UncheckInput(Checked<unsigned> count)
+ {
+ ByteTerm term(TypeUncheckInput);
+ term.checkInputCount = count.unsafeGet();
+ return term;
+ }
+
+ static ByteTerm EOL(int inputPos)
+ {
+ ByteTerm term(TypeAssertionEOL);
+ term.inputPosition = inputPos;
+ return term;
+ }
+
+ static ByteTerm WordBoundary(bool invert, int inputPos)
+ {
+ ByteTerm term(TypeAssertionWordBoundary, invert);
+ term.inputPosition = inputPos;
+ return term;
+ }
+
+ static ByteTerm BackReference(unsigned subpatternId, int inputPos)
+ {
+ return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
+ }
+
+ static ByteTerm BodyAlternativeBegin(bool onceThrough)
+ {
+ ByteTerm term(TypeBodyAlternativeBegin);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = onceThrough;
+ return term;
+ }
+
+ static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
+ {
+ ByteTerm term(TypeBodyAlternativeDisjunction);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = onceThrough;
+ return term;
+ }
+
+ static ByteTerm BodyAlternativeEnd()
+ {
+ ByteTerm term(TypeBodyAlternativeEnd);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm AlternativeBegin()
+ {
+ ByteTerm term(TypeAlternativeBegin);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm AlternativeDisjunction()
+ {
+ ByteTerm term(TypeAlternativeDisjunction);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm AlternativeEnd()
+ {
+ ByteTerm term(TypeAlternativeEnd);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm SubpatternBegin()
+ {
+ return ByteTerm(TypeSubpatternBegin);
+ }
+
+ static ByteTerm SubpatternEnd()
+ {
+ return ByteTerm(TypeSubpatternEnd);
+ }
+
+ static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
+ {
+ ByteTerm term(TypeDotStarEnclosure);
+ term.anchors.m_bol = bolAnchor;
+ term.anchors.m_eol = eolAnchor;
+ return term;
+ }
+
+ bool invert()
+ {
+ return m_invert;
+ }
+
+ bool capture()
+ {
+ return m_capture;
+ }
+};
+
+class ByteDisjunction {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
+ : m_numSubpatterns(numSubpatterns)
+ , m_frameSize(frameSize)
+ {
+ }
+
+ Vector<ByteTerm> terms;
+ unsigned m_numSubpatterns;
+ unsigned m_frameSize;
+};
+
+struct BytecodePattern {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator)
+ : m_body(body)
+ , m_ignoreCase(pattern.m_ignoreCase)
+ , m_multiline(pattern.m_multiline)
+ , m_allocator(allocator)
+ {
+ newlineCharacterClass = pattern.newlineCharacterClass();
+ wordcharCharacterClass = pattern.wordcharCharacterClass();
+
+ m_allParenthesesInfo.append(allParenthesesInfo);
+ m_userCharacterClasses.append(pattern.m_userCharacterClasses);
+ // 'Steal' the YarrPattern's CharacterClasses! We clear its
+ // array, so that it won't delete them on destruction. We'll
+ // take responsibility for that.
+ pattern.m_userCharacterClasses.clear();
+ }
+
+ ~BytecodePattern()
+ {
+ deleteAllValues(m_allParenthesesInfo);
+ deleteAllValues(m_userCharacterClasses);
+ }
+
+ OwnPtr<ByteDisjunction> m_body;
+ bool m_ignoreCase;
+ bool m_multiline;
+ // Each BytecodePattern is associated with a RegExp, each RegExp is associated
+ // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator.
+ BumpPointerAllocator* m_allocator;
+
+ CharacterClass* newlineCharacterClass;
+ CharacterClass* wordcharCharacterClass;
+
+private:
+ Vector<ByteDisjunction*> m_allParenthesesInfo;
+ Vector<CharacterClass*> m_userCharacterClasses;
+};
+
+JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*);
+JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
+
+} } // namespace JSC::Yarr
+
+#endif // YarrInterpreter_h
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrJIT.h"
+
+#include <wtf/ASCIICType.h>
+#include "LinkBuffer.h"
+#include "Options.h"
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+
+#if ENABLE(YARR_JIT)
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+template<YarrJITCompileMode compileMode>
+class YarrGenerator : private MacroAssembler {
+ friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline);
+
+#if CPU(ARM)
+ static const RegisterID input = ARMRegisters::r0;
+ static const RegisterID index = ARMRegisters::r1;
+ static const RegisterID length = ARMRegisters::r2;
+ static const RegisterID output = ARMRegisters::r4;
+
+ static const RegisterID regT0 = ARMRegisters::r5;
+ static const RegisterID regT1 = ARMRegisters::r6;
+
+ static const RegisterID returnRegister = ARMRegisters::r0;
+ static const RegisterID returnRegister2 = ARMRegisters::r1;
+#elif CPU(MIPS)
+ static const RegisterID input = MIPSRegisters::a0;
+ static const RegisterID index = MIPSRegisters::a1;
+ static const RegisterID length = MIPSRegisters::a2;
+ static const RegisterID output = MIPSRegisters::a3;
+
+ static const RegisterID regT0 = MIPSRegisters::t4;
+ static const RegisterID regT1 = MIPSRegisters::t5;
+
+ static const RegisterID returnRegister = MIPSRegisters::v0;
+ static const RegisterID returnRegister2 = MIPSRegisters::v1;
+#elif CPU(SH4)
+ static const RegisterID input = SH4Registers::r4;
+ static const RegisterID index = SH4Registers::r5;
+ static const RegisterID length = SH4Registers::r6;
+ static const RegisterID output = SH4Registers::r7;
+
+ static const RegisterID regT0 = SH4Registers::r0;
+ static const RegisterID regT1 = SH4Registers::r1;
+
+ static const RegisterID returnRegister = SH4Registers::r0;
+ static const RegisterID returnRegister2 = SH4Registers::r1;
+#elif CPU(X86)
+ static const RegisterID input = X86Registers::eax;
+ static const RegisterID index = X86Registers::edx;
+ static const RegisterID length = X86Registers::ecx;
+ static const RegisterID output = X86Registers::edi;
+
+ static const RegisterID regT0 = X86Registers::ebx;
+ static const RegisterID regT1 = X86Registers::esi;
+
+ static const RegisterID returnRegister = X86Registers::eax;
+ static const RegisterID returnRegister2 = X86Registers::edx;
+#elif CPU(X86_64)
+ static const RegisterID input = X86Registers::edi;
+ static const RegisterID index = X86Registers::esi;
+ static const RegisterID length = X86Registers::edx;
+ static const RegisterID output = X86Registers::ecx;
+
+ static const RegisterID regT0 = X86Registers::eax;
+ static const RegisterID regT1 = X86Registers::ebx;
+
+ static const RegisterID returnRegister = X86Registers::eax;
+ static const RegisterID returnRegister2 = X86Registers::edx;
+#endif
+
+ void optimizeAlternative(PatternAlternative* alternative)
+ {
+ if (!alternative->m_terms.size())
+ return;
+
+ for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+ PatternTerm& nextTerm = alternative->m_terms[i + 1];
+
+ if ((term.type == PatternTerm::TypeCharacterClass)
+ && (term.quantityType == QuantifierFixedCount)
+ && (nextTerm.type == PatternTerm::TypePatternCharacter)
+ && (nextTerm.quantityType == QuantifierFixedCount)) {
+ PatternTerm termCopy = term;
+ alternative->m_terms[i] = nextTerm;
+ alternative->m_terms[i + 1] = termCopy;
+ }
+ }
+ }
+
+ void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount)
+ {
+ do {
+ // pick which range we're going to generate
+ int which = count >> 1;
+ char lo = ranges[which].begin;
+ char hi = ranges[which].end;
+
+ // check if there are any ranges or matches below lo. If not, just jl to failure -
+ // if there is anything else to check, check that first, if it falls through jmp to failure.
+ if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) {
+ Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo));
+
+ // generate code for all ranges before this one
+ if (which)
+ matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount);
+
+ while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) {
+ matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex])));
+ ++*matchIndex;
+ }
+ failures.append(jump());
+
+ loOrAbove.link(this);
+ } else if (which) {
+ Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo));
+
+ matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount);
+ failures.append(jump());
+
+ loOrAbove.link(this);
+ } else
+ failures.append(branch32(LessThan, character, Imm32((unsigned short)lo)));
+
+ while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi))
+ ++*matchIndex;
+
+ matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi)));
+ // fall through to here, the value is above hi.
+
+ // shuffle along & loop around if there are any more matches to handle.
+ unsigned next = which + 1;
+ ranges += next;
+ count -= next;
+ } while (count);
+ }
+
+ void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass)
+ {
+ if (charClass->m_table) {
+ ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table));
+ matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry));
+ return;
+ }
+ Jump unicodeFail;
+ if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) {
+ Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f));
+
+ if (charClass->m_matchesUnicode.size()) {
+ for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) {
+ UChar ch = charClass->m_matchesUnicode[i];
+ matchDest.append(branch32(Equal, character, Imm32(ch)));
+ }
+ }
+
+ if (charClass->m_rangesUnicode.size()) {
+ for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) {
+ UChar lo = charClass->m_rangesUnicode[i].begin;
+ UChar hi = charClass->m_rangesUnicode[i].end;
+
+ Jump below = branch32(LessThan, character, Imm32(lo));
+ matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi)));
+ below.link(this);
+ }
+ }
+
+ unicodeFail = jump();
+ isAscii.link(this);
+ }
+
+ if (charClass->m_ranges.size()) {
+ unsigned matchIndex = 0;
+ JumpList failures;
+ matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size());
+ while (matchIndex < charClass->m_matches.size())
+ matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++])));
+
+ failures.link(this);
+ } else if (charClass->m_matches.size()) {
+ // optimization: gather 'a','A' etc back together, can mask & test once.
+ Vector<char> matchesAZaz;
+
+ for (unsigned i = 0; i < charClass->m_matches.size(); ++i) {
+ char ch = charClass->m_matches[i];
+ if (m_pattern.m_ignoreCase) {
+ if (isASCIILower(ch)) {
+ matchesAZaz.append(ch);
+ continue;
+ }
+ if (isASCIIUpper(ch))
+ continue;
+ }
+ matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch)));
+ }
+
+ if (unsigned countAZaz = matchesAZaz.size()) {
+ or32(TrustedImm32(32), character);
+ for (unsigned i = 0; i < countAZaz; ++i)
+ matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i])));
+ }
+ }
+
+ if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size())
+ unicodeFail.link(this);
+ }
+
+ // Jumps if input not available; will have (incorrectly) incremented already!
+ Jump jumpIfNoAvailableInput(unsigned countToCheck = 0)
+ {
+ if (countToCheck)
+ add32(Imm32(countToCheck), index);
+ return branch32(Above, index, length);
+ }
+
+ Jump jumpIfAvailableInput(unsigned countToCheck)
+ {
+ add32(Imm32(countToCheck), index);
+ return branch32(BelowOrEqual, index, length);
+ }
+
+ Jump checkInput()
+ {
+ return branch32(BelowOrEqual, index, length);
+ }
+
+ Jump atEndOfInput()
+ {
+ return branch32(Equal, index, length);
+ }
+
+ Jump notAtEndOfInput()
+ {
+ return branch32(NotEqual, index, length);
+ }
+
+ Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character)
+ {
+ readCharacter(inputPosition, character);
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
+ or32(TrustedImm32(0x20), character);
+ ch |= 0x20;
+ }
+
+ return branch32(NotEqual, character, Imm32(ch));
+ }
+
+ void readCharacter(int inputPosition, RegisterID reg)
+ {
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg);
+ else
+ load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg);
+ }
+
+ void storeToFrame(RegisterID reg, unsigned frameLocation)
+ {
+ poke(reg, frameLocation);
+ }
+
+ void storeToFrame(TrustedImm32 imm, unsigned frameLocation)
+ {
+ poke(imm, frameLocation);
+ }
+
+ DataLabelPtr storeToFrameWithPatch(unsigned frameLocation)
+ {
+ return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*)));
+ }
+
+ void loadFromFrame(unsigned frameLocation, RegisterID reg)
+ {
+ peek(reg, frameLocation);
+ }
+
+ void loadFromFrameAndJump(unsigned frameLocation)
+ {
+ jump(Address(stackPointerRegister, frameLocation * sizeof(void*)));
+ }
+
+ void initCallFrame()
+ {
+ unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+ if (callFrameSize)
+ subPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+ }
+ void removeCallFrame()
+ {
+ unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+ if (callFrameSize)
+ addPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+ }
+
+ // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns.
+ void setSubpatternStart(RegisterID reg, unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(reg, Address(output, (subpattern << 1) * sizeof(int)));
+ }
+ void setSubpatternEnd(RegisterID reg, unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(reg, Address(output, ((subpattern << 1) + 1) * sizeof(int)));
+ }
+ void clearSubpatternStart(unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int)));
+ }
+
+ // We use one of three different strategies to track the start of the current match,
+ // while matching.
+ // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily
+ // at the end of matching. This is irrespective of compileMode, and in this case
+ // these methods should never be called.
+ // 2) If we're compiling IncludeSubpatterns, 'output' contains a pointer to an output
+ // vector, store the match start in the output vector.
+ // 3) If we're compiling MatchOnly, 'output' is unused, store the match start directly
+ // in this register.
+ void setMatchStart(RegisterID reg)
+ {
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ if (compileMode == IncludeSubpatterns)
+ store32(reg, output);
+ else
+ move(reg, output);
+ }
+ void getMatchStart(RegisterID reg)
+ {
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ if (compileMode == IncludeSubpatterns)
+ load32(output, reg);
+ else
+ move(output, reg);
+ }
+
+ enum YarrOpCode {
+ // These nodes wrap body alternatives - those in the main disjunction,
+ // rather than subpatterns or assertions. These are chained together in
+ // a doubly linked list, with a 'begin' node for the first alternative,
+ // a 'next' node for each subsequent alternative, and an 'end' node at
+ // the end. In the case of repeating alternatives, the 'end' node also
+ // has a reference back to 'begin'.
+ OpBodyAlternativeBegin,
+ OpBodyAlternativeNext,
+ OpBodyAlternativeEnd,
+ // Similar to the body alternatives, but used for subpatterns with two
+ // or more alternatives.
+ OpNestedAlternativeBegin,
+ OpNestedAlternativeNext,
+ OpNestedAlternativeEnd,
+ // Used for alternatives in subpatterns where there is only a single
+ // alternative (backtrackingis easier in these cases), or for alternatives
+ // which never need to be backtracked (those in parenthetical assertions,
+ // terminal subpatterns).
+ OpSimpleNestedAlternativeBegin,
+ OpSimpleNestedAlternativeNext,
+ OpSimpleNestedAlternativeEnd,
+ // Used to wrap 'Once' subpattern matches (quantityCount == 1).
+ OpParenthesesSubpatternOnceBegin,
+ OpParenthesesSubpatternOnceEnd,
+ // Used to wrap 'Terminal' subpattern matches (at the end of the regexp).
+ OpParenthesesSubpatternTerminalBegin,
+ OpParenthesesSubpatternTerminalEnd,
+ // Used to wrap parenthetical assertions.
+ OpParentheticalAssertionBegin,
+ OpParentheticalAssertionEnd,
+ // Wraps all simple terms (pattern characters, character classes).
+ OpTerm,
+ // Where an expression contains only 'once through' body alternatives
+ // and no repeating ones, this op is used to return match failure.
+ OpMatchFailed
+ };
+
+ // This structure is used to hold the compiled opcode information,
+ // including reference back to the original PatternTerm/PatternAlternatives,
+ // and JIT compilation data structures.
+ struct YarrOp {
+ explicit YarrOp(PatternTerm* term)
+ : m_op(OpTerm)
+ , m_term(term)
+ , m_isDeadCode(false)
+ {
+ }
+
+ explicit YarrOp(YarrOpCode op)
+ : m_op(op)
+ , m_isDeadCode(false)
+ {
+ }
+
+ // The operation, as a YarrOpCode, and also a reference to the PatternTerm.
+ YarrOpCode m_op;
+ PatternTerm* m_term;
+
+ // For alternatives, this holds the PatternAlternative and doubly linked
+ // references to this alternative's siblings. In the case of the
+ // OpBodyAlternativeEnd node at the end of a section of repeating nodes,
+ // m_nextOp will reference the OpBodyAlternativeBegin node of the first
+ // repeating alternative.
+ PatternAlternative* m_alternative;
+ size_t m_previousOp;
+ size_t m_nextOp;
+
+ // Used to record a set of Jumps out of the generated code, typically
+ // used for jumps out to backtracking code, and a single reentry back
+ // into the code for a node (likely where a backtrack will trigger
+ // rematching).
+ Label m_reentry;
+ JumpList m_jumps;
+
+ // Used for backtracking when the prior alternative did not consume any
+ // characters but matched.
+ Jump m_zeroLengthMatch;
+
+ // This flag is used to null out the second pattern character, when
+ // two are fused to match a pair together.
+ bool m_isDeadCode;
+
+ // Currently used in the case of some of the more complex management of
+ // 'm_checked', to cache the offset used in this alternative, to avoid
+ // recalculating it.
+ int m_checkAdjust;
+
+ // Used by OpNestedAlternativeNext/End to hold the pointer to the
+ // value that will be pushed into the pattern's frame to return to,
+ // upon backtracking back into the disjunction.
+ DataLabelPtr m_returnAddress;
+ };
+
+ // BacktrackingState
+ // This class encapsulates information about the state of code generation
+ // whilst generating the code for backtracking, when a term fails to match.
+ // Upon entry to code generation of the backtracking code for a given node,
+ // the Backtracking state will hold references to all control flow sources
+ // that are outputs in need of further backtracking from the prior node
+ // generated (which is the subsequent operation in the regular expression,
+ // and in the m_ops Vector, since we generated backtracking backwards).
+ // These references to control flow take the form of:
+ // - A jump list of jumps, to be linked to code that will backtrack them
+ // further.
+ // - A set of DataLabelPtr values, to be populated with values to be
+ // treated effectively as return addresses backtracking into complex
+ // subpatterns.
+ // - A flag indicating that the current sequence of generated code up to
+ // this point requires backtracking.
+ class BacktrackingState {
+ public:
+ BacktrackingState()
+ : m_pendingFallthrough(false)
+ {
+ }
+
+ // Add a jump or jumps, a return address, or set the flag indicating
+ // that the current 'fallthrough' control flow requires backtracking.
+ void append(const Jump& jump)
+ {
+ m_laterFailures.append(jump);
+ }
+ void append(JumpList& jumpList)
+ {
+ m_laterFailures.append(jumpList);
+ }
+ void append(const DataLabelPtr& returnAddress)
+ {
+ m_pendingReturns.append(returnAddress);
+ }
+ void fallthrough()
+ {
+ ASSERT(!m_pendingFallthrough);
+ m_pendingFallthrough = true;
+ }
+
+ // These methods clear the backtracking state, either linking to the
+ // current location, a provided label, or copying the backtracking out
+ // to a JumpList. All actions may require code generation to take place,
+ // and as such are passed a pointer to the assembler.
+ void link(MacroAssembler* assembler)
+ {
+ if (m_pendingReturns.size()) {
+ Label here(assembler);
+ for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+ m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here));
+ m_pendingReturns.clear();
+ }
+ m_laterFailures.link(assembler);
+ m_laterFailures.clear();
+ m_pendingFallthrough = false;
+ }
+ void linkTo(Label label, MacroAssembler* assembler)
+ {
+ if (m_pendingReturns.size()) {
+ for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+ m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], label));
+ m_pendingReturns.clear();
+ }
+ if (m_pendingFallthrough)
+ assembler->jump(label);
+ m_laterFailures.linkTo(label, assembler);
+ m_laterFailures.clear();
+ m_pendingFallthrough = false;
+ }
+ void takeBacktracksToJumpList(JumpList& jumpList, MacroAssembler* assembler)
+ {
+ if (m_pendingReturns.size()) {
+ Label here(assembler);
+ for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+ m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here));
+ m_pendingReturns.clear();
+ m_pendingFallthrough = true;
+ }
+ if (m_pendingFallthrough)
+ jumpList.append(assembler->jump());
+ jumpList.append(m_laterFailures);
+ m_laterFailures.clear();
+ m_pendingFallthrough = false;
+ }
+
+ bool isEmpty()
+ {
+ return m_laterFailures.empty() && m_pendingReturns.isEmpty() && !m_pendingFallthrough;
+ }
+
+ // Called at the end of code generation to link all return addresses.
+ void linkDataLabels(LinkBuffer& linkBuffer)
+ {
+ ASSERT(isEmpty());
+ for (unsigned i = 0; i < m_backtrackRecords.size(); ++i)
+ linkBuffer.patch(m_backtrackRecords[i].m_dataLabel, linkBuffer.locationOf(m_backtrackRecords[i].m_backtrackLocation));
+ }
+
+ private:
+ struct ReturnAddressRecord {
+ ReturnAddressRecord(DataLabelPtr dataLabel, Label backtrackLocation)
+ : m_dataLabel(dataLabel)
+ , m_backtrackLocation(backtrackLocation)
+ {
+ }
+
+ DataLabelPtr m_dataLabel;
+ Label m_backtrackLocation;
+ };
+
+ JumpList m_laterFailures;
+ bool m_pendingFallthrough;
+ Vector<DataLabelPtr, 4> m_pendingReturns;
+ Vector<ReturnAddressRecord, 4> m_backtrackRecords;
+ };
+
+ // Generation methods:
+ // ===================
+
+ // This method provides a default implementation of backtracking common
+ // to many terms; terms commonly jump out of the forwards matching path
+ // on any failed conditions, and add these jumps to the m_jumps list. If
+ // no special handling is required we can often just backtrack to m_jumps.
+ void backtrackTermDefault(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ m_backtrackingState.append(op.m_jumps);
+ }
+
+ void generateAssertionBOL(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ if (m_pattern.m_multiline) {
+ const RegisterID character = regT0;
+
+ JumpList matchDest;
+ if (!term->inputPosition)
+ matchDest.append(branch32(Equal, index, Imm32(m_checked)));
+
+ readCharacter((term->inputPosition - m_checked) - 1, character);
+ matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass());
+ op.m_jumps.append(jump());
+
+ matchDest.link(this);
+ } else {
+ // Erk, really should poison out these alternatives early. :-/
+ if (term->inputPosition)
+ op.m_jumps.append(jump());
+ else
+ op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked)));
+ }
+ }
+ void backtrackAssertionBOL(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generateAssertionEOL(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ if (m_pattern.m_multiline) {
+ const RegisterID character = regT0;
+
+ JumpList matchDest;
+ if (term->inputPosition == m_checked)
+ matchDest.append(atEndOfInput());
+
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass());
+ op.m_jumps.append(jump());
+
+ matchDest.link(this);
+ } else {
+ if (term->inputPosition == m_checked)
+ op.m_jumps.append(notAtEndOfInput());
+ // Erk, really should poison out these alternatives early. :-/
+ else
+ op.m_jumps.append(jump());
+ }
+ }
+ void backtrackAssertionEOL(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ // Also falls though on nextIsNotWordChar.
+ void matchAssertionWordchar(size_t opIndex, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+
+ if (term->inputPosition == m_checked)
+ nextIsNotWordChar.append(atEndOfInput());
+
+ readCharacter((term->inputPosition - m_checked), character);
+ matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass());
+ }
+
+ void generateAssertionWordBoundary(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+
+ Jump atBegin;
+ JumpList matchDest;
+ if (!term->inputPosition)
+ atBegin = branch32(Equal, index, Imm32(m_checked));
+ readCharacter((term->inputPosition - m_checked) - 1, character);
+ matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass());
+ if (!term->inputPosition)
+ atBegin.link(this);
+
+ // We fall through to here if the last character was not a wordchar.
+ JumpList nonWordCharThenWordChar;
+ JumpList nonWordCharThenNonWordChar;
+ if (term->invert()) {
+ matchAssertionWordchar(opIndex, nonWordCharThenNonWordChar, nonWordCharThenWordChar);
+ nonWordCharThenWordChar.append(jump());
+ } else {
+ matchAssertionWordchar(opIndex, nonWordCharThenWordChar, nonWordCharThenNonWordChar);
+ nonWordCharThenNonWordChar.append(jump());
+ }
+ op.m_jumps.append(nonWordCharThenNonWordChar);
+
+ // We jump here if the last character was a wordchar.
+ matchDest.link(this);
+ JumpList wordCharThenWordChar;
+ JumpList wordCharThenNonWordChar;
+ if (term->invert()) {
+ matchAssertionWordchar(opIndex, wordCharThenNonWordChar, wordCharThenWordChar);
+ wordCharThenWordChar.append(jump());
+ } else {
+ matchAssertionWordchar(opIndex, wordCharThenWordChar, wordCharThenNonWordChar);
+ // This can fall-though!
+ }
+
+ op.m_jumps.append(wordCharThenWordChar);
+
+ nonWordCharThenWordChar.link(this);
+ wordCharThenNonWordChar.link(this);
+ }
+ void backtrackAssertionWordBoundary(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generatePatternCharacterOnce(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+
+ if (op.m_isDeadCode)
+ return;
+
+ // m_ops always ends with a OpBodyAlternativeEnd or OpMatchFailed
+ // node, so there must always be at least one more node.
+ ASSERT(opIndex + 1 < m_ops.size());
+ YarrOp* nextOp = &m_ops[opIndex + 1];
+
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ if ((ch > 0xff) && (m_charSize == Char8)) {
+ // Have a 16 bit pattern character and an 8 bit string - short circuit
+ op.m_jumps.append(jump());
+ return;
+ }
+
+ const RegisterID character = regT0;
+ int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2;
+ unsigned ignoreCaseMask = 0;
+ int allCharacters = ch;
+ int numberCharacters;
+ int startTermPosition = term->inputPosition;
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch))
+ ignoreCaseMask |= 32;
+
+ for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) {
+ PatternTerm* nextTerm = nextOp->m_term;
+
+ if (nextTerm->type != PatternTerm::TypePatternCharacter
+ || nextTerm->quantityType != QuantifierFixedCount
+ || nextTerm->quantityCount != 1
+ || nextTerm->inputPosition != (startTermPosition + numberCharacters))
+ break;
+
+ nextOp->m_isDeadCode = true;
+
+ int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters;
+
+ UChar currentCharacter = nextTerm->patternCharacter;
+
+ if ((currentCharacter > 0xff) && (m_charSize == Char8)) {
+ // Have a 16 bit pattern character and an 8 bit string - short circuit
+ op.m_jumps.append(jump());
+ return;
+ }
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter));
+
+ allCharacters |= (currentCharacter << shiftAmount);
+
+ if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter)))
+ ignoreCaseMask |= 32 << shiftAmount;
+ }
+
+ if (m_charSize == Char8) {
+ switch (numberCharacters) {
+ case 1:
+ op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character));
+ return;
+ case 2: {
+ BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+ load16Unaligned(address, character);
+ break;
+ }
+ case 3: {
+ BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+ load16Unaligned(highAddress, character);
+ if (ignoreCaseMask)
+ or32(Imm32(ignoreCaseMask), character);
+ op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask)));
+ op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, startTermPosition + 2 - m_checked, character));
+ return;
+ }
+ case 4: {
+ BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+ load32WithUnalignedHalfWords(address, character);
+ break;
+ }
+ }
+ } else {
+ switch (numberCharacters) {
+ case 1:
+ op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+ return;
+ case 2:
+ BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar));
+ load32WithUnalignedHalfWords(address, character);
+ break;
+ }
+ }
+
+ if (ignoreCaseMask)
+ or32(Imm32(ignoreCaseMask), character);
+ op.m_jumps.append(branch32(NotEqual, character, Imm32(allCharacters | ignoreCaseMask)));
+ return;
+ }
+ void backtrackPatternCharacterOnce(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generatePatternCharacterFixed(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(index, countRegister);
+ sub32(Imm32(term->quantityCount.unsafeGet()), countRegister);
+
+ Label loop(this);
+ BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet());
+
+ if (m_charSize == Char8)
+ load8(address, character);
+ else
+ load16(address, character);
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
+ or32(TrustedImm32(0x20), character);
+ ch |= 0x20;
+ }
+
+ op.m_jumps.append(branch32(NotEqual, character, Imm32(ch)));
+ add32(TrustedImm32(1), countRegister);
+ branch32(NotEqual, countRegister, index).linkTo(loop, this);
+ }
+ void backtrackPatternCharacterFixed(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generatePatternCharacterGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+
+ // Unless have a 16 bit pattern character and an 8 bit string - short circuit
+ if (!((ch > 0xff) && (m_charSize == Char8))) {
+ JumpList failures;
+ Label loop(this);
+ failures.append(atEndOfInput());
+ failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+ if (term->quantityCount == quantifyInfinite)
+ jump(loop);
+ else
+ branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this);
+
+ failures.link(this);
+ }
+ op.m_reentry = label();
+
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackPatternCharacterGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+ m_backtrackingState.append(branchTest32(Zero, countRegister));
+ sub32(TrustedImm32(1), countRegister);
+ sub32(TrustedImm32(1), index);
+ jump(op.m_reentry);
+ }
+
+ void generatePatternCharacterNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+ op.m_reentry = label();
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackPatternCharacterNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+
+ // Unless have a 16 bit pattern character and an 8 bit string - short circuit
+ if (!((ch > 0xff) && (m_charSize == Char8))) {
+ JumpList nonGreedyFailures;
+ nonGreedyFailures.append(atEndOfInput());
+ if (term->quantityCount != quantifyInfinite)
+ nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet())));
+ nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+
+ jump(op.m_reentry);
+ nonGreedyFailures.link(this);
+ }
+
+ sub32(countRegister, index);
+ m_backtrackingState.fallthrough();
+ }
+
+ void generateCharacterClassOnce(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+
+ JumpList matchDest;
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+
+ if (term->invert())
+ op.m_jumps.append(matchDest);
+ else {
+ op.m_jumps.append(jump());
+ matchDest.link(this);
+ }
+ }
+ void backtrackCharacterClassOnce(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generateCharacterClassFixed(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(index, countRegister);
+ sub32(Imm32(term->quantityCount.unsafeGet()), countRegister);
+
+ Label loop(this);
+ JumpList matchDest;
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character);
+ else
+ load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+
+ if (term->invert())
+ op.m_jumps.append(matchDest);
+ else {
+ op.m_jumps.append(jump());
+ matchDest.link(this);
+ }
+
+ add32(TrustedImm32(1), countRegister);
+ branch32(NotEqual, countRegister, index).linkTo(loop, this);
+ }
+ void backtrackCharacterClassFixed(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generateCharacterClassGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+
+ JumpList failures;
+ Label loop(this);
+ failures.append(atEndOfInput());
+
+ if (term->invert()) {
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, failures, term->characterClass);
+ } else {
+ JumpList matchDest;
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+ failures.append(jump());
+ matchDest.link(this);
+ }
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+ if (term->quantityCount != quantifyInfinite) {
+ branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this);
+ failures.append(jump());
+ } else
+ jump(loop);
+
+ failures.link(this);
+ op.m_reentry = label();
+
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackCharacterClassGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+ m_backtrackingState.append(branchTest32(Zero, countRegister));
+ sub32(TrustedImm32(1), countRegister);
+ sub32(TrustedImm32(1), index);
+ jump(op.m_reentry);
+ }
+
+ void generateCharacterClassNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+ op.m_reentry = label();
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackCharacterClassNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ JumpList nonGreedyFailures;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+
+ nonGreedyFailures.append(atEndOfInput());
+ nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet())));
+
+ JumpList matchDest;
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+
+ if (term->invert())
+ nonGreedyFailures.append(matchDest);
+ else {
+ nonGreedyFailures.append(jump());
+ matchDest.link(this);
+ }
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+
+ jump(op.m_reentry);
+
+ nonGreedyFailures.link(this);
+ sub32(countRegister, index);
+ m_backtrackingState.fallthrough();
+ }
+
+ void generateDotStarEnclosure(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID matchPos = regT1;
+
+ JumpList foundBeginningNewLine;
+ JumpList saveStartIndex;
+ JumpList foundEndingNewLine;
+
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ getMatchStart(matchPos);
+
+ saveStartIndex.append(branchTest32(Zero, matchPos));
+ Label findBOLLoop(this);
+ sub32(TrustedImm32(1), matchPos);
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, matchPos, TimesOne, 0), character);
+ else
+ load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+ matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass());
+ branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this);
+ saveStartIndex.append(jump());
+
+ foundBeginningNewLine.link(this);
+ add32(TrustedImm32(1), matchPos); // Advance past newline
+ saveStartIndex.link(this);
+
+ if (!m_pattern.m_multiline && term->anchors.bolAnchor)
+ op.m_jumps.append(branchTest32(NonZero, matchPos));
+
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ setMatchStart(matchPos);
+
+ move(index, matchPos);
+
+ Label findEOLLoop(this);
+ foundEndingNewLine.append(branch32(Equal, matchPos, length));
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, matchPos, TimesOne, 0), character);
+ else
+ load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+ matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass());
+ add32(TrustedImm32(1), matchPos);
+ jump(findEOLLoop);
+
+ foundEndingNewLine.link(this);
+
+ if (!m_pattern.m_multiline && term->anchors.eolAnchor)
+ op.m_jumps.append(branch32(NotEqual, matchPos, length));
+
+ move(matchPos, index);
+ }
+
+ void backtrackDotStarEnclosure(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ // Code generation/backtracking for simple terms
+ // (pattern characters, character classes, and assertions).
+ // These methods farm out work to the set of functions above.
+ void generateTerm(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ switch (term->type) {
+ case PatternTerm::TypePatternCharacter:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ generatePatternCharacterOnce(opIndex);
+ else
+ generatePatternCharacterFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ generatePatternCharacterGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ generatePatternCharacterNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ generateCharacterClassOnce(opIndex);
+ else
+ generateCharacterClassFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ generateCharacterClassGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ generateCharacterClassNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeAssertionBOL:
+ generateAssertionBOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ generateAssertionEOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ generateAssertionWordBoundary(opIndex);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ case PatternTerm::TypeParentheticalAssertion:
+ ASSERT_NOT_REACHED();
+ case PatternTerm::TypeBackReference:
+ m_shouldFallBack = true;
+ break;
+ case PatternTerm::TypeDotStarEnclosure:
+ generateDotStarEnclosure(opIndex);
+ break;
+ }
+ }
+ void backtrackTerm(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ switch (term->type) {
+ case PatternTerm::TypePatternCharacter:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ backtrackPatternCharacterOnce(opIndex);
+ else
+ backtrackPatternCharacterFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ backtrackPatternCharacterGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ backtrackPatternCharacterNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ backtrackCharacterClassOnce(opIndex);
+ else
+ backtrackCharacterClassFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ backtrackCharacterClassGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ backtrackCharacterClassNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeAssertionBOL:
+ backtrackAssertionBOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ backtrackAssertionEOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ backtrackAssertionWordBoundary(opIndex);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ case PatternTerm::TypeParentheticalAssertion:
+ ASSERT_NOT_REACHED();
+
+ case PatternTerm::TypeDotStarEnclosure:
+ backtrackDotStarEnclosure(opIndex);
+ break;
+
+ case PatternTerm::TypeBackReference:
+ m_shouldFallBack = true;
+ break;
+ }
+ }
+
+ void generate()
+ {
+ // Forwards generate the matching code.
+ ASSERT(m_ops.size());
+ size_t opIndex = 0;
+
+ do {
+ YarrOp& op = m_ops[opIndex];
+ switch (op.m_op) {
+
+ case OpTerm:
+ generateTerm(opIndex);
+ break;
+
+ // OpBodyAlternativeBegin/Next/End
+ //
+ // These nodes wrap the set of alternatives in the body of the regular expression.
+ // There may be either one or two chains of OpBodyAlternative nodes, one representing
+ // the 'once through' sequence of alternatives (if any exist), and one representing
+ // the repeating alternatives (again, if any exist).
+ //
+ // Upon normal entry to the Begin alternative, we will check that input is available.
+ // Reentry to the Begin alternative will take place after the check has taken place,
+ // and will assume that the input position has already been progressed as appropriate.
+ //
+ // Entry to subsequent Next/End alternatives occurs when the prior alternative has
+ // successfully completed a match - return a success state from JIT code.
+ //
+ // Next alternatives allow for reentry optimized to suit backtracking from its
+ // preceding alternative. It expects the input position to still be set to a position
+ // appropriate to its predecessor, and it will only perform an input check if the
+ // predecessor had a minimum size less than its own.
+ //
+ // In the case 'once through' expressions, the End node will also have a reentry
+ // point to jump to when the last alternative fails. Again, this expects the input
+ // position to still reflect that expected by the prior alternative.
+ case OpBodyAlternativeBegin: {
+ PatternAlternative* alternative = op.m_alternative;
+
+ // Upon entry at the head of the set of alternatives, check if input is available
+ // to run the first alternative. (This progresses the input position).
+ op.m_jumps.append(jumpIfNoAvailableInput(alternative->m_minimumSize));
+ // We will reenter after the check, and assume the input position to have been
+ // set as appropriate to this alternative.
+ op.m_reentry = label();
+
+ m_checked += alternative->m_minimumSize;
+ break;
+ }
+ case OpBodyAlternativeNext:
+ case OpBodyAlternativeEnd: {
+ PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+ PatternAlternative* alternative = op.m_alternative;
+
+ // If we get here, the prior alternative matched - return success.
+
+ // Adjust the stack pointer to remove the pattern's frame.
+ removeCallFrame();
+
+ // Load appropriate values into the return register and the first output
+ // slot, and return. In the case of pattern with a fixed size, we will
+ // not have yet set the value in the first
+ ASSERT(index != returnRegister);
+ if (m_pattern.m_body->m_hasFixedSize) {
+ move(index, returnRegister);
+ if (priorAlternative->m_minimumSize)
+ sub32(Imm32(priorAlternative->m_minimumSize), returnRegister);
+ if (compileMode == IncludeSubpatterns)
+ store32(returnRegister, output);
+ } else
+ getMatchStart(returnRegister);
+ if (compileMode == IncludeSubpatterns)
+ store32(index, Address(output, 4));
+ move(index, returnRegister2);
+
+ generateReturn();
+
+ // This is the divide between the tail of the prior alternative, above, and
+ // the head of the subsequent alternative, below.
+
+ if (op.m_op == OpBodyAlternativeNext) {
+ // This is the reentry point for the Next alternative. We expect any code
+ // that jumps here to do so with the input position matching that of the
+ // PRIOR alteranative, and we will only check input availability if we
+ // need to progress it forwards.
+ op.m_reentry = label();
+ if (alternative->m_minimumSize > priorAlternative->m_minimumSize) {
+ add32(Imm32(alternative->m_minimumSize - priorAlternative->m_minimumSize), index);
+ op.m_jumps.append(jumpIfNoAvailableInput());
+ } else if (priorAlternative->m_minimumSize > alternative->m_minimumSize)
+ sub32(Imm32(priorAlternative->m_minimumSize - alternative->m_minimumSize), index);
+ } else if (op.m_nextOp == notFound) {
+ // This is the reentry point for the End of 'once through' alternatives,
+ // jumped to when the last alternative fails to match.
+ op.m_reentry = label();
+ sub32(Imm32(priorAlternative->m_minimumSize), index);
+ }
+
+ if (op.m_op == OpBodyAlternativeNext)
+ m_checked += alternative->m_minimumSize;
+ m_checked -= priorAlternative->m_minimumSize;
+ break;
+ }
+
+ // OpSimpleNestedAlternativeBegin/Next/End
+ // OpNestedAlternativeBegin/Next/End
+ //
+ // These nodes are used to handle sets of alternatives that are nested within
+ // subpatterns and parenthetical assertions. The 'simple' forms are used where
+ // we do not need to be able to backtrack back into any alternative other than
+ // the last, the normal forms allow backtracking into any alternative.
+ //
+ // Each Begin/Next node is responsible for planting an input check to ensure
+ // sufficient input is available on entry. Next nodes additionally need to
+ // jump to the end - Next nodes use the End node's m_jumps list to hold this
+ // set of jumps.
+ //
+ // In the non-simple forms, successful alternative matches must store a
+ // 'return address' using a DataLabelPtr, used to store the address to jump
+ // to when backtracking, to get to the code for the appropriate alternative.
+ case OpSimpleNestedAlternativeBegin:
+ case OpNestedAlternativeBegin: {
+ PatternTerm* term = op.m_term;
+ PatternAlternative* alternative = op.m_alternative;
+ PatternDisjunction* disjunction = term->parentheses.disjunction;
+
+ // Calculate how much input we need to check for, and if non-zero check.
+ op.m_checkAdjust = alternative->m_minimumSize;
+ if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion))
+ op.m_checkAdjust -= disjunction->m_minimumSize;
+ if (op.m_checkAdjust)
+ op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust));
+
+ m_checked += op.m_checkAdjust;
+ break;
+ }
+ case OpSimpleNestedAlternativeNext:
+ case OpNestedAlternativeNext: {
+ PatternTerm* term = op.m_term;
+ PatternAlternative* alternative = op.m_alternative;
+ PatternDisjunction* disjunction = term->parentheses.disjunction;
+
+ // In the non-simple case, store a 'return address' so we can backtrack correctly.
+ if (op.m_op == OpNestedAlternativeNext) {
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ unsigned alternativeFrameLocation = parenthesesFrameLocation;
+ if (term->quantityType != QuantifierFixedCount)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation);
+ }
+
+ if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) {
+ // If the previous alternative matched without consuming characters then
+ // backtrack to try to match while consumming some input.
+ op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ }
+
+ // If we reach here then the last alternative has matched - jump to the
+ // End node, to skip over any further alternatives.
+ //
+ // FIXME: this is logically O(N^2) (though N can be expected to be very
+ // small). We could avoid this either by adding an extra jump to the JIT
+ // data structures, or by making backtracking code that jumps to Next
+ // alternatives are responsible for checking that input is available (if
+ // we didn't need to plant the input checks, then m_jumps would be free).
+ YarrOp* endOp = &m_ops[op.m_nextOp];
+ while (endOp->m_nextOp != notFound) {
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext);
+ endOp = &m_ops[endOp->m_nextOp];
+ }
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd);
+ endOp->m_jumps.append(jump());
+
+ // This is the entry point for the next alternative.
+ op.m_reentry = label();
+
+ // Calculate how much input we need to check for, and if non-zero check.
+ op.m_checkAdjust = alternative->m_minimumSize;
+ if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion))
+ op.m_checkAdjust -= disjunction->m_minimumSize;
+ if (op.m_checkAdjust)
+ op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust));
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked -= lastOp.m_checkAdjust;
+ m_checked += op.m_checkAdjust;
+ break;
+ }
+ case OpSimpleNestedAlternativeEnd:
+ case OpNestedAlternativeEnd: {
+ PatternTerm* term = op.m_term;
+
+ // In the non-simple case, store a 'return address' so we can backtrack correctly.
+ if (op.m_op == OpNestedAlternativeEnd) {
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ unsigned alternativeFrameLocation = parenthesesFrameLocation;
+ if (term->quantityType != QuantifierFixedCount)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation);
+ }
+
+ if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) {
+ // If the previous alternative matched without consuming characters then
+ // backtrack to try to match while consumming some input.
+ op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ }
+
+ // If this set of alternatives contains more than one alternative,
+ // then the Next nodes will have planted jumps to the End, and added
+ // them to this node's m_jumps list.
+ op.m_jumps.link(this);
+ op.m_jumps.clear();
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked -= lastOp.m_checkAdjust;
+ break;
+ }
+
+ // OpParenthesesSubpatternOnceBegin/End
+ //
+ // These nodes support (optionally) capturing subpatterns, that have a
+ // quantity count of 1 (this covers fixed once, and ?/?? quantifiers).
+ case OpParenthesesSubpatternOnceBegin: {
+ PatternTerm* term = op.m_term;
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ const RegisterID indexTemporary = regT0;
+ ASSERT(term->quantityCount == 1);
+
+ // Upon entry to a Greedy quantified set of parenthese store the index.
+ // We'll use this for two purposes:
+ // - To indicate which iteration we are on of mathing the remainder of
+ // the expression after the parentheses - the first, including the
+ // match within the parentheses, or the second having skipped over them.
+ // - To check for empty matches, which must be rejected.
+ //
+ // At the head of a NonGreedy set of parentheses we'll immediately set the
+ // value on the stack to -1 (indicating a match skipping the subpattern),
+ // and plant a jump to the end. We'll also plant a label to backtrack to
+ // to reenter the subpattern later, with a store to set up index on the
+ // second iteration.
+ //
+ // FIXME: for capturing parens, could use the index in the capture array?
+ if (term->quantityType == QuantifierGreedy)
+ storeToFrame(index, parenthesesFrameLocation);
+ else if (term->quantityType == QuantifierNonGreedy) {
+ storeToFrame(TrustedImm32(-1), parenthesesFrameLocation);
+ op.m_jumps.append(jump());
+ op.m_reentry = label();
+ storeToFrame(index, parenthesesFrameLocation);
+ }
+
+ // If the parenthese are capturing, store the starting index value to the
+ // captures array, offsetting as necessary.
+ //
+ // FIXME: could avoid offsetting this value in JIT code, apply
+ // offsets only afterwards, at the point the results array is
+ // being accessed.
+ if (term->capture() && compileMode == IncludeSubpatterns) {
+ int inputOffset = term->inputPosition - m_checked;
+ if (term->quantityType == QuantifierFixedCount)
+ inputOffset -= term->parentheses.disjunction->m_minimumSize;
+ if (inputOffset) {
+ move(index, indexTemporary);
+ add32(Imm32(inputOffset), indexTemporary);
+ setSubpatternStart(indexTemporary, term->parentheses.subpatternId);
+ } else
+ setSubpatternStart(index, term->parentheses.subpatternId);
+ }
+ break;
+ }
+ case OpParenthesesSubpatternOnceEnd: {
+ PatternTerm* term = op.m_term;
+ const RegisterID indexTemporary = regT0;
+ ASSERT(term->quantityCount == 1);
+
+#ifndef NDEBUG
+ // Runtime ASSERT to make sure that the nested alternative handled the
+ // "no input consumed" check.
+ if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) {
+ Jump pastBreakpoint;
+ pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ breakpoint();
+ pastBreakpoint.link(this);
+ }
+#endif
+
+ // If the parenthese are capturing, store the ending index value to the
+ // captures array, offsetting as necessary.
+ //
+ // FIXME: could avoid offsetting this value in JIT code, apply
+ // offsets only afterwards, at the point the results array is
+ // being accessed.
+ if (term->capture() && compileMode == IncludeSubpatterns) {
+ int inputOffset = term->inputPosition - m_checked;
+ if (inputOffset) {
+ move(index, indexTemporary);
+ add32(Imm32(inputOffset), indexTemporary);
+ setSubpatternEnd(indexTemporary, term->parentheses.subpatternId);
+ } else
+ setSubpatternEnd(index, term->parentheses.subpatternId);
+ }
+
+ // If the parentheses are quantified Greedy then add a label to jump back
+ // to if get a failed match from after the parentheses. For NonGreedy
+ // parentheses, link the jump from before the subpattern to here.
+ if (term->quantityType == QuantifierGreedy)
+ op.m_reentry = label();
+ else if (term->quantityType == QuantifierNonGreedy) {
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+ beginOp.m_jumps.link(this);
+ }
+ break;
+ }
+
+ // OpParenthesesSubpatternTerminalBegin/End
+ case OpParenthesesSubpatternTerminalBegin: {
+ PatternTerm* term = op.m_term;
+ ASSERT(term->quantityType == QuantifierGreedy);
+ ASSERT(term->quantityCount == quantifyInfinite);
+ ASSERT(!term->capture());
+
+ // Upon entry set a label to loop back to.
+ op.m_reentry = label();
+
+ // Store the start index of the current match; we need to reject zero
+ // length matches.
+ storeToFrame(index, term->frameLocation);
+ break;
+ }
+ case OpParenthesesSubpatternTerminalEnd: {
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+#ifndef NDEBUG
+ PatternTerm* term = op.m_term;
+
+ // Runtime ASSERT to make sure that the nested alternative handled the
+ // "no input consumed" check.
+ Jump pastBreakpoint;
+ pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ breakpoint();
+ pastBreakpoint.link(this);
+#endif
+
+ // We know that the match is non-zero, we can accept it and
+ // loop back up to the head of the subpattern.
+ jump(beginOp.m_reentry);
+
+ // This is the entry point to jump to when we stop matching - we will
+ // do so once the subpattern cannot match any more.
+ op.m_reentry = label();
+ break;
+ }
+
+ // OpParentheticalAssertionBegin/End
+ case OpParentheticalAssertionBegin: {
+ PatternTerm* term = op.m_term;
+
+ // Store the current index - assertions should not update index, so
+ // we will need to restore it upon a successful match.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ storeToFrame(index, parenthesesFrameLocation);
+
+ // Check
+ op.m_checkAdjust = m_checked - term->inputPosition;
+ if (op.m_checkAdjust)
+ sub32(Imm32(op.m_checkAdjust), index);
+
+ m_checked -= op.m_checkAdjust;
+ break;
+ }
+ case OpParentheticalAssertionEnd: {
+ PatternTerm* term = op.m_term;
+
+ // Restore the input index value.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ loadFromFrame(parenthesesFrameLocation, index);
+
+ // If inverted, a successful match of the assertion must be treated
+ // as a failure, so jump to backtracking.
+ if (term->invert()) {
+ op.m_jumps.append(jump());
+ op.m_reentry = label();
+ }
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked += lastOp.m_checkAdjust;
+ break;
+ }
+
+ case OpMatchFailed:
+ removeCallFrame();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
+ generateReturn();
+ break;
+ }
+
+ ++opIndex;
+ } while (opIndex < m_ops.size());
+ }
+
+ void backtrack()
+ {
+ // Backwards generate the backtracking code.
+ size_t opIndex = m_ops.size();
+ ASSERT(opIndex);
+
+ do {
+ --opIndex;
+ YarrOp& op = m_ops[opIndex];
+ switch (op.m_op) {
+
+ case OpTerm:
+ backtrackTerm(opIndex);
+ break;
+
+ // OpBodyAlternativeBegin/Next/End
+ //
+ // For each Begin/Next node representing an alternative, we need to decide what to do
+ // in two circumstances:
+ // - If we backtrack back into this node, from within the alternative.
+ // - If the input check at the head of the alternative fails (if this exists).
+ //
+ // We treat these two cases differently since in the former case we have slightly
+ // more information - since we are backtracking out of a prior alternative we know
+ // that at least enough input was available to run it. For example, given the regular
+ // expression /a|b/, if we backtrack out of the first alternative (a failed pattern
+ // character match of 'a'), then we need not perform an additional input availability
+ // check before running the second alternative.
+ //
+ // Backtracking required differs for the last alternative, which in the case of the
+ // repeating set of alternatives must loop. The code generated for the last alternative
+ // will also be used to handle all input check failures from any prior alternatives -
+ // these require similar functionality, in seeking the next available alternative for
+ // which there is sufficient input.
+ //
+ // Since backtracking of all other alternatives simply requires us to link backtracks
+ // to the reentry point for the subsequent alternative, we will only be generating any
+ // code when backtracking the last alternative.
+ case OpBodyAlternativeBegin:
+ case OpBodyAlternativeNext: {
+ PatternAlternative* alternative = op.m_alternative;
+
+ if (op.m_op == OpBodyAlternativeNext) {
+ PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+ m_checked += priorAlternative->m_minimumSize;
+ }
+ m_checked -= alternative->m_minimumSize;
+
+ // Is this the last alternative? If not, then if we backtrack to this point we just
+ // need to jump to try to match the next alternative.
+ if (m_ops[op.m_nextOp].m_op != OpBodyAlternativeEnd) {
+ m_backtrackingState.linkTo(m_ops[op.m_nextOp].m_reentry, this);
+ break;
+ }
+ YarrOp& endOp = m_ops[op.m_nextOp];
+
+ YarrOp* beginOp = &op;
+ while (beginOp->m_op != OpBodyAlternativeBegin) {
+ ASSERT(beginOp->m_op == OpBodyAlternativeNext);
+ beginOp = &m_ops[beginOp->m_previousOp];
+ }
+
+ bool onceThrough = endOp.m_nextOp == notFound;
+
+ // First, generate code to handle cases where we backtrack out of an attempted match
+ // of the last alternative. If this is a 'once through' set of alternatives then we
+ // have nothing to do - link this straight through to the End.
+ if (onceThrough)
+ m_backtrackingState.linkTo(endOp.m_reentry, this);
+ else {
+ // If we don't need to move the input poistion, and the pattern has a fixed size
+ // (in which case we omit the store of the start index until the pattern has matched)
+ // then we can just link the backtrack out of the last alternative straight to the
+ // head of the first alternative.
+ if (m_pattern.m_body->m_hasFixedSize
+ && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize)
+ && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1))
+ m_backtrackingState.linkTo(beginOp->m_reentry, this);
+ else {
+ // We need to generate a trampoline of code to execute before looping back
+ // around to the first alternative.
+ m_backtrackingState.link(this);
+
+ // If the pattern size is not fixed, then store the start index, for use if we match.
+ if (!m_pattern.m_body->m_hasFixedSize) {
+ if (alternative->m_minimumSize == 1)
+ setMatchStart(index);
+ else {
+ move(index, regT0);
+ if (alternative->m_minimumSize)
+ sub32(Imm32(alternative->m_minimumSize - 1), regT0);
+ else
+ add32(TrustedImm32(1), regT0);
+ setMatchStart(regT0);
+ }
+ }
+
+ // Generate code to loop. Check whether the last alternative is longer than the
+ // first (e.g. /a|xy/ or /a|xyz/).
+ if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) {
+ // We want to loop, and increment input position. If the delta is 1, it is
+ // already correctly incremented, if more than one then decrement as appropriate.
+ unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize;
+ ASSERT(delta);
+ if (delta != 1)
+ sub32(Imm32(delta - 1), index);
+ jump(beginOp->m_reentry);
+ } else {
+ // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot
+ // be sufficent input available to handle this, so just fall through.
+ unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize;
+ if (delta != 0xFFFFFFFFu) {
+ // We need to check input because we are incrementing the input.
+ add32(Imm32(delta + 1), index);
+ checkInput().linkTo(beginOp->m_reentry, this);
+ }
+ }
+ }
+ }
+
+ // We can reach this point in the code in two ways:
+ // - Fallthrough from the code above (a repeating alternative backtracked out of its
+ // last alternative, and did not have sufficent input to run the first).
+ // - We will loop back up to the following label when a releating alternative loops,
+ // following a failed input check.
+ //
+ // Either way, we have just failed the input check for the first alternative.
+ Label firstInputCheckFailed(this);
+
+ // Generate code to handle input check failures from alternatives except the last.
+ // prevOp is the alternative we're handling a bail out from (initially Begin), and
+ // nextOp is the alternative we will be attempting to reenter into.
+ //
+ // We will link input check failures from the forwards matching path back to the code
+ // that can handle them.
+ YarrOp* prevOp = beginOp;
+ YarrOp* nextOp = &m_ops[beginOp->m_nextOp];
+ while (nextOp->m_op != OpBodyAlternativeEnd) {
+ prevOp->m_jumps.link(this);
+
+ // We only get here if an input check fails, it is only worth checking again
+ // if the next alternative has a minimum size less than the last.
+ if (prevOp->m_alternative->m_minimumSize > nextOp->m_alternative->m_minimumSize) {
+ // FIXME: if we added an extra label to YarrOp, we could avoid needing to
+ // subtract delta back out, and reduce this code. Should performance test
+ // the benefit of this.
+ unsigned delta = prevOp->m_alternative->m_minimumSize - nextOp->m_alternative->m_minimumSize;
+ sub32(Imm32(delta), index);
+ Jump fail = jumpIfNoAvailableInput();
+ add32(Imm32(delta), index);
+ jump(nextOp->m_reentry);
+ fail.link(this);
+ } else if (prevOp->m_alternative->m_minimumSize < nextOp->m_alternative->m_minimumSize)
+ add32(Imm32(nextOp->m_alternative->m_minimumSize - prevOp->m_alternative->m_minimumSize), index);
+ prevOp = nextOp;
+ nextOp = &m_ops[nextOp->m_nextOp];
+ }
+
+ // We fall through to here if there is insufficient input to run the last alternative.
+
+ // If there is insufficient input to run the last alternative, then for 'once through'
+ // alternatives we are done - just jump back up into the forwards matching path at the End.
+ if (onceThrough) {
+ op.m_jumps.linkTo(endOp.m_reentry, this);
+ jump(endOp.m_reentry);
+ break;
+ }
+
+ // For repeating alternatives, link any input check failure from the last alternative to
+ // this point.
+ op.m_jumps.link(this);
+
+ bool needsToUpdateMatchStart = !m_pattern.m_body->m_hasFixedSize;
+
+ // Check for cases where input position is already incremented by 1 for the last
+ // alternative (this is particularly useful where the minimum size of the body
+ // disjunction is 0, e.g. /a*|b/).
+ if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) {
+ // index is already incremented by 1, so just store it now!
+ setMatchStart(index);
+ needsToUpdateMatchStart = false;
+ }
+
+ // Check whether there is sufficient input to loop. Increment the input position by
+ // one, and check. Also add in the minimum disjunction size before checking - there
+ // is no point in looping if we're just going to fail all the input checks around
+ // the next iteration.
+ ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize);
+ if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) {
+ // If the last alternative had the same minimum size as the disjunction,
+ // just simply increment input pos by 1, no adjustment based on minimum size.
+ add32(TrustedImm32(1), index);
+ } else {
+ // If the minumum for the last alternative was one greater than than that
+ // for the disjunction, we're already progressed by 1, nothing to do!
+ unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1;
+ if (delta)
+ sub32(Imm32(delta), index);
+ }
+ Jump matchFailed = jumpIfNoAvailableInput();
+
+ if (needsToUpdateMatchStart) {
+ if (!m_pattern.m_body->m_minimumSize)
+ setMatchStart(index);
+ else {
+ move(index, regT0);
+ sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0);
+ setMatchStart(regT0);
+ }
+ }
+
+ // Calculate how much more input the first alternative requires than the minimum
+ // for the body as a whole. If no more is needed then we dont need an additional
+ // input check here - jump straight back up to the start of the first alternative.
+ if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize)
+ jump(beginOp->m_reentry);
+ else {
+ if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize)
+ add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index);
+ else
+ sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index);
+ checkInput().linkTo(beginOp->m_reentry, this);
+ jump(firstInputCheckFailed);
+ }
+
+ // We jump to here if we iterate to the point that there is insufficient input to
+ // run any matches, and need to return a failure state from JIT code.
+ matchFailed.link(this);
+
+ removeCallFrame();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
+ generateReturn();
+ break;
+ }
+ case OpBodyAlternativeEnd: {
+ // We should never backtrack back into a body disjunction.
+ ASSERT(m_backtrackingState.isEmpty());
+
+ PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+ m_checked += priorAlternative->m_minimumSize;
+ break;
+ }
+
+ // OpSimpleNestedAlternativeBegin/Next/End
+ // OpNestedAlternativeBegin/Next/End
+ //
+ // Generate code for when we backtrack back out of an alternative into
+ // a Begin or Next node, or when the entry input count check fails. If
+ // there are more alternatives we need to jump to the next alternative,
+ // if not we backtrack back out of the current set of parentheses.
+ //
+ // In the case of non-simple nested assertions we need to also link the
+ // 'return address' appropriately to backtrack back out into the correct
+ // alternative.
+ case OpSimpleNestedAlternativeBegin:
+ case OpSimpleNestedAlternativeNext:
+ case OpNestedAlternativeBegin:
+ case OpNestedAlternativeNext: {
+ YarrOp& nextOp = m_ops[op.m_nextOp];
+ bool isBegin = op.m_previousOp == notFound;
+ bool isLastAlternative = nextOp.m_nextOp == notFound;
+ ASSERT(isBegin == (op.m_op == OpSimpleNestedAlternativeBegin || op.m_op == OpNestedAlternativeBegin));
+ ASSERT(isLastAlternative == (nextOp.m_op == OpSimpleNestedAlternativeEnd || nextOp.m_op == OpNestedAlternativeEnd));
+
+ // Treat an input check failure the same as a failed match.
+ m_backtrackingState.append(op.m_jumps);
+
+ // Set the backtracks to jump to the appropriate place. We may need
+ // to link the backtracks in one of three different way depending on
+ // the type of alternative we are dealing with:
+ // - A single alternative, with no simplings.
+ // - The last alternative of a set of two or more.
+ // - An alternative other than the last of a set of two or more.
+ //
+ // In the case of a single alternative on its own, we don't need to
+ // jump anywhere - if the alternative fails to match we can just
+ // continue to backtrack out of the parentheses without jumping.
+ //
+ // In the case of the last alternative in a set of more than one, we
+ // need to jump to return back out to the beginning. We'll do so by
+ // adding a jump to the End node's m_jumps list, and linking this
+ // when we come to generate the Begin node. For alternatives other
+ // than the last, we need to jump to the next alternative.
+ //
+ // If the alternative had adjusted the input position we must link
+ // backtracking to here, correct, and then jump on. If not we can
+ // link the backtracks directly to their destination.
+ if (op.m_checkAdjust) {
+ // Handle the cases where we need to link the backtracks here.
+ m_backtrackingState.link(this);
+ sub32(Imm32(op.m_checkAdjust), index);
+ if (!isLastAlternative) {
+ // An alternative that is not the last should jump to its successor.
+ jump(nextOp.m_reentry);
+ } else if (!isBegin) {
+ // The last of more than one alternatives must jump back to the beginning.
+ nextOp.m_jumps.append(jump());
+ } else {
+ // A single alternative on its own can fall through.
+ m_backtrackingState.fallthrough();
+ }
+ } else {
+ // Handle the cases where we can link the backtracks directly to their destinations.
+ if (!isLastAlternative) {
+ // An alternative that is not the last should jump to its successor.
+ m_backtrackingState.linkTo(nextOp.m_reentry, this);
+ } else if (!isBegin) {
+ // The last of more than one alternatives must jump back to the beginning.
+ m_backtrackingState.takeBacktracksToJumpList(nextOp.m_jumps, this);
+ }
+ // In the case of a single alternative on its own do nothing - it can fall through.
+ }
+
+ // If there is a backtrack jump from a zero length match link it here.
+ if (op.m_zeroLengthMatch.isSet())
+ m_backtrackingState.append(op.m_zeroLengthMatch);
+
+ // At this point we've handled the backtracking back into this node.
+ // Now link any backtracks that need to jump to here.
+
+ // For non-simple alternatives, link the alternative's 'return address'
+ // so that we backtrack back out into the previous alternative.
+ if (op.m_op == OpNestedAlternativeNext)
+ m_backtrackingState.append(op.m_returnAddress);
+
+ // If there is more than one alternative, then the last alternative will
+ // have planted a jump to be linked to the end. This jump was added to the
+ // End node's m_jumps list. If we are back at the beginning, link it here.
+ if (isBegin) {
+ YarrOp* endOp = &m_ops[op.m_nextOp];
+ while (endOp->m_nextOp != notFound) {
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext);
+ endOp = &m_ops[endOp->m_nextOp];
+ }
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd);
+ m_backtrackingState.append(endOp->m_jumps);
+ }
+
+ if (!isBegin) {
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked += lastOp.m_checkAdjust;
+ }
+ m_checked -= op.m_checkAdjust;
+ break;
+ }
+ case OpSimpleNestedAlternativeEnd:
+ case OpNestedAlternativeEnd: {
+ PatternTerm* term = op.m_term;
+
+ // If there is a backtrack jump from a zero length match link it here.
+ if (op.m_zeroLengthMatch.isSet())
+ m_backtrackingState.append(op.m_zeroLengthMatch);
+
+ // If we backtrack into the end of a simple subpattern do nothing;
+ // just continue through into the last alternative. If we backtrack
+ // into the end of a non-simple set of alterntives we need to jump
+ // to the backtracking return address set up during generation.
+ if (op.m_op == OpNestedAlternativeEnd) {
+ m_backtrackingState.link(this);
+
+ // Plant a jump to the return address.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ unsigned alternativeFrameLocation = parenthesesFrameLocation;
+ if (term->quantityType != QuantifierFixedCount)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ loadFromFrameAndJump(alternativeFrameLocation);
+
+ // Link the DataLabelPtr associated with the end of the last
+ // alternative to this point.
+ m_backtrackingState.append(op.m_returnAddress);
+ }
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked += lastOp.m_checkAdjust;
+ break;
+ }
+
+ // OpParenthesesSubpatternOnceBegin/End
+ //
+ // When we are backtracking back out of a capturing subpattern we need
+ // to clear the start index in the matches output array, to record that
+ // this subpattern has not been captured.
+ //
+ // When backtracking back out of a Greedy quantified subpattern we need
+ // to catch this, and try running the remainder of the alternative after
+ // the subpattern again, skipping the parentheses.
+ //
+ // Upon backtracking back into a quantified set of parentheses we need to
+ // check whether we were currently skipping the subpattern. If not, we
+ // can backtrack into them, if we were we need to either backtrack back
+ // out of the start of the parentheses, or jump back to the forwards
+ // matching start, depending of whether the match is Greedy or NonGreedy.
+ case OpParenthesesSubpatternOnceBegin: {
+ PatternTerm* term = op.m_term;
+ ASSERT(term->quantityCount == 1);
+
+ // We only need to backtrack to thispoint if capturing or greedy.
+ if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) {
+ m_backtrackingState.link(this);
+
+ // If capturing, clear the capture (we only need to reset start).
+ if (term->capture() && compileMode == IncludeSubpatterns)
+ clearSubpatternStart(term->parentheses.subpatternId);
+
+ // If Greedy, jump to the end.
+ if (term->quantityType == QuantifierGreedy) {
+ // Clear the flag in the stackframe indicating we ran through the subpattern.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ storeToFrame(TrustedImm32(-1), parenthesesFrameLocation);
+ // Jump to after the parentheses, skipping the subpattern.
+ jump(m_ops[op.m_nextOp].m_reentry);
+ // A backtrack from after the parentheses, when skipping the subpattern,
+ // will jump back to here.
+ op.m_jumps.link(this);
+ }
+
+ m_backtrackingState.fallthrough();
+ }
+ break;
+ }
+ case OpParenthesesSubpatternOnceEnd: {
+ PatternTerm* term = op.m_term;
+
+ if (term->quantityType != QuantifierFixedCount) {
+ m_backtrackingState.link(this);
+
+ // Check whether we should backtrack back into the parentheses, or if we
+ // are currently in a state where we had skipped over the subpattern
+ // (in which case the flag value on the stack will be -1).
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1));
+
+ if (term->quantityType == QuantifierGreedy) {
+ // For Greedy parentheses, we skip after having already tried going
+ // through the subpattern, so if we get here we're done.
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+ beginOp.m_jumps.append(hadSkipped);
+ } else {
+ // For NonGreedy parentheses, we try skipping the subpattern first,
+ // so if we get here we need to try running through the subpattern
+ // next. Jump back to the start of the parentheses in the forwards
+ // matching path.
+ ASSERT(term->quantityType == QuantifierNonGreedy);
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+ hadSkipped.linkTo(beginOp.m_reentry, this);
+ }
+
+ m_backtrackingState.fallthrough();
+ }
+
+ m_backtrackingState.append(op.m_jumps);
+ break;
+ }
+
+ // OpParenthesesSubpatternTerminalBegin/End
+ //
+ // Terminal subpatterns will always match - there is nothing after them to
+ // force a backtrack, and they have a minimum count of 0, and as such will
+ // always produce an acceptable result.
+ case OpParenthesesSubpatternTerminalBegin: {
+ // We will backtrack to this point once the subpattern cannot match any
+ // more. Since no match is accepted as a successful match (we are Greedy
+ // quantified with a minimum of zero) jump back to the forwards matching
+ // path at the end.
+ YarrOp& endOp = m_ops[op.m_nextOp];
+ m_backtrackingState.linkTo(endOp.m_reentry, this);
+ break;
+ }
+ case OpParenthesesSubpatternTerminalEnd:
+ // We should never be backtracking to here (hence the 'terminal' in the name).
+ ASSERT(m_backtrackingState.isEmpty());
+ m_backtrackingState.append(op.m_jumps);
+ break;
+
+ // OpParentheticalAssertionBegin/End
+ case OpParentheticalAssertionBegin: {
+ PatternTerm* term = op.m_term;
+ YarrOp& endOp = m_ops[op.m_nextOp];
+
+ // We need to handle the backtracks upon backtracking back out
+ // of a parenthetical assertion if either we need to correct
+ // the input index, or the assertion was inverted.
+ if (op.m_checkAdjust || term->invert()) {
+ m_backtrackingState.link(this);
+
+ if (op.m_checkAdjust)
+ add32(Imm32(op.m_checkAdjust), index);
+
+ // In an inverted assertion failure to match the subpattern
+ // is treated as a successful match - jump to the end of the
+ // subpattern. We already have adjusted the input position
+ // back to that before the assertion, which is correct.
+ if (term->invert())
+ jump(endOp.m_reentry);
+
+ m_backtrackingState.fallthrough();
+ }
+
+ // The End node's jump list will contain any backtracks into
+ // the end of the assertion. Also, if inverted, we will have
+ // added the failure caused by a successful match to this.
+ m_backtrackingState.append(endOp.m_jumps);
+
+ m_checked += op.m_checkAdjust;
+ break;
+ }
+ case OpParentheticalAssertionEnd: {
+ // FIXME: We should really be clearing any nested subpattern
+ // matches on bailing out from after the pattern. Firefox has
+ // this bug too (presumably because they use YARR!)
+
+ // Never backtrack into an assertion; later failures bail to before the begin.
+ m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this);
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked -= lastOp.m_checkAdjust;
+ break;
+ }
+
+ case OpMatchFailed:
+ break;
+ }
+
+ } while (opIndex);
+ }
+
+ // Compilation methods:
+ // ====================
+
+ // opCompileParenthesesSubpattern
+ // Emits ops for a subpattern (set of parentheses). These consist
+ // of a set of alternatives wrapped in an outer set of nodes for
+ // the parentheses.
+ // Supported types of parentheses are 'Once' (quantityCount == 1)
+ // and 'Terminal' (non-capturing parentheses quantified as greedy
+ // and infinite).
+ // Alternatives will use the 'Simple' set of ops if either the
+ // subpattern is terminal (in which case we will never need to
+ // backtrack), or if the subpattern only contains one alternative.
+ void opCompileParenthesesSubpattern(PatternTerm* term)
+ {
+ YarrOpCode parenthesesBeginOpCode;
+ YarrOpCode parenthesesEndOpCode;
+ YarrOpCode alternativeBeginOpCode = OpSimpleNestedAlternativeBegin;
+ YarrOpCode alternativeNextOpCode = OpSimpleNestedAlternativeNext;
+ YarrOpCode alternativeEndOpCode = OpSimpleNestedAlternativeEnd;
+
+ // We can currently only compile quantity 1 subpatterns that are
+ // not copies. We generate a copy in the case of a range quantifier,
+ // e.g. /(?:x){3,9}/, or /(?:x)+/ (These are effectively expanded to
+ // /(?:x){3,3}(?:x){0,6}/ and /(?:x)(?:x)*/ repectively). The problem
+ // comes where the subpattern is capturing, in which case we would
+ // need to restore the capture from the first subpattern upon a
+ // failure in the second.
+ if (term->quantityCount == 1 && !term->parentheses.isCopy) {
+ // Select the 'Once' nodes.
+ parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin;
+ parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd;
+
+ // If there is more than one alternative we cannot use the 'simple' nodes.
+ if (term->parentheses.disjunction->m_alternatives.size() != 1) {
+ alternativeBeginOpCode = OpNestedAlternativeBegin;
+ alternativeNextOpCode = OpNestedAlternativeNext;
+ alternativeEndOpCode = OpNestedAlternativeEnd;
+ }
+ } else if (term->parentheses.isTerminal) {
+ // Select the 'Terminal' nodes.
+ parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin;
+ parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd;
+ } else {
+ // This subpattern is not supported by the JIT.
+ m_shouldFallBack = true;
+ return;
+ }
+
+ size_t parenBegin = m_ops.size();
+ m_ops.append(parenthesesBeginOpCode);
+
+ m_ops.append(alternativeBeginOpCode);
+ m_ops.last().m_previousOp = notFound;
+ m_ops.last().m_term = term;
+ Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives;
+ for (unsigned i = 0; i < alternatives.size(); ++i) {
+ size_t lastOpIndex = m_ops.size() - 1;
+
+ PatternAlternative* nestedAlternative = alternatives[i];
+ opCompileAlternative(nestedAlternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(alternativeNextOpCode));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = nestedAlternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+ thisOp.m_term = term;
+ }
+ YarrOp& lastOp = m_ops.last();
+ ASSERT(lastOp.m_op == alternativeNextOpCode);
+ lastOp.m_op = alternativeEndOpCode;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = notFound;
+
+ size_t parenEnd = m_ops.size();
+ m_ops.append(parenthesesEndOpCode);
+
+ m_ops[parenBegin].m_term = term;
+ m_ops[parenBegin].m_previousOp = notFound;
+ m_ops[parenBegin].m_nextOp = parenEnd;
+ m_ops[parenEnd].m_term = term;
+ m_ops[parenEnd].m_previousOp = parenBegin;
+ m_ops[parenEnd].m_nextOp = notFound;
+ }
+
+ // opCompileParentheticalAssertion
+ // Emits ops for a parenthetical assertion. These consist of an
+ // OpSimpleNestedAlternativeBegin/Next/End set of nodes wrapping
+ // the alternatives, with these wrapped by an outer pair of
+ // OpParentheticalAssertionBegin/End nodes.
+ // We can always use the OpSimpleNestedAlternative nodes in the
+ // case of parenthetical assertions since these only ever match
+ // once, and will never backtrack back into the assertion.
+ void opCompileParentheticalAssertion(PatternTerm* term)
+ {
+ size_t parenBegin = m_ops.size();
+ m_ops.append(OpParentheticalAssertionBegin);
+
+ m_ops.append(OpSimpleNestedAlternativeBegin);
+ m_ops.last().m_previousOp = notFound;
+ m_ops.last().m_term = term;
+ Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives;
+ for (unsigned i = 0; i < alternatives.size(); ++i) {
+ size_t lastOpIndex = m_ops.size() - 1;
+
+ PatternAlternative* nestedAlternative = alternatives[i];
+ opCompileAlternative(nestedAlternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(OpSimpleNestedAlternativeNext));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = nestedAlternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+ thisOp.m_term = term;
+ }
+ YarrOp& lastOp = m_ops.last();
+ ASSERT(lastOp.m_op == OpSimpleNestedAlternativeNext);
+ lastOp.m_op = OpSimpleNestedAlternativeEnd;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = notFound;
+
+ size_t parenEnd = m_ops.size();
+ m_ops.append(OpParentheticalAssertionEnd);
+
+ m_ops[parenBegin].m_term = term;
+ m_ops[parenBegin].m_previousOp = notFound;
+ m_ops[parenBegin].m_nextOp = parenEnd;
+ m_ops[parenEnd].m_term = term;
+ m_ops[parenEnd].m_previousOp = parenBegin;
+ m_ops[parenEnd].m_nextOp = notFound;
+ }
+
+ // opCompileAlternative
+ // Called to emit nodes for all terms in an alternative.
+ void opCompileAlternative(PatternAlternative* alternative)
+ {
+ optimizeAlternative(alternative);
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm* term = &alternative->m_terms[i];
+
+ switch (term->type) {
+ case PatternTerm::TypeParenthesesSubpattern:
+ opCompileParenthesesSubpattern(term);
+ break;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ opCompileParentheticalAssertion(term);
+ break;
+
+ default:
+ m_ops.append(term);
+ }
+ }
+ }
+
+ // opCompileBody
+ // This method compiles the body disjunction of the regular expression.
+ // The body consists of two sets of alternatives - zero or more 'once
+ // through' (BOL anchored) alternatives, followed by zero or more
+ // repeated alternatives.
+ // For each of these two sets of alteratives, if not empty they will be
+ // wrapped in a set of OpBodyAlternativeBegin/Next/End nodes (with the
+ // 'begin' node referencing the first alternative, and 'next' nodes
+ // referencing any further alternatives. The begin/next/end nodes are
+ // linked together in a doubly linked list. In the case of repeating
+ // alternatives, the end node is also linked back to the beginning.
+ // If no repeating alternatives exist, then a OpMatchFailed node exists
+ // to return the failing result.
+ void opCompileBody(PatternDisjunction* disjunction)
+ {
+ Vector<PatternAlternative*>& alternatives = disjunction->m_alternatives;
+ size_t currentAlternativeIndex = 0;
+
+ // Emit the 'once through' alternatives.
+ if (alternatives.size() && alternatives[0]->onceThrough()) {
+ m_ops.append(YarrOp(OpBodyAlternativeBegin));
+ m_ops.last().m_previousOp = notFound;
+
+ do {
+ size_t lastOpIndex = m_ops.size() - 1;
+ PatternAlternative* alternative = alternatives[currentAlternativeIndex];
+ opCompileAlternative(alternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(OpBodyAlternativeNext));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = alternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+
+ ++currentAlternativeIndex;
+ } while (currentAlternativeIndex < alternatives.size() && alternatives[currentAlternativeIndex]->onceThrough());
+
+ YarrOp& lastOp = m_ops.last();
+
+ ASSERT(lastOp.m_op == OpBodyAlternativeNext);
+ lastOp.m_op = OpBodyAlternativeEnd;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = notFound;
+ }
+
+ if (currentAlternativeIndex == alternatives.size()) {
+ m_ops.append(YarrOp(OpMatchFailed));
+ return;
+ }
+
+ // Emit the repeated alternatives.
+ size_t repeatLoop = m_ops.size();
+ m_ops.append(YarrOp(OpBodyAlternativeBegin));
+ m_ops.last().m_previousOp = notFound;
+ do {
+ size_t lastOpIndex = m_ops.size() - 1;
+ PatternAlternative* alternative = alternatives[currentAlternativeIndex];
+ ASSERT(!alternative->onceThrough());
+ opCompileAlternative(alternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(OpBodyAlternativeNext));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = alternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+
+ ++currentAlternativeIndex;
+ } while (currentAlternativeIndex < alternatives.size());
+ YarrOp& lastOp = m_ops.last();
+ ASSERT(lastOp.m_op == OpBodyAlternativeNext);
+ lastOp.m_op = OpBodyAlternativeEnd;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = repeatLoop;
+ }
+
+ void generateEnter()
+ {
+#if CPU(X86_64)
+ push(X86Registers::ebp);
+ move(stackPointerRegister, X86Registers::ebp);
+ push(X86Registers::ebx);
+#elif CPU(X86)
+ push(X86Registers::ebp);
+ move(stackPointerRegister, X86Registers::ebp);
+ // TODO: do we need spill registers to fill the output pointer if there are no sub captures?
+ push(X86Registers::ebx);
+ push(X86Registers::edi);
+ push(X86Registers::esi);
+ // load output into edi (2 = saved ebp + return address).
+ #if COMPILER(MSVC)
+ loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input);
+ loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index);
+ loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length);
+ if (compileMode == IncludeSubpatterns)
+ loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output);
+ #else
+ if (compileMode == IncludeSubpatterns)
+ loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output);
+ #endif
+#elif CPU(ARM)
+ push(ARMRegisters::r4);
+ push(ARMRegisters::r5);
+ push(ARMRegisters::r6);
+#if CPU(ARM_TRADITIONAL)
+ push(ARMRegisters::r8); // scratch register
+#endif
+ if (compileMode == IncludeSubpatterns)
+ move(ARMRegisters::r3, output);
+#elif CPU(SH4)
+ push(SH4Registers::r11);
+ push(SH4Registers::r13);
+#elif CPU(MIPS)
+ // Do nothing.
+#endif
+ }
+
+ void generateReturn()
+ {
+#if CPU(X86_64)
+ pop(X86Registers::ebx);
+ pop(X86Registers::ebp);
+#elif CPU(X86)
+ pop(X86Registers::esi);
+ pop(X86Registers::edi);
+ pop(X86Registers::ebx);
+ pop(X86Registers::ebp);
+#elif CPU(ARM)
+#if CPU(ARM_TRADITIONAL)
+ pop(ARMRegisters::r8); // scratch register
+#endif
+ pop(ARMRegisters::r6);
+ pop(ARMRegisters::r5);
+ pop(ARMRegisters::r4);
+#elif CPU(SH4)
+ pop(SH4Registers::r13);
+ pop(SH4Registers::r11);
+#elif CPU(MIPS)
+ // Do nothing
+#endif
+ ret();
+ }
+
+public:
+ YarrGenerator(YarrPattern& pattern, YarrCharSize charSize)
+ : m_pattern(pattern)
+ , m_charSize(charSize)
+ , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo)
+ , m_shouldFallBack(false)
+ , m_checked(0)
+ {
+ }
+
+ void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject)
+ {
+ generateEnter();
+
+ Jump hasInput = checkInput();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
+ generateReturn();
+ hasInput.link(this);
+
+ if (compileMode == IncludeSubpatterns) {
+ for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i)
+ store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int)));
+ }
+
+ if (!m_pattern.m_body->m_hasFixedSize)
+ setMatchStart(index);
+
+ initCallFrame();
+
+ // Compile the pattern to the internal 'YarrOp' representation.
+ opCompileBody(m_pattern.m_body);
+
+ // If we encountered anything we can't handle in the JIT code
+ // (e.g. backreferences) then return early.
+ if (m_shouldFallBack) {
+ jitObject.setFallBack(true);
+ return;
+ }
+
+ generate();
+ backtrack();
+
+ // Link & finalize the code.
+ LinkBuffer linkBuffer(*globalData, this, REGEXP_CODE_ID);
+ m_backtrackingState.linkDataLabels(linkBuffer);
+
+ if (compileMode == MatchOnly) {
+ if (m_charSize == Char8)
+ jitObject.set8BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 8-bit regular expression")));
+ else
+ jitObject.set16BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 16-bit regular expression")));
+ } else {
+ if (m_charSize == Char8)
+ jitObject.set8BitCode(FINALIZE_CODE(linkBuffer, ("8-bit regular expression")));
+ else
+ jitObject.set16BitCode(FINALIZE_CODE(linkBuffer, ("16-bit regular expression")));
+ }
+ jitObject.setFallBack(m_shouldFallBack);
+ }
+
+private:
+ YarrPattern& m_pattern;
+
+ YarrCharSize m_charSize;
+
+ Scale m_charScale;
+
+ // Used to detect regular expression constructs that are not currently
+ // supported in the JIT; fall back to the interpreter when this is detected.
+ bool m_shouldFallBack;
+
+ // The regular expression expressed as a linear sequence of operations.
+ Vector<YarrOp, 128> m_ops;
+
+ // This records the current input offset being applied due to the current
+ // set of alternatives we are nested within. E.g. when matching the
+ // character 'b' within the regular expression /abc/, we will know that
+ // the minimum size for the alternative is 3, checked upon entry to the
+ // alternative, and that 'b' is at offset 1 from the start, and as such
+ // when matching 'b' we need to apply an offset of -2 to the load.
+ //
+ // FIXME: This should go away. Rather than tracking this value throughout
+ // code generation, we should gather this information up front & store it
+ // on the YarrOp structure.
+ int m_checked;
+
+ // This class records state whilst generating the backtracking path of code.
+ BacktrackingState m_backtrackingState;
+};
+
+void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode)
+{
+ if (mode == MatchOnly)
+ YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject);
+ else
+ YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject);
+}
+
+}}
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrJIT_h
+#define YarrJIT_h
+
+#if ENABLE(YARR_JIT)
+
+#include "JSGlobalData.h"
+#include "MacroAssemblerCodeRef.h"
+#include "MatchResult.h"
+#include "Yarr.h"
+#include "YarrPattern.h"
+
+#if CPU(X86) && !COMPILER(MSVC)
+#define YARR_CALL __attribute__ ((regparm (3)))
+#else
+#define YARR_CALL
+#endif
+
+namespace JSC {
+
+class JSGlobalData;
+class ExecutablePool;
+
+namespace Yarr {
+
+class YarrCodeBlock {
+#if CPU(X86_64)
+ typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+ typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#else
+ typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#endif
+
+public:
+ YarrCodeBlock()
+ : m_needFallBack(false)
+ {
+ }
+
+ ~YarrCodeBlock()
+ {
+ }
+
+ void setFallBack(bool fallback) { m_needFallBack = fallback; }
+ bool isFallBack() { return m_needFallBack; }
+
+ bool has8BitCode() { return m_ref8.size(); }
+ bool has16BitCode() { return m_ref16.size(); }
+ void set8BitCode(MacroAssemblerCodeRef ref) { m_ref8 = ref; }
+ void set16BitCode(MacroAssemblerCodeRef ref) { m_ref16 = ref; }
+
+ bool has8BitCodeMatchOnly() { return m_matchOnly8.size(); }
+ bool has16BitCodeMatchOnly() { return m_matchOnly16.size(); }
+ void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; }
+ void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; }
+
+ MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output)
+ {
+ ASSERT(has8BitCode());
+ return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output));
+ }
+
+ MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output)
+ {
+ ASSERT(has16BitCode());
+ return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output));
+ }
+
+ MatchResult execute(const LChar* input, unsigned start, unsigned length)
+ {
+ ASSERT(has8BitCodeMatchOnly());
+ return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length));
+ }
+
+ MatchResult execute(const UChar* input, unsigned start, unsigned length)
+ {
+ ASSERT(has16BitCodeMatchOnly());
+ return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length));
+ }
+
+#if ENABLE(REGEXP_TRACING)
+ void *getAddr() { return m_ref.code().executableAddress(); }
+#endif
+
+ void clear()
+ {
+ m_ref8 = MacroAssemblerCodeRef();
+ m_ref16 = MacroAssemblerCodeRef();
+ m_matchOnly8 = MacroAssemblerCodeRef();
+ m_matchOnly16 = MacroAssemblerCodeRef();
+ m_needFallBack = false;
+ }
+
+private:
+ MacroAssemblerCodeRef m_ref8;
+ MacroAssemblerCodeRef m_ref16;
+ MacroAssemblerCodeRef m_matchOnly8;
+ MacroAssemblerCodeRef m_matchOnly16;
+ bool m_needFallBack;
+};
+
+enum YarrJITCompileMode {
+ MatchOnly,
+ IncludeSubpatterns
+};
+void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns);
+
+} } // namespace JSC::Yarr
+
+#endif
+
+#endif // YarrJIT_h
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrParser_h
+#define YarrParser_h
+
+#include "Yarr.h"
+#include <wtf/ASCIICType.h>
+#include <wtf/text/WTFString.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
+
+enum BuiltInCharacterClassID {
+ DigitClassID,
+ SpaceClassID,
+ WordClassID,
+ NewlineClassID,
+};
+
+// The Parser class should not be used directly - only via the Yarr::parse() method.
+template<class Delegate, typename CharType>
+class Parser {
+private:
+ template<class FriendDelegate>
+ friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
+
+ enum ErrorCode {
+ NoError,
+ PatternTooLarge,
+ QuantifierOutOfOrder,
+ QuantifierWithoutAtom,
+ QuantifierTooLarge,
+ MissingParentheses,
+ ParenthesesUnmatched,
+ ParenthesesTypeInvalid,
+ CharacterClassUnmatched,
+ CharacterClassOutOfOrder,
+ EscapeUnterminated,
+ NumberOfErrorCodes
+ };
+
+ /*
+ * CharacterClassParserDelegate:
+ *
+ * The class CharacterClassParserDelegate is used in the parsing of character
+ * classes. This class handles detection of character ranges. This class
+ * implements enough of the delegate interface such that it can be passed to
+ * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
+ * to perform the parsing of escape characters in character sets.
+ */
+ class CharacterClassParserDelegate {
+ public:
+ CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+ : m_delegate(delegate)
+ , m_err(err)
+ , m_state(Empty)
+ , m_character(0)
+ {
+ }
+
+ /*
+ * begin():
+ *
+ * Called at beginning of construction.
+ */
+ void begin(bool invert)
+ {
+ m_delegate.atomCharacterClassBegin(invert);
+ }
+
+ /*
+ * atomPatternCharacter():
+ *
+ * This method is called either from parseCharacterClass() (for an unescaped
+ * character in a character class), or from parseEscape(). In the former case
+ * the value true will be passed for the argument 'hyphenIsRange', and in this
+ * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+ * is different to /[a\-z]/).
+ */
+ void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+ {
+ switch (m_state) {
+ case AfterCharacterClass:
+ // Following a builtin character class we need look out for a hyphen.
+ // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+ // If we see a hyphen following a charater class then unlike usual
+ // we'll report it to the delegate immediately, and put ourself into
+ // a poisoned state. Any following calls to add another character or
+ // character class will result in an error. (A hypen following a
+ // character-class is itself valid, but only at the end of a regex).
+ if (hyphenIsRange && ch == '-') {
+ m_delegate.atomCharacterClassAtom('-');
+ m_state = AfterCharacterClassHyphen;
+ return;
+ }
+ // Otherwise just fall through - cached character so treat this as Empty.
+
+ case Empty:
+ m_character = ch;
+ m_state = CachedCharacter;
+ return;
+
+ case CachedCharacter:
+ if (hyphenIsRange && ch == '-')
+ m_state = CachedCharacterHyphen;
+ else {
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_character = ch;
+ }
+ return;
+
+ case CachedCharacterHyphen:
+ if (ch < m_character) {
+ m_err = CharacterClassOutOfOrder;
+ return;
+ }
+ m_delegate.atomCharacterClassRange(m_character, ch);
+ m_state = Empty;
+ return;
+
+ // See coment in atomBuiltInCharacterClass below.
+ // This too is technically an error, per ECMA-262, and again we
+ // we chose to allow this. Note a subtlely here that while we
+ // diverge from the spec's definition of CharacterRange we do
+ // remain in compliance with the grammar. For example, consider
+ // the expression /[\d-a-z]/. We comply with the grammar in
+ // this case by not allowing a-z to be matched as a range.
+ case AfterCharacterClassHyphen:
+ m_delegate.atomCharacterClassAtom(ch);
+ m_state = Empty;
+ return;
+ }
+ }
+
+ /*
+ * atomBuiltInCharacterClass():
+ *
+ * Adds a built-in character class, called by parseEscape().
+ */
+ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+ {
+ switch (m_state) {
+ case CachedCharacter:
+ // Flush the currently cached character, then fall through.
+ m_delegate.atomCharacterClassAtom(m_character);
+
+ case Empty:
+ case AfterCharacterClass:
+ m_state = AfterCharacterClass;
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ return;
+
+ // If we hit either of these cases, we have an invalid range that
+ // looks something like /[x-\d]/ or /[\d-\d]/.
+ // According to ECMA-262 this should be a syntax error, but
+ // empirical testing shows this to break teh webz. Instead we
+ // comply with to the ECMA-262 grammar, and assume the grammar to
+ // have matched the range correctly, but tweak our interpretation
+ // of CharacterRange. Effectively we implicitly handle the hyphen
+ // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
+ case CachedCharacterHyphen:
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_delegate.atomCharacterClassAtom('-');
+ // fall through
+ case AfterCharacterClassHyphen:
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ m_state = Empty;
+ return;
+ }
+ }
+
+ /*
+ * end():
+ *
+ * Called at end of construction.
+ */
+ void end()
+ {
+ if (m_state == CachedCharacter)
+ m_delegate.atomCharacterClassAtom(m_character);
+ else if (m_state == CachedCharacterHyphen) {
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_delegate.atomCharacterClassAtom('-');
+ }
+ m_delegate.atomCharacterClassEnd();
+ }
+
+ // parseEscape() should never call these delegate methods when
+ // invoked with inCharacterClass set.
+ NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
+ NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
+
+ private:
+ Delegate& m_delegate;
+ ErrorCode& m_err;
+ enum CharacterClassConstructionState {
+ Empty,
+ CachedCharacter,
+ CachedCharacterHyphen,
+ AfterCharacterClass,
+ AfterCharacterClassHyphen,
+ } m_state;
+ UChar m_character;
+ };
+
+ Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
+ : m_delegate(delegate)
+ , m_backReferenceLimit(backReferenceLimit)
+ , m_err(NoError)
+ , m_data(pattern.getCharacters<CharType>())
+ , m_size(pattern.length())
+ , m_index(0)
+ , m_parenthesesNestingDepth(0)
+ {
+ }
+
+ /*
+ * parseEscape():
+ *
+ * Helper for parseTokens() AND parseCharacterClass().
+ * Unlike the other parser methods, this function does not report tokens
+ * directly to the member delegate (m_delegate), instead tokens are
+ * emitted to the delegate provided as an argument. In the case of atom
+ * escapes, parseTokens() will call parseEscape() passing m_delegate as
+ * an argument, and as such the escape will be reported to the delegate.
+ *
+ * However this method may also be used by parseCharacterClass(), in which
+ * case a CharacterClassParserDelegate will be passed as the delegate that
+ * tokens should be added to. A boolean flag is also provided to indicate
+ * whether that an escape in a CharacterClass is being parsed (some parsing
+ * rules change in this context).
+ *
+ * The boolean value returned by this method indicates whether the token
+ * parsed was an atom (outside of a characted class \b and \B will be
+ * interpreted as assertions).
+ */
+ template<bool inCharacterClass, class EscapeDelegate>
+ bool parseEscape(EscapeDelegate& delegate)
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == '\\');
+ consume();
+
+ if (atEndOfPattern()) {
+ m_err = EscapeUnterminated;
+ return false;
+ }
+
+ switch (peek()) {
+ // Assertions
+ case 'b':
+ consume();
+ if (inCharacterClass)
+ delegate.atomPatternCharacter('\b');
+ else {
+ delegate.assertionWordBoundary(false);
+ return false;
+ }
+ break;
+ case 'B':
+ consume();
+ if (inCharacterClass)
+ delegate.atomPatternCharacter('B');
+ else {
+ delegate.assertionWordBoundary(true);
+ return false;
+ }
+ break;
+
+ // CharacterClassEscape
+ case 'd':
+ consume();
+ delegate.atomBuiltInCharacterClass(DigitClassID, false);
+ break;
+ case 's':
+ consume();
+ delegate.atomBuiltInCharacterClass(SpaceClassID, false);
+ break;
+ case 'w':
+ consume();
+ delegate.atomBuiltInCharacterClass(WordClassID, false);
+ break;
+ case 'D':
+ consume();
+ delegate.atomBuiltInCharacterClass(DigitClassID, true);
+ break;
+ case 'S':
+ consume();
+ delegate.atomBuiltInCharacterClass(SpaceClassID, true);
+ break;
+ case 'W':
+ consume();
+ delegate.atomBuiltInCharacterClass(WordClassID, true);
+ break;
+
+ // DecimalEscape
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
+ // First, try to parse this as backreference.
+ if (!inCharacterClass) {
+ ParseState state = saveState();
+
+ unsigned backReference = consumeNumber();
+ if (backReference <= m_backReferenceLimit) {
+ delegate.atomBackReference(backReference);
+ break;
+ }
+
+ restoreState(state);
+ }
+
+ // Not a backreference, and not octal.
+ if (peek() >= '8') {
+ delegate.atomPatternCharacter('\\');
+ break;
+ }
+
+ // Fall-through to handle this as an octal escape.
+ }
+
+ // Octal escape
+ case '0':
+ delegate.atomPatternCharacter(consumeOctal());
+ break;
+
+ // ControlEscape
+ case 'f':
+ consume();
+ delegate.atomPatternCharacter('\f');
+ break;
+ case 'n':
+ consume();
+ delegate.atomPatternCharacter('\n');
+ break;
+ case 'r':
+ consume();
+ delegate.atomPatternCharacter('\r');
+ break;
+ case 't':
+ consume();
+ delegate.atomPatternCharacter('\t');
+ break;
+ case 'v':
+ consume();
+ delegate.atomPatternCharacter('\v');
+ break;
+
+ // ControlLetter
+ case 'c': {
+ ParseState state = saveState();
+ consume();
+ if (!atEndOfPattern()) {
+ int control = consume();
+
+ // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
+ if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
+ delegate.atomPatternCharacter(control & 0x1f);
+ break;
+ }
+ }
+ restoreState(state);
+ delegate.atomPatternCharacter('\\');
+ break;
+ }
+
+ // HexEscape
+ case 'x': {
+ consume();
+ int x = tryConsumeHex(2);
+ if (x == -1)
+ delegate.atomPatternCharacter('x');
+ else
+ delegate.atomPatternCharacter(x);
+ break;
+ }
+
+ // UnicodeEscape
+ case 'u': {
+ consume();
+ int u = tryConsumeHex(4);
+ if (u == -1)
+ delegate.atomPatternCharacter('u');
+ else
+ delegate.atomPatternCharacter(u);
+ break;
+ }
+
+ // IdentityEscape
+ default:
+ delegate.atomPatternCharacter(consume());
+ }
+
+ return true;
+ }
+
+ /*
+ * parseAtomEscape(), parseCharacterClassEscape():
+ *
+ * These methods alias to parseEscape().
+ */
+ bool parseAtomEscape()
+ {
+ return parseEscape<false>(m_delegate);
+ }
+ void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
+ {
+ parseEscape<true>(delegate);
+ }
+
+ /*
+ * parseCharacterClass():
+ *
+ * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+ * to an instance of CharacterClassParserDelegate, to describe the character class to the
+ * delegate.
+ */
+ void parseCharacterClass()
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == '[');
+ consume();
+
+ CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
+
+ characterClassConstructor.begin(tryConsume('^'));
+
+ while (!atEndOfPattern()) {
+ switch (peek()) {
+ case ']':
+ consume();
+ characterClassConstructor.end();
+ return;
+
+ case '\\':
+ parseCharacterClassEscape(characterClassConstructor);
+ break;
+
+ default:
+ characterClassConstructor.atomPatternCharacter(consume(), true);
+ }
+
+ if (m_err)
+ return;
+ }
+
+ m_err = CharacterClassUnmatched;
+ }
+
+ /*
+ * parseParenthesesBegin():
+ *
+ * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
+ */
+ void parseParenthesesBegin()
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == '(');
+ consume();
+
+ if (tryConsume('?')) {
+ if (atEndOfPattern()) {
+ m_err = ParenthesesTypeInvalid;
+ return;
+ }
+
+ switch (consume()) {
+ case ':':
+ m_delegate.atomParenthesesSubpatternBegin(false);
+ break;
+
+ case '=':
+ m_delegate.atomParentheticalAssertionBegin();
+ break;
+
+ case '!':
+ m_delegate.atomParentheticalAssertionBegin(true);
+ break;
+
+ default:
+ m_err = ParenthesesTypeInvalid;
+ }
+ } else
+ m_delegate.atomParenthesesSubpatternBegin();
+
+ ++m_parenthesesNestingDepth;
+ }
+
+ /*
+ * parseParenthesesEnd():
+ *
+ * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
+ */
+ void parseParenthesesEnd()
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == ')');
+ consume();
+
+ if (m_parenthesesNestingDepth > 0)
+ m_delegate.atomParenthesesEnd();
+ else
+ m_err = ParenthesesUnmatched;
+
+ --m_parenthesesNestingDepth;
+ }
+
+ /*
+ * parseQuantifier():
+ *
+ * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
+ */
+ void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
+ {
+ ASSERT(!m_err);
+ ASSERT(min <= max);
+
+ if (min == UINT_MAX) {
+ m_err = QuantifierTooLarge;
+ return;
+ }
+
+ if (lastTokenWasAnAtom)
+ m_delegate.quantifyAtom(min, max, !tryConsume('?'));
+ else
+ m_err = QuantifierWithoutAtom;
+ }
+
+ /*
+ * parseTokens():
+ *
+ * This method loops over the input pattern reporting tokens to the delegate.
+ * The method returns when a parse error is detected, or the end of the pattern
+ * is reached. One piece of state is tracked around the loop, which is whether
+ * the last token passed to the delegate was an atom (this is necessary to detect
+ * a parse error when a quantifier provided without an atom to quantify).
+ */
+ void parseTokens()
+ {
+ bool lastTokenWasAnAtom = false;
+
+ while (!atEndOfPattern()) {
+ switch (peek()) {
+ case '|':
+ consume();
+ m_delegate.disjunction();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '(':
+ parseParenthesesBegin();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case ')':
+ parseParenthesesEnd();
+ lastTokenWasAnAtom = true;
+ break;
+
+ case '^':
+ consume();
+ m_delegate.assertionBOL();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '$':
+ consume();
+ m_delegate.assertionEOL();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '.':
+ consume();
+ m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
+ lastTokenWasAnAtom = true;
+ break;
+
+ case '[':
+ parseCharacterClass();
+ lastTokenWasAnAtom = true;
+ break;
+
+ case '\\':
+ lastTokenWasAnAtom = parseAtomEscape();
+ break;
+
+ case '*':
+ consume();
+ parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '+':
+ consume();
+ parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '?':
+ consume();
+ parseQuantifier(lastTokenWasAnAtom, 0, 1);
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '{': {
+ ParseState state = saveState();
+
+ consume();
+ if (peekIsDigit()) {
+ unsigned min = consumeNumber();
+ unsigned max = min;
+
+ if (tryConsume(','))
+ max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
+
+ if (tryConsume('}')) {
+ if (min <= max)
+ parseQuantifier(lastTokenWasAnAtom, min, max);
+ else
+ m_err = QuantifierOutOfOrder;
+ lastTokenWasAnAtom = false;
+ break;
+ }
+ }
+
+ restoreState(state);
+ } // if we did not find a complete quantifer, fall through to the default case.
+
+ default:
+ m_delegate.atomPatternCharacter(consume());
+ lastTokenWasAnAtom = true;
+ }
+
+ if (m_err)
+ return;
+ }
+
+ if (m_parenthesesNestingDepth > 0)
+ m_err = MissingParentheses;
+ }
+
+ /*
+ * parse():
+ *
+ * This method calls parseTokens() to parse over the input and converts any
+ * error code to a const char* for a result.
+ */
+ const char* parse()
+ {
+ if (m_size > MAX_PATTERN_SIZE)
+ m_err = PatternTooLarge;
+ else
+ parseTokens();
+ ASSERT(atEndOfPattern() || m_err);
+
+ // The order of this array must match the ErrorCode enum.
+ static const char* errorMessages[NumberOfErrorCodes] = {
+ 0, // NoError
+ REGEXP_ERROR_PREFIX "regular expression too large",
+ REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
+ REGEXP_ERROR_PREFIX "nothing to repeat",
+ REGEXP_ERROR_PREFIX "number too large in {} quantifier",
+ REGEXP_ERROR_PREFIX "missing )",
+ REGEXP_ERROR_PREFIX "unmatched parentheses",
+ REGEXP_ERROR_PREFIX "unrecognized character after (?",
+ REGEXP_ERROR_PREFIX "missing terminating ] for character class",
+ REGEXP_ERROR_PREFIX "range out of order in character class",
+ REGEXP_ERROR_PREFIX "\\ at end of pattern"
+ };
+
+ return errorMessages[m_err];
+ }
+
+ // Misc helper functions:
+
+ typedef unsigned ParseState;
+
+ ParseState saveState()
+ {
+ return m_index;
+ }
+
+ void restoreState(ParseState state)
+ {
+ m_index = state;
+ }
+
+ bool atEndOfPattern()
+ {
+ ASSERT(m_index <= m_size);
+ return m_index == m_size;
+ }
+
+ int peek()
+ {
+ ASSERT(m_index < m_size);
+ return m_data[m_index];
+ }
+
+ bool peekIsDigit()
+ {
+ return !atEndOfPattern() && WTF::isASCIIDigit(peek());
+ }
+
+ unsigned peekDigit()
+ {
+ ASSERT(peekIsDigit());
+ return peek() - '0';
+ }
+
+ int consume()
+ {
+ ASSERT(m_index < m_size);
+ return m_data[m_index++];
+ }
+
+ unsigned consumeDigit()
+ {
+ ASSERT(peekIsDigit());
+ return consume() - '0';
+ }
+
+ unsigned consumeNumber()
+ {
+ unsigned n = consumeDigit();
+ // check for overflow.
+ for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
+ n = newValue;
+ consume();
+ }
+ return n;
+ }
+
+ unsigned consumeOctal()
+ {
+ ASSERT(WTF::isASCIIOctalDigit(peek()));
+
+ unsigned n = consumeDigit();
+ while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
+ n = n * 8 + consumeDigit();
+ return n;
+ }
+
+ bool tryConsume(UChar ch)
+ {
+ if (atEndOfPattern() || (m_data[m_index] != ch))
+ return false;
+ ++m_index;
+ return true;
+ }
+
+ int tryConsumeHex(int count)
+ {
+ ParseState state = saveState();
+
+ int n = 0;
+ while (count--) {
+ if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
+ restoreState(state);
+ return -1;
+ }
+ n = (n << 4) | WTF::toASCIIHexValue(consume());
+ }
+ return n;
+ }
+
+ Delegate& m_delegate;
+ unsigned m_backReferenceLimit;
+ ErrorCode m_err;
+ const CharType* m_data;
+ unsigned m_size;
+ unsigned m_index;
+ unsigned m_parenthesesNestingDepth;
+
+ // Derived by empirical testing of compile time in PCRE and WREC.
+ static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
+};
+
+/*
+ * Yarr::parse():
+ *
+ * The parse method is passed a pattern to be parsed and a delegate upon which
+ * callbacks will be made to record the parsed tokens forming the regex.
+ * Yarr::parse() returns null on success, or a const C string providing an error
+ * message where a parse error occurs.
+ *
+ * The Delegate must implement the following interface:
+ *
+ * void assertionBOL();
+ * void assertionEOL();
+ * void assertionWordBoundary(bool invert);
+ *
+ * void atomPatternCharacter(UChar ch);
+ * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
+ * void atomCharacterClassBegin(bool invert)
+ * void atomCharacterClassAtom(UChar ch)
+ * void atomCharacterClassRange(UChar begin, UChar end)
+ * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+ * void atomCharacterClassEnd()
+ * void atomParenthesesSubpatternBegin(bool capture = true);
+ * void atomParentheticalAssertionBegin(bool invert = false);
+ * void atomParenthesesEnd();
+ * void atomBackReference(unsigned subpatternId);
+ *
+ * void quantifyAtom(unsigned min, unsigned max, bool greedy);
+ *
+ * void disjunction();
+ *
+ * The regular expression is described by a sequence of assertion*() and atom*()
+ * callbacks to the delegate, describing the terms in the regular expression.
+ * Following an atom a quantifyAtom() call may occur to indicate that the previous
+ * atom should be quantified. In the case of atoms described across multiple
+ * calls (parentheses and character classes) the call to quantifyAtom() will come
+ * after the call to the atom*End() method, never after atom*Begin().
+ *
+ * Character classes may either be described by a single call to
+ * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
+ * In the latter case, ...Begin() will be called, followed by a sequence of
+ * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
+ *
+ * Sequences of atoms and assertions are broken into alternatives via calls to
+ * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
+ * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
+ * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
+ * capturing subpattern, this will be the subpatternId associated with these
+ * parentheses, and will also by definition be the lowest subpatternId of these
+ * parentheses and of any nested paretheses. The atomParenthesesEnd() method
+ * is passed the subpatternId of the last capturing subexpression nested within
+ * these paretheses. In the case of a capturing subpattern with no nested
+ * capturing subpatterns, the same subpatternId will be passed to the begin and
+ * end functions. In the case of non-capturing subpatterns the subpatternId
+ * passed to the begin method is also the first possible subpatternId that might
+ * be nested within these paretheses. If a set of non-capturing parentheses does
+ * not contain any capturing subpatterns, then the subpatternId passed to begin
+ * will be greater than the subpatternId passed to end.
+ */
+
+template<class Delegate>
+const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
+{
+ if (pattern.is8Bit())
+ return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
+ return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
+}
+
+} } // namespace JSC::Yarr
+
+#endif // YarrParser_h
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrPattern.h"
+
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+#include "YarrParser.h"
+#include <wtf/Vector.h>
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+#include "RegExpJitTables.h"
+
+class CharacterClassConstructor {
+public:
+ CharacterClassConstructor(bool isCaseInsensitive = false)
+ : m_isCaseInsensitive(isCaseInsensitive)
+ {
+ }
+
+ void reset()
+ {
+ m_matches.clear();
+ m_ranges.clear();
+ m_matchesUnicode.clear();
+ m_rangesUnicode.clear();
+ }
+
+ void append(const CharacterClass* other)
+ {
+ for (size_t i = 0; i < other->m_matches.size(); ++i)
+ addSorted(m_matches, other->m_matches[i]);
+ for (size_t i = 0; i < other->m_ranges.size(); ++i)
+ addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
+ for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
+ addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
+ for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
+ addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
+ }
+
+ void putChar(UChar ch)
+ {
+ // Handle ascii cases.
+ if (ch <= 0x7f) {
+ if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
+ addSorted(m_matches, toASCIIUpper(ch));
+ addSorted(m_matches, toASCIILower(ch));
+ } else
+ addSorted(m_matches, ch);
+ return;
+ }
+
+ // Simple case, not a case-insensitive match.
+ if (!m_isCaseInsensitive) {
+ addSorted(m_matchesUnicode, ch);
+ return;
+ }
+
+ // Add multiple matches, if necessary.
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique)
+ addSorted(m_matchesUnicode, ch);
+ else
+ putUnicodeIgnoreCase(ch, info);
+ }
+
+ void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
+ {
+ ASSERT(m_isCaseInsensitive);
+ ASSERT(ch > 0x7f);
+ ASSERT(ch >= info->begin && ch <= info->end);
+ ASSERT(info->type != CanonicalizeUnique);
+ if (info->type == CanonicalizeSet) {
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ } else {
+ addSorted(m_matchesUnicode, ch);
+ addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ }
+ }
+
+ void putRange(UChar lo, UChar hi)
+ {
+ if (lo <= 0x7f) {
+ char asciiLo = lo;
+ char asciiHi = std::min(hi, (UChar)0x7f);
+ addSortedRange(m_ranges, lo, asciiHi);
+
+ if (m_isCaseInsensitive) {
+ if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
+ addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
+ if ((asciiLo <= 'z') && (asciiHi >= 'a'))
+ addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
+ }
+ }
+ if (hi <= 0x7f)
+ return;
+
+ lo = std::max(lo, (UChar)0x80);
+ addSortedRange(m_rangesUnicode, lo, hi);
+
+ if (!m_isCaseInsensitive)
+ return;
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ while (true) {
+ // Handle the range [lo .. end]
+ UChar end = std::min<UChar>(info->end, hi);
+
+ switch (info->type) {
+ case CanonicalizeUnique:
+ // Nothing to do - no canonical equivalents.
+ break;
+ case CanonicalizeSet: {
+ UChar ch;
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ break;
+ }
+ case CanonicalizeRangeLo:
+ addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
+ break;
+ case CanonicalizeRangeHi:
+ addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
+ break;
+ case CanonicalizeAlternatingAligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (lo & 1)
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (!(end & 1))
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ case CanonicalizeAlternatingUnaligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (!(lo & 1))
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (end & 1)
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ }
+
+ if (hi == end)
+ return;
+
+ ++info;
+ lo = info->begin;
+ };
+
+ }
+
+ CharacterClass* charClass()
+ {
+ CharacterClass* characterClass = new CharacterClass(0);
+
+ characterClass->m_matches.swap(m_matches);
+ characterClass->m_ranges.swap(m_ranges);
+ characterClass->m_matchesUnicode.swap(m_matchesUnicode);
+ characterClass->m_rangesUnicode.swap(m_rangesUnicode);
+
+ return characterClass;
+ }
+
+private:
+ void addSorted(Vector<UChar>& matches, UChar ch)
+ {
+ unsigned pos = 0;
+ unsigned range = matches.size();
+
+ // binary chop, find position to insert char.
+ while (range) {
+ unsigned index = range >> 1;
+
+ int val = matches[pos+index] - ch;
+ if (!val)
+ return;
+ else if (val > 0)
+ range = index;
+ else {
+ pos += (index+1);
+ range -= (index+1);
+ }
+ }
+
+ if (pos == matches.size())
+ matches.append(ch);
+ else
+ matches.insert(pos, ch);
+ }
+
+ void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
+ {
+ unsigned end = ranges.size();
+
+ // Simple linear scan - I doubt there are that many ranges anyway...
+ // feel free to fix this with something faster (eg binary chop).
+ for (unsigned i = 0; i < end; ++i) {
+ // does the new range fall before the current position in the array
+ if (hi < ranges[i].begin) {
+ // optional optimization: concatenate appending ranges? - may not be worthwhile.
+ if (hi == (ranges[i].begin - 1)) {
+ ranges[i].begin = lo;
+ return;
+ }
+ ranges.insert(i, CharacterRange(lo, hi));
+ return;
+ }
+ // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
+ // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
+ // end of the last range they concatenate, which is just as good.
+ if (lo <= (ranges[i].end + 1)) {
+ // found an intersect! we'll replace this entry in the array.
+ ranges[i].begin = std::min(ranges[i].begin, lo);
+ ranges[i].end = std::max(ranges[i].end, hi);
+
+ // now check if the new range can subsume any subsequent ranges.
+ unsigned next = i+1;
+ // each iteration of the loop we will either remove something from the list, or break the loop.
+ while (next < ranges.size()) {
+ if (ranges[next].begin <= (ranges[i].end + 1)) {
+ // the next entry now overlaps / concatenates this one.
+ ranges[i].end = std::max(ranges[i].end, ranges[next].end);
+ ranges.remove(next);
+ } else
+ break;
+ }
+
+ return;
+ }
+ }
+
+ // CharacterRange comes after all existing ranges.
+ ranges.append(CharacterRange(lo, hi));
+ }
+
+ bool m_isCaseInsensitive;
+
+ Vector<UChar> m_matches;
+ Vector<CharacterRange> m_ranges;
+ Vector<UChar> m_matchesUnicode;
+ Vector<CharacterRange> m_rangesUnicode;
+};
+
+class YarrPatternConstructor {
+public:
+ YarrPatternConstructor(YarrPattern& pattern)
+ : m_pattern(pattern)
+ , m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_invertParentheticalAssertion(false)
+ {
+ m_pattern.m_body = new PatternDisjunction();
+ m_alternative = m_pattern.m_body->addNewAlternative();
+ m_pattern.m_disjunctions.append(m_pattern.m_body);
+ }
+
+ ~YarrPatternConstructor()
+ {
+ }
+
+ void reset()
+ {
+ m_pattern.reset();
+ m_characterClassConstructor.reset();
+
+ m_pattern.m_body = new PatternDisjunction();
+ m_alternative = m_pattern.m_body->addNewAlternative();
+ m_pattern.m_disjunctions.append(m_pattern.m_body);
+ }
+
+ void assertionBOL()
+ {
+ if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+ m_alternative->m_startsWithBOL = true;
+ m_alternative->m_containsBOL = true;
+ m_pattern.m_containsBOL = true;
+ }
+ m_alternative->m_terms.append(PatternTerm::BOL());
+ }
+ void assertionEOL()
+ {
+ m_alternative->m_terms.append(PatternTerm::EOL());
+ }
+ void assertionWordBoundary(bool invert)
+ {
+ m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
+ }
+
+ void atomPatternCharacter(UChar ch)
+ {
+ // We handle case-insensitive checking of unicode characters which do have both
+ // cases by handling them as if they were defined using a CharacterClass.
+ if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
+ CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+ m_pattern.m_userCharacterClasses.append(newCharacterClass);
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
+ }
+
+ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+ {
+ switch (classID) {
+ case DigitClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
+ break;
+ case SpaceClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
+ break;
+ case WordClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+ break;
+ case NewlineClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
+ break;
+ }
+ }
+
+ void atomCharacterClassBegin(bool invert = false)
+ {
+ m_invertCharacterClass = invert;
+ }
+
+ void atomCharacterClassAtom(UChar ch)
+ {
+ m_characterClassConstructor.putChar(ch);
+ }
+
+ void atomCharacterClassRange(UChar begin, UChar end)
+ {
+ m_characterClassConstructor.putRange(begin, end);
+ }
+
+ void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+ {
+ ASSERT(classID != NewlineClassID);
+
+ switch (classID) {
+ case DigitClassID:
+ m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
+ break;
+
+ case SpaceClassID:
+ m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
+ break;
+
+ case WordClassID:
+ m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
+ break;
+
+ default:
+ ASSERT_NOT_REACHED();
+ }
+ }
+
+ void atomCharacterClassEnd()
+ {
+ CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+ m_pattern.m_userCharacterClasses.append(newCharacterClass);
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
+ }
+
+ void atomParenthesesSubpatternBegin(bool capture = true)
+ {
+ unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
+ if (capture)
+ m_pattern.m_numSubpatterns++;
+
+ PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
+ m_pattern.m_disjunctions.append(parenthesesDisjunction);
+ m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
+ m_alternative = parenthesesDisjunction->addNewAlternative();
+ }
+
+ void atomParentheticalAssertionBegin(bool invert = false)
+ {
+ PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
+ m_pattern.m_disjunctions.append(parenthesesDisjunction);
+ m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
+ m_alternative = parenthesesDisjunction->addNewAlternative();
+ m_invertParentheticalAssertion = invert;
+ }
+
+ void atomParenthesesEnd()
+ {
+ ASSERT(m_alternative->m_parent);
+ ASSERT(m_alternative->m_parent->m_parent);
+
+ PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
+ m_alternative = m_alternative->m_parent->m_parent;
+
+ PatternTerm& lastTerm = m_alternative->lastTerm();
+
+ unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
+ unsigned numBOLAnchoredAlts = 0;
+
+ for (unsigned i = 0; i < numParenAlternatives; i++) {
+ // Bubble up BOL flags
+ if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
+ numBOLAnchoredAlts++;
+ }
+
+ if (numBOLAnchoredAlts) {
+ m_alternative->m_containsBOL = true;
+ // If all the alternatives in parens start with BOL, then so does this one
+ if (numBOLAnchoredAlts == numParenAlternatives)
+ m_alternative->m_startsWithBOL = true;
+ }
+
+ lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+ m_invertParentheticalAssertion = false;
+ }
+
+ void atomBackReference(unsigned subpatternId)
+ {
+ ASSERT(subpatternId);
+ m_pattern.m_containsBackreferences = true;
+ m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
+
+ if (subpatternId > m_pattern.m_numSubpatterns) {
+ m_alternative->m_terms.append(PatternTerm::ForwardReference());
+ return;
+ }
+
+ PatternAlternative* currentAlternative = m_alternative;
+ ASSERT(currentAlternative);
+
+ // Note to self: if we waited until the AST was baked, we could also remove forwards refs
+ while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
+ PatternTerm& term = currentAlternative->lastTerm();
+ ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
+
+ if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
+ m_alternative->m_terms.append(PatternTerm::ForwardReference());
+ return;
+ }
+ }
+
+ m_alternative->m_terms.append(PatternTerm(subpatternId));
+ }
+
+ // deep copy the argument disjunction. If filterStartsWithBOL is true,
+ // skip alternatives with m_startsWithBOL set true.
+ PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
+ {
+ PatternDisjunction* newDisjunction = 0;
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
+ if (!newDisjunction) {
+ newDisjunction = new PatternDisjunction();
+ newDisjunction->m_parent = disjunction->m_parent;
+ }
+ PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
+ newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
+ }
+ }
+
+ if (newDisjunction)
+ m_pattern.m_disjunctions.append(newDisjunction);
+ return newDisjunction;
+ }
+
+ PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
+ {
+ if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
+ return PatternTerm(term);
+
+ PatternTerm termCopy = term;
+ termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
+ return termCopy;
+ }
+
+ void quantifyAtom(unsigned min, unsigned max, bool greedy)
+ {
+ ASSERT(min <= max);
+ ASSERT(m_alternative->m_terms.size());
+
+ if (!max) {
+ m_alternative->removeLastTerm();
+ return;
+ }
+
+ PatternTerm& term = m_alternative->lastTerm();
+ ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
+ ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
+
+ if (term.type == PatternTerm::TypeParentheticalAssertion) {
+ // If an assertion is quantified with a minimum count of zero, it can simply be removed.
+ // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
+ // results in any input being consumed, however the continuation passed to the assertion
+ // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
+ // reject all zero length matches (see step 2.1). A match from the continuation of the
+ // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
+ // this is that matches from the assertion are not required, and won't be accepted anyway,
+ // so no need to ever run it.
+ if (!min)
+ m_alternative->removeLastTerm();
+ // We never need to run an assertion more than once. Subsequent interations will be run
+ // with the same start index (since assertions are non-capturing) and the same captures
+ // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
+ // same result and captures. If the first match succeeds then the subsequent (min - 1)
+ // matches will too. Any additional optional matches will fail (on the same basis as the
+ // minimum zero quantified assertions, above), but this will still result in a match.
+ return;
+ }
+
+ if (min == 0)
+ term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+ else if (min == max)
+ term.quantify(min, QuantifierFixedCount);
+ else {
+ term.quantify(min, QuantifierFixedCount);
+ m_alternative->m_terms.append(copyTerm(term));
+ // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
+ m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+ if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
+ m_alternative->lastTerm().parentheses.isCopy = true;
+ }
+ }
+
+ void disjunction()
+ {
+ m_alternative = m_alternative->m_parent->addNewAlternative();
+ }
+
+ unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
+ {
+ alternative->m_hasFixedSize = true;
+ Checked<unsigned> currentInputPosition = initialInputPosition;
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ case PatternTerm::TypeAssertionEOL:
+ case PatternTerm::TypeAssertionWordBoundary:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ break;
+
+ case PatternTerm::TypeBackReference:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
+ alternative->m_hasFixedSize = false;
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypePatternCharacter:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ if (term.quantityType != QuantifierFixedCount) {
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
+ alternative->m_hasFixedSize = false;
+ } else
+ currentInputPosition += term.quantityCount;
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ if (term.quantityType != QuantifierFixedCount) {
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
+ alternative->m_hasFixedSize = false;
+ } else
+ currentInputPosition += term.quantityCount;
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
+ term.frameLocation = currentCallFrameSize;
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ if (term.quantityType != QuantifierFixedCount)
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ // If quantity is fixed, then pre-check its minimum size.
+ if (term.quantityType == QuantifierFixedCount)
+ currentInputPosition += term.parentheses.disjunction->m_minimumSize;
+ term.inputPosition = currentInputPosition.unsafeGet();
+ } else if (term.parentheses.isTerminal) {
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ term.inputPosition = currentInputPosition.unsafeGet();
+ } else {
+ term.inputPosition = currentInputPosition.unsafeGet();
+ setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
+ }
+ // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
+ alternative->m_hasFixedSize = false;
+ break;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
+ break;
+
+ case PatternTerm::TypeDotStarEnclosure:
+ alternative->m_hasFixedSize = false;
+ term.inputPosition = initialInputPosition;
+ break;
+ }
+ }
+
+ alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
+ return currentCallFrameSize;
+ }
+
+ unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
+ {
+ if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
+ initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
+
+ unsigned minimumInputSize = UINT_MAX;
+ unsigned maximumCallFrameSize = 0;
+ bool hasFixedSize = true;
+
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
+ minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
+ maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
+ hasFixedSize &= alternative->m_hasFixedSize;
+ }
+
+ ASSERT(minimumInputSize != UINT_MAX);
+ ASSERT(maximumCallFrameSize >= initialCallFrameSize);
+
+ disjunction->m_hasFixedSize = hasFixedSize;
+ disjunction->m_minimumSize = minimumInputSize;
+ disjunction->m_callFrameSize = maximumCallFrameSize;
+ return maximumCallFrameSize;
+ }
+
+ void setupOffsets()
+ {
+ setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
+ }
+
+ // This optimization identifies sets of parentheses that we will never need to backtrack.
+ // In these cases we do not need to store state from prior iterations.
+ // We can presently avoid backtracking for:
+ // * where the parens are at the end of the regular expression (last term in any of the
+ // alternatives of the main body disjunction).
+ // * where the parens are non-capturing, and quantified unbounded greedy (*).
+ // * where the parens do not contain any capturing subpatterns.
+ void checkForTerminalParentheses()
+ {
+ // This check is much too crude; should be just checking whether the candidate
+ // node contains nested capturing subpatterns, not the whole expression!
+ if (m_pattern.m_numSubpatterns)
+ return;
+
+ Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ for (size_t i = 0; i < alternatives.size(); ++i) {
+ Vector<PatternTerm>& terms = alternatives[i]->m_terms;
+ if (terms.size()) {
+ PatternTerm& term = terms.last();
+ if (term.type == PatternTerm::TypeParenthesesSubpattern
+ && term.quantityType == QuantifierGreedy
+ && term.quantityCount == quantifyInfinite
+ && !term.capture())
+ term.parentheses.isTerminal = true;
+ }
+ }
+ }
+
+ void optimizeBOL()
+ {
+ // Look for expressions containing beginning of line (^) anchoring and unroll them.
+ // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
+ // This code relies on the parsing code tagging alternatives with m_containsBOL and
+ // m_startsWithBOL and rolling those up to containing alternatives.
+ // At this point, this is only valid for non-multiline expressions.
+ PatternDisjunction* disjunction = m_pattern.m_body;
+
+ if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+ return;
+
+ PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
+
+ // Set alternatives in disjunction to "onceThrough"
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives[alt]->setOnceThrough();
+
+ if (loopDisjunction) {
+ // Move alternatives from loopDisjunction to disjunction
+ for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
+
+ loopDisjunction->m_alternatives.clear();
+ }
+ }
+
+ bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
+ {
+ Vector<PatternTerm>& terms = alternative->m_terms;
+
+ for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
+ PatternTerm& term = terms[termIndex];
+
+ if (term.m_capture)
+ return true;
+
+ if (term.type == PatternTerm::TypeParenthesesSubpattern) {
+ PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
+ for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
+ if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ // This optimization identifies alternatives in the form of
+ // [^].*[?]<expression>.*[$] for expressions that don't have any
+ // capturing terms. The alternative is changed to <expression>
+ // followed by processing of the dot stars to find and adjust the
+ // beginning and the end of the match.
+ void optimizeDotStarWrappedExpressions()
+ {
+ Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ if (alternatives.size() != 1)
+ return;
+
+ PatternAlternative* alternative = alternatives[0];
+ Vector<PatternTerm>& terms = alternative->m_terms;
+ if (terms.size() >= 3) {
+ bool startsWithBOL = false;
+ bool endsWithEOL = false;
+ size_t termIndex, firstExpressionTerm, lastExpressionTerm;
+
+ termIndex = 0;
+ if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
+ startsWithBOL = true;
+ ++termIndex;
+ }
+
+ PatternTerm& firstNonAnchorTerm = terms[termIndex];
+ if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
+ return;
+
+ firstExpressionTerm = termIndex + 1;
+
+ termIndex = terms.size() - 1;
+ if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
+ endsWithEOL = true;
+ --termIndex;
+ }
+
+ PatternTerm& lastNonAnchorTerm = terms[termIndex];
+ if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
+ return;
+
+ lastExpressionTerm = termIndex - 1;
+
+ if (firstExpressionTerm > lastExpressionTerm)
+ return;
+
+ if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
+ for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
+ terms.remove(termIndex);
+
+ for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
+ terms.remove(termIndex - 1);
+
+ terms.append(PatternTerm(startsWithBOL, endsWithEOL));
+
+ m_pattern.m_containsBOL = false;
+ }
+ }
+ }
+
+private:
+ YarrPattern& m_pattern;
+ PatternAlternative* m_alternative;
+ CharacterClassConstructor m_characterClassConstructor;
+ bool m_invertCharacterClass;
+ bool m_invertParentheticalAssertion;
+};
+
+const char* YarrPattern::compile(const String& patternString)
+{
+ YarrPatternConstructor constructor(*this);
+
+ if (const char* error = parse(constructor, patternString))
+ return error;
+
+ // If the pattern contains illegal backreferences reset & reparse.
+ // Quoting Netscape's "What's new in JavaScript 1.2",
+ // "Note: if the number of left parentheses is less than the number specified
+ // in \#, the \# is taken as an octal escape as described in the next row."
+ if (containsIllegalBackReference()) {
+ unsigned numSubpatterns = m_numSubpatterns;
+
+ constructor.reset();
+#if !ASSERT_DISABLED
+ const char* error =
+#endif
+ parse(constructor, patternString, numSubpatterns);
+
+ ASSERT(!error);
+ ASSERT(numSubpatterns == m_numSubpatterns);
+ }
+
+ constructor.checkForTerminalParentheses();
+ constructor.optimizeDotStarWrappedExpressions();
+ constructor.optimizeBOL();
+
+ constructor.setupOffsets();
+
+ return 0;
+}
+
+YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
+ : m_ignoreCase(ignoreCase)
+ , m_multiline(multiline)
+ , m_containsBackreferences(false)
+ , m_containsBOL(false)
+ , m_numSubpatterns(0)
+ , m_maxBackReference(0)
+ , newlineCached(0)
+ , digitsCached(0)
+ , spacesCached(0)
+ , wordcharCached(0)
+ , nondigitsCached(0)
+ , nonspacesCached(0)
+ , nonwordcharCached(0)
+{
+ *error = compile(pattern);
+}
+
+} }
--- /dev/null
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrPattern_h
+#define YarrPattern_h
+
+#include <wtf/CheckedArithmetic.h>
+#include <wtf/RefCounted.h>
+#include <wtf/Vector.h>
+#include <wtf/text/WTFString.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+struct PatternDisjunction;
+
+struct CharacterRange {
+ UChar begin;
+ UChar end;
+
+ CharacterRange(UChar begin, UChar end)
+ : begin(begin)
+ , end(end)
+ {
+ }
+};
+
+struct CharacterClassTable : RefCounted<CharacterClassTable> {
+ const char* m_table;
+ bool m_inverted;
+ static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
+ {
+ return adoptRef(new CharacterClassTable(table, inverted));
+ }
+
+private:
+ CharacterClassTable(const char* table, bool inverted)
+ : m_table(table)
+ , m_inverted(inverted)
+ {
+ }
+};
+
+struct CharacterClass {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ // All CharacterClass instances have to have the full set of matches and ranges,
+ // they may have an optional table for faster lookups (which must match the
+ // specified matches and ranges)
+ CharacterClass(PassRefPtr<CharacterClassTable> table)
+ : m_table(table)
+ {
+ }
+ Vector<UChar> m_matches;
+ Vector<CharacterRange> m_ranges;
+ Vector<UChar> m_matchesUnicode;
+ Vector<CharacterRange> m_rangesUnicode;
+ RefPtr<CharacterClassTable> m_table;
+};
+
+enum QuantifierType {
+ QuantifierFixedCount,
+ QuantifierGreedy,
+ QuantifierNonGreedy,
+};
+
+struct PatternTerm {
+ enum Type {
+ TypeAssertionBOL,
+ TypeAssertionEOL,
+ TypeAssertionWordBoundary,
+ TypePatternCharacter,
+ TypeCharacterClass,
+ TypeBackReference,
+ TypeForwardReference,
+ TypeParenthesesSubpattern,
+ TypeParentheticalAssertion,
+ TypeDotStarEnclosure,
+ } type;
+ bool m_capture :1;
+ bool m_invert :1;
+ union {
+ UChar patternCharacter;
+ CharacterClass* characterClass;
+ unsigned backReferenceSubpatternId;
+ struct {
+ PatternDisjunction* disjunction;
+ unsigned subpatternId;
+ unsigned lastSubpatternId;
+ bool isCopy;
+ bool isTerminal;
+ } parentheses;
+ struct {
+ bool bolAnchor : 1;
+ bool eolAnchor : 1;
+ } anchors;
+ };
+ QuantifierType quantityType;
+ Checked<unsigned> quantityCount;
+ int inputPosition;
+ unsigned frameLocation;
+
+ PatternTerm(UChar ch)
+ : type(PatternTerm::TypePatternCharacter)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ patternCharacter = ch;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(CharacterClass* charClass, bool invert)
+ : type(PatternTerm::TypeCharacterClass)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ characterClass = charClass;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
+ : type(type)
+ , m_capture(capture)
+ , m_invert(invert)
+ {
+ parentheses.disjunction = disjunction;
+ parentheses.subpatternId = subpatternId;
+ parentheses.isCopy = false;
+ parentheses.isTerminal = false;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(Type type, bool invert = false)
+ : type(type)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(unsigned spatternId)
+ : type(TypeBackReference)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ backReferenceSubpatternId = spatternId;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(bool bolAnchor, bool eolAnchor)
+ : type(TypeDotStarEnclosure)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ anchors.bolAnchor = bolAnchor;
+ anchors.eolAnchor = eolAnchor;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ static PatternTerm ForwardReference()
+ {
+ return PatternTerm(TypeForwardReference);
+ }
+
+ static PatternTerm BOL()
+ {
+ return PatternTerm(TypeAssertionBOL);
+ }
+
+ static PatternTerm EOL()
+ {
+ return PatternTerm(TypeAssertionEOL);
+ }
+
+ static PatternTerm WordBoundary(bool invert)
+ {
+ return PatternTerm(TypeAssertionWordBoundary, invert);
+ }
+
+ bool invert()
+ {
+ return m_invert;
+ }
+
+ bool capture()
+ {
+ return m_capture;
+ }
+
+ void quantify(unsigned count, QuantifierType type)
+ {
+ quantityCount = count;
+ quantityType = type;
+ }
+};
+
+struct PatternAlternative {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ PatternAlternative(PatternDisjunction* disjunction)
+ : m_parent(disjunction)
+ , m_onceThrough(false)
+ , m_hasFixedSize(false)
+ , m_startsWithBOL(false)
+ , m_containsBOL(false)
+ {
+ }
+
+ PatternTerm& lastTerm()
+ {
+ ASSERT(m_terms.size());
+ return m_terms[m_terms.size() - 1];
+ }
+
+ void removeLastTerm()
+ {
+ ASSERT(m_terms.size());
+ m_terms.shrink(m_terms.size() - 1);
+ }
+
+ void setOnceThrough()
+ {
+ m_onceThrough = true;
+ }
+
+ bool onceThrough()
+ {
+ return m_onceThrough;
+ }
+
+ Vector<PatternTerm> m_terms;
+ PatternDisjunction* m_parent;
+ unsigned m_minimumSize;
+ bool m_onceThrough : 1;
+ bool m_hasFixedSize : 1;
+ bool m_startsWithBOL : 1;
+ bool m_containsBOL : 1;
+};
+
+struct PatternDisjunction {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ PatternDisjunction(PatternAlternative* parent = 0)
+ : m_parent(parent)
+ , m_hasFixedSize(false)
+ {
+ }
+
+ ~PatternDisjunction()
+ {
+ deleteAllValues(m_alternatives);
+ }
+
+ PatternAlternative* addNewAlternative()
+ {
+ PatternAlternative* alternative = new PatternAlternative(this);
+ m_alternatives.append(alternative);
+ return alternative;
+ }
+
+ Vector<PatternAlternative*> m_alternatives;
+ PatternAlternative* m_parent;
+ unsigned m_minimumSize;
+ unsigned m_callFrameSize;
+ bool m_hasFixedSize;
+};
+
+// You probably don't want to be calling these functions directly
+// (please to be calling newlineCharacterClass() et al on your
+// friendly neighborhood YarrPattern instance to get nicely
+// cached copies).
+CharacterClass* newlineCreate();
+CharacterClass* digitsCreate();
+CharacterClass* spacesCreate();
+CharacterClass* wordcharCreate();
+CharacterClass* nondigitsCreate();
+CharacterClass* nonspacesCreate();
+CharacterClass* nonwordcharCreate();
+
+struct TermChain {
+ TermChain(PatternTerm term)
+ : term(term)
+ {}
+
+ PatternTerm term;
+ Vector<TermChain> hotTerms;
+};
+
+struct YarrPattern {
+ JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error);
+
+ ~YarrPattern()
+ {
+ deleteAllValues(m_disjunctions);
+ deleteAllValues(m_userCharacterClasses);
+ }
+
+ void reset()
+ {
+ m_numSubpatterns = 0;
+ m_maxBackReference = 0;
+
+ m_containsBackreferences = false;
+ m_containsBOL = false;
+
+ newlineCached = 0;
+ digitsCached = 0;
+ spacesCached = 0;
+ wordcharCached = 0;
+ nondigitsCached = 0;
+ nonspacesCached = 0;
+ nonwordcharCached = 0;
+
+ deleteAllValues(m_disjunctions);
+ m_disjunctions.clear();
+ deleteAllValues(m_userCharacterClasses);
+ m_userCharacterClasses.clear();
+ }
+
+ bool containsIllegalBackReference()
+ {
+ return m_maxBackReference > m_numSubpatterns;
+ }
+
+ CharacterClass* newlineCharacterClass()
+ {
+ if (!newlineCached)
+ m_userCharacterClasses.append(newlineCached = newlineCreate());
+ return newlineCached;
+ }
+ CharacterClass* digitsCharacterClass()
+ {
+ if (!digitsCached)
+ m_userCharacterClasses.append(digitsCached = digitsCreate());
+ return digitsCached;
+ }
+ CharacterClass* spacesCharacterClass()
+ {
+ if (!spacesCached)
+ m_userCharacterClasses.append(spacesCached = spacesCreate());
+ return spacesCached;
+ }
+ CharacterClass* wordcharCharacterClass()
+ {
+ if (!wordcharCached)
+ m_userCharacterClasses.append(wordcharCached = wordcharCreate());
+ return wordcharCached;
+ }
+ CharacterClass* nondigitsCharacterClass()
+ {
+ if (!nondigitsCached)
+ m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
+ return nondigitsCached;
+ }
+ CharacterClass* nonspacesCharacterClass()
+ {
+ if (!nonspacesCached)
+ m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
+ return nonspacesCached;
+ }
+ CharacterClass* nonwordcharCharacterClass()
+ {
+ if (!nonwordcharCached)
+ m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
+ return nonwordcharCached;
+ }
+
+ bool m_ignoreCase : 1;
+ bool m_multiline : 1;
+ bool m_containsBackreferences : 1;
+ bool m_containsBOL : 1;
+ unsigned m_numSubpatterns;
+ unsigned m_maxBackReference;
+ PatternDisjunction* m_body;
+ Vector<PatternDisjunction*, 4> m_disjunctions;
+ Vector<CharacterClass*> m_userCharacterClasses;
+
+private:
+ const char* compile(const String& patternString);
+
+ CharacterClass* newlineCached;
+ CharacterClass* digitsCached;
+ CharacterClass* spacesCached;
+ CharacterClass* wordcharCached;
+ CharacterClass* nondigitsCached;
+ CharacterClass* nonspacesCached;
+ CharacterClass* nonwordcharCached;
+};
+
+} } // namespace JSC::Yarr
+
+#endif // YarrPattern_h
--- /dev/null
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrSyntaxChecker.h"
+
+#include "YarrParser.h"
+
+namespace JSC { namespace Yarr {
+
+class SyntaxChecker {
+public:
+ void assertionBOL() {}
+ void assertionEOL() {}
+ void assertionWordBoundary(bool) {}
+ void atomPatternCharacter(UChar) {}
+ void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {}
+ void atomCharacterClassBegin(bool = false) {}
+ void atomCharacterClassAtom(UChar) {}
+ void atomCharacterClassRange(UChar, UChar) {}
+ void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {}
+ void atomCharacterClassEnd() {}
+ void atomParenthesesSubpatternBegin(bool = true) {}
+ void atomParentheticalAssertionBegin(bool = false) {}
+ void atomParenthesesEnd() {}
+ void atomBackReference(unsigned) {}
+ void quantifyAtom(unsigned, unsigned, bool) {}
+ void disjunction() {}
+};
+
+const char* checkSyntax(const String& pattern)
+{
+ SyntaxChecker syntaxChecker;
+ return parse(syntaxChecker, pattern);
+}
+
+}} // JSC::YARR
--- /dev/null
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrSyntaxChecker_h
+#define YarrSyntaxChecker_h
+
+#include <wtf/text/WTFString.h>
+
+namespace JSC { namespace Yarr {
+
+const char* checkSyntax(const String& pattern);
+
+}} // JSC::YARR
+
+#endif // YarrSyntaxChecker_h
+
--- /dev/null
+# -------------------------------------------------------------------
+# Project file for YARR
+#
+# See 'Tools/qmake/README' for an overview of the build system
+# -------------------------------------------------------------------
+
+SOURCES += \
+ $$PWD/YarrInterpreter.cpp \
+ $$PWD/YarrPattern.cpp \
+ $$PWD/YarrSyntaxChecker.cpp \
+ $$PWD/YarrCanonicalizeUCS2.cpp
+
RegExpObject *ExecutionEngine::newRegExpObject(const QString &pattern, int flags)
{
bool global = (flags & IR::RegExp::RegExp_Global);
- QRegularExpression::PatternOptions options = 0;
+ bool ignoreCase = false;
+ bool multiline = false;
if (flags & IR::RegExp::RegExp_IgnoreCase)
- options |= QRegularExpression::CaseInsensitiveOption;
+ ignoreCase = true;
if (flags & IR::RegExp::RegExp_Multiline)
- options |= QRegularExpression::MultilineOption;
+ multiline = true;
- QRegularExpression re(pattern, options);
- return newRegExpObject(re, global);
+ return newRegExpObject(RegExp::create(this, pattern, ignoreCase, multiline), global);
}
-RegExpObject *ExecutionEngine::newRegExpObject(const QRegularExpression &re, bool global)
+RegExpObject *ExecutionEngine::newRegExpObject(PassRefPtr<RegExp> re, bool global)
{
RegExpObject *object = new (memoryManager) RegExpObject(re, global);
object->prototype = regExpPrototype;
#include <qmljs_environment.h>
#include <setjmp.h>
+#include <wtf/PassRefPtr.h>
+#include <wtf/BumpPointerAllocator.h>
+
namespace QQmlJS {
namespace Debugging {
struct TypeErrorPrototype;
struct URIErrorPrototype;
+class RegExp;
+
struct ExecutionEngine
{
MemoryManager *memoryManager;
EvalISelFactory *iselFactory;
ExecutionContext *current;
ExecutionContext *rootContext;
+ WTF::BumpPointerAllocator bumperPointerAllocator; // Used by Yarr Regex engine.
Debugging::Debugger *debugger;
FunctionObject *newDateCtor(ExecutionContext *ctx);
RegExpObject *newRegExpObject(const QString &pattern, int flags);
- RegExpObject *newRegExpObject(const QRegularExpression &re, bool global);
+ RegExpObject *newRegExpObject(PassRefPtr<RegExp> re, bool global);
FunctionObject *newRegExpCtor(ExecutionContext *ctx);
Object *newErrorObject(const Value &value);
QString n = name->toQString();
Value v = Value::undefinedValue();
if (n == QLatin1String("source"))
- v = Value::fromString(ctx, value.pattern());
+ v = Value::fromString(ctx, value->pattern());
else if (n == QLatin1String("global"))
v = Value::fromBoolean(global);
else if (n == QLatin1String("ignoreCase"))
- v = Value::fromBoolean(value.patternOptions() & QRegularExpression::CaseInsensitiveOption);
+ v = Value::fromBoolean(value->ignoreCase());
else if (n == QLatin1String("multiline"))
- v = Value::fromBoolean(value.patternOptions() & QRegularExpression::MultilineOption);
+ v = Value::fromBoolean(value->multiLine());
else if (n == QLatin1String("lastIndex"))
v = lastIndex;
if (v.type() != Value::Undefined_Type) {
*hasProperty = true;
return v;
}
-
return Object::__get__(ctx, name, hasProperty);
}
#include "qv4propertydescriptor.h"
#include "qv4propertytable.h"
#include "qv4objectiterator.h"
+#include "qv4regexp.h"
#include <QtCore/QString>
#include <QtCore/QHash>
-#include <QtCore/QRegularExpression>
#include <QtCore/QScopedPointer>
#include <cstdio>
#include <cassert>
};
struct RegExpObject: Object {
- QRegularExpression value;
+ RefPtr<RegExp> value;
Value lastIndex;
bool global;
- RegExpObject(const QRegularExpression &value, bool global): value(value), lastIndex(Value::fromInt32(0)), global(global) {}
+ RegExpObject(PassRefPtr<RegExp> value, bool global): value(value), lastIndex(Value::fromInt32(0)), global(global) {}
virtual QString className() { return QStringLiteral("RegExp"); }
virtual RegExpObject *asRegExpObject() { return this; }
virtual Value __get__(ExecutionContext *ctx, String *name, bool *hasProperty);
#include <QtCore/qmath.h>
#include <QtCore/QDateTime>
#include <QtCore/QStringList>
-#include <QtCore/QRegularExpression>
#include <QtCore/QDebug>
#include <cmath>
#include <qmath.h>
r = __qmljs_to_string(r, ctx);
bool global = false;
- QRegularExpression::PatternOptions options = QRegularExpression::NoPatternOption;
+ bool ignoreCase = false;
+ bool multiLine = false;
if (!f.isUndefined()) {
f = __qmljs_to_string(f, ctx);
QString str = f.stringValue()->toQString();
for (int i = 0; i < str.length(); ++i) {
if (str.at(i) == QChar('g') && !global) {
global = true;
- } else if (str.at(i) == QChar('i') && !(options & QRegularExpression::CaseInsensitiveOption)) {
- options |= QRegularExpression::CaseInsensitiveOption;
- } else if (str.at(i) == QChar('m') && !(options & QRegularExpression::MultilineOption)) {
- options |= QRegularExpression::MultilineOption;
+ } else if (str.at(i) == QChar('i') && !ignoreCase) {
+ ignoreCase = true;
+ } else if (str.at(i) == QChar('m') && !multiLine) {
+ multiLine = true;
} else {
ctx->throwTypeError();
}
}
}
- QRegularExpression re(r.stringValue()->toQString(), options);
- if (!re.isValid())
+ RefPtr<RegExp> re = RegExp::create(ctx->engine, r.stringValue()->toQString(), ignoreCase, multiLine);
+ if (!re->isValid())
ctx->throwTypeError();
RegExpObject *o = ctx->engine->newRegExpObject(re, global);
if (offset < 0 || offset > s.length())
return Value::nullValue();
- QRegularExpressionMatch match = r->value.match(s, offset);
- if (!match.hasMatch())
+ uint* matchOffsets = (uint*)alloca(r->value->captureCount() * 2 * sizeof(uint));
+ int result = r->value->match(s, offset, matchOffsets);
+ if (result == -1)
return Value::nullValue();
// fill in result data
ArrayObject *array = ctx->engine->newArrayObject(ctx)->asArrayObject();
- int captured = match.lastCapturedIndex();
- for (int i = 0; i <= captured; ++i)
- array->array.push_back(Value::fromString(ctx, match.captured(i)));
+ for (int i = 0; i < r->value->captureCount(); ++i) {
+ int start = matchOffsets[i * 2];
+ int end = matchOffsets[i * 2 + 1];
+ if (start != -1 && end != -1)
+ array->array.push_back(Value::fromString(ctx, s.mid(start, end - start)));
+ }
- array->__put__(ctx, QLatin1String("index"), Value::fromInt32(match.capturedStart(0)));
+ array->__put__(ctx, QLatin1String("index"), Value::fromInt32(result));
array->__put__(ctx, QLatin1String("input"), arg);
if (r->global)
- r->lastIndex = Value::fromInt32(match.capturedEnd(0));
+ r->lastIndex = Value::fromInt32(matchOffsets[1]);
return Value::fromObject(array);
}
if (!r)
ctx->throwTypeError();
- QString result = QChar('/') + r->value.pattern();
+ QString result = QChar('/') + r->value->pattern();
result += QChar('/');
- QRegularExpression::PatternOptions o = r->value.patternOptions();
// ### 'g' option missing
- if (o & QRegularExpression::CaseInsensitiveOption)
+ if (r->value->ignoreCase())
result += QChar('i');
- if (o & QRegularExpression::MultilineOption)
+ if (r->value->multiLine())
result += QChar('m');
return Value::fromString(ctx, result);
}
struct RegExpPrototype: RegExpObject
{
- RegExpPrototype(): RegExpObject(QRegularExpression(), false) {}
+ RegExpPrototype(): RegExpObject(RegExp::create(0, QString()), false) {}
void init(ExecutionContext *ctx, const Value &ctor);
static Value method_exec(ExecutionContext *ctx);
--- /dev/null
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia. For licensing terms and
+** conditions see http://qt.digia.com/licensing. For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights. These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qv4regexp.h"
+
+#include "qmljs_engine.h"
+
+namespace QQmlJS {
+namespace VM {
+
+int RegExp::match(const QString &string, int start, uint *matchOffsets)
+{
+ if (!isValid())
+ return JSC::Yarr::offsetNoMatch;
+
+ return JSC::Yarr::interpret(m_byteCode.get(), WTF::String(string).characters16(), string.length(), start, matchOffsets);
+}
+
+RegExp::RegExp(ExecutionEngine* engine, const QString &pattern, bool ignoreCase, bool multiline)
+ : m_pattern(pattern)
+ , m_subPatternCount(0)
+ , m_ignoreCase(ignoreCase)
+ , m_multiLine(multiline)
+{
+ if (!engine)
+ return;
+ const char* error = 0;
+ JSC::Yarr::YarrPattern yarrPattern(WTF::String(pattern), ignoreCase, multiline, &error);
+ if (error)
+ return;
+ m_subPatternCount = yarrPattern.m_numSubpatterns;
+ m_byteCode = JSC::Yarr::byteCompile(yarrPattern, &engine->bumperPointerAllocator);
+}
+
+} // end of namespace VM
+} // end of namespace QQmlJS
+
+
--- /dev/null
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia. For licensing terms and
+** conditions see http://qt.digia.com/licensing. For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights. These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef QV4REGEXP_H
+#define QV4REGEXP_H
+
+#include <QString>
+#include <QVector>
+
+#include <wtf/RefCounted.h>
+#include <wtf/RefPtr.h>
+#include <wtf/FastAllocBase.h>
+#include <wtf/BumpPointerAllocator.h>
+
+#include <limits.h>
+
+#include <yarr/Yarr.h>
+#include <yarr/YarrInterpreter.h>
+
+namespace QQmlJS {
+namespace VM {
+
+struct ExecutionEngine;
+
+class RegExp : public RefCounted<RegExp>
+{
+public:
+ static PassRefPtr<RegExp> create(ExecutionEngine* engine, const QString& pattern, bool ignoreCase = false, bool multiline = false)
+ { return adoptRef(new RegExp(engine, pattern, ignoreCase, multiline)); }
+
+ QString pattern() const { return m_pattern; }
+
+ bool isValid() const { return m_byteCode.get(); }
+
+ int match(const QString& string, int start, uint *matchOffsets);
+
+ bool ignoreCase() const { return m_ignoreCase; }
+ bool multiLine() const { return m_multiLine; }
+ int captureCount() const { return m_subPatternCount + 1; }
+
+private:
+ Q_DISABLE_COPY(RegExp);
+ RegExp(ExecutionEngine* engine, const QString& pattern, bool ignoreCase, bool multiline);
+
+ QString m_pattern;
+ OwnPtr<JSC::Yarr::BytecodePattern> m_byteCode;
+ int m_subPatternCount;
+ bool m_ignoreCase;
+ bool m_multiLine;
+};
+
+} // end of namespace VM
+} // end of namespace QQmlJS
+
+#endif // QV4REGEXP_H
S15.10.2.3_A1_T2 failing
S15.10.2.5_A1_T4 failing
S15.10.2.12_A6_T1 failing
-S15.10.2.13_A2_T1 failing
-S15.10.2.13_A2_T2 failing
-S15.10.2.13_A2_T8 failing
S15.10.2.15_A1_T1 failing
S15.10.2.15_A1_T10 failing
S15.10.2.15_A1_T11 failing
15.10.2.15-6-1 failing
15.10.2.2-1 failing
15.10.2.5-3-1 failing
-S15.10.2.10_A2.1_T3 failing
-S15.10.2.10_A4.1_T1 failing
-S15.10.2.10_A4.1_T2 failing
-S15.10.2.10_A4.1_T3 failing
-S15.10.2.10_A5.1_T1 failing
-S15.10.2.11_A1_T5 failing
-S15.10.2.11_A1_T7 failing
S15.10.2.12_A1_T1 failing
-S15.10.2.12_A1_T2 failing
-S15.10.2.12_A1_T5 failing
S15.10.2.12_A2_T1 failing
-S15.10.2.12_A2_T2 failing
-S15.10.2.12_A2_T5 failing
S15.10.2.12_A3_T1 failing
S15.10.2.12_A4_T1 failing
S15.10.2.12_A5_T1 failing
S15.10.4.1_A5_T7 failing
S15.10.4.1_A5_T8 failing
S15.10.4.1_A5_T9 failing
-S15.10.4.1_A8_T2 failing
S15.10.4.1_A9_T1 failing
S15.10.4.1_A9_T2 failing
S15.10.4.1_A9_T3 failing
15.4.4.14-9-b-i-5 failing
15.4.4.16-7-c-i-6 failing
15.4.4.17-7-c-i-6 failing
-
qv4managed.cpp \
qv4array.cpp \
qv4string.cpp \
- qv4objectiterator.cpp
+ qv4objectiterator.cpp \
+ qv4regexp.cpp
HEADERS += \
qv4codegen_p.h \
qv4string.h \
qv4propertydescriptor.h \
qv4propertytable.h \
- qv4objectiterator.h
+ qv4objectiterator.h \
+ qv4regexp.h
llvm {