Add Yarr regex engine from JSC

author Simon Hausmann <simon.hausmann@digia.com>

Mon, 14 Jan 2013 15:53:43 +0000 (16:53 +0100)

committer Simon Hausmann <simon.hausmann@digia.com>

Thu, 17 Jan 2013 13:24:04 +0000 (14:24 +0100)
author Simon Hausmann <simon.hausmann@digia.com>
Mon, 14 Jan 2013 15:53:43 +0000 (16:53 +0100)
committer Simon Hausmann <simon.hausmann@digia.com>
Thu, 17 Jan 2013 13:24:04 +0000 (14:24 +0100)
diff --git a/.gitignore b/.gitignore

index 17784d7c7e34e5b78e1a9758bc251ed9fd8a96ab..ec496ad5797680f9f0562697711e5f578c34a5a6 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ udis86_itab.*
  *.pro.user
  *.bc
  *.ll
+RegExpJitTables.h
diff --git a/masm/config.h b/masm/config.h

index 1ced4e454c631cc2a005f9832eb2f5bc754d107c..5f59f311e3ac95e7b750ea5f138b042efbb92625 100644 (file)
--- a/masm/config.h
+++ b/masm/config.h
@@ -45,6 +45,8 @@
  #include <wtf/Platform.h>
  #ifdef __cplusplus
  #include <wtf/Vector.h>
+#include <wtf/FastAllocBase.h>
+#include <wtf/RefPtr.h>
  #include <cmath>
  #else
  #include <math.h>
diff --git a/masm/create_regex_tables b/masm/create_regex_tables

new file mode 100644 (file)

index 0000000..bd799ba
--- /dev/null
+++ b/masm/create_regex_tables
@@ -0,0 +1,121 @@
+# Copyright (C) 2010 Apple Inc. All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 
+# THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+
+import sys
+
+types = {
+    "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
+    "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]},
+    "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
+    "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
+    "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]},
+    "digits": { "UseTable" : False, "data": [('0', '9')]},
+    "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] }
+}
+entriesPerLine = 50
+arrays = "";
+functions = "";
+emitTables = (len(sys.argv) < 2 or sys.argv[1] != "--no-tables")
+
+for name, classes in types.items():
+    ranges = [];
+    size = 0;
+    for _class in classes["data"]:
+        if type(_class) == str:
+            ranges.append((ord(_class), ord(_class)))
+        elif type(_class) == int:
+            ranges.append((_class, _class))
+        else:
+            (min, max) = _class;
+            if type(min) == str:
+                min = ord(min)
+            if type(max) == str:
+                max = ord(max)
+            if max > 0x7f and min <= 0x7f:
+                ranges.append((min, 0x7f))
+                min = 0x80
+            ranges.append((min,max))
+    ranges.sort();
+    
+    if emitTables and classes["UseTable"] and (not "Inverse" in classes):
+        array = ("static const char _%sData[65536] = {\n" % name);
+        i = 0
+        for (min,max) in ranges:
+            while i < min:
+                i = i + 1
+                array += ('0,')
+                if (i % entriesPerLine == 0) and (i != 0):
+                    array += ('\n')
+            while i <= max:
+                i = i + 1
+                if (i == 65536):
+                    array += ("1")
+                else:
+                    array += ('1,')
+                if (i % entriesPerLine == 0) and (i != 0):
+                    array += ('\n')
+        while i < 0xffff:
+            array += ("0,")
+            i = i + 1;
+            if (i % entriesPerLine == 0) and (i != 0):
+                array += ('\n')
+        if i == 0xffff:
+            array += ("0")
+        array += ("\n};\n\n");
+        arrays += array
+    
+    # Generate createFunction:
+    function = "";
+    function += ("CharacterClass* %sCreate()\n" % name)
+    function += ("{\n")
+    if emitTables and classes["UseTable"]:
+        if "Inverse" in classes:
+            function += ("    CharacterClass* characterClass = new CharacterClass(CharacterClassTable::create(_%sData, true));\n" % (classes["Inverse"]))
+        else:
+            function += ("    CharacterClass* characterClass = new CharacterClass(CharacterClassTable::create(_%sData, false));\n" % (name))
+    else:
+        function += ("    CharacterClass* characterClass = new CharacterClass(0);\n")
+    for (min, max) in ranges:
+        if (min == max):
+            if (min > 127):
+                function += ("    characterClass->m_matchesUnicode.append(0x%04x);\n" % min)
+            else:
+                function += ("    characterClass->m_matches.append(0x%02x);\n" % min)
+            continue
+        if (min > 127) or (max > 127):
+            function += ("    characterClass->m_rangesUnicode.append(CharacterRange(0x%04x, 0x%04x));\n" % (min, max))
+        else:
+            function += ("    characterClass->m_ranges.append(CharacterRange(0x%02x, 0x%02x));\n" % (min, max))
+    function += ("    return characterClass;\n")
+    function += ("}\n\n")
+    functions += function
+
+if (len(sys.argv) > 1):
+    f = open(sys.argv[-1], "w")
+    f.write(arrays)
+    f.write(functions)
+    f.close()
+else:
+    print(arrays)
+    print(functions)
+
diff --git a/masm/masm.pri b/masm/masm.pri

index d0033f59c1fb1557a176c0f657326e196fd535ea..a6d11f633c9f541286002e284e2bf48ec77b4b32 100644 (file)
--- a/masm/masm.pri
+++ b/masm/masm.pri
@@ -30,7 +30,7 @@ HEADERS += $$PWD/wtf/PageReservation.h
  SOURCES += $$PWD/stubs/WTFStubs.cpp
  HEADERS += $$PWD/stubs/WTFStubs.h
  
-DEFINES += WTF_EXPORT_PRIVATE=""
+DEFINES += WTF_EXPORT_PRIVATE="" JS_EXPORT_PRIVATE=""
  
  DEFINES += ENABLE_LLINT=0
  DEFINES += ENABLE_DFG_JIT=0
@@ -42,6 +42,7 @@ DEFINES += BUILDING_QT__
  
  INCLUDEPATH += $$PWD/jit
  INCLUDEPATH += $$PWD/assembler
+INCLUDEPATH += $$PWD/runtime
  INCLUDEPATH += $$PWD/wtf
  INCLUDEPATH += $$PWD/stubs
  INCLUDEPATH += $$PWD/stubs/wtf
@@ -61,6 +62,21 @@ SOURCES += $$PWD/disassembler/udis86/udis86_syn-att.c
  SOURCES += $$PWD/disassembler/udis86/udis86_syn.c
  SOURCES += $$PWD/disassembler/udis86/udis86_syn-intel.c
  
+DEFINES += ENABLE_YARR_JIT=0
+SOURCES += \
+    $$PWD/yarr/YarrCanonicalizeUCS2.cpp \
+    $$PWD/yarr/YarrInterpreter.cpp \
+    $$PWD/yarr/YarrPattern.cpp \
+    $$PWD/yarr/YarrSyntaxChecker.cpp
+
+HEADERS += $$PWD/yarr/*.h
+
+retgen.output = RegExpJitTables.h
+retgen.script = $$PWD/create_regex_tables
+retgen.input = retgen.script
+retgen.CONFIG += no_link
+retgen.commands = python $$retgen.script > ${QMAKE_FILE_OUT}
+QMAKE_EXTRA_COMPILERS += retgen
  
  ITAB = $$PWD/disassembler/udis86/optable.xml
  udis86.output = udis86_itab.h
diff --git a/masm/runtime/MatchResult.h b/masm/runtime/MatchResult.h

new file mode 100644 (file)

index 0000000..d87c851
--- /dev/null
+++ b/masm/runtime/MatchResult.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef MatchResult_h
+#define MatchResult_h
+
+typedef uint64_t EncodedMatchResult;
+
+struct MatchResult {
+    ALWAYS_INLINE MatchResult(size_t start, size_t end)
+        : start(start)
+        , end(end)
+    {
+    }
+
+    explicit ALWAYS_INLINE MatchResult(EncodedMatchResult encoded)
+    {
+        union u {
+            uint64_t encoded;
+            struct s {
+                size_t start;
+                size_t end;
+            } split;
+        } value;
+        value.encoded = encoded;
+        start = value.split.start;
+        end = value.split.end;
+    }
+
+    ALWAYS_INLINE static MatchResult failed()
+    {
+        return MatchResult(WTF::notFound, 0);
+    }
+
+    ALWAYS_INLINE operator bool()
+    {
+        return start != WTF::notFound;
+    }
+
+    ALWAYS_INLINE bool empty()
+    {
+        return start == end;
+    }
+
+    size_t start;
+    size_t end;
+};
+
+#endif
diff --git a/masm/stubs/wtf/PassOwnPtr.h b/masm/stubs/wtf/PassOwnPtr.h

index c8d3bf3f750abbf4772349ab3c5e0b5ec7599ad9..f9b84e7b57ef82a8975ac6d12d7926e4414efe8b 100644 (file)
--- a/masm/stubs/wtf/PassOwnPtr.h
+++ b/masm/stubs/wtf/PassOwnPtr.h
@@ -43,7 +43,30 @@
  
  #include <qscopedpointer.h>
  
-#define OwnPtr QScopedPointer
+template <typename T> class PassOwnPtr;
+template <typename PtrType> PassOwnPtr<PtrType> adoptPtr(PtrType*);
+
+template <typename T>
+struct OwnPtr : public QScopedPointer<T>
+{
+    OwnPtr() {}
+    OwnPtr(const PassOwnPtr<T> &ptr)
+        : QScopedPointer<T>(ptr.leakRef())
+    {}
+
+    OwnPtr& operator=(const OwnPtr<T>& other)
+    {
+        this->reset(const_cast<OwnPtr<T> &>(other).take());
+        return *this;
+    }
+
+    T* get() const { return this->data(); }
+
+    PassOwnPtr<T> release()
+    {
+        return adoptPtr(this->take());
+    }
+};
  
  template <typename T>
  class PassOwnPtr {
diff --git a/masm/stubs/wtf/RefCounted.h b/masm/stubs/wtf/RefCounted.h

index f905ace8ad0738d2c0cab519023670eaf0493d63..4fc9ad9074fb4123ef2f07fac4c63b349cac81a0 100644 (file)
--- a/masm/stubs/wtf/RefCounted.h
+++ b/masm/stubs/wtf/RefCounted.h
@@ -41,6 +41,8 @@
  #ifndef REFCOUNTED_H
  #define REFCOUNTED_H
  
+#include "PassRefPtr.h"
+
  template <typename Base>
  class RefCounted {
  public:
diff --git a/masm/stubs/wtf/Vector.h b/masm/stubs/wtf/Vector.h

index 2682824da8600568c307a9e3cb371e88f6f8fe28..1feea851e1168aa83179f6c042a1c33cc52d3794 100644 (file)
--- a/masm/stubs/wtf/Vector.h
+++ b/masm/stubs/wtf/Vector.h
@@ -43,6 +43,8 @@
  
  #include <vector>
  #include <wtf/Assertions.h>
+#include <wtf/NotFound.h>
+#include <qalgorithms.h>
  
  namespace WTF {
  
@@ -55,12 +57,39 @@ public:
      inline void append(const T& value)
      { this->push_back(value); }
  
+    inline void append(const Vector<T>& vector)
+    {
+        this->insert(this->end(), vector.begin(), vector.end());
+    }
+
+    using std::vector<T>::insert;
+
+    inline void insert(size_t position, T value)
+    { this->insert(this->begin() + position, value); }
+
      inline void grow(size_t size)
      { this->resize(size); }
+
+    inline void shrink(size_t size)
+    { this->erase(this->begin() + size, this->end()); }
+
+    inline void remove(size_t position)
+    { this->erase(this->begin() + position); }
+
+    inline bool isEmpty() const { return this->empty(); }
+
+    inline T &last() { return *(this->begin() + this->size() - 1); }
  };
  
+template <typename T, int capacity>
+void deleteAllValues(const Vector<T, capacity> &vector)
+{
+    qDeleteAll(vector);
+}
+
  }
  
  using WTF::Vector;
+using WTF::deleteAllValues;
  
  #endif // VECTOR_H
diff --git a/masm/stubs/wtf/text/CString.h b/masm/stubs/wtf/text/CString.h

new file mode 100644 (file)

index 0000000..c9a65e5
--- /dev/null
+++ b/masm/stubs/wtf/text/CString.h
@@ -0,0 +1,44 @@
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia.  For licensing terms and
+** conditions see http://qt.digia.com/licensing.  For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights.  These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef CSTRING_H
+#define CSTRING_H
+
+#endif // CSTRING_H
diff --git a/masm/stubs/wtf/text/WTFString.h b/masm/stubs/wtf/text/WTFString.h

new file mode 100644 (file)

index 0000000..d157dc7
--- /dev/null
+++ b/masm/stubs/wtf/text/WTFString.h
@@ -0,0 +1,75 @@
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia.  For licensing terms and
+** conditions see http://qt.digia.com/licensing.  For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights.  These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef WTFSTRING_H
+#define WTFSTRING_H
+
+#include <QString>
+#include <wtf/ASCIICType.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace WTF {
+
+class String : public QString
+{
+public:
+    String(const QString& s) : QString(s) {}
+    bool is8Bit() const { return false; }
+    const unsigned char *characters8() const { return 0; }
+    const UChar *characters16() const { return reinterpret_cast<const UChar*>(constData()); }
+
+    template <typename T>
+    const T* getCharacters() const;
+
+};
+
+template <>
+inline const unsigned char* String::getCharacters<unsigned char>() const { return characters8(); }
+template <>
+inline const UChar* String::getCharacters<UChar>() const { return characters16(); }
+
+}
+
+// Don't import WTF::String into the global namespace to avoid conflicts with QQmlJS::VM::String
+namespace JSC {
+    using WTF::String;
+}
+
+#endif // WTFSTRING_H
diff --git a/masm/stubs/wtf/unicode/Unicode.h b/masm/stubs/wtf/unicode/Unicode.h

new file mode 100644 (file)

index 0000000..d61bc64
--- /dev/null
+++ b/masm/stubs/wtf/unicode/Unicode.h
@@ -0,0 +1,59 @@
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia.  For licensing terms and
+** conditions see http://qt.digia.com/licensing.  For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights.  These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef UNICODE_H
+#define UNICODE_H
+
+#include <QChar>
+
+typedef unsigned char LChar;
+typedef uint16_t UChar;
+
+namespace Unicode {
+    inline UChar toLower(UChar ch) {
+        return QChar::toLower(ch);
+    }
+
+    inline UChar toUpper(UChar ch) {
+        return QChar::toUpper(ch);
+    }
+}
+
+#endif // UNICODE_H
diff --git a/masm/wtf/ASCIICType.h b/masm/wtf/ASCIICType.h

new file mode 100644 (file)

index 0000000..18e108e
--- /dev/null
+++ b/masm/wtf/ASCIICType.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2007, 2008, 2009, 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer. 
+ * 2.  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution. 
+ * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
+ *     its contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef WTF_ASCIICType_h
+#define WTF_ASCIICType_h
+
+#include <wtf/Assertions.h>
+
+// The behavior of many of the functions in the <ctype.h> header is dependent
+// on the current locale. But in the WebKit project, all uses of those functions
+// are in code processing something that's not locale-specific. These equivalents
+// for some of the <ctype.h> functions are named more explicitly, not dependent
+// on the C library locale, and we should also optimize them as needed.
+
+// All functions return false or leave the character unchanged if passed a character
+// that is outside the range 0-7F. So they can be used on Unicode strings or
+// characters if the intent is to do processing only if the character is ASCII.
+
+namespace WTF {
+
+template<typename CharType> inline bool isASCII(CharType c)
+{
+    return !(c & ~0x7F);
+}
+
+template<typename CharType> inline bool isASCIIAlpha(CharType c)
+{
+    return (c | 0x20) >= 'a' && (c | 0x20) <= 'z';
+}
+
+template<typename CharType> inline bool isASCIIDigit(CharType c)
+{
+    return c >= '0' && c <= '9';
+}
+
+template<typename CharType> inline bool isASCIIAlphanumeric(CharType c)
+{
+    return isASCIIDigit(c) || isASCIIAlpha(c);
+}
+
+template<typename CharType> inline bool isASCIIHexDigit(CharType c)
+{
+    return isASCIIDigit(c) || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f');
+}
+
+template<typename CharType> inline bool isASCIILower(CharType c)
+{
+    return c >= 'a' && c <= 'z';
+}
+
+template<typename CharType> inline bool isASCIIOctalDigit(CharType c)
+{
+    return (c >= '0') & (c <= '7');
+}
+
+template<typename CharType> inline bool isASCIIPrintable(CharType c)
+{
+    return c >= ' ' && c <= '~';
+}
+
+/*
+ Statistics from a run of Apple's page load test for callers of isASCIISpace:
+
+ character          count
+ ---------          -----
+ non-spaces         689383
+ 20  space          294720
+ 0A  \n             89059
+ 09  \t             28320
+ 0D  \r             0
+ 0C  \f             0
+ 0B  \v             0
+ */
+template<typename CharType> inline bool isASCIISpace(CharType c)
+{
+    return c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
+}
+
+template<typename CharType> inline bool isASCIIUpper(CharType c)
+{
+    return c >= 'A' && c <= 'Z';
+}
+
+template<typename CharType> inline CharType toASCIILower(CharType c)
+{
+    return c | ((c >= 'A' && c <= 'Z') << 5);
+}
+
+template<typename CharType> inline CharType toASCIILowerUnchecked(CharType character)
+{
+    // This function can be used for comparing any input character
+    // to a lowercase English character. The isASCIIAlphaCaselessEqual
+    // below should be used for regular comparison of ASCII alpha
+    // characters, but switch statements in CSS tokenizer require
+    // direct use of this function.
+    return character | 0x20;
+}
+
+template<typename CharType> inline CharType toASCIIUpper(CharType c)
+{
+    return c & ~((c >= 'a' && c <= 'z') << 5);
+}
+
+template<typename CharType> inline int toASCIIHexValue(CharType c)
+{
+    ASSERT(isASCIIHexDigit(c));
+    return c < 'A' ? c - '0' : (c - 'A' + 10) & 0xF;
+}
+
+template<typename CharType> inline int toASCIIHexValue(CharType upperValue, CharType lowerValue)
+{
+    ASSERT(isASCIIHexDigit(upperValue) && isASCIIHexDigit(lowerValue));
+    return ((toASCIIHexValue(upperValue) << 4) & 0xF0) | toASCIIHexValue(lowerValue);
+}
+
+inline char lowerNibbleToASCIIHexDigit(char c)
+{
+    char nibble = c & 0xF;
+    return nibble < 10 ? '0' + nibble : 'A' + nibble - 10;
+}
+
+inline char upperNibbleToASCIIHexDigit(char c)
+{
+    char nibble = (c >> 4) & 0xF;
+    return nibble < 10 ? '0' + nibble : 'A' + nibble - 10;
+}
+
+template<typename CharType> inline bool isASCIIAlphaCaselessEqual(CharType cssCharacter, char character)
+{
+    // This function compares a (preferrably) constant ASCII
+    // lowercase letter to any input character.
+    ASSERT(character >= 'a' && character <= 'z');
+    return LIKELY(toASCIILowerUnchecked(cssCharacter) == character);
+}
+
+}
+
+using WTF::isASCII;
+using WTF::isASCIIAlpha;
+using WTF::isASCIIAlphanumeric;
+using WTF::isASCIIDigit;
+using WTF::isASCIIHexDigit;
+using WTF::isASCIILower;
+using WTF::isASCIIOctalDigit;
+using WTF::isASCIIPrintable;
+using WTF::isASCIISpace;
+using WTF::isASCIIUpper;
+using WTF::toASCIIHexValue;
+using WTF::toASCIILower;
+using WTF::toASCIILowerUnchecked;
+using WTF::toASCIIUpper;
+using WTF::lowerNibbleToASCIIHexDigit;
+using WTF::upperNibbleToASCIIHexDigit;
+using WTF::isASCIIAlphaCaselessEqual;
+
+#endif
diff --git a/masm/wtf/BumpPointerAllocator.h b/masm/wtf/BumpPointerAllocator.h

new file mode 100644 (file)

index 0000000..3b2cfd9
--- /dev/null
+++ b/masm/wtf/BumpPointerAllocator.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (C) 2010 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef BumpPointerAllocator_h
+#define BumpPointerAllocator_h
+
+#include <algorithm>
+#include <wtf/PageAllocation.h>
+#include <wtf/PageBlock.h>
+
+namespace WTF {
+
+#define MINIMUM_BUMP_POOL_SIZE 0x1000
+
+class BumpPointerPool {
+public:
+    // ensureCapacity will check whether the current pool has capacity to
+    // allocate 'size' bytes of memory  If it does not, it will attempt to
+    // allocate a new pool (which will be added to this one in a chain).
+    //
+    // If allocation fails (out of memory) this method will return null.
+    // If the return value is non-null, then callers should update any
+    // references they have to this current (possibly full) BumpPointerPool
+    // to instead point to the newly returned BumpPointerPool.
+    BumpPointerPool* ensureCapacity(size_t size)
+    {
+        void* allocationEnd = static_cast<char*>(m_current) + size;
+        ASSERT(allocationEnd > m_current); // check for overflow
+        if (allocationEnd <= static_cast<void*>(this))
+            return this;
+        return ensureCapacityCrossPool(this, size);
+    }
+
+    // alloc should only be called after calling ensureCapacity; as such
+    // alloc will never fail.
+    void* alloc(size_t size)
+    {
+        void* current = m_current;
+        void* allocationEnd = static_cast<char*>(current) + size;
+        ASSERT(allocationEnd > current); // check for overflow
+        ASSERT(allocationEnd <= static_cast<void*>(this));
+        m_current = allocationEnd;
+        return current;
+    }
+
+    // The dealloc method releases memory allocated using alloc.  Memory
+    // must be released in a LIFO fashion, e.g. if the client calls alloc
+    // four times, returning pointer A, B, C, D, then the only valid order
+    // in which these may be deallocaed is D, C, B, A.
+    //
+    // The client may optionally skip some deallocations.  In the example
+    // above, it would be valid to only explicitly dealloc C, A (D being
+    // dealloced along with C, B along with A).
+    //
+    // If pointer was not allocated from this pool (or pools) then dealloc
+    // will CRASH().  Callers should update any references they have to
+    // this current BumpPointerPool to instead point to the returned
+    // BumpPointerPool.
+    BumpPointerPool* dealloc(void* position)
+    {
+        if ((position >= m_start) && (position <= static_cast<void*>(this))) {
+            ASSERT(position <= m_current);
+            m_current = position;
+            return this;
+        }
+        return deallocCrossPool(this, position);
+    }
+
+private:
+    // Placement operator new, returns the last 'size' bytes of allocation for use as this.
+    void* operator new(size_t size, const PageAllocation& allocation)
+    {
+        ASSERT(size < allocation.size());
+        return reinterpret_cast<char*>(reinterpret_cast<intptr_t>(allocation.base()) + allocation.size()) - size;
+    }
+
+    BumpPointerPool(const PageAllocation& allocation)
+        : m_current(allocation.base())
+        , m_start(allocation.base())
+        , m_next(0)
+        , m_previous(0)
+        , m_allocation(allocation)
+    {
+    }
+
+    static BumpPointerPool* create(size_t minimumCapacity = 0)
+    {
+        // Add size of BumpPointerPool object, check for overflow.
+        minimumCapacity += sizeof(BumpPointerPool);
+        if (minimumCapacity < sizeof(BumpPointerPool))
+            return 0;
+
+        size_t poolSize = std::max(static_cast<size_t>(MINIMUM_BUMP_POOL_SIZE), WTF::pageSize());
+        while (poolSize < minimumCapacity) {
+            poolSize <<= 1;
+            // The following if check relies on MINIMUM_BUMP_POOL_SIZE being a power of 2!
+            ASSERT(!(MINIMUM_BUMP_POOL_SIZE & (MINIMUM_BUMP_POOL_SIZE - 1)));
+            if (!poolSize)
+                return 0;
+        }
+
+        PageAllocation allocation = PageAllocation::allocate(poolSize);
+        if (!!allocation)
+            return new (allocation) BumpPointerPool(allocation);
+        return 0;
+    }
+
+    void shrink()
+    {
+        ASSERT(!m_previous);
+        m_current = m_start;
+        while (m_next) {
+            BumpPointerPool* nextNext = m_next->m_next;
+            m_next->destroy();
+            m_next = nextNext;
+        }
+    }
+
+    void destroy()
+    {
+        m_allocation.deallocate();
+    }
+
+    static BumpPointerPool* ensureCapacityCrossPool(BumpPointerPool* previousPool, size_t size)
+    {
+        // The pool passed should not have capacity, so we'll start with the next one.
+        ASSERT(previousPool);
+        ASSERT((static_cast<char*>(previousPool->m_current) + size) > previousPool->m_current); // check for overflow
+        ASSERT((static_cast<char*>(previousPool->m_current) + size) > static_cast<void*>(previousPool));
+        BumpPointerPool* pool = previousPool->m_next;
+
+        while (true) {
+            if (!pool) {
+                // We've run to the end; allocate a new pool.
+                pool = BumpPointerPool::create(size);
+                previousPool->m_next = pool;
+                pool->m_previous = previousPool;
+                return pool;
+            }
+
+            // 
+            void* current = pool->m_current;
+            void* allocationEnd = static_cast<char*>(current) + size;
+            ASSERT(allocationEnd > current); // check for overflow
+            if (allocationEnd <= static_cast<void*>(pool))
+                return pool;
+        }
+    }
+
+    static BumpPointerPool* deallocCrossPool(BumpPointerPool* pool, void* position)
+    {
+        // Should only be called if position is not in the current pool.
+        ASSERT((position < pool->m_start) || (position > static_cast<void*>(pool)));
+
+        while (true) {
+            // Unwind the current pool to the start, move back in the chain to the previous pool.
+            pool->m_current = pool->m_start;
+            pool = pool->m_previous;
+
+            // position was nowhere in the chain!
+            if (!pool)
+                CRASH();
+
+            if ((position >= pool->m_start) && (position <= static_cast<void*>(pool))) {
+                ASSERT(position <= pool->m_current);
+                pool->m_current = position;
+                return pool;
+            }
+        }
+    }
+
+    void* m_current;
+    void* m_start;
+    BumpPointerPool* m_next;
+    BumpPointerPool* m_previous;
+    PageAllocation m_allocation;
+
+    friend class BumpPointerAllocator;
+};
+
+// A BumpPointerAllocator manages a set of BumpPointerPool objects, which
+// can be used for LIFO (stack like) allocation.
+//
+// To begin allocating using this class call startAllocator().  The result
+// of this method will be null if the initial pool allocation fails, or a
+// pointer to a BumpPointerPool object that can be used to perform
+// allocations.  Whilst running no memory will be released until
+// stopAllocator() is called.  At this point all allocations made through
+// this allocator will be reaped, and underlying memory may be freed.
+//
+// (In practice we will still hold on to the initial pool to allow allocation
+// to be quickly restared, but aditional pools will be freed).
+//
+// This allocator is non-renetrant, it is encumbant on the clients to ensure
+// startAllocator() is not called again until stopAllocator() has been called.
+class BumpPointerAllocator {
+public:
+    BumpPointerAllocator()
+        : m_head(0)
+    {
+    }
+
+    ~BumpPointerAllocator()
+    {
+        if (m_head)
+            m_head->destroy();
+    }
+
+    BumpPointerPool* startAllocator()
+    {
+        if (!m_head)
+            m_head = BumpPointerPool::create();
+        return m_head;
+    }
+
+    void stopAllocator()
+    {
+        if (m_head)
+            m_head->shrink();
+    }
+
+private:
+    BumpPointerPool* m_head;
+};
+
+}
+
+using WTF::BumpPointerAllocator;
+
+#endif // BumpPointerAllocator_h
diff --git a/masm/yarr/Yarr.h b/masm/yarr/Yarr.h

new file mode 100644 (file)

index 0000000..d393e9f
--- /dev/null
+++ b/masm/yarr/Yarr.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef Yarr_h
+#define Yarr_h
+
+#include "YarrInterpreter.h"
+#include "YarrPattern.h"
+
+namespace JSC { namespace Yarr {
+
+#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoBackReference 2
+#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
+#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1
+#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1
+#define YarrStackSpaceForBackTrackInfoParentheses 2
+
+static const unsigned quantifyInfinite = UINT_MAX;
+static const unsigned offsetNoMatch = (unsigned)-1;
+
+// The below limit restricts the number of "recursive" match calls in order to
+// avoid spending exponential time on complex regular expressions.
+static const unsigned matchLimit = 1000000;
+
+enum JSRegExpResult {
+    JSRegExpMatch = 1,
+    JSRegExpNoMatch = 0,
+    JSRegExpErrorNoMatch = -1,
+    JSRegExpErrorHitLimit = -2,
+    JSRegExpErrorNoMemory = -3,
+    JSRegExpErrorInternal = -4
+};
+
+enum YarrCharSize {
+    Char8,
+    Char16
+};
+
+} } // namespace JSC::Yarr
+
+#endif // Yarr_h
+
diff --git a/masm/yarr/YarrCanonicalizeUCS2.cpp b/masm/yarr/YarrCanonicalizeUCS2.cpp

new file mode 100644 (file)

index 0000000..7bb3d08
--- /dev/null
+++ b/masm/yarr/YarrCanonicalizeUCS2.cpp
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js
+
+#include "config.h"
+#include "YarrCanonicalizeUCS2.h"
+
+namespace JSC { namespace Yarr {
+
+#include <stdint.h>
+
+uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 };
+uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 };
+uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 };
+uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 };
+uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 };
+uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 };
+uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 };
+uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 };
+uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 };
+uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 };
+uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 };
+uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 };
+uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 };
+uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 };
+uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 };
+
+static const size_t UCS2_CANONICALIZATION_SETS = 15;
+uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {
+    ucs2CharacterSet0,
+    ucs2CharacterSet1,
+    ucs2CharacterSet2,
+    ucs2CharacterSet3,
+    ucs2CharacterSet4,
+    ucs2CharacterSet5,
+    ucs2CharacterSet6,
+    ucs2CharacterSet7,
+    ucs2CharacterSet8,
+    ucs2CharacterSet9,
+    ucs2CharacterSet10,
+    ucs2CharacterSet11,
+    ucs2CharacterSet12,
+    ucs2CharacterSet13,
+    ucs2CharacterSet14,
+};
+
+const size_t UCS2_CANONICALIZATION_RANGES = 364;
+UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {
+    { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique },
+    { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo },
+    { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique },
+    { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi },
+    { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique },
+    { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet },
+    { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique },
+    { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo },
+    { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique },
+    { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo },
+    { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique },
+    { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi },
+    { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique },
+    { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi },
+    { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo },
+    { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique },
+    { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique },
+    { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique },
+    { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi },
+    { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique },
+    { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo },
+    { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo },
+    { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo },
+    { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo },
+    { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique },
+    { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo },
+    { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo },
+    { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo },
+    { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo },
+    { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo },
+    { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo },
+    { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo },
+    { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo },
+    { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo },
+    { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique },
+    { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo },
+    { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo },
+    { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo },
+    { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo },
+    { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo },
+    { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo },
+    { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique },
+    { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo },
+    { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo },
+    { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo },
+    { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique },
+    { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique },
+    { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo },
+    { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique },
+    { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet },
+    { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet },
+    { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet },
+    { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi },
+    { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique },
+    { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet },
+    { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi },
+    { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi },
+    { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi },
+    { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique },
+    { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique },
+    { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo },
+    { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi },
+    { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo },
+    { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo },
+    { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi },
+    { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo },
+    { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo },
+    { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo },
+    { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo },
+    { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo },
+    { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi },
+    { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi },
+    { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique },
+    { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi },
+    { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique },
+    { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi },
+    { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique },
+    { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi },
+    { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique },
+    { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi },
+    { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique },
+    { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi },
+    { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique },
+    { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo },
+    { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique },
+    { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi },
+    { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi },
+    { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique },
+    { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo },
+    { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique },
+    { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi },
+    { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique },
+    { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo },
+    { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi },
+    { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique },
+    { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi },
+    { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique },
+    { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo },
+    { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique },
+    { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi },
+    { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique },
+    { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi },
+    { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique },
+    { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi },
+    { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi },
+    { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi },
+    { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi },
+    { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique },
+    { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi },
+    { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique },
+    { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet },
+    { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique },
+    { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique },
+    { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique },
+    { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo },
+    { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique },
+    { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo },
+    { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique },
+    { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo },
+    { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique },
+    { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo },
+    { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique },
+    { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo },
+    { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique },
+    { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo },
+    { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet },
+    { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo },
+    { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet },
+    { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo },
+    { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet },
+    { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet },
+    { 0x039au, 0x039au, 0x0008u, CanonicalizeSet },
+    { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo },
+    { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet },
+    { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo },
+    { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet },
+    { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet },
+    { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique },
+    { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet },
+    { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo },
+    { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet },
+    { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo },
+    { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi },
+    { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi },
+    { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique },
+    { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi },
+    { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet },
+    { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi },
+    { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet },
+    { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi },
+    { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet },
+    { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet },
+    { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet },
+    { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi },
+    { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet },
+    { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi },
+    { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet },
+    { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet },
+    { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet },
+    { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi },
+    { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet },
+    { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi },
+    { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi },
+    { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi },
+    { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo },
+    { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet },
+    { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet },
+    { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique },
+    { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet },
+    { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet },
+    { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi },
+    { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet },
+    { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet },
+    { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo },
+    { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique },
+    { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet },
+    { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique },
+    { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi },
+    { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique },
+    { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi },
+    { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo },
+    { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo },
+    { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi },
+    { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi },
+    { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique },
+    { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo },
+    { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi },
+    { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique },
+    { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo },
+    { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique },
+    { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi },
+    { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique },
+    { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo },
+    { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique },
+    { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo },
+    { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique },
+    { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo },
+    { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique },
+    { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet },
+    { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique },
+    { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet },
+    { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique },
+    { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique },
+    { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique },
+    { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique },
+    { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique },
+    { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique },
+    { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique },
+    { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique },
+    { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique },
+    { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique },
+    { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique },
+    { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique },
+    { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi },
+    { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo },
+    { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo },
+    { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo },
+    { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo },
+    { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo },
+    { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo },
+    { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique },
+    { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique },
+    { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi },
+    { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi },
+    { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique },
+    { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet },
+    { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique },
+    { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi },
+    { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique },
+    { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique },
+    { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi },
+    { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi },
+    { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique },
+    { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo },
+    { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique },
+    { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo },
+    { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique },
+    { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi },
+    { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi },
+    { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi },
+    { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique },
+    { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi },
+    { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi },
+    { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique },
+    { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo },
+    { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique },
+    { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi },
+    { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique },
+    { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo },
+    { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi },
+    { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique },
+    { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique },
+    { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo },
+    { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi },
+    { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique },
+    { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo },
+    { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique },
+    { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi },
+    { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique },
+    { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi },
+    { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi },
+    { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi },
+    { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi },
+    { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi },
+    { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi },
+    { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi },
+    { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi },
+    { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi },
+    { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique },
+    { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique },
+    { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique },
+    { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi },
+    { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique },
+    { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique },
+    { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi },
+    { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique },
+    { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique },
+    { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique },
+    { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique },
+    { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique },
+    { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi },
+    { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique },
+    { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+    { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi },
+    { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique },
+    { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique },
+    { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned },
+    { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique },
+    { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo },
+    { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique },
+    { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi },
+    { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique },
+};
+
+const size_t LATIN_CANONICALIZATION_RANGES = 20;
+LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {
+    { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf },
+    { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 },
+    { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf },
+    { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 },
+    { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf },
+    { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 },
+    { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf },
+    { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 },
+    { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf },
+    { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 },
+    { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf },
+    { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 },
+    { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf },
+    { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid },
+    { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther },
+    { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid },
+    { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther },
+    { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid },
+    { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther },
+    { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid },
+};
+
+} } // JSC::Yarr
+
diff --git a/masm/yarr/YarrCanonicalizeUCS2.h b/masm/yarr/YarrCanonicalizeUCS2.h

new file mode 100644 (file)

index 0000000..be0ead4
--- /dev/null
+++ b/masm/yarr/YarrCanonicalizeUCS2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrCanonicalizeUCS2_H
+#define YarrCanonicalizeUCS2_H
+
+#include <stdint.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
+// provides information for each UCS2 code point as to the set of code points that it should
+// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
+enum UCS2CanonicalizationType {
+    CanonicalizeUnique,               // No canonically equal values, e.g. 0x0.
+    CanonicalizeSet,                  // Value indicates a set in characterSetInfo.
+    CanonicalizeRangeLo,              // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
+    CanonicalizeRangeHi,              // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
+    CanonicalizeAlternatingAligned,   // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
+    CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
+};
+struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t UCS2_CANONICALIZATION_RANGES;
+extern uint16_t* characterSetInfo[];
+extern UCS2CanonicalizationRange rangeInfo[];
+
+// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
+// the set of Latin1 codepoints that could match.
+enum LatinCanonicalizationType {
+    CanonicalizeLatinSelf,     // This character is in the Latin1 range, but has no canonical equivalent in the range.
+    CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20.
+    CanonicalizeLatinOther,    // This character is not in the Latin1 range, but canonicalizes to another that is.
+    CanonicalizeLatinInvalid,  // Cannot match against Latin1 input.
+};
+struct LatinCanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t LATIN_CANONICALIZATION_RANGES;
+extern LatinCanonicalizationRange latinRangeInfo[];
+
+// This searches in log2 time over ~364 entries, so should typically result in 8 compares.
+inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch)
+{
+    UCS2CanonicalizationRange* info = rangeInfo;
+    size_t entries = UCS2_CANONICALIZATION_RANGES;
+
+    while (true) {
+        size_t candidate = entries >> 1;
+        UCS2CanonicalizationRange* candidateInfo = info + candidate;
+        if (ch < candidateInfo->begin)
+            entries = candidate;
+        else if (ch <= candidateInfo->end)
+            return candidateInfo;
+        else {
+            info = candidateInfo + 1;
+            entries -= (candidate + 1);
+        }
+    }
+}
+
+// Should only be called for characters that have one canonically matching value.
+inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch)
+{
+    ASSERT(ch >= info->begin && ch <= info->end);
+    switch (info->type) {
+    case CanonicalizeRangeLo:
+        return ch + info->value;
+    case CanonicalizeRangeHi:
+        return ch - info->value;
+    case CanonicalizeAlternatingAligned:
+        return ch ^ 1;
+    case CanonicalizeAlternatingUnaligned:
+        return ((ch - 1) ^ 1) + 1;
+    default:
+        ASSERT_NOT_REACHED();
+    }
+    ASSERT_NOT_REACHED();
+    return 0;
+}
+
+// Returns true if no other UCS2 codepoint can match this value.
+inline bool isCanonicallyUnique(UChar ch)
+{
+    return rangeInfoFor(ch)->type == CanonicalizeUnique;
+}
+
+// Returns true if values are equal, under the canonicalization rules.
+inline bool areCanonicallyEquivalent(UChar a, UChar b)
+{
+    UCS2CanonicalizationRange* info = rangeInfoFor(a);
+    switch (info->type) {
+    case CanonicalizeUnique:
+        return a == b;
+    case CanonicalizeSet: {
+        for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) {
+            if (a == b)
+                return true;
+        }
+        return false;
+    }
+    case CanonicalizeRangeLo:
+        return (a == b) || (a + info->value == b);
+    case CanonicalizeRangeHi:
+        return (a == b) || (a - info->value == b);
+    case CanonicalizeAlternatingAligned:
+        return (a | 1) == (b | 1);
+    case CanonicalizeAlternatingUnaligned:
+        return ((a - 1) | 1) == ((b - 1) | 1);
+    }
+
+    ASSERT_NOT_REACHED();
+    return false;
+}
+
+} } // JSC::Yarr
+
+#endif
diff --git a/masm/yarr/YarrCanonicalizeUCS2.js b/masm/yarr/YarrCanonicalizeUCS2.js

new file mode 100644 (file)

index 0000000..00361dd
--- /dev/null
+++ b/masm/yarr/YarrCanonicalizeUCS2.js
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+// See ES 5.1, 15.10.2.8
+function canonicalize(ch)
+{
+    var u = String.fromCharCode(ch).toUpperCase();
+    if (u.length > 1)
+        return ch;
+    var cu = u.charCodeAt(0);
+    if (ch >= 128 && cu < 128)
+        return ch;
+    return cu;
+}
+
+var MAX_UCS2 = 0xFFFF;
+var MAX_LATIN = 0xFF;
+
+var groupedCanonically = [];
+// Pass 1: populate groupedCanonically - this is mapping from canonicalized
+// values back to the set of character code that canonicalize to them.
+for (var i = 0; i <= MAX_UCS2; ++i) {
+    var ch = canonicalize(i);
+    if (!groupedCanonically[ch])
+        groupedCanonically[ch] = [];
+    groupedCanonically[ch].push(i);
+}
+
+var typeInfo = [];
+var latinTypeInfo = [];
+var characterSetInfo = [];
+// Pass 2: populate typeInfo & characterSetInfo. For every character calculate
+// a typeInfo value, described by the types above, and a value payload.
+for (cu in groupedCanonically) {
+    // The set of characters that canonicalize to cu
+    var characters = groupedCanonically[cu];
+
+    // If there is only one, it is unique.
+    if (characters.length == 1) {
+        typeInfo[characters[0]] = "CanonicalizeUnique:0";
+        latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
+        continue;
+    }
+
+    // Sort the array.
+    characters.sort(function(x,y){return x-y;});
+
+    // If there are more than two characters, create an entry in characterSetInfo.
+    if (characters.length > 2) {
+        for (i in characters)
+            typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
+        characterSetInfo.push(characters);
+
+        if (characters[1] <= MAX_LATIN)
+            throw new Error("sets with more than one latin character not supported!");
+        if (characters[0] <= MAX_LATIN) {
+            for (i in characters)
+                latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
+            latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
+        } else {
+            for (i in characters)
+                latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
+        }
+
+        continue;
+    }
+
+    // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
+    var lo = characters[0];
+    var hi = characters[1];
+    var delta = hi - lo;
+    if (delta == 1) {
+        var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
+        typeInfo[lo] = type;
+        typeInfo[hi] = type;
+    } else {
+        typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
+        typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
+    }
+
+    if (lo > MAX_LATIN) {
+        latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; 
+        latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
+    } else if (hi > MAX_LATIN) {
+        latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; 
+        latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
+    } else {
+        if (delta != 0x20 || lo & 0x20)
+            throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
+        latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
+        latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
+    }
+}
+
+var rangeInfo = [];
+// Pass 3: coallesce types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+    var begin = end;
+    var type = typeInfo[end];
+    while (end < MAX_UCS2 && typeInfo[end + 1] == type)
+        ++end;
+    rangeInfo.push({begin:begin, end:end, type:type});
+}
+
+var latinRangeInfo = [];
+// Pass 4: coallesce latin-1 types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+    var begin = end;
+    var type = latinTypeInfo[end];
+    while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
+        ++end;
+    latinRangeInfo.push({begin:begin, end:end, type:type});
+}
+
+
+// Helper function to convert a number to a fixed width hex representation of a C uint16_t.
+function hex(x)
+{
+    var s = Number(x).toString(16);
+    while (s.length < 4)
+        s = 0 + s;
+    return "0x" + s + "u";
+}
+
+var copyright = (
+    "/*"                                                                            + "\n" +
+    " * Copyright (C) 2012 Apple Inc. All rights reserved."                         + "\n" +
+    " *"                                                                            + "\n" +
+    " * Redistribution and use in source and binary forms, with or without"         + "\n" +
+    " * modification, are permitted provided that the following conditions"         + "\n" +
+    " * are met:"                                                                   + "\n" +
+    " * 1. Redistributions of source code must retain the above copyright"          + "\n" +
+    " *    notice, this list of conditions and the following disclaimer."           + "\n" +
+    " * 2. Redistributions in binary form must reproduce the above copyright"       + "\n" +
+    " *    notice, this list of conditions and the following disclaimer in the"     + "\n" +
+    " *    documentation and/or other materials provided with the distribution."    + "\n" +
+    " *"                                                                            + "\n" +
+    " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY"                  + "\n" +
+    " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE"          + "\n" +
+    " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR"         + "\n" +
+    " * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR"                   + "\n" +
+    " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,"      + "\n" +
+    " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,"        + "\n" +
+    " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR"         + "\n" +
+    " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY"        + "\n" +
+    " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT"               + "\n" +
+    " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE"      + "\n" +
+    " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "      + "\n" +
+    " */");
+
+print(copyright);
+print();
+print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
+print();
+print('#include "config.h"');
+print('#include "YarrCanonicalizeUCS2.h"');
+print();
+print("namespace JSC { namespace Yarr {");
+print();
+print("#include <stdint.h>");
+print();
+
+for (i in characterSetInfo) {
+    var characters = ""
+    var set = characterSetInfo[i];
+    for (var j in set)
+        characters += hex(set[j]) + ", ";
+    print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
+}
+print();
+print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
+print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
+for (i in characterSetInfo)
+print("    ucs2CharacterSet" + i + ",");
+print("};");
+print();
+print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
+print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
+for (i in rangeInfo) {
+    var info = rangeInfo[i];
+    var typeAndValue = info.type.split(':');
+    print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
+print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
+for (i in latinRangeInfo) {
+    var info = latinRangeInfo[i];
+    var typeAndValue = info.type.split(':');
+    print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("} } // JSC::Yarr");
+print();
+
diff --git a/masm/yarr/YarrInterpreter.cpp b/masm/yarr/YarrInterpreter.cpp

new file mode 100644 (file)

index 0000000..31603f6
--- /dev/null
+++ b/masm/yarr/YarrInterpreter.cpp
@@ -0,0 +1,1964 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrInterpreter.h"
+
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+#include <wtf/BumpPointerAllocator.h>
+#include <wtf/DataLog.h>
+#include <wtf/text/CString.h>
+#include <wtf/text/WTFString.h>
+
+#ifndef NDEBUG
+#include <stdio.h>
+#endif
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+template<typename CharType>
+class Interpreter {
+public:
+    struct ParenthesesDisjunctionContext;
+
+    struct BackTrackInfoPatternCharacter {
+        uintptr_t matchAmount;
+    };
+    struct BackTrackInfoCharacterClass {
+        uintptr_t matchAmount;
+    };
+    struct BackTrackInfoBackReference {
+        uintptr_t begin; // Not really needed for greedy quantifiers.
+        uintptr_t matchAmount; // Not really needed for fixed quantifiers.
+    };
+    struct BackTrackInfoAlternative {
+        uintptr_t offset;
+    };
+    struct BackTrackInfoParentheticalAssertion {
+        uintptr_t begin;
+    };
+    struct BackTrackInfoParenthesesOnce {
+        uintptr_t begin;
+    };
+    struct BackTrackInfoParenthesesTerminal {
+        uintptr_t begin;
+    };
+    struct BackTrackInfoParentheses {
+        uintptr_t matchAmount;
+        ParenthesesDisjunctionContext* lastContext;
+    };
+
+    static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
+    {
+        context->next = backTrack->lastContext;
+        backTrack->lastContext = context;
+        ++backTrack->matchAmount;
+    }
+
+    static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
+    {
+        ASSERT(backTrack->matchAmount);
+        ASSERT(backTrack->lastContext);
+        backTrack->lastContext = backTrack->lastContext->next;
+        --backTrack->matchAmount;
+    }
+
+    struct DisjunctionContext
+    {
+        DisjunctionContext()
+            : term(0)
+        {
+        }
+
+        void* operator new(size_t, void* where)
+        {
+            return where;
+        }
+
+        int term;
+        unsigned matchBegin;
+        unsigned matchEnd;
+        uintptr_t frame[1];
+    };
+
+    DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
+    {
+        size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+        allocatorPool = allocatorPool->ensureCapacity(size);
+        if (!allocatorPool)
+            CRASH();
+        return new (allocatorPool->alloc(size)) DisjunctionContext();
+    }
+
+    void freeDisjunctionContext(DisjunctionContext* context)
+    {
+        allocatorPool = allocatorPool->dealloc(context);
+    }
+
+    struct ParenthesesDisjunctionContext
+    {
+        ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
+            : next(0)
+        {
+            unsigned firstSubpatternId = term.atom.subpatternId;
+            unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
+
+            for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
+                subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
+                output[(firstSubpatternId << 1) + i] = offsetNoMatch;
+            }
+
+            new (getDisjunctionContext(term)) DisjunctionContext();
+        }
+
+        void* operator new(size_t, void* where)
+        {
+            return where;
+        }
+
+        void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
+        {
+            for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
+                output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
+        }
+
+        DisjunctionContext* getDisjunctionContext(ByteTerm& term)
+        {
+            return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
+        }
+
+        ParenthesesDisjunctionContext* next;
+        unsigned subpatternBackup[1];
+    };
+
+    ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
+    {
+        size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+        allocatorPool = allocatorPool->ensureCapacity(size);
+        if (!allocatorPool)
+            CRASH();
+        return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
+    }
+
+    void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
+    {
+        allocatorPool = allocatorPool->dealloc(context);
+    }
+
+    class InputStream {
+    public:
+        InputStream(const CharType* input, unsigned start, unsigned length)
+            : input(input)
+            , pos(start)
+            , length(length)
+        {
+        }
+
+        void next()
+        {
+            ++pos;
+        }
+
+        void rewind(unsigned amount)
+        {
+            ASSERT(pos >= amount);
+            pos -= amount;
+        }
+
+        int read()
+        {
+            ASSERT(pos < length);
+            if (pos < length)
+                return input[pos];
+            return -1;
+        }
+
+        int readPair()
+        {
+            ASSERT(pos + 1 < length);
+            return input[pos] | input[pos + 1] << 16;
+        }
+
+        int readChecked(unsigned negativePositionOffest)
+        {
+            if (pos < negativePositionOffest)
+                CRASH();
+            unsigned p = pos - negativePositionOffest;
+            ASSERT(p < length);
+            return input[p];
+        }
+
+        int reread(unsigned from)
+        {
+            ASSERT(from < length);
+            return input[from];
+        }
+
+        int prev()
+        {
+            ASSERT(!(pos > length));
+            if (pos && length)
+                return input[pos - 1];
+            return -1;
+        }
+
+        unsigned getPos()
+        {
+            return pos;
+        }
+
+        void setPos(unsigned p)
+        {
+            pos = p;
+        }
+
+        bool atStart()
+        {
+            return pos == 0;
+        }
+
+        bool atEnd()
+        {
+            return pos == length;
+        }
+
+        unsigned end()
+        {
+            return length;
+        }
+
+        bool checkInput(unsigned count)
+        {
+            if (((pos + count) <= length) && ((pos + count) >= pos)) {
+                pos += count;
+                return true;
+            }
+            return false;
+        }
+
+        void uncheckInput(unsigned count)
+        {
+            if (pos < count)
+                CRASH();
+            pos -= count;
+        }
+
+        bool atStart(unsigned negativePositionOffest)
+        {
+            return pos == negativePositionOffest;
+        }
+
+        bool atEnd(unsigned negativePositionOffest)
+        {
+            if (pos < negativePositionOffest)
+                CRASH();
+            return (pos - negativePositionOffest) == length;
+        }
+
+        bool isAvailableInput(unsigned offset)
+        {
+            return (((pos + offset) <= length) && ((pos + offset) >= pos));
+        }
+
+    private:
+        const CharType* input;
+        unsigned pos;
+        unsigned length;
+    };
+
+    bool testCharacterClass(CharacterClass* characterClass, int ch)
+    {
+        if (ch & 0xFF80) {
+            for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
+                if (ch == characterClass->m_matchesUnicode[i])
+                    return true;
+            for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
+                if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
+                    return true;
+        } else {
+            for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
+                if (ch == characterClass->m_matches[i])
+                    return true;
+            for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
+                if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
+                    return true;
+        }
+
+        return false;
+    }
+
+    bool checkCharacter(int testChar, unsigned negativeInputOffset)
+    {
+        return testChar == input.readChecked(negativeInputOffset);
+    }
+
+    bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
+    {
+        int ch = input.readChecked(negativeInputOffset);
+        return (loChar == ch) || (hiChar == ch);
+    }
+
+    bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
+    {
+        bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
+        return invert ? !match : match;
+    }
+
+    bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
+    {
+        unsigned matchSize = (unsigned)(matchEnd - matchBegin);
+
+        if (!input.checkInput(matchSize))
+            return false;
+
+        if (pattern->m_ignoreCase) {
+            for (unsigned i = 0; i < matchSize; ++i) {
+                int oldCh = input.reread(matchBegin + i);
+                int ch = input.readChecked(negativeInputOffset + matchSize - i);
+
+                if (oldCh == ch)
+                    continue;
+
+                // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
+                // unicode values are never allowed to match against ascii ones.
+                if (isASCII(oldCh) || isASCII(ch)) {
+                    if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
+                        continue;
+                } else if (areCanonicallyEquivalent(oldCh, ch))
+                    continue;
+
+                input.uncheckInput(matchSize);
+                return false;
+            }
+        } else {
+            for (unsigned i = 0; i < matchSize; ++i) {
+                if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
+                    input.uncheckInput(matchSize);
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool matchAssertionBOL(ByteTerm& term)
+    {
+        return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
+    }
+
+    bool matchAssertionEOL(ByteTerm& term)
+    {
+        if (term.inputPosition)
+            return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
+
+        return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
+    }
+
+    bool matchAssertionWordBoundary(ByteTerm& term)
+    {
+        bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
+        bool readIsWordchar;
+        if (term.inputPosition)
+            readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
+        else
+            readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
+
+        bool wordBoundary = prevIsWordchar != readIsWordchar;
+        return term.invert() ? !wordBoundary : wordBoundary;
+    }
+
+    bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
+    {
+        BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount:
+            break;
+
+        case QuantifierGreedy:
+            if (backTrack->matchAmount) {
+                --backTrack->matchAmount;
+                input.uncheckInput(1);
+                return true;
+            }
+            break;
+
+        case QuantifierNonGreedy:
+            if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+                ++backTrack->matchAmount;
+                if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
+                    return true;
+            }
+            input.uncheckInput(backTrack->matchAmount);
+            break;
+        }
+
+        return false;
+    }
+
+    bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
+    {
+        BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount:
+            break;
+
+        case QuantifierGreedy:
+            if (backTrack->matchAmount) {
+                --backTrack->matchAmount;
+                input.uncheckInput(1);
+                return true;
+            }
+            break;
+
+        case QuantifierNonGreedy:
+            if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+                ++backTrack->matchAmount;
+                if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
+                    return true;
+            }
+            input.uncheckInput(backTrack->matchAmount);
+            break;
+        }
+
+        return false;
+    }
+
+    bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeCharacterClass);
+        BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount: {
+            for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+                if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
+                    return false;
+            }
+            return true;
+        }
+
+        case QuantifierGreedy: {
+            unsigned matchAmount = 0;
+            while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+                if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
+                    input.uncheckInput(1);
+                    break;
+                }
+                ++matchAmount;
+            }
+            backTrack->matchAmount = matchAmount;
+
+            return true;
+        }
+
+        case QuantifierNonGreedy:
+            backTrack->matchAmount = 0;
+            return true;
+        }
+
+        ASSERT_NOT_REACHED();
+        return false;
+    }
+
+    bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeCharacterClass);
+        BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount:
+            break;
+
+        case QuantifierGreedy:
+            if (backTrack->matchAmount) {
+                --backTrack->matchAmount;
+                input.uncheckInput(1);
+                return true;
+            }
+            break;
+
+        case QuantifierNonGreedy:
+            if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+                ++backTrack->matchAmount;
+                if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
+                    return true;
+            }
+            input.uncheckInput(backTrack->matchAmount);
+            break;
+        }
+
+        return false;
+    }
+
+    bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeBackReference);
+        BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+        unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+        unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+        // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
+        // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
+        // Eg.: /(a\1)/
+        if (matchEnd == offsetNoMatch)
+            return true;
+
+        if (matchBegin == offsetNoMatch)
+            return true;
+
+        ASSERT(matchBegin <= matchEnd);
+
+        if (matchBegin == matchEnd)
+            return true;
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount: {
+            backTrack->begin = input.getPos();
+            for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+                if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+                    input.setPos(backTrack->begin);
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        case QuantifierGreedy: {
+            unsigned matchAmount = 0;
+            while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
+                ++matchAmount;
+            backTrack->matchAmount = matchAmount;
+            return true;
+        }
+
+        case QuantifierNonGreedy:
+            backTrack->begin = input.getPos();
+            backTrack->matchAmount = 0;
+            return true;
+        }
+
+        ASSERT_NOT_REACHED();
+        return false;
+    }
+
+    bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeBackReference);
+        BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+        unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+        unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+        if (matchBegin == offsetNoMatch)
+            return false;
+
+        ASSERT(matchBegin <= matchEnd);
+
+        if (matchBegin == matchEnd)
+            return false;
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount:
+            // for quantityCount == 1, could rewind.
+            input.setPos(backTrack->begin);
+            break;
+
+        case QuantifierGreedy:
+            if (backTrack->matchAmount) {
+                --backTrack->matchAmount;
+                input.rewind(matchEnd - matchBegin);
+                return true;
+            }
+            break;
+
+        case QuantifierNonGreedy:
+            if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+                ++backTrack->matchAmount;
+                return true;
+            }
+            input.setPos(backTrack->begin);
+            break;
+        }
+
+        return false;
+    }
+
+    void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
+    {
+        if (term.capture()) {
+            unsigned subpatternId = term.atom.subpatternId;
+            output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
+            output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
+        }
+    }
+    void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
+    {
+        unsigned firstSubpatternId = term.atom.subpatternId;
+        unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
+        context->restoreOutput(output, firstSubpatternId, count);
+    }
+    JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+    {
+        while (backTrack->matchAmount) {
+            ParenthesesDisjunctionContext* context = backTrack->lastContext;
+
+            JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
+            if (result == JSRegExpMatch)
+                return JSRegExpMatch;
+
+            resetMatches(term, context);
+            popParenthesesDisjunctionContext(backTrack);
+            freeParenthesesDisjunctionContext(context);
+
+            if (result != JSRegExpNoMatch)
+                return result;
+        }
+
+        return JSRegExpNoMatch;
+    }
+
+    bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+        ASSERT(term.atom.quantityCount == 1);
+
+        BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+        switch (term.atom.quantityType) {
+        case QuantifierGreedy: {
+            // set this speculatively; if we get to the parens end this will be true.
+            backTrack->begin = input.getPos();
+            break;
+        }
+        case QuantifierNonGreedy: {
+            backTrack->begin = notFound;
+            context->term += term.atom.parenthesesWidth;
+            return true;
+        }
+        case QuantifierFixedCount:
+            break;
+        }
+
+        if (term.capture()) {
+            unsigned subpatternId = term.atom.subpatternId;
+            output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
+        }
+
+        return true;
+    }
+
+    bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+        ASSERT(term.atom.quantityCount == 1);
+
+        if (term.capture()) {
+            unsigned subpatternId = term.atom.subpatternId;
+            output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
+        }
+
+        if (term.atom.quantityType == QuantifierFixedCount)
+            return true;
+
+        BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+        return backTrack->begin != input.getPos();
+    }
+
+    bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+        ASSERT(term.atom.quantityCount == 1);
+
+        BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+        if (term.capture()) {
+            unsigned subpatternId = term.atom.subpatternId;
+            output[(subpatternId << 1)] = offsetNoMatch;
+            output[(subpatternId << 1) + 1] = offsetNoMatch;
+        }
+
+        switch (term.atom.quantityType) {
+        case QuantifierGreedy:
+            // if we backtrack to this point, there is another chance - try matching nothing.
+            ASSERT(backTrack->begin != notFound);
+            backTrack->begin = notFound;
+            context->term += term.atom.parenthesesWidth;
+            return true;
+        case QuantifierNonGreedy:
+            ASSERT(backTrack->begin != notFound);
+        case QuantifierFixedCount:
+            break;
+        }
+
+        return false;
+    }
+
+    bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+        ASSERT(term.atom.quantityCount == 1);
+
+        BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+        switch (term.atom.quantityType) {
+        case QuantifierGreedy:
+            if (backTrack->begin == notFound) {
+                context->term -= term.atom.parenthesesWidth;
+                return false;
+            }
+        case QuantifierNonGreedy:
+            if (backTrack->begin == notFound) {
+                backTrack->begin = input.getPos();
+                if (term.capture()) {
+                    // Technically this access to inputPosition should be accessing the begin term's
+                    // inputPosition, but for repeats other than fixed these values should be
+                    // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
+                    ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+                    ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
+                    unsigned subpatternId = term.atom.subpatternId;
+                    output[subpatternId << 1] = input.getPos() + term.inputPosition;
+                }
+                context->term -= term.atom.parenthesesWidth;
+                return true;
+            }
+        case QuantifierFixedCount:
+            break;
+        }
+
+        return false;
+    }
+
+    bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+        ASSERT(term.atom.quantityType == QuantifierGreedy);
+        ASSERT(term.atom.quantityCount == quantifyInfinite);
+        ASSERT(!term.capture());
+
+        BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+        backTrack->begin = input.getPos();
+        return true;
+    }
+
+    bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
+
+        BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+        // Empty match is a failed match.
+        if (backTrack->begin == input.getPos())
+            return false;
+
+        // Successful match! Okay, what's next? - loop around and try to match moar!
+        context->term -= (term.atom.parenthesesWidth + 1);
+        return true;
+    }
+
+    bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+        ASSERT(term.atom.quantityType == QuantifierGreedy);
+        ASSERT(term.atom.quantityCount == quantifyInfinite);
+        ASSERT(!term.capture());
+
+        // If we backtrack to this point, we have failed to match this iteration of the parens.
+        // Since this is greedy / zero minimum a failed is also accepted as a match!
+        context->term += term.atom.parenthesesWidth;
+        return true;
+    }
+
+    bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
+    {
+        // 'Terminal' parentheses are at the end of the regex, and as such a match past end
+        // should always be returned as a successful match - we should never backtrack to here.
+        ASSERT_NOT_REACHED();
+        return false;
+    }
+
+    bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+        ASSERT(term.atom.quantityCount == 1);
+
+        BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+        backTrack->begin = input.getPos();
+        return true;
+    }
+
+    bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+        ASSERT(term.atom.quantityCount == 1);
+
+        BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+        input.setPos(backTrack->begin);
+
+        // We've reached the end of the parens; if they are inverted, this is failure.
+        if (term.invert()) {
+            context->term -= term.atom.parenthesesWidth;
+            return false;
+        }
+
+        return true;
+    }
+
+    bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+        ASSERT(term.atom.quantityCount == 1);
+
+        // We've failed to match parens; if they are inverted, this is win!
+        if (term.invert()) {
+            context->term += term.atom.parenthesesWidth;
+            return true;
+        }
+
+        return false;
+    }
+
+    bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+        ASSERT(term.atom.quantityCount == 1);
+
+        BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+        input.setPos(backTrack->begin);
+
+        context->term -= term.atom.parenthesesWidth;
+        return false;
+    }
+
+    JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+        BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+        ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+        backTrack->matchAmount = 0;
+        backTrack->lastContext = 0;
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount: {
+            // While we haven't yet reached our fixed limit,
+            while (backTrack->matchAmount < term.atom.quantityCount) {
+                // Try to do a match, and it it succeeds, add it to the list.
+                ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+                JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                if (result == JSRegExpMatch)
+                    appendParenthesesDisjunctionContext(backTrack, context);
+                else {
+                    // The match failed; try to find an alternate point to carry on from.
+                    resetMatches(term, context);
+                    freeParenthesesDisjunctionContext(context);
+
+                    if (result != JSRegExpNoMatch)
+                        return result;
+                    JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+                    if (backtrackResult != JSRegExpMatch)
+                        return backtrackResult;
+                }
+            }
+
+            ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+            ParenthesesDisjunctionContext* context = backTrack->lastContext;
+            recordParenthesesMatch(term, context);
+            return JSRegExpMatch;
+        }
+
+        case QuantifierGreedy: {
+            while (backTrack->matchAmount < term.atom.quantityCount) {
+                ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+                JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                if (result == JSRegExpMatch)
+                    appendParenthesesDisjunctionContext(backTrack, context);
+                else {
+                    resetMatches(term, context);
+                    freeParenthesesDisjunctionContext(context);
+
+                    if (result != JSRegExpNoMatch)
+                        return result;
+
+                    break;
+                }
+            }
+
+            if (backTrack->matchAmount) {
+                ParenthesesDisjunctionContext* context = backTrack->lastContext;
+                recordParenthesesMatch(term, context);
+            }
+            return JSRegExpMatch;
+        }
+
+        case QuantifierNonGreedy:
+            return JSRegExpMatch;
+        }
+
+        ASSERT_NOT_REACHED();
+        return JSRegExpErrorNoMatch;
+    }
+
+    // Rules for backtracking differ depending on whether this is greedy or non-greedy.
+    //
+    // Greedy matches never should try just adding more - you should already have done
+    // the 'more' cases.  Always backtrack, at least a leetle bit.  However cases where
+    // you backtrack an item off the list needs checking, since we'll never have matched
+    // the one less case.  Tracking forwards, still add as much as possible.
+    //
+    // Non-greedy, we've already done the one less case, so don't match on popping.
+    // We haven't done the one more case, so always try to add that.
+    //
+    JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+    {
+        ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+        BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+        ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+        switch (term.atom.quantityType) {
+        case QuantifierFixedCount: {
+            ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+
+            ParenthesesDisjunctionContext* context = 0;
+            JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
+
+            if (result != JSRegExpMatch)
+                return result;
+
+            // While we haven't yet reached our fixed limit,
+            while (backTrack->matchAmount < term.atom.quantityCount) {
+                // Try to do a match, and it it succeeds, add it to the list.
+                context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+                result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+
+                if (result == JSRegExpMatch)
+                    appendParenthesesDisjunctionContext(backTrack, context);
+                else {
+                    // The match failed; try to find an alternate point to carry on from.
+                    resetMatches(term, context);
+                    freeParenthesesDisjunctionContext(context);
+
+                    if (result != JSRegExpNoMatch)
+                        return result;
+                    JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+                    if (backtrackResult != JSRegExpMatch)
+                        return backtrackResult;
+                }
+            }
+
+            ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+            context = backTrack->lastContext;
+            recordParenthesesMatch(term, context);
+            return JSRegExpMatch;
+        }
+
+        case QuantifierGreedy: {
+            if (!backTrack->matchAmount)
+                return JSRegExpNoMatch;
+
+            ParenthesesDisjunctionContext* context = backTrack->lastContext;
+            JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+            if (result == JSRegExpMatch) {
+                while (backTrack->matchAmount < term.atom.quantityCount) {
+                    ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+                    JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                    if (parenthesesResult == JSRegExpMatch)
+                        appendParenthesesDisjunctionContext(backTrack, context);
+                    else {
+                        resetMatches(term, context);
+                        freeParenthesesDisjunctionContext(context);
+
+                        if (parenthesesResult != JSRegExpNoMatch)
+                            return parenthesesResult;
+
+                        break;
+                    }
+                }
+            } else {
+                resetMatches(term, context);
+                popParenthesesDisjunctionContext(backTrack);
+                freeParenthesesDisjunctionContext(context);
+
+                if (result != JSRegExpNoMatch)
+                    return result;
+            }
+
+            if (backTrack->matchAmount) {
+                ParenthesesDisjunctionContext* context = backTrack->lastContext;
+                recordParenthesesMatch(term, context);
+            }
+            return JSRegExpMatch;
+        }
+
+        case QuantifierNonGreedy: {
+            // If we've not reached the limit, try to add one more match.
+            if (backTrack->matchAmount < term.atom.quantityCount) {
+                ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+                JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                if (result == JSRegExpMatch) {
+                    appendParenthesesDisjunctionContext(backTrack, context);
+                    recordParenthesesMatch(term, context);
+                    return JSRegExpMatch;
+                }
+
+                resetMatches(term, context);
+                freeParenthesesDisjunctionContext(context);
+
+                if (result != JSRegExpNoMatch)
+                    return result;
+            }
+
+            // Nope - okay backtrack looking for an alternative.
+            while (backTrack->matchAmount) {
+                ParenthesesDisjunctionContext* context = backTrack->lastContext;
+                JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+                if (result == JSRegExpMatch) {
+                    // successful backtrack! we're back in the game!
+                    if (backTrack->matchAmount) {
+                        context = backTrack->lastContext;
+                        recordParenthesesMatch(term, context);
+                    }
+                    return JSRegExpMatch;
+                }
+
+                // pop a match off the stack
+                resetMatches(term, context);
+                popParenthesesDisjunctionContext(backTrack);
+                freeParenthesesDisjunctionContext(context);
+
+                if (result != JSRegExpNoMatch)
+                    return result;
+            }
+
+            return JSRegExpNoMatch;
+        }
+        }
+
+        ASSERT_NOT_REACHED();
+        return JSRegExpErrorNoMatch;
+    }
+
+    bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
+    {
+        UNUSED_PARAM(term);
+        unsigned matchBegin = context->matchBegin;
+
+        if (matchBegin) {
+            for (matchBegin--; true; matchBegin--) {
+                if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
+                    ++matchBegin;
+                    break;
+                }
+
+                if (!matchBegin)
+                    break;
+            }
+        }
+
+        unsigned matchEnd = input.getPos();
+
+        for (; (matchEnd != input.end())
+             && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
+
+        if (((matchBegin && term.anchors.m_bol)
+             || ((matchEnd != input.end()) && term.anchors.m_eol))
+            && !pattern->m_multiline)
+            return false;
+
+        context->matchBegin = matchBegin;
+        context->matchEnd = matchEnd;
+        return true;
+    }
+
+#define MATCH_NEXT() { ++context->term; goto matchAgain; }
+#define BACKTRACK() { --context->term; goto backtrack; }
+#define currentTerm() (disjunction->terms[context->term])
+    JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+    {
+        if (!--remainingMatchCount)
+            return JSRegExpErrorHitLimit;
+
+        if (btrack)
+            BACKTRACK();
+
+        context->matchBegin = input.getPos();
+        context->term = 0;
+
+    matchAgain:
+        ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+        switch (currentTerm().type) {
+        case ByteTerm::TypeSubpatternBegin:
+            MATCH_NEXT();
+        case ByteTerm::TypeSubpatternEnd:
+            context->matchEnd = input.getPos();
+            return JSRegExpMatch;
+
+        case ByteTerm::TypeBodyAlternativeBegin:
+            MATCH_NEXT();
+        case ByteTerm::TypeBodyAlternativeDisjunction:
+        case ByteTerm::TypeBodyAlternativeEnd:
+            context->matchEnd = input.getPos();
+            return JSRegExpMatch;
+
+        case ByteTerm::TypeAlternativeBegin:
+            MATCH_NEXT();
+        case ByteTerm::TypeAlternativeDisjunction:
+        case ByteTerm::TypeAlternativeEnd: {
+            int offset = currentTerm().alternative.end;
+            BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+            backTrack->offset = offset;
+            context->term += offset;
+            MATCH_NEXT();
+        }
+
+        case ByteTerm::TypeAssertionBOL:
+            if (matchAssertionBOL(currentTerm()))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeAssertionEOL:
+            if (matchAssertionEOL(currentTerm()))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeAssertionWordBoundary:
+            if (matchAssertionWordBoundary(currentTerm()))
+                MATCH_NEXT();
+            BACKTRACK();
+
+        case ByteTerm::TypePatternCharacterOnce:
+        case ByteTerm::TypePatternCharacterFixed: {
+            for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+                if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
+                    BACKTRACK();
+            }
+            MATCH_NEXT();
+        }
+        case ByteTerm::TypePatternCharacterGreedy: {
+            BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+            unsigned matchAmount = 0;
+            while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+                if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
+                    input.uncheckInput(1);
+                    break;
+                }
+                ++matchAmount;
+            }
+            backTrack->matchAmount = matchAmount;
+
+            MATCH_NEXT();
+        }
+        case ByteTerm::TypePatternCharacterNonGreedy: {
+            BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+            backTrack->matchAmount = 0;
+            MATCH_NEXT();
+        }
+
+        case ByteTerm::TypePatternCasedCharacterOnce:
+        case ByteTerm::TypePatternCasedCharacterFixed: {
+            for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+                if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
+                    BACKTRACK();
+            }
+            MATCH_NEXT();
+        }
+        case ByteTerm::TypePatternCasedCharacterGreedy: {
+            BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+            unsigned matchAmount = 0;
+            while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+                if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
+                    input.uncheckInput(1);
+                    break;
+                }
+                ++matchAmount;
+            }
+            backTrack->matchAmount = matchAmount;
+
+            MATCH_NEXT();
+        }
+        case ByteTerm::TypePatternCasedCharacterNonGreedy: {
+            BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+            backTrack->matchAmount = 0;
+            MATCH_NEXT();
+        }
+
+        case ByteTerm::TypeCharacterClass:
+            if (matchCharacterClass(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeBackReference:
+            if (matchBackReference(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpattern: {
+            JSRegExpResult result = matchParentheses(currentTerm(), context);
+
+            if (result == JSRegExpMatch) {
+                MATCH_NEXT();
+            }  else if (result != JSRegExpNoMatch)
+                return result;
+
+            BACKTRACK();
+        }
+        case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+            if (matchParenthesesOnceBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+            if (matchParenthesesOnceEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+            if (matchParenthesesTerminalBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+            if (matchParenthesesTerminalEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParentheticalAssertionBegin:
+            if (matchParentheticalAssertionBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParentheticalAssertionEnd:
+            if (matchParentheticalAssertionEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+
+        case ByteTerm::TypeCheckInput:
+            if (input.checkInput(currentTerm().checkInputCount))
+                MATCH_NEXT();
+            BACKTRACK();
+
+        case ByteTerm::TypeUncheckInput:
+            input.uncheckInput(currentTerm().checkInputCount);
+            MATCH_NEXT();
+                
+        case ByteTerm::TypeDotStarEnclosure:
+            if (matchDotStarEnclosure(currentTerm(), context))
+                return JSRegExpMatch;
+            BACKTRACK();
+        }
+
+        // We should never fall-through to here.
+        ASSERT_NOT_REACHED();
+
+    backtrack:
+        ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+        switch (currentTerm().type) {
+        case ByteTerm::TypeSubpatternBegin:
+            return JSRegExpNoMatch;
+        case ByteTerm::TypeSubpatternEnd:
+            ASSERT_NOT_REACHED();
+
+        case ByteTerm::TypeBodyAlternativeBegin:
+        case ByteTerm::TypeBodyAlternativeDisjunction: {
+            int offset = currentTerm().alternative.next;
+            context->term += offset;
+            if (offset > 0)
+                MATCH_NEXT();
+
+            if (input.atEnd())
+                return JSRegExpNoMatch;
+
+            input.next();
+
+            context->matchBegin = input.getPos();
+
+            if (currentTerm().alternative.onceThrough)
+                context->term += currentTerm().alternative.next;
+
+            MATCH_NEXT();
+        }
+        case ByteTerm::TypeBodyAlternativeEnd:
+            ASSERT_NOT_REACHED();
+
+        case ByteTerm::TypeAlternativeBegin:
+        case ByteTerm::TypeAlternativeDisjunction: {
+            int offset = currentTerm().alternative.next;
+            context->term += offset;
+            if (offset > 0)
+                MATCH_NEXT();
+            BACKTRACK();
+        }
+        case ByteTerm::TypeAlternativeEnd: {
+            // We should never backtrack back into an alternative of the main body of the regex.
+            BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+            unsigned offset = backTrack->offset;
+            context->term -= offset;
+            BACKTRACK();
+        }
+
+        case ByteTerm::TypeAssertionBOL:
+        case ByteTerm::TypeAssertionEOL:
+        case ByteTerm::TypeAssertionWordBoundary:
+            BACKTRACK();
+
+        case ByteTerm::TypePatternCharacterOnce:
+        case ByteTerm::TypePatternCharacterFixed:
+        case ByteTerm::TypePatternCharacterGreedy:
+        case ByteTerm::TypePatternCharacterNonGreedy:
+            if (backtrackPatternCharacter(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypePatternCasedCharacterOnce:
+        case ByteTerm::TypePatternCasedCharacterFixed:
+        case ByteTerm::TypePatternCasedCharacterGreedy:
+        case ByteTerm::TypePatternCasedCharacterNonGreedy:
+            if (backtrackPatternCasedCharacter(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeCharacterClass:
+            if (backtrackCharacterClass(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeBackReference:
+            if (backtrackBackReference(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpattern: {
+            JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+
+            if (result == JSRegExpMatch) {
+                MATCH_NEXT();
+            } else if (result != JSRegExpNoMatch)
+                return result;
+
+            BACKTRACK();
+        }
+        case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+            if (backtrackParenthesesOnceBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+            if (backtrackParenthesesOnceEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+            if (backtrackParenthesesTerminalBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+            if (backtrackParenthesesTerminalEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParentheticalAssertionBegin:
+            if (backtrackParentheticalAssertionBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParentheticalAssertionEnd:
+            if (backtrackParentheticalAssertionEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+
+        case ByteTerm::TypeCheckInput:
+            input.uncheckInput(currentTerm().checkInputCount);
+            BACKTRACK();
+
+        case ByteTerm::TypeUncheckInput:
+            input.checkInput(currentTerm().checkInputCount);
+            BACKTRACK();
+
+        case ByteTerm::TypeDotStarEnclosure:
+            ASSERT_NOT_REACHED();
+        }
+
+        ASSERT_NOT_REACHED();
+        return JSRegExpErrorNoMatch;
+    }
+
+    JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+    {
+        JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
+
+        if (result == JSRegExpMatch) {
+            while (context->matchBegin == context->matchEnd) {
+                result = matchDisjunction(disjunction, context, true);
+                if (result != JSRegExpMatch)
+                    return result;
+            }
+            return JSRegExpMatch;
+        }
+
+        return result;
+    }
+
+    unsigned interpret()
+    {
+        if (!input.isAvailableInput(0))
+            return offsetNoMatch;
+
+        for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
+            output[i << 1] = offsetNoMatch;
+
+        allocatorPool = pattern->m_allocator->startAllocator();
+        if (!allocatorPool)
+            CRASH();
+
+        DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
+
+        JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
+        if (result == JSRegExpMatch) {
+            output[0] = context->matchBegin;
+            output[1] = context->matchEnd;
+        }
+
+        freeDisjunctionContext(context);
+
+        pattern->m_allocator->stopAllocator();
+
+        ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
+        return output[0];
+    }
+
+    Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
+        : pattern(pattern)
+        , output(output)
+        , input(input, start, length)
+        , allocatorPool(0)
+        , remainingMatchCount(matchLimit)
+    {
+    }
+
+private:
+    BytecodePattern* pattern;
+    unsigned* output;
+    InputStream input;
+    BumpPointerPool* allocatorPool;
+    unsigned remainingMatchCount;
+};
+
+
+
+class ByteCompiler {
+    struct ParenthesesStackEntry {
+        unsigned beginTerm;
+        unsigned savedAlternativeIndex;
+        ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
+            : beginTerm(beginTerm)
+            , savedAlternativeIndex(savedAlternativeIndex)
+        {
+        }
+    };
+
+public:
+    ByteCompiler(YarrPattern& pattern)
+        : m_pattern(pattern)
+    {
+        m_currentAlternativeIndex = 0;
+    }
+
+    PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
+    {
+        regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
+        emitDisjunction(m_pattern.m_body);
+        regexEnd();
+
+        return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
+    }
+
+    void checkInput(unsigned count)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
+    }
+
+    void uncheckInput(unsigned count)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
+    }
+    
+    void assertionBOL(unsigned inputPosition)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
+    }
+
+    void assertionEOL(unsigned inputPosition)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
+    }
+
+    void assertionWordBoundary(bool invert, unsigned inputPosition)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
+    }
+
+    void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+    {
+        if (m_pattern.m_ignoreCase) {
+            UChar lo = Unicode::toLower(ch);
+            UChar hi = Unicode::toUpper(ch);
+
+            if (lo != hi) {
+                m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
+                return;
+            }
+        }
+
+        m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
+    }
+
+    void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
+
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+    }
+
+    void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+    {
+        ASSERT(subpatternId);
+
+        m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
+
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+    }
+
+    void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+    {
+        int beginTerm = m_bodyDisjunction->terms.size();
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+        m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+        m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+        m_currentAlternativeIndex = beginTerm + 1;
+    }
+
+    void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+    {
+        int beginTerm = m_bodyDisjunction->terms.size();
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+        m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+        m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+        m_currentAlternativeIndex = beginTerm + 1;
+    }
+
+    void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+    {
+        // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
+        // then fix this up at the end! - simplifying this should make it much clearer.
+        // https://bugs.webkit.org/show_bug.cgi?id=50136
+
+        int beginTerm = m_bodyDisjunction->terms.size();
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+        m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+        m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+        m_currentAlternativeIndex = beginTerm + 1;
+    }
+
+    void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
+    {
+        int beginTerm = m_bodyDisjunction->terms.size();
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+        m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+        m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+        m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+        m_currentAlternativeIndex = beginTerm + 1;
+    }
+
+    void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+    {
+        unsigned beginTerm = popParenthesesStack();
+        closeAlternative(beginTerm + 1);
+        unsigned endTerm = m_bodyDisjunction->terms.size();
+
+        ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
+
+        bool invert = m_bodyDisjunction->terms[beginTerm].invert();
+        unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
+        m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+        m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+        m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+        m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+        m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+    }
+
+    void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
+    }
+
+    unsigned popParenthesesStack()
+    {
+        ASSERT(m_parenthesesStack.size());
+        int stackEnd = m_parenthesesStack.size() - 1;
+        unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
+        m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
+        m_parenthesesStack.shrink(stackEnd);
+
+        ASSERT(beginTerm < m_bodyDisjunction->terms.size());
+        ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
+
+        return beginTerm;
+    }
+
+#ifndef NDEBUG
+    void dumpDisjunction(ByteDisjunction* disjunction)
+    {
+        dataLogF("ByteDisjunction(%p):\n\t", disjunction);
+        for (unsigned i = 0; i < disjunction->terms.size(); ++i)
+            dataLogF("{ %d } ", disjunction->terms[i].type);
+        dataLogF("\n");
+    }
+#endif
+
+    void closeAlternative(int beginTerm)
+    {
+        int origBeginTerm = beginTerm;
+        ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
+        int endIndex = m_bodyDisjunction->terms.size();
+
+        unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+        if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
+            m_bodyDisjunction->terms.remove(beginTerm);
+        else {
+            while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+                beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+                ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
+                m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+                m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+            }
+
+            m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+            m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
+            m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+        }
+    }
+
+    void closeBodyAlternative()
+    {
+        int beginTerm = 0;
+        int origBeginTerm = 0;
+        ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
+        int endIndex = m_bodyDisjunction->terms.size();
+
+        unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+        while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+            beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+            ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
+            m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+            m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+        }
+
+        m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+        m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
+        m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+    }
+
+    void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+    {
+        unsigned beginTerm = popParenthesesStack();
+        closeAlternative(beginTerm + 1);
+        unsigned endTerm = m_bodyDisjunction->terms.size();
+
+        ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+        ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
+
+        bool capture = parenthesesBegin.capture();
+        unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+
+        unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
+        ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+
+        parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
+        for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+            parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
+        parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+
+        m_bodyDisjunction->terms.shrink(beginTerm);
+
+        m_allParenthesesInfo.append(parenthesesDisjunction);
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
+
+        m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+        m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+    }
+
+    void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+    {
+        unsigned beginTerm = popParenthesesStack();
+        closeAlternative(beginTerm + 1);
+        unsigned endTerm = m_bodyDisjunction->terms.size();
+
+        ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+        bool capture = m_bodyDisjunction->terms[beginTerm].capture();
+        unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
+        m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+        m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+        m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+        m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+        m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+    }
+
+    void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+    {
+        unsigned beginTerm = popParenthesesStack();
+        closeAlternative(beginTerm + 1);
+        unsigned endTerm = m_bodyDisjunction->terms.size();
+
+        ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+
+        bool capture = m_bodyDisjunction->terms[beginTerm].capture();
+        unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+        m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
+        m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+        m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+        m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+        m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+        m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+        m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+    }
+
+    void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
+    {
+        m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
+        m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
+        m_bodyDisjunction->terms[0].frameLocation = 0;
+        m_currentAlternativeIndex = 0;
+    }
+
+    void regexEnd()
+    {
+        closeBodyAlternative();
+    }
+
+    void alternativeBodyDisjunction(bool onceThrough)
+    {
+        int newAlternativeIndex = m_bodyDisjunction->terms.size();
+        m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+        m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
+
+        m_currentAlternativeIndex = newAlternativeIndex;
+    }
+
+    void alternativeDisjunction()
+    {
+        int newAlternativeIndex = m_bodyDisjunction->terms.size();
+        m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+        m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
+
+        m_currentAlternativeIndex = newAlternativeIndex;
+    }
+
+    void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
+    {
+        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+            unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
+
+            PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+            if (alt) {
+                if (disjunction == m_pattern.m_body)
+                    alternativeBodyDisjunction(alternative->onceThrough());
+                else
+                    alternativeDisjunction();
+            }
+
+            unsigned minimumSize = alternative->m_minimumSize;
+            ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
+            unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
+
+            if (countToCheck) {
+                checkInput(countToCheck);
+                currentCountAlreadyChecked += countToCheck;
+            }
+
+            for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+                PatternTerm& term = alternative->m_terms[i];
+
+                switch (term.type) {
+                case PatternTerm::TypeAssertionBOL:
+                    assertionBOL(currentCountAlreadyChecked - term.inputPosition);
+                    break;
+
+                case PatternTerm::TypeAssertionEOL:
+                    assertionEOL(currentCountAlreadyChecked - term.inputPosition);
+                    break;
+
+                case PatternTerm::TypeAssertionWordBoundary:
+                    assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
+                    break;
+
+                case PatternTerm::TypePatternCharacter:
+                    atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+                    break;
+
+                case PatternTerm::TypeCharacterClass:
+                    atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+                    break;
+
+                case PatternTerm::TypeBackReference:
+                    atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+                        break;
+
+                case PatternTerm::TypeForwardReference:
+                    break;
+
+                case PatternTerm::TypeParenthesesSubpattern: {
+                    unsigned disjunctionAlreadyCheckedCount = 0;
+                    if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+                        unsigned alternativeFrameLocation = term.frameLocation;
+                        // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
+                        if (term.quantityType == QuantifierFixedCount)
+                            disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
+                        else
+                            alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+                        unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+                        atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
+                        emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+                        atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+                    } else if (term.parentheses.isTerminal) {
+                        unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+                        atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
+                        emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+                        atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+                    } else {
+                        unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+                        atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
+                        emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
+                        atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+                    }
+                    break;
+                }
+
+                case PatternTerm::TypeParentheticalAssertion: {
+                    unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
+
+                    ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
+                    unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
+                    unsigned uncheckAmount = 0;
+                    if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
+                        uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
+                        uncheckInput(uncheckAmount);
+                        currentCountAlreadyChecked -= uncheckAmount;
+                    }
+
+                    atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
+                    emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
+                    atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
+                    if (uncheckAmount) {
+                        checkInput(uncheckAmount);
+                        currentCountAlreadyChecked += uncheckAmount;
+                    }
+                    break;
+                }
+
+                case PatternTerm::TypeDotStarEnclosure:
+                    assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
+                    break;
+                }
+            }
+        }
+    }
+
+private:
+    YarrPattern& m_pattern;
+    OwnPtr<ByteDisjunction> m_bodyDisjunction;
+    unsigned m_currentAlternativeIndex;
+    Vector<ParenthesesStackEntry> m_parenthesesStack;
+    Vector<ByteDisjunction*> m_allParenthesesInfo;
+};
+
+PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
+{
+    return ByteCompiler(pattern).compile(allocator);
+}
+
+unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
+{
+    if (input.is8Bit())
+        return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
+    return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
+{
+    return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
+{
+    return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
+}
+
+// These should be the same for both UChar & LChar.
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
+
+
+} }
diff --git a/masm/yarr/YarrInterpreter.h b/masm/yarr/YarrInterpreter.h

new file mode 100644 (file)

index 0000000..fb60bd9
--- /dev/null
+++ b/masm/yarr/YarrInterpreter.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrInterpreter_h
+#define YarrInterpreter_h
+
+#include "YarrPattern.h"
+#include <wtf/PassOwnPtr.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace WTF {
+class BumpPointerAllocator;
+}
+using WTF::BumpPointerAllocator;
+
+namespace JSC { namespace Yarr {
+
+class ByteDisjunction;
+
+struct ByteTerm {
+    enum Type {
+        TypeBodyAlternativeBegin,
+        TypeBodyAlternativeDisjunction,
+        TypeBodyAlternativeEnd,
+        TypeAlternativeBegin,
+        TypeAlternativeDisjunction,
+        TypeAlternativeEnd,
+        TypeSubpatternBegin,
+        TypeSubpatternEnd,
+        TypeAssertionBOL,
+        TypeAssertionEOL,
+        TypeAssertionWordBoundary,
+        TypePatternCharacterOnce,
+        TypePatternCharacterFixed,
+        TypePatternCharacterGreedy,
+        TypePatternCharacterNonGreedy,
+        TypePatternCasedCharacterOnce,
+        TypePatternCasedCharacterFixed,
+        TypePatternCasedCharacterGreedy,
+        TypePatternCasedCharacterNonGreedy,
+        TypeCharacterClass,
+        TypeBackReference,
+        TypeParenthesesSubpattern,
+        TypeParenthesesSubpatternOnceBegin,
+        TypeParenthesesSubpatternOnceEnd,
+        TypeParenthesesSubpatternTerminalBegin,
+        TypeParenthesesSubpatternTerminalEnd,
+        TypeParentheticalAssertionBegin,
+        TypeParentheticalAssertionEnd,
+        TypeCheckInput,
+        TypeUncheckInput,
+        TypeDotStarEnclosure,
+    } type;
+    union {
+        struct {
+            union {
+                UChar patternCharacter;
+                struct {
+                    UChar lo;
+                    UChar hi;
+                } casedCharacter;
+                CharacterClass* characterClass;
+                unsigned subpatternId;
+            };
+            union {
+                ByteDisjunction* parenthesesDisjunction;
+                unsigned parenthesesWidth;
+            };
+            QuantifierType quantityType;
+            unsigned quantityCount;
+        } atom;
+        struct {
+            int next;
+            int end;
+            bool onceThrough;
+        } alternative;
+        struct {
+            bool m_bol : 1;
+            bool m_eol : 1;
+        } anchors;
+        unsigned checkInputCount;
+    };
+    unsigned frameLocation;
+    bool m_capture : 1;
+    bool m_invert : 1;
+    unsigned inputPosition;
+
+    ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+        : frameLocation(frameLocation)
+        , m_capture(false)
+        , m_invert(false)
+    {
+        switch (quantityType) {
+        case QuantifierFixedCount:
+            type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
+            break;
+        case QuantifierGreedy:
+            type = ByteTerm::TypePatternCharacterGreedy;
+            break;
+        case QuantifierNonGreedy:
+            type = ByteTerm::TypePatternCharacterNonGreedy;
+            break;
+        }
+
+        atom.patternCharacter = ch;
+        atom.quantityType = quantityType;
+        atom.quantityCount = quantityCount.unsafeGet();
+        inputPosition = inputPos;
+    }
+
+    ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+        : frameLocation(frameLocation)
+        , m_capture(false)
+        , m_invert(false)
+    {
+        switch (quantityType) {
+        case QuantifierFixedCount:
+            type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
+            break;
+        case QuantifierGreedy:
+            type = ByteTerm::TypePatternCasedCharacterGreedy;
+            break;
+        case QuantifierNonGreedy:
+            type = ByteTerm::TypePatternCasedCharacterNonGreedy;
+            break;
+        }
+
+        atom.casedCharacter.lo = lo;
+        atom.casedCharacter.hi = hi;
+        atom.quantityType = quantityType;
+        atom.quantityCount = quantityCount.unsafeGet();
+        inputPosition = inputPos;
+    }
+
+    ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
+        : type(ByteTerm::TypeCharacterClass)
+        , m_capture(false)
+        , m_invert(invert)
+    {
+        atom.characterClass = characterClass;
+        atom.quantityType = QuantifierFixedCount;
+        atom.quantityCount = 1;
+        inputPosition = inputPos;
+    }
+
+    ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos)
+        : type(type)
+        , m_capture(capture)
+        , m_invert(false)
+    {
+        atom.subpatternId = subpatternId;
+        atom.parenthesesDisjunction = parenthesesInfo;
+        atom.quantityType = QuantifierFixedCount;
+        atom.quantityCount = 1;
+        inputPosition = inputPos;
+    }
+    
+    ByteTerm(Type type, bool invert = false)
+        : type(type)
+        , m_capture(false)
+        , m_invert(invert)
+    {
+        atom.quantityType = QuantifierFixedCount;
+        atom.quantityCount = 1;
+    }
+
+    ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos)
+        : type(type)
+        , m_capture(capture)
+        , m_invert(invert)
+    {
+        atom.subpatternId = subpatternId;
+        atom.quantityType = QuantifierFixedCount;
+        atom.quantityCount = 1;
+        inputPosition = inputPos;
+    }
+
+    static ByteTerm BOL(int inputPos)
+    {
+        ByteTerm term(TypeAssertionBOL);
+        term.inputPosition = inputPos;
+        return term;
+    }
+
+    static ByteTerm CheckInput(Checked<unsigned> count)
+    {
+        ByteTerm term(TypeCheckInput);
+        term.checkInputCount = count.unsafeGet();
+        return term;
+    }
+
+    static ByteTerm UncheckInput(Checked<unsigned> count)
+    {
+        ByteTerm term(TypeUncheckInput);
+        term.checkInputCount = count.unsafeGet();
+        return term;
+    }
+    
+    static ByteTerm EOL(int inputPos)
+    {
+        ByteTerm term(TypeAssertionEOL);
+        term.inputPosition = inputPos;
+        return term;
+    }
+
+    static ByteTerm WordBoundary(bool invert, int inputPos)
+    {
+        ByteTerm term(TypeAssertionWordBoundary, invert);
+        term.inputPosition = inputPos;
+        return term;
+    }
+    
+    static ByteTerm BackReference(unsigned subpatternId, int inputPos)
+    {
+        return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
+    }
+
+    static ByteTerm BodyAlternativeBegin(bool onceThrough)
+    {
+        ByteTerm term(TypeBodyAlternativeBegin);
+        term.alternative.next = 0;
+        term.alternative.end = 0;
+        term.alternative.onceThrough = onceThrough;
+        return term;
+    }
+
+    static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
+    {
+        ByteTerm term(TypeBodyAlternativeDisjunction);
+        term.alternative.next = 0;
+        term.alternative.end = 0;
+        term.alternative.onceThrough = onceThrough;
+        return term;
+    }
+
+    static ByteTerm BodyAlternativeEnd()
+    {
+        ByteTerm term(TypeBodyAlternativeEnd);
+        term.alternative.next = 0;
+        term.alternative.end = 0;
+        term.alternative.onceThrough = false;
+        return term;
+    }
+
+    static ByteTerm AlternativeBegin()
+    {
+        ByteTerm term(TypeAlternativeBegin);
+        term.alternative.next = 0;
+        term.alternative.end = 0;
+        term.alternative.onceThrough = false;
+        return term;
+    }
+
+    static ByteTerm AlternativeDisjunction()
+    {
+        ByteTerm term(TypeAlternativeDisjunction);
+        term.alternative.next = 0;
+        term.alternative.end = 0;
+        term.alternative.onceThrough = false;
+        return term;
+    }
+
+    static ByteTerm AlternativeEnd()
+    {
+        ByteTerm term(TypeAlternativeEnd);
+        term.alternative.next = 0;
+        term.alternative.end = 0;
+        term.alternative.onceThrough = false;
+        return term;
+    }
+
+    static ByteTerm SubpatternBegin()
+    {
+        return ByteTerm(TypeSubpatternBegin);
+    }
+
+    static ByteTerm SubpatternEnd()
+    {
+        return ByteTerm(TypeSubpatternEnd);
+    }
+    
+    static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
+    {
+        ByteTerm term(TypeDotStarEnclosure);
+        term.anchors.m_bol = bolAnchor;
+        term.anchors.m_eol = eolAnchor;
+        return term;
+    }
+
+    bool invert()
+    {
+        return m_invert;
+    }
+
+    bool capture()
+    {
+        return m_capture;
+    }
+};
+
+class ByteDisjunction {
+    WTF_MAKE_FAST_ALLOCATED;
+public:
+    ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
+        : m_numSubpatterns(numSubpatterns)
+        , m_frameSize(frameSize)
+    {
+    }
+
+    Vector<ByteTerm> terms;
+    unsigned m_numSubpatterns;
+    unsigned m_frameSize;
+};
+
+struct BytecodePattern {
+    WTF_MAKE_FAST_ALLOCATED;
+public:
+    BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator)
+        : m_body(body)
+        , m_ignoreCase(pattern.m_ignoreCase)
+        , m_multiline(pattern.m_multiline)
+        , m_allocator(allocator)
+    {
+        newlineCharacterClass = pattern.newlineCharacterClass();
+        wordcharCharacterClass = pattern.wordcharCharacterClass();
+
+        m_allParenthesesInfo.append(allParenthesesInfo);
+        m_userCharacterClasses.append(pattern.m_userCharacterClasses);
+        // 'Steal' the YarrPattern's CharacterClasses!  We clear its
+        // array, so that it won't delete them on destruction.  We'll
+        // take responsibility for that.
+        pattern.m_userCharacterClasses.clear();
+    }
+
+    ~BytecodePattern()
+    {
+        deleteAllValues(m_allParenthesesInfo);
+        deleteAllValues(m_userCharacterClasses);
+    }
+
+    OwnPtr<ByteDisjunction> m_body;
+    bool m_ignoreCase;
+    bool m_multiline;
+    // Each BytecodePattern is associated with a RegExp, each RegExp is associated
+    // with a JSGlobalData.  Cache a pointer to out JSGlobalData's m_regExpAllocator.
+    BumpPointerAllocator* m_allocator;
+
+    CharacterClass* newlineCharacterClass;
+    CharacterClass* wordcharCharacterClass;
+
+private:
+    Vector<ByteDisjunction*> m_allParenthesesInfo;
+    Vector<CharacterClass*> m_userCharacterClasses;
+};
+
+JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*);
+JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
+
+} } // namespace JSC::Yarr
+
+#endif // YarrInterpreter_h
diff --git a/masm/yarr/YarrJIT.cpp b/masm/yarr/YarrJIT.cpp

new file mode 100644 (file)

index 0000000..ce84e2c
--- /dev/null
+++ b/masm/yarr/YarrJIT.cpp
@@ -0,0 +1,2667 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "YarrJIT.h"
+
+#include <wtf/ASCIICType.h>
+#include "LinkBuffer.h"
+#include "Options.h"
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+
+#if ENABLE(YARR_JIT)
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+template<YarrJITCompileMode compileMode>
+class YarrGenerator : private MacroAssembler {
+    friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline);
+
+#if CPU(ARM)
+    static const RegisterID input = ARMRegisters::r0;
+    static const RegisterID index = ARMRegisters::r1;
+    static const RegisterID length = ARMRegisters::r2;
+    static const RegisterID output = ARMRegisters::r4;
+
+    static const RegisterID regT0 = ARMRegisters::r5;
+    static const RegisterID regT1 = ARMRegisters::r6;
+
+    static const RegisterID returnRegister = ARMRegisters::r0;
+    static const RegisterID returnRegister2 = ARMRegisters::r1;
+#elif CPU(MIPS)
+    static const RegisterID input = MIPSRegisters::a0;
+    static const RegisterID index = MIPSRegisters::a1;
+    static const RegisterID length = MIPSRegisters::a2;
+    static const RegisterID output = MIPSRegisters::a3;
+
+    static const RegisterID regT0 = MIPSRegisters::t4;
+    static const RegisterID regT1 = MIPSRegisters::t5;
+
+    static const RegisterID returnRegister = MIPSRegisters::v0;
+    static const RegisterID returnRegister2 = MIPSRegisters::v1;
+#elif CPU(SH4)
+    static const RegisterID input = SH4Registers::r4;
+    static const RegisterID index = SH4Registers::r5;
+    static const RegisterID length = SH4Registers::r6;
+    static const RegisterID output = SH4Registers::r7;
+
+    static const RegisterID regT0 = SH4Registers::r0;
+    static const RegisterID regT1 = SH4Registers::r1;
+
+    static const RegisterID returnRegister = SH4Registers::r0;
+    static const RegisterID returnRegister2 = SH4Registers::r1;
+#elif CPU(X86)
+    static const RegisterID input = X86Registers::eax;
+    static const RegisterID index = X86Registers::edx;
+    static const RegisterID length = X86Registers::ecx;
+    static const RegisterID output = X86Registers::edi;
+
+    static const RegisterID regT0 = X86Registers::ebx;
+    static const RegisterID regT1 = X86Registers::esi;
+
+    static const RegisterID returnRegister = X86Registers::eax;
+    static const RegisterID returnRegister2 = X86Registers::edx;
+#elif CPU(X86_64)
+    static const RegisterID input = X86Registers::edi;
+    static const RegisterID index = X86Registers::esi;
+    static const RegisterID length = X86Registers::edx;
+    static const RegisterID output = X86Registers::ecx;
+
+    static const RegisterID regT0 = X86Registers::eax;
+    static const RegisterID regT1 = X86Registers::ebx;
+
+    static const RegisterID returnRegister = X86Registers::eax;
+    static const RegisterID returnRegister2 = X86Registers::edx;
+#endif
+
+    void optimizeAlternative(PatternAlternative* alternative)
+    {
+        if (!alternative->m_terms.size())
+            return;
+
+        for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) {
+            PatternTerm& term = alternative->m_terms[i];
+            PatternTerm& nextTerm = alternative->m_terms[i + 1];
+
+            if ((term.type == PatternTerm::TypeCharacterClass)
+                && (term.quantityType == QuantifierFixedCount)
+                && (nextTerm.type == PatternTerm::TypePatternCharacter)
+                && (nextTerm.quantityType == QuantifierFixedCount)) {
+                PatternTerm termCopy = term;
+                alternative->m_terms[i] = nextTerm;
+                alternative->m_terms[i + 1] = termCopy;
+            }
+        }
+    }
+
+    void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount)
+    {
+        do {
+            // pick which range we're going to generate
+            int which = count >> 1;
+            char lo = ranges[which].begin;
+            char hi = ranges[which].end;
+
+            // check if there are any ranges or matches below lo.  If not, just jl to failure -
+            // if there is anything else to check, check that first, if it falls through jmp to failure.
+            if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) {
+                Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo));
+
+                // generate code for all ranges before this one
+                if (which)
+                    matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount);
+
+                while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) {
+                    matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex])));
+                    ++*matchIndex;
+                }
+                failures.append(jump());
+
+                loOrAbove.link(this);
+            } else if (which) {
+                Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo));
+
+                matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount);
+                failures.append(jump());
+
+                loOrAbove.link(this);
+            } else
+                failures.append(branch32(LessThan, character, Imm32((unsigned short)lo)));
+
+            while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi))
+                ++*matchIndex;
+
+            matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi)));
+            // fall through to here, the value is above hi.
+
+            // shuffle along & loop around if there are any more matches to handle.
+            unsigned next = which + 1;
+            ranges += next;
+            count -= next;
+        } while (count);
+    }
+
+    void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass)
+    {
+        if (charClass->m_table) {
+            ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table));
+            matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry));
+            return;
+        }
+        Jump unicodeFail;
+        if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) {
+            Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f));
+
+            if (charClass->m_matchesUnicode.size()) {
+                for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) {
+                    UChar ch = charClass->m_matchesUnicode[i];
+                    matchDest.append(branch32(Equal, character, Imm32(ch)));
+                }
+            }
+
+            if (charClass->m_rangesUnicode.size()) {
+                for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) {
+                    UChar lo = charClass->m_rangesUnicode[i].begin;
+                    UChar hi = charClass->m_rangesUnicode[i].end;
+
+                    Jump below = branch32(LessThan, character, Imm32(lo));
+                    matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi)));
+                    below.link(this);
+                }
+            }
+
+            unicodeFail = jump();
+            isAscii.link(this);
+        }
+
+        if (charClass->m_ranges.size()) {
+            unsigned matchIndex = 0;
+            JumpList failures;
+            matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size());
+            while (matchIndex < charClass->m_matches.size())
+                matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++])));
+
+            failures.link(this);
+        } else if (charClass->m_matches.size()) {
+            // optimization: gather 'a','A' etc back together, can mask & test once.
+            Vector<char> matchesAZaz;
+
+            for (unsigned i = 0; i < charClass->m_matches.size(); ++i) {
+                char ch = charClass->m_matches[i];
+                if (m_pattern.m_ignoreCase) {
+                    if (isASCIILower(ch)) {
+                        matchesAZaz.append(ch);
+                        continue;
+                    }
+                    if (isASCIIUpper(ch))
+                        continue;
+                }
+                matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch)));
+            }
+
+            if (unsigned countAZaz = matchesAZaz.size()) {
+                or32(TrustedImm32(32), character);
+                for (unsigned i = 0; i < countAZaz; ++i)
+                    matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i])));
+            }
+        }
+
+        if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size())
+            unicodeFail.link(this);
+    }
+
+    // Jumps if input not available; will have (incorrectly) incremented already!
+    Jump jumpIfNoAvailableInput(unsigned countToCheck = 0)
+    {
+        if (countToCheck)
+            add32(Imm32(countToCheck), index);
+        return branch32(Above, index, length);
+    }
+
+    Jump jumpIfAvailableInput(unsigned countToCheck)
+    {
+        add32(Imm32(countToCheck), index);
+        return branch32(BelowOrEqual, index, length);
+    }
+
+    Jump checkInput()
+    {
+        return branch32(BelowOrEqual, index, length);
+    }
+
+    Jump atEndOfInput()
+    {
+        return branch32(Equal, index, length);
+    }
+
+    Jump notAtEndOfInput()
+    {
+        return branch32(NotEqual, index, length);
+    }
+
+    Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character)
+    {
+        readCharacter(inputPosition, character);
+
+        // For case-insesitive compares, non-ascii characters that have different
+        // upper & lower case representations are converted to a character class.
+        ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+        if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
+            or32(TrustedImm32(0x20), character);
+            ch |= 0x20;
+        }
+
+        return branch32(NotEqual, character, Imm32(ch));
+    }
+
+    void readCharacter(int inputPosition, RegisterID reg)
+    {
+        if (m_charSize == Char8)
+            load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg);
+        else
+            load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg);
+    }
+
+    void storeToFrame(RegisterID reg, unsigned frameLocation)
+    {
+        poke(reg, frameLocation);
+    }
+
+    void storeToFrame(TrustedImm32 imm, unsigned frameLocation)
+    {
+        poke(imm, frameLocation);
+    }
+
+    DataLabelPtr storeToFrameWithPatch(unsigned frameLocation)
+    {
+        return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*)));
+    }
+
+    void loadFromFrame(unsigned frameLocation, RegisterID reg)
+    {
+        peek(reg, frameLocation);
+    }
+
+    void loadFromFrameAndJump(unsigned frameLocation)
+    {
+        jump(Address(stackPointerRegister, frameLocation * sizeof(void*)));
+    }
+
+    void initCallFrame()
+    {
+        unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+        if (callFrameSize)
+            subPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+    }
+    void removeCallFrame()
+    {
+        unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+        if (callFrameSize)
+            addPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+    }
+
+    // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns.
+    void setSubpatternStart(RegisterID reg, unsigned subpattern)
+    {
+        ASSERT(subpattern);
+        // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+        store32(reg, Address(output, (subpattern << 1) * sizeof(int)));
+    }
+    void setSubpatternEnd(RegisterID reg, unsigned subpattern)
+    {
+        ASSERT(subpattern);
+        // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+        store32(reg, Address(output, ((subpattern << 1) + 1) * sizeof(int)));
+    }
+    void clearSubpatternStart(unsigned subpattern)
+    {
+        ASSERT(subpattern);
+        // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+        store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int)));
+    }
+
+    // We use one of three different strategies to track the start of the current match,
+    // while matching.
+    // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily
+    //    at the end of matching. This is irrespective of compileMode, and in this case
+    //    these methods should never be called.
+    // 2) If we're compiling IncludeSubpatterns, 'output' contains a pointer to an output
+    //    vector, store the match start in the output vector.
+    // 3) If we're compiling MatchOnly, 'output' is unused, store the match start directly
+    //    in this register.
+    void setMatchStart(RegisterID reg)
+    {
+        ASSERT(!m_pattern.m_body->m_hasFixedSize);
+        if (compileMode == IncludeSubpatterns)
+            store32(reg, output);
+        else
+            move(reg, output);
+    }
+    void getMatchStart(RegisterID reg)
+    {
+        ASSERT(!m_pattern.m_body->m_hasFixedSize);
+        if (compileMode == IncludeSubpatterns)
+            load32(output, reg);
+        else
+            move(output, reg);
+    }
+
+    enum YarrOpCode {
+        // These nodes wrap body alternatives - those in the main disjunction,
+        // rather than subpatterns or assertions. These are chained together in
+        // a doubly linked list, with a 'begin' node for the first alternative,
+        // a 'next' node for each subsequent alternative, and an 'end' node at
+        // the end. In the case of repeating alternatives, the 'end' node also
+        // has a reference back to 'begin'.
+        OpBodyAlternativeBegin,
+        OpBodyAlternativeNext,
+        OpBodyAlternativeEnd,
+        // Similar to the body alternatives, but used for subpatterns with two
+        // or more alternatives.
+        OpNestedAlternativeBegin,
+        OpNestedAlternativeNext,
+        OpNestedAlternativeEnd,
+        // Used for alternatives in subpatterns where there is only a single
+        // alternative (backtrackingis easier in these cases), or for alternatives
+        // which never need to be backtracked (those in parenthetical assertions,
+        // terminal subpatterns).
+        OpSimpleNestedAlternativeBegin,
+        OpSimpleNestedAlternativeNext,
+        OpSimpleNestedAlternativeEnd,
+        // Used to wrap 'Once' subpattern matches (quantityCount == 1).
+        OpParenthesesSubpatternOnceBegin,
+        OpParenthesesSubpatternOnceEnd,
+        // Used to wrap 'Terminal' subpattern matches (at the end of the regexp).
+        OpParenthesesSubpatternTerminalBegin,
+        OpParenthesesSubpatternTerminalEnd,
+        // Used to wrap parenthetical assertions.
+        OpParentheticalAssertionBegin,
+        OpParentheticalAssertionEnd,
+        // Wraps all simple terms (pattern characters, character classes).
+        OpTerm,
+        // Where an expression contains only 'once through' body alternatives
+        // and no repeating ones, this op is used to return match failure.
+        OpMatchFailed
+    };
+
+    // This structure is used to hold the compiled opcode information,
+    // including reference back to the original PatternTerm/PatternAlternatives,
+    // and JIT compilation data structures.
+    struct YarrOp {
+        explicit YarrOp(PatternTerm* term)
+            : m_op(OpTerm)
+            , m_term(term)
+            , m_isDeadCode(false)
+        {
+        }
+
+        explicit YarrOp(YarrOpCode op)
+            : m_op(op)
+            , m_isDeadCode(false)
+        {
+        }
+
+        // The operation, as a YarrOpCode, and also a reference to the PatternTerm.
+        YarrOpCode m_op;
+        PatternTerm* m_term;
+
+        // For alternatives, this holds the PatternAlternative and doubly linked
+        // references to this alternative's siblings. In the case of the
+        // OpBodyAlternativeEnd node at the end of a section of repeating nodes,
+        // m_nextOp will reference the OpBodyAlternativeBegin node of the first
+        // repeating alternative.
+        PatternAlternative* m_alternative;
+        size_t m_previousOp;
+        size_t m_nextOp;
+
+        // Used to record a set of Jumps out of the generated code, typically
+        // used for jumps out to backtracking code, and a single reentry back
+        // into the code for a node (likely where a backtrack will trigger
+        // rematching).
+        Label m_reentry;
+        JumpList m_jumps;
+
+        // Used for backtracking when the prior alternative did not consume any
+        // characters but matched.
+        Jump m_zeroLengthMatch;
+
+        // This flag is used to null out the second pattern character, when
+        // two are fused to match a pair together.
+        bool m_isDeadCode;
+
+        // Currently used in the case of some of the more complex management of
+        // 'm_checked', to cache the offset used in this alternative, to avoid
+        // recalculating it.
+        int m_checkAdjust;
+
+        // Used by OpNestedAlternativeNext/End to hold the pointer to the
+        // value that will be pushed into the pattern's frame to return to,
+        // upon backtracking back into the disjunction.
+        DataLabelPtr m_returnAddress;
+    };
+
+    // BacktrackingState
+    // This class encapsulates information about the state of code generation
+    // whilst generating the code for backtracking, when a term fails to match.
+    // Upon entry to code generation of the backtracking code for a given node,
+    // the Backtracking state will hold references to all control flow sources
+    // that are outputs in need of further backtracking from the prior node
+    // generated (which is the subsequent operation in the regular expression,
+    // and in the m_ops Vector, since we generated backtracking backwards).
+    // These references to control flow take the form of:
+    //  - A jump list of jumps, to be linked to code that will backtrack them
+    //    further.
+    //  - A set of DataLabelPtr values, to be populated with values to be
+    //    treated effectively as return addresses backtracking into complex
+    //    subpatterns.
+    //  - A flag indicating that the current sequence of generated code up to
+    //    this point requires backtracking.
+    class BacktrackingState {
+    public:
+        BacktrackingState()
+            : m_pendingFallthrough(false)
+        {
+        }
+
+        // Add a jump or jumps, a return address, or set the flag indicating
+        // that the current 'fallthrough' control flow requires backtracking.
+        void append(const Jump& jump)
+        {
+            m_laterFailures.append(jump);
+        }
+        void append(JumpList& jumpList)
+        {
+            m_laterFailures.append(jumpList);
+        }
+        void append(const DataLabelPtr& returnAddress)
+        {
+            m_pendingReturns.append(returnAddress);
+        }
+        void fallthrough()
+        {
+            ASSERT(!m_pendingFallthrough);
+            m_pendingFallthrough = true;
+        }
+
+        // These methods clear the backtracking state, either linking to the
+        // current location, a provided label, or copying the backtracking out
+        // to a JumpList. All actions may require code generation to take place,
+        // and as such are passed a pointer to the assembler.
+        void link(MacroAssembler* assembler)
+        {
+            if (m_pendingReturns.size()) {
+                Label here(assembler);
+                for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+                    m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here));
+                m_pendingReturns.clear();
+            }
+            m_laterFailures.link(assembler);
+            m_laterFailures.clear();
+            m_pendingFallthrough = false;
+        }
+        void linkTo(Label label, MacroAssembler* assembler)
+        {
+            if (m_pendingReturns.size()) {
+                for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+                    m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], label));
+                m_pendingReturns.clear();
+            }
+            if (m_pendingFallthrough)
+                assembler->jump(label);
+            m_laterFailures.linkTo(label, assembler);
+            m_laterFailures.clear();
+            m_pendingFallthrough = false;
+        }
+        void takeBacktracksToJumpList(JumpList& jumpList, MacroAssembler* assembler)
+        {
+            if (m_pendingReturns.size()) {
+                Label here(assembler);
+                for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+                    m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here));
+                m_pendingReturns.clear();
+                m_pendingFallthrough = true;
+            }
+            if (m_pendingFallthrough)
+                jumpList.append(assembler->jump());
+            jumpList.append(m_laterFailures);
+            m_laterFailures.clear();
+            m_pendingFallthrough = false;
+        }
+
+        bool isEmpty()
+        {
+            return m_laterFailures.empty() && m_pendingReturns.isEmpty() && !m_pendingFallthrough;
+        }
+
+        // Called at the end of code generation to link all return addresses.
+        void linkDataLabels(LinkBuffer& linkBuffer)
+        {
+            ASSERT(isEmpty());
+            for (unsigned i = 0; i < m_backtrackRecords.size(); ++i)
+                linkBuffer.patch(m_backtrackRecords[i].m_dataLabel, linkBuffer.locationOf(m_backtrackRecords[i].m_backtrackLocation));
+        }
+
+    private:
+        struct ReturnAddressRecord {
+            ReturnAddressRecord(DataLabelPtr dataLabel, Label backtrackLocation)
+                : m_dataLabel(dataLabel)
+                , m_backtrackLocation(backtrackLocation)
+            {
+            }
+
+            DataLabelPtr m_dataLabel;
+            Label m_backtrackLocation;
+        };
+
+        JumpList m_laterFailures;
+        bool m_pendingFallthrough;
+        Vector<DataLabelPtr, 4> m_pendingReturns;
+        Vector<ReturnAddressRecord, 4> m_backtrackRecords;
+    };
+
+    // Generation methods:
+    // ===================
+
+    // This method provides a default implementation of backtracking common
+    // to many terms; terms commonly jump out of the forwards  matching path
+    // on any failed conditions, and add these jumps to the m_jumps list. If
+    // no special handling is required we can often just backtrack to m_jumps.
+    void backtrackTermDefault(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        m_backtrackingState.append(op.m_jumps);
+    }
+
+    void generateAssertionBOL(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        if (m_pattern.m_multiline) {
+            const RegisterID character = regT0;
+
+            JumpList matchDest;
+            if (!term->inputPosition)
+                matchDest.append(branch32(Equal, index, Imm32(m_checked)));
+
+            readCharacter((term->inputPosition - m_checked) - 1, character);
+            matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass());
+            op.m_jumps.append(jump());
+
+            matchDest.link(this);
+        } else {
+            // Erk, really should poison out these alternatives early. :-/
+            if (term->inputPosition)
+                op.m_jumps.append(jump());
+            else
+                op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked)));
+        }
+    }
+    void backtrackAssertionBOL(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    void generateAssertionEOL(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        if (m_pattern.m_multiline) {
+            const RegisterID character = regT0;
+
+            JumpList matchDest;
+            if (term->inputPosition == m_checked)
+                matchDest.append(atEndOfInput());
+
+            readCharacter(term->inputPosition - m_checked, character);
+            matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass());
+            op.m_jumps.append(jump());
+
+            matchDest.link(this);
+        } else {
+            if (term->inputPosition == m_checked)
+                op.m_jumps.append(notAtEndOfInput());
+            // Erk, really should poison out these alternatives early. :-/
+            else
+                op.m_jumps.append(jump());
+        }
+    }
+    void backtrackAssertionEOL(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    // Also falls though on nextIsNotWordChar.
+    void matchAssertionWordchar(size_t opIndex, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+
+        if (term->inputPosition == m_checked)
+            nextIsNotWordChar.append(atEndOfInput());
+
+        readCharacter((term->inputPosition - m_checked), character);
+        matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass());
+    }
+
+    void generateAssertionWordBoundary(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+
+        Jump atBegin;
+        JumpList matchDest;
+        if (!term->inputPosition)
+            atBegin = branch32(Equal, index, Imm32(m_checked));
+        readCharacter((term->inputPosition - m_checked) - 1, character);
+        matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass());
+        if (!term->inputPosition)
+            atBegin.link(this);
+
+        // We fall through to here if the last character was not a wordchar.
+        JumpList nonWordCharThenWordChar;
+        JumpList nonWordCharThenNonWordChar;
+        if (term->invert()) {
+            matchAssertionWordchar(opIndex, nonWordCharThenNonWordChar, nonWordCharThenWordChar);
+            nonWordCharThenWordChar.append(jump());
+        } else {
+            matchAssertionWordchar(opIndex, nonWordCharThenWordChar, nonWordCharThenNonWordChar);
+            nonWordCharThenNonWordChar.append(jump());
+        }
+        op.m_jumps.append(nonWordCharThenNonWordChar);
+
+        // We jump here if the last character was a wordchar.
+        matchDest.link(this);
+        JumpList wordCharThenWordChar;
+        JumpList wordCharThenNonWordChar;
+        if (term->invert()) {
+            matchAssertionWordchar(opIndex, wordCharThenNonWordChar, wordCharThenWordChar);
+            wordCharThenWordChar.append(jump());
+        } else {
+            matchAssertionWordchar(opIndex, wordCharThenWordChar, wordCharThenNonWordChar);
+            // This can fall-though!
+        }
+
+        op.m_jumps.append(wordCharThenWordChar);
+
+        nonWordCharThenWordChar.link(this);
+        wordCharThenNonWordChar.link(this);
+    }
+    void backtrackAssertionWordBoundary(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    void generatePatternCharacterOnce(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+
+        if (op.m_isDeadCode)
+            return;
+        
+        // m_ops always ends with a OpBodyAlternativeEnd or OpMatchFailed
+        // node, so there must always be at least one more node.
+        ASSERT(opIndex + 1 < m_ops.size());
+        YarrOp* nextOp = &m_ops[opIndex + 1];
+
+        PatternTerm* term = op.m_term;
+        UChar ch = term->patternCharacter;
+
+        if ((ch > 0xff) && (m_charSize == Char8)) {
+            // Have a 16 bit pattern character and an 8 bit string - short circuit
+            op.m_jumps.append(jump());
+            return;
+        }
+
+        const RegisterID character = regT0;
+        int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2;
+        unsigned ignoreCaseMask = 0;
+        int allCharacters = ch;
+        int numberCharacters;
+        int startTermPosition = term->inputPosition;
+
+        // For case-insesitive compares, non-ascii characters that have different
+        // upper & lower case representations are converted to a character class.
+        ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+
+        if (m_pattern.m_ignoreCase && isASCIIAlpha(ch))
+            ignoreCaseMask |= 32;
+
+        for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) {
+            PatternTerm* nextTerm = nextOp->m_term;
+            
+            if (nextTerm->type != PatternTerm::TypePatternCharacter
+                || nextTerm->quantityType != QuantifierFixedCount
+                || nextTerm->quantityCount != 1
+                || nextTerm->inputPosition != (startTermPosition + numberCharacters))
+                break;
+
+            nextOp->m_isDeadCode = true;
+
+            int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters;
+
+            UChar currentCharacter = nextTerm->patternCharacter;
+
+            if ((currentCharacter > 0xff) && (m_charSize == Char8)) {
+                // Have a 16 bit pattern character and an 8 bit string - short circuit
+                op.m_jumps.append(jump());
+                return;
+            }
+
+            // For case-insesitive compares, non-ascii characters that have different
+            // upper & lower case representations are converted to a character class.
+            ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter));
+
+            allCharacters |= (currentCharacter << shiftAmount);
+
+            if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter)))
+                ignoreCaseMask |= 32 << shiftAmount;                    
+        }
+
+        if (m_charSize == Char8) {
+            switch (numberCharacters) {
+            case 1:
+                op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character));
+                return;
+            case 2: {
+                BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+                load16Unaligned(address, character);
+                break;
+            }
+            case 3: {
+                BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+                load16Unaligned(highAddress, character);
+                if (ignoreCaseMask)
+                    or32(Imm32(ignoreCaseMask), character);
+                op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask)));
+                op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, startTermPosition + 2 - m_checked, character));
+                return;
+            }
+            case 4: {
+                BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+                load32WithUnalignedHalfWords(address, character);
+                break;
+            }
+            }
+        } else {
+            switch (numberCharacters) {
+            case 1:
+                op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+                return;
+            case 2:
+                BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar));
+                load32WithUnalignedHalfWords(address, character);
+                break;
+            }
+        }
+
+        if (ignoreCaseMask)
+            or32(Imm32(ignoreCaseMask), character);
+        op.m_jumps.append(branch32(NotEqual, character, Imm32(allCharacters | ignoreCaseMask)));
+        return;
+    }
+    void backtrackPatternCharacterOnce(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    void generatePatternCharacterFixed(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+        UChar ch = term->patternCharacter;
+
+        const RegisterID character = regT0;
+        const RegisterID countRegister = regT1;
+
+        move(index, countRegister);
+        sub32(Imm32(term->quantityCount.unsafeGet()), countRegister);
+
+        Label loop(this);
+        BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet());
+
+        if (m_charSize == Char8)
+            load8(address, character);
+        else
+            load16(address, character);
+
+        // For case-insesitive compares, non-ascii characters that have different
+        // upper & lower case representations are converted to a character class.
+        ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+        if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
+            or32(TrustedImm32(0x20), character);
+            ch |= 0x20;
+        }
+
+        op.m_jumps.append(branch32(NotEqual, character, Imm32(ch)));
+        add32(TrustedImm32(1), countRegister);
+        branch32(NotEqual, countRegister, index).linkTo(loop, this);
+    }
+    void backtrackPatternCharacterFixed(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    void generatePatternCharacterGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+        UChar ch = term->patternCharacter;
+
+        const RegisterID character = regT0;
+        const RegisterID countRegister = regT1;
+
+        move(TrustedImm32(0), countRegister);
+
+        // Unless have a 16 bit pattern character and an 8 bit string - short circuit
+        if (!((ch > 0xff) && (m_charSize == Char8))) {
+            JumpList failures;
+            Label loop(this);
+            failures.append(atEndOfInput());
+            failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+
+            add32(TrustedImm32(1), countRegister);
+            add32(TrustedImm32(1), index);
+            if (term->quantityCount == quantifyInfinite)
+                jump(loop);
+            else
+                branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this);
+
+            failures.link(this);
+        }
+        op.m_reentry = label();
+
+        storeToFrame(countRegister, term->frameLocation);
+    }
+    void backtrackPatternCharacterGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID countRegister = regT1;
+
+        m_backtrackingState.link(this);
+
+        loadFromFrame(term->frameLocation, countRegister);
+        m_backtrackingState.append(branchTest32(Zero, countRegister));
+        sub32(TrustedImm32(1), countRegister);
+        sub32(TrustedImm32(1), index);
+        jump(op.m_reentry);
+    }
+
+    void generatePatternCharacterNonGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID countRegister = regT1;
+
+        move(TrustedImm32(0), countRegister);
+        op.m_reentry = label();
+        storeToFrame(countRegister, term->frameLocation);
+    }
+    void backtrackPatternCharacterNonGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+        UChar ch = term->patternCharacter;
+
+        const RegisterID character = regT0;
+        const RegisterID countRegister = regT1;
+
+        m_backtrackingState.link(this);
+
+        loadFromFrame(term->frameLocation, countRegister);
+
+        // Unless have a 16 bit pattern character and an 8 bit string - short circuit
+        if (!((ch > 0xff) && (m_charSize == Char8))) {
+            JumpList nonGreedyFailures;
+            nonGreedyFailures.append(atEndOfInput());
+            if (term->quantityCount != quantifyInfinite)
+                nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet())));
+            nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+
+            add32(TrustedImm32(1), countRegister);
+            add32(TrustedImm32(1), index);
+
+            jump(op.m_reentry);
+            nonGreedyFailures.link(this);
+        }
+
+        sub32(countRegister, index);
+        m_backtrackingState.fallthrough();
+    }
+
+    void generateCharacterClassOnce(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+
+        JumpList matchDest;
+        readCharacter(term->inputPosition - m_checked, character);
+        matchCharacterClass(character, matchDest, term->characterClass);
+
+        if (term->invert())
+            op.m_jumps.append(matchDest);
+        else {
+            op.m_jumps.append(jump());
+            matchDest.link(this);
+        }
+    }
+    void backtrackCharacterClassOnce(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    void generateCharacterClassFixed(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+        const RegisterID countRegister = regT1;
+
+        move(index, countRegister);
+        sub32(Imm32(term->quantityCount.unsafeGet()), countRegister);
+
+        Label loop(this);
+        JumpList matchDest;
+        if (m_charSize == Char8)
+            load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character);
+        else
+            load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character);
+        matchCharacterClass(character, matchDest, term->characterClass);
+
+        if (term->invert())
+            op.m_jumps.append(matchDest);
+        else {
+            op.m_jumps.append(jump());
+            matchDest.link(this);
+        }
+
+        add32(TrustedImm32(1), countRegister);
+        branch32(NotEqual, countRegister, index).linkTo(loop, this);
+    }
+    void backtrackCharacterClassFixed(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+
+    void generateCharacterClassGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+        const RegisterID countRegister = regT1;
+
+        move(TrustedImm32(0), countRegister);
+
+        JumpList failures;
+        Label loop(this);
+        failures.append(atEndOfInput());
+
+        if (term->invert()) {
+            readCharacter(term->inputPosition - m_checked, character);
+            matchCharacterClass(character, failures, term->characterClass);
+        } else {
+            JumpList matchDest;
+            readCharacter(term->inputPosition - m_checked, character);
+            matchCharacterClass(character, matchDest, term->characterClass);
+            failures.append(jump());
+            matchDest.link(this);
+        }
+
+        add32(TrustedImm32(1), countRegister);
+        add32(TrustedImm32(1), index);
+        if (term->quantityCount != quantifyInfinite) {
+            branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this);
+            failures.append(jump());
+        } else
+            jump(loop);
+
+        failures.link(this);
+        op.m_reentry = label();
+
+        storeToFrame(countRegister, term->frameLocation);
+    }
+    void backtrackCharacterClassGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID countRegister = regT1;
+
+        m_backtrackingState.link(this);
+
+        loadFromFrame(term->frameLocation, countRegister);
+        m_backtrackingState.append(branchTest32(Zero, countRegister));
+        sub32(TrustedImm32(1), countRegister);
+        sub32(TrustedImm32(1), index);
+        jump(op.m_reentry);
+    }
+
+    void generateCharacterClassNonGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID countRegister = regT1;
+
+        move(TrustedImm32(0), countRegister);
+        op.m_reentry = label();
+        storeToFrame(countRegister, term->frameLocation);
+    }
+    void backtrackCharacterClassNonGreedy(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+        const RegisterID countRegister = regT1;
+
+        JumpList nonGreedyFailures;
+
+        m_backtrackingState.link(this);
+
+        loadFromFrame(term->frameLocation, countRegister);
+
+        nonGreedyFailures.append(atEndOfInput());
+        nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet())));
+
+        JumpList matchDest;
+        readCharacter(term->inputPosition - m_checked, character);
+        matchCharacterClass(character, matchDest, term->characterClass);
+
+        if (term->invert())
+            nonGreedyFailures.append(matchDest);
+        else {
+            nonGreedyFailures.append(jump());
+            matchDest.link(this);
+        }
+
+        add32(TrustedImm32(1), countRegister);
+        add32(TrustedImm32(1), index);
+
+        jump(op.m_reentry);
+
+        nonGreedyFailures.link(this);
+        sub32(countRegister, index);
+        m_backtrackingState.fallthrough();
+    }
+
+    void generateDotStarEnclosure(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+        const RegisterID matchPos = regT1;
+
+        JumpList foundBeginningNewLine;
+        JumpList saveStartIndex;
+        JumpList foundEndingNewLine;
+
+        ASSERT(!m_pattern.m_body->m_hasFixedSize);
+        getMatchStart(matchPos);
+
+        saveStartIndex.append(branchTest32(Zero, matchPos));
+        Label findBOLLoop(this);
+        sub32(TrustedImm32(1), matchPos);
+        if (m_charSize == Char8)
+            load8(BaseIndex(input, matchPos, TimesOne, 0), character);
+        else
+            load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+        matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass());
+        branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this);
+        saveStartIndex.append(jump());
+
+        foundBeginningNewLine.link(this);
+        add32(TrustedImm32(1), matchPos); // Advance past newline
+        saveStartIndex.link(this);
+
+        if (!m_pattern.m_multiline && term->anchors.bolAnchor)
+            op.m_jumps.append(branchTest32(NonZero, matchPos));
+
+        ASSERT(!m_pattern.m_body->m_hasFixedSize);
+        setMatchStart(matchPos);
+
+        move(index, matchPos);
+
+        Label findEOLLoop(this);        
+        foundEndingNewLine.append(branch32(Equal, matchPos, length));
+        if (m_charSize == Char8)
+            load8(BaseIndex(input, matchPos, TimesOne, 0), character);
+        else
+            load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+        matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass());
+        add32(TrustedImm32(1), matchPos);
+        jump(findEOLLoop);
+
+        foundEndingNewLine.link(this);
+
+        if (!m_pattern.m_multiline && term->anchors.eolAnchor)
+            op.m_jumps.append(branch32(NotEqual, matchPos, length));
+
+        move(matchPos, index);
+    }
+
+    void backtrackDotStarEnclosure(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+    
+    // Code generation/backtracking for simple terms
+    // (pattern characters, character classes, and assertions).
+    // These methods farm out work to the set of functions above.
+    void generateTerm(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        switch (term->type) {
+        case PatternTerm::TypePatternCharacter:
+            switch (term->quantityType) {
+            case QuantifierFixedCount:
+                if (term->quantityCount == 1)
+                    generatePatternCharacterOnce(opIndex);
+                else
+                    generatePatternCharacterFixed(opIndex);
+                break;
+            case QuantifierGreedy:
+                generatePatternCharacterGreedy(opIndex);
+                break;
+            case QuantifierNonGreedy:
+                generatePatternCharacterNonGreedy(opIndex);
+                break;
+            }
+            break;
+
+        case PatternTerm::TypeCharacterClass:
+            switch (term->quantityType) {
+            case QuantifierFixedCount:
+                if (term->quantityCount == 1)
+                    generateCharacterClassOnce(opIndex);
+                else
+                    generateCharacterClassFixed(opIndex);
+                break;
+            case QuantifierGreedy:
+                generateCharacterClassGreedy(opIndex);
+                break;
+            case QuantifierNonGreedy:
+                generateCharacterClassNonGreedy(opIndex);
+                break;
+            }
+            break;
+
+        case PatternTerm::TypeAssertionBOL:
+            generateAssertionBOL(opIndex);
+            break;
+
+        case PatternTerm::TypeAssertionEOL:
+            generateAssertionEOL(opIndex);
+            break;
+
+        case PatternTerm::TypeAssertionWordBoundary:
+            generateAssertionWordBoundary(opIndex);
+            break;
+
+        case PatternTerm::TypeForwardReference:
+            break;
+
+        case PatternTerm::TypeParenthesesSubpattern:
+        case PatternTerm::TypeParentheticalAssertion:
+            ASSERT_NOT_REACHED();
+        case PatternTerm::TypeBackReference:
+            m_shouldFallBack = true;
+            break;
+        case PatternTerm::TypeDotStarEnclosure:
+            generateDotStarEnclosure(opIndex);
+            break;
+        }
+    }
+    void backtrackTerm(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        switch (term->type) {
+        case PatternTerm::TypePatternCharacter:
+            switch (term->quantityType) {
+            case QuantifierFixedCount:
+                if (term->quantityCount == 1)
+                    backtrackPatternCharacterOnce(opIndex);
+                else
+                    backtrackPatternCharacterFixed(opIndex);
+                break;
+            case QuantifierGreedy:
+                backtrackPatternCharacterGreedy(opIndex);
+                break;
+            case QuantifierNonGreedy:
+                backtrackPatternCharacterNonGreedy(opIndex);
+                break;
+            }
+            break;
+
+        case PatternTerm::TypeCharacterClass:
+            switch (term->quantityType) {
+            case QuantifierFixedCount:
+                if (term->quantityCount == 1)
+                    backtrackCharacterClassOnce(opIndex);
+                else
+                    backtrackCharacterClassFixed(opIndex);
+                break;
+            case QuantifierGreedy:
+                backtrackCharacterClassGreedy(opIndex);
+                break;
+            case QuantifierNonGreedy:
+                backtrackCharacterClassNonGreedy(opIndex);
+                break;
+            }
+            break;
+
+        case PatternTerm::TypeAssertionBOL:
+            backtrackAssertionBOL(opIndex);
+            break;
+
+        case PatternTerm::TypeAssertionEOL:
+            backtrackAssertionEOL(opIndex);
+            break;
+
+        case PatternTerm::TypeAssertionWordBoundary:
+            backtrackAssertionWordBoundary(opIndex);
+            break;
+
+        case PatternTerm::TypeForwardReference:
+            break;
+
+        case PatternTerm::TypeParenthesesSubpattern:
+        case PatternTerm::TypeParentheticalAssertion:
+            ASSERT_NOT_REACHED();
+
+        case PatternTerm::TypeDotStarEnclosure:
+            backtrackDotStarEnclosure(opIndex);
+            break;
+
+        case PatternTerm::TypeBackReference:
+            m_shouldFallBack = true;
+            break;
+        }
+    }
+
+    void generate()
+    {
+        // Forwards generate the matching code.
+        ASSERT(m_ops.size());
+        size_t opIndex = 0;
+
+        do {
+            YarrOp& op = m_ops[opIndex];
+            switch (op.m_op) {
+
+            case OpTerm:
+                generateTerm(opIndex);
+                break;
+
+            // OpBodyAlternativeBegin/Next/End
+            //
+            // These nodes wrap the set of alternatives in the body of the regular expression.
+            // There may be either one or two chains of OpBodyAlternative nodes, one representing
+            // the 'once through' sequence of alternatives (if any exist), and one representing
+            // the repeating alternatives (again, if any exist).
+            //
+            // Upon normal entry to the Begin alternative, we will check that input is available.
+            // Reentry to the Begin alternative will take place after the check has taken place,
+            // and will assume that the input position has already been progressed as appropriate.
+            //
+            // Entry to subsequent Next/End alternatives occurs when the prior alternative has
+            // successfully completed a match - return a success state from JIT code.
+            //
+            // Next alternatives allow for reentry optimized to suit backtracking from its
+            // preceding alternative. It expects the input position to still be set to a position
+            // appropriate to its predecessor, and it will only perform an input check if the
+            // predecessor had a minimum size less than its own.
+            //
+            // In the case 'once through' expressions, the End node will also have a reentry
+            // point to jump to when the last alternative fails. Again, this expects the input
+            // position to still reflect that expected by the prior alternative.
+            case OpBodyAlternativeBegin: {
+                PatternAlternative* alternative = op.m_alternative;
+
+                // Upon entry at the head of the set of alternatives, check if input is available
+                // to run the first alternative. (This progresses the input position).
+                op.m_jumps.append(jumpIfNoAvailableInput(alternative->m_minimumSize));
+                // We will reenter after the check, and assume the input position to have been
+                // set as appropriate to this alternative.
+                op.m_reentry = label();
+
+                m_checked += alternative->m_minimumSize;
+                break;
+            }
+            case OpBodyAlternativeNext:
+            case OpBodyAlternativeEnd: {
+                PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+                PatternAlternative* alternative = op.m_alternative;
+
+                // If we get here, the prior alternative matched - return success.
+                
+                // Adjust the stack pointer to remove the pattern's frame.
+                removeCallFrame();
+
+                // Load appropriate values into the return register and the first output
+                // slot, and return. In the case of pattern with a fixed size, we will
+                // not have yet set the value in the first 
+                ASSERT(index != returnRegister);
+                if (m_pattern.m_body->m_hasFixedSize) {
+                    move(index, returnRegister);
+                    if (priorAlternative->m_minimumSize)
+                        sub32(Imm32(priorAlternative->m_minimumSize), returnRegister);
+                    if (compileMode == IncludeSubpatterns)
+                        store32(returnRegister, output);
+                } else
+                    getMatchStart(returnRegister);
+                if (compileMode == IncludeSubpatterns)
+                    store32(index, Address(output, 4));
+                move(index, returnRegister2);
+
+                generateReturn();
+
+                // This is the divide between the tail of the prior alternative, above, and
+                // the head of the subsequent alternative, below.
+
+                if (op.m_op == OpBodyAlternativeNext) {
+                    // This is the reentry point for the Next alternative. We expect any code
+                    // that jumps here to do so with the input position matching that of the
+                    // PRIOR alteranative, and we will only check input availability if we
+                    // need to progress it forwards.
+                    op.m_reentry = label();
+                    if (alternative->m_minimumSize > priorAlternative->m_minimumSize) {
+                        add32(Imm32(alternative->m_minimumSize - priorAlternative->m_minimumSize), index);
+                        op.m_jumps.append(jumpIfNoAvailableInput());
+                    } else if (priorAlternative->m_minimumSize > alternative->m_minimumSize)
+                        sub32(Imm32(priorAlternative->m_minimumSize - alternative->m_minimumSize), index);
+                } else if (op.m_nextOp == notFound) {
+                    // This is the reentry point for the End of 'once through' alternatives,
+                    // jumped to when the last alternative fails to match.
+                    op.m_reentry = label();
+                    sub32(Imm32(priorAlternative->m_minimumSize), index);
+                }
+
+                if (op.m_op == OpBodyAlternativeNext)
+                    m_checked += alternative->m_minimumSize;
+                m_checked -= priorAlternative->m_minimumSize;
+                break;
+            }
+
+            // OpSimpleNestedAlternativeBegin/Next/End
+            // OpNestedAlternativeBegin/Next/End
+            //
+            // These nodes are used to handle sets of alternatives that are nested within
+            // subpatterns and parenthetical assertions. The 'simple' forms are used where
+            // we do not need to be able to backtrack back into any alternative other than
+            // the last, the normal forms allow backtracking into any alternative.
+            //
+            // Each Begin/Next node is responsible for planting an input check to ensure
+            // sufficient input is available on entry. Next nodes additionally need to
+            // jump to the end - Next nodes use the End node's m_jumps list to hold this
+            // set of jumps.
+            //
+            // In the non-simple forms, successful alternative matches must store a
+            // 'return address' using a DataLabelPtr, used to store the address to jump
+            // to when backtracking, to get to the code for the appropriate alternative.
+            case OpSimpleNestedAlternativeBegin:
+            case OpNestedAlternativeBegin: {
+                PatternTerm* term = op.m_term;
+                PatternAlternative* alternative = op.m_alternative;
+                PatternDisjunction* disjunction = term->parentheses.disjunction;
+
+                // Calculate how much input we need to check for, and if non-zero check.
+                op.m_checkAdjust = alternative->m_minimumSize;
+                if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion))
+                    op.m_checkAdjust -= disjunction->m_minimumSize;
+                if (op.m_checkAdjust)
+                    op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust));
+
+                m_checked += op.m_checkAdjust;
+                break;
+            }
+            case OpSimpleNestedAlternativeNext:
+            case OpNestedAlternativeNext: {
+                PatternTerm* term = op.m_term;
+                PatternAlternative* alternative = op.m_alternative;
+                PatternDisjunction* disjunction = term->parentheses.disjunction;
+
+                // In the non-simple case, store a 'return address' so we can backtrack correctly.
+                if (op.m_op == OpNestedAlternativeNext) {
+                    unsigned parenthesesFrameLocation = term->frameLocation;
+                    unsigned alternativeFrameLocation = parenthesesFrameLocation;
+                    if (term->quantityType != QuantifierFixedCount)
+                        alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+                    op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation);
+                }
+
+                if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) {
+                    // If the previous alternative matched without consuming characters then
+                    // backtrack to try to match while consumming some input.
+                    op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+                }
+
+                // If we reach here then the last alternative has matched - jump to the
+                // End node, to skip over any further alternatives.
+                //
+                // FIXME: this is logically O(N^2) (though N can be expected to be very
+                // small). We could avoid this either by adding an extra jump to the JIT
+                // data structures, or by making backtracking code that jumps to Next
+                // alternatives are responsible for checking that input is available (if
+                // we didn't need to plant the input checks, then m_jumps would be free).
+                YarrOp* endOp = &m_ops[op.m_nextOp];
+                while (endOp->m_nextOp != notFound) {
+                    ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext);
+                    endOp = &m_ops[endOp->m_nextOp];
+                }
+                ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd);
+                endOp->m_jumps.append(jump());
+
+                // This is the entry point for the next alternative.
+                op.m_reentry = label();
+
+                // Calculate how much input we need to check for, and if non-zero check.
+                op.m_checkAdjust = alternative->m_minimumSize;
+                if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion))
+                    op.m_checkAdjust -= disjunction->m_minimumSize;
+                if (op.m_checkAdjust)
+                    op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust));
+
+                YarrOp& lastOp = m_ops[op.m_previousOp];
+                m_checked -= lastOp.m_checkAdjust;
+                m_checked += op.m_checkAdjust;
+                break;
+            }
+            case OpSimpleNestedAlternativeEnd:
+            case OpNestedAlternativeEnd: {
+                PatternTerm* term = op.m_term;
+
+                // In the non-simple case, store a 'return address' so we can backtrack correctly.
+                if (op.m_op == OpNestedAlternativeEnd) {
+                    unsigned parenthesesFrameLocation = term->frameLocation;
+                    unsigned alternativeFrameLocation = parenthesesFrameLocation;
+                    if (term->quantityType != QuantifierFixedCount)
+                        alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+                    op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation);
+                }
+
+                if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) {
+                    // If the previous alternative matched without consuming characters then
+                    // backtrack to try to match while consumming some input.
+                    op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+                }
+
+                // If this set of alternatives contains more than one alternative,
+                // then the Next nodes will have planted jumps to the End, and added
+                // them to this node's m_jumps list.
+                op.m_jumps.link(this);
+                op.m_jumps.clear();
+
+                YarrOp& lastOp = m_ops[op.m_previousOp];
+                m_checked -= lastOp.m_checkAdjust;
+                break;
+            }
+
+            // OpParenthesesSubpatternOnceBegin/End
+            //
+            // These nodes support (optionally) capturing subpatterns, that have a
+            // quantity count of 1 (this covers fixed once, and ?/?? quantifiers). 
+            case OpParenthesesSubpatternOnceBegin: {
+                PatternTerm* term = op.m_term;
+                unsigned parenthesesFrameLocation = term->frameLocation;
+                const RegisterID indexTemporary = regT0;
+                ASSERT(term->quantityCount == 1);
+
+                // Upon entry to a Greedy quantified set of parenthese store the index.
+                // We'll use this for two purposes:
+                //  - To indicate which iteration we are on of mathing the remainder of
+                //    the expression after the parentheses - the first, including the
+                //    match within the parentheses, or the second having skipped over them.
+                //  - To check for empty matches, which must be rejected.
+                //
+                // At the head of a NonGreedy set of parentheses we'll immediately set the
+                // value on the stack to -1 (indicating a match skipping the subpattern),
+                // and plant a jump to the end. We'll also plant a label to backtrack to
+                // to reenter the subpattern later, with a store to set up index on the
+                // second iteration.
+                //
+                // FIXME: for capturing parens, could use the index in the capture array?
+                if (term->quantityType == QuantifierGreedy)
+                    storeToFrame(index, parenthesesFrameLocation);
+                else if (term->quantityType == QuantifierNonGreedy) {
+                    storeToFrame(TrustedImm32(-1), parenthesesFrameLocation);
+                    op.m_jumps.append(jump());
+                    op.m_reentry = label();
+                    storeToFrame(index, parenthesesFrameLocation);
+                }
+
+                // If the parenthese are capturing, store the starting index value to the
+                // captures array, offsetting as necessary.
+                //
+                // FIXME: could avoid offsetting this value in JIT code, apply
+                // offsets only afterwards, at the point the results array is
+                // being accessed.
+                if (term->capture() && compileMode == IncludeSubpatterns) {
+                    int inputOffset = term->inputPosition - m_checked;
+                    if (term->quantityType == QuantifierFixedCount)
+                        inputOffset -= term->parentheses.disjunction->m_minimumSize;
+                    if (inputOffset) {
+                        move(index, indexTemporary);
+                        add32(Imm32(inputOffset), indexTemporary);
+                        setSubpatternStart(indexTemporary, term->parentheses.subpatternId);
+                    } else
+                        setSubpatternStart(index, term->parentheses.subpatternId);
+                }
+                break;
+            }
+            case OpParenthesesSubpatternOnceEnd: {
+                PatternTerm* term = op.m_term;
+                const RegisterID indexTemporary = regT0;
+                ASSERT(term->quantityCount == 1);
+
+#ifndef NDEBUG
+                // Runtime ASSERT to make sure that the nested alternative handled the
+                // "no input consumed" check.
+                if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) {
+                    Jump pastBreakpoint;
+                    pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+                    breakpoint();
+                    pastBreakpoint.link(this);
+                }
+#endif
+
+                // If the parenthese are capturing, store the ending index value to the
+                // captures array, offsetting as necessary.
+                //
+                // FIXME: could avoid offsetting this value in JIT code, apply
+                // offsets only afterwards, at the point the results array is
+                // being accessed.
+                if (term->capture() && compileMode == IncludeSubpatterns) {
+                    int inputOffset = term->inputPosition - m_checked;
+                    if (inputOffset) {
+                        move(index, indexTemporary);
+                        add32(Imm32(inputOffset), indexTemporary);
+                        setSubpatternEnd(indexTemporary, term->parentheses.subpatternId);
+                    } else
+                        setSubpatternEnd(index, term->parentheses.subpatternId);
+                }
+
+                // If the parentheses are quantified Greedy then add a label to jump back
+                // to if get a failed match from after the parentheses. For NonGreedy
+                // parentheses, link the jump from before the subpattern to here.
+                if (term->quantityType == QuantifierGreedy)
+                    op.m_reentry = label();
+                else if (term->quantityType == QuantifierNonGreedy) {
+                    YarrOp& beginOp = m_ops[op.m_previousOp];
+                    beginOp.m_jumps.link(this);
+                }
+                break;
+            }
+
+            // OpParenthesesSubpatternTerminalBegin/End
+            case OpParenthesesSubpatternTerminalBegin: {
+                PatternTerm* term = op.m_term;
+                ASSERT(term->quantityType == QuantifierGreedy);
+                ASSERT(term->quantityCount == quantifyInfinite);
+                ASSERT(!term->capture());
+
+                // Upon entry set a label to loop back to.
+                op.m_reentry = label();
+
+                // Store the start index of the current match; we need to reject zero
+                // length matches.
+                storeToFrame(index, term->frameLocation);
+                break;
+            }
+            case OpParenthesesSubpatternTerminalEnd: {
+                YarrOp& beginOp = m_ops[op.m_previousOp];
+#ifndef NDEBUG
+                PatternTerm* term = op.m_term;
+
+                // Runtime ASSERT to make sure that the nested alternative handled the
+                // "no input consumed" check.
+                Jump pastBreakpoint;
+                pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+                breakpoint();
+                pastBreakpoint.link(this);
+#endif
+
+                // We know that the match is non-zero, we can accept it  and
+                // loop back up to the head of the subpattern.
+                jump(beginOp.m_reentry);
+
+                // This is the entry point to jump to when we stop matching - we will
+                // do so once the subpattern cannot match any more.
+                op.m_reentry = label();
+                break;
+            }
+
+            // OpParentheticalAssertionBegin/End
+            case OpParentheticalAssertionBegin: {
+                PatternTerm* term = op.m_term;
+
+                // Store the current index - assertions should not update index, so
+                // we will need to restore it upon a successful match.
+                unsigned parenthesesFrameLocation = term->frameLocation;
+                storeToFrame(index, parenthesesFrameLocation);
+
+                // Check 
+                op.m_checkAdjust = m_checked - term->inputPosition;
+                if (op.m_checkAdjust)
+                    sub32(Imm32(op.m_checkAdjust), index);
+
+                m_checked -= op.m_checkAdjust;
+                break;
+            }
+            case OpParentheticalAssertionEnd: {
+                PatternTerm* term = op.m_term;
+
+                // Restore the input index value.
+                unsigned parenthesesFrameLocation = term->frameLocation;
+                loadFromFrame(parenthesesFrameLocation, index);
+
+                // If inverted, a successful match of the assertion must be treated
+                // as a failure, so jump to backtracking.
+                if (term->invert()) {
+                    op.m_jumps.append(jump());
+                    op.m_reentry = label();
+                }
+
+                YarrOp& lastOp = m_ops[op.m_previousOp];
+                m_checked += lastOp.m_checkAdjust;
+                break;
+            }
+
+            case OpMatchFailed:
+                removeCallFrame();
+                move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+                move(TrustedImm32(0), returnRegister2);
+                generateReturn();
+                break;
+            }
+
+            ++opIndex;
+        } while (opIndex < m_ops.size());
+    }
+
+    void backtrack()
+    {
+        // Backwards generate the backtracking code.
+        size_t opIndex = m_ops.size();
+        ASSERT(opIndex);
+
+        do {
+            --opIndex;
+            YarrOp& op = m_ops[opIndex];
+            switch (op.m_op) {
+
+            case OpTerm:
+                backtrackTerm(opIndex);
+                break;
+
+            // OpBodyAlternativeBegin/Next/End
+            //
+            // For each Begin/Next node representing an alternative, we need to decide what to do
+            // in two circumstances:
+            //  - If we backtrack back into this node, from within the alternative.
+            //  - If the input check at the head of the alternative fails (if this exists).
+            //
+            // We treat these two cases differently since in the former case we have slightly
+            // more information - since we are backtracking out of a prior alternative we know
+            // that at least enough input was available to run it. For example, given the regular
+            // expression /a|b/, if we backtrack out of the first alternative (a failed pattern
+            // character match of 'a'), then we need not perform an additional input availability
+            // check before running the second alternative.
+            //
+            // Backtracking required differs for the last alternative, which in the case of the
+            // repeating set of alternatives must loop. The code generated for the last alternative
+            // will also be used to handle all input check failures from any prior alternatives -
+            // these require similar functionality, in seeking the next available alternative for
+            // which there is sufficient input.
+            //
+            // Since backtracking of all other alternatives simply requires us to link backtracks
+            // to the reentry point for the subsequent alternative, we will only be generating any
+            // code when backtracking the last alternative.
+            case OpBodyAlternativeBegin:
+            case OpBodyAlternativeNext: {
+                PatternAlternative* alternative = op.m_alternative;
+
+                if (op.m_op == OpBodyAlternativeNext) {
+                    PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+                    m_checked += priorAlternative->m_minimumSize;
+                }
+                m_checked -= alternative->m_minimumSize;
+
+                // Is this the last alternative? If not, then if we backtrack to this point we just
+                // need to jump to try to match the next alternative.
+                if (m_ops[op.m_nextOp].m_op != OpBodyAlternativeEnd) {
+                    m_backtrackingState.linkTo(m_ops[op.m_nextOp].m_reentry, this);
+                    break;
+                }
+                YarrOp& endOp = m_ops[op.m_nextOp];
+
+                YarrOp* beginOp = &op;
+                while (beginOp->m_op != OpBodyAlternativeBegin) {
+                    ASSERT(beginOp->m_op == OpBodyAlternativeNext);
+                    beginOp = &m_ops[beginOp->m_previousOp];
+                }
+
+                bool onceThrough = endOp.m_nextOp == notFound;
+
+                // First, generate code to handle cases where we backtrack out of an attempted match
+                // of the last alternative. If this is a 'once through' set of alternatives then we
+                // have nothing to do - link this straight through to the End.
+                if (onceThrough)
+                    m_backtrackingState.linkTo(endOp.m_reentry, this);
+                else {
+                    // If we don't need to move the input poistion, and the pattern has a fixed size
+                    // (in which case we omit the store of the start index until the pattern has matched)
+                    // then we can just link the backtrack out of the last alternative straight to the
+                    // head of the first alternative.
+                    if (m_pattern.m_body->m_hasFixedSize
+                        && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize)
+                        && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1))
+                        m_backtrackingState.linkTo(beginOp->m_reentry, this);
+                    else {
+                        // We need to generate a trampoline of code to execute before looping back
+                        // around to the first alternative.
+                        m_backtrackingState.link(this);
+
+                        // If the pattern size is not fixed, then store the start index, for use if we match.
+                        if (!m_pattern.m_body->m_hasFixedSize) {
+                            if (alternative->m_minimumSize == 1)
+                                setMatchStart(index);
+                            else {
+                                move(index, regT0);
+                                if (alternative->m_minimumSize)
+                                    sub32(Imm32(alternative->m_minimumSize - 1), regT0);
+                                else
+                                    add32(TrustedImm32(1), regT0);
+                                setMatchStart(regT0);
+                            }
+                        }
+
+                        // Generate code to loop. Check whether the last alternative is longer than the
+                        // first (e.g. /a|xy/ or /a|xyz/).
+                        if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) {
+                            // We want to loop, and increment input position. If the delta is 1, it is
+                            // already correctly incremented, if more than one then decrement as appropriate.
+                            unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize;
+                            ASSERT(delta);
+                            if (delta != 1)
+                                sub32(Imm32(delta - 1), index);
+                            jump(beginOp->m_reentry);
+                        } else {
+                            // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot
+                            // be sufficent input available to handle this, so just fall through.
+                            unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize;
+                            if (delta != 0xFFFFFFFFu) {
+                                // We need to check input because we are incrementing the input.
+                                add32(Imm32(delta + 1), index);
+                                checkInput().linkTo(beginOp->m_reentry, this);
+                            }
+                        }
+                    }
+                }
+
+                // We can reach this point in the code in two ways:
+                //  - Fallthrough from the code above (a repeating alternative backtracked out of its
+                //    last alternative, and did not have sufficent input to run the first).
+                //  - We will loop back up to the following label when a releating alternative loops,
+                //    following a failed input check.
+                //
+                // Either way, we have just failed the input check for the first alternative.
+                Label firstInputCheckFailed(this);
+
+                // Generate code to handle input check failures from alternatives except the last.
+                // prevOp is the alternative we're handling a bail out from (initially Begin), and
+                // nextOp is the alternative we will be attempting to reenter into.
+                // 
+                // We will link input check failures from the forwards matching path back to the code
+                // that can handle them.
+                YarrOp* prevOp = beginOp;
+                YarrOp* nextOp = &m_ops[beginOp->m_nextOp];
+                while (nextOp->m_op != OpBodyAlternativeEnd) {
+                    prevOp->m_jumps.link(this);
+
+                    // We only get here if an input check fails, it is only worth checking again
+                    // if the next alternative has a minimum size less than the last.
+                    if (prevOp->m_alternative->m_minimumSize > nextOp->m_alternative->m_minimumSize) {
+                        // FIXME: if we added an extra label to YarrOp, we could avoid needing to
+                        // subtract delta back out, and reduce this code. Should performance test
+                        // the benefit of this.
+                        unsigned delta = prevOp->m_alternative->m_minimumSize - nextOp->m_alternative->m_minimumSize;
+                        sub32(Imm32(delta), index);
+                        Jump fail = jumpIfNoAvailableInput();
+                        add32(Imm32(delta), index);
+                        jump(nextOp->m_reentry);
+                        fail.link(this);
+                    } else if (prevOp->m_alternative->m_minimumSize < nextOp->m_alternative->m_minimumSize)
+                        add32(Imm32(nextOp->m_alternative->m_minimumSize - prevOp->m_alternative->m_minimumSize), index);
+                    prevOp = nextOp;
+                    nextOp = &m_ops[nextOp->m_nextOp];
+                }
+
+                // We fall through to here if there is insufficient input to run the last alternative.
+
+                // If there is insufficient input to run the last alternative, then for 'once through'
+                // alternatives we are done - just jump back up into the forwards matching path at the End.
+                if (onceThrough) {
+                    op.m_jumps.linkTo(endOp.m_reentry, this);
+                    jump(endOp.m_reentry);
+                    break;
+                }
+
+                // For repeating alternatives, link any input check failure from the last alternative to
+                // this point.
+                op.m_jumps.link(this);
+
+                bool needsToUpdateMatchStart = !m_pattern.m_body->m_hasFixedSize;
+
+                // Check for cases where input position is already incremented by 1 for the last
+                // alternative (this is particularly useful where the minimum size of the body
+                // disjunction is 0, e.g. /a*|b/).
+                if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) {
+                    // index is already incremented by 1, so just store it now!
+                    setMatchStart(index);
+                    needsToUpdateMatchStart = false;
+                }
+
+                // Check whether there is sufficient input to loop. Increment the input position by
+                // one, and check. Also add in the minimum disjunction size before checking - there
+                // is no point in looping if we're just going to fail all the input checks around
+                // the next iteration.
+                ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize);
+                if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) {
+                    // If the last alternative had the same minimum size as the disjunction,
+                    // just simply increment input pos by 1, no adjustment based on minimum size.
+                    add32(TrustedImm32(1), index);
+                } else {
+                    // If the minumum for the last alternative was one greater than than that
+                    // for the disjunction, we're already progressed by 1, nothing to do!
+                    unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1;
+                    if (delta)
+                        sub32(Imm32(delta), index);
+                }
+                Jump matchFailed = jumpIfNoAvailableInput();
+
+                if (needsToUpdateMatchStart) {
+                    if (!m_pattern.m_body->m_minimumSize)
+                        setMatchStart(index);
+                    else {
+                        move(index, regT0);
+                        sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0);
+                        setMatchStart(regT0);
+                    }
+                }
+
+                // Calculate how much more input the first alternative requires than the minimum
+                // for the body as a whole. If no more is needed then we dont need an additional
+                // input check here - jump straight back up to the start of the first alternative.
+                if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize)
+                    jump(beginOp->m_reentry);
+                else {
+                    if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize)
+                        add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index);
+                    else
+                        sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index);
+                    checkInput().linkTo(beginOp->m_reentry, this);
+                    jump(firstInputCheckFailed);
+                }
+
+                // We jump to here if we iterate to the point that there is insufficient input to
+                // run any matches, and need to return a failure state from JIT code.
+                matchFailed.link(this);
+
+                removeCallFrame();
+                move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+                move(TrustedImm32(0), returnRegister2);
+                generateReturn();
+                break;
+            }
+            case OpBodyAlternativeEnd: {
+                // We should never backtrack back into a body disjunction.
+                ASSERT(m_backtrackingState.isEmpty());
+
+                PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+                m_checked += priorAlternative->m_minimumSize;
+                break;
+            }
+
+            // OpSimpleNestedAlternativeBegin/Next/End
+            // OpNestedAlternativeBegin/Next/End
+            //
+            // Generate code for when we backtrack back out of an alternative into
+            // a Begin or Next node, or when the entry input count check fails. If
+            // there are more alternatives we need to jump to the next alternative,
+            // if not we backtrack back out of the current set of parentheses.
+            //
+            // In the case of non-simple nested assertions we need to also link the
+            // 'return address' appropriately to backtrack back out into the correct
+            // alternative.
+            case OpSimpleNestedAlternativeBegin:
+            case OpSimpleNestedAlternativeNext:
+            case OpNestedAlternativeBegin:
+            case OpNestedAlternativeNext: {
+                YarrOp& nextOp = m_ops[op.m_nextOp];
+                bool isBegin = op.m_previousOp == notFound;
+                bool isLastAlternative = nextOp.m_nextOp == notFound;
+                ASSERT(isBegin == (op.m_op == OpSimpleNestedAlternativeBegin || op.m_op == OpNestedAlternativeBegin));
+                ASSERT(isLastAlternative == (nextOp.m_op == OpSimpleNestedAlternativeEnd || nextOp.m_op == OpNestedAlternativeEnd));
+
+                // Treat an input check failure the same as a failed match.
+                m_backtrackingState.append(op.m_jumps);
+
+                // Set the backtracks to jump to the appropriate place. We may need
+                // to link the backtracks in one of three different way depending on
+                // the type of alternative we are dealing with:
+                //  - A single alternative, with no simplings.
+                //  - The last alternative of a set of two or more.
+                //  - An alternative other than the last of a set of two or more.
+                //
+                // In the case of a single alternative on its own, we don't need to
+                // jump anywhere - if the alternative fails to match we can just
+                // continue to backtrack out of the parentheses without jumping.
+                //
+                // In the case of the last alternative in a set of more than one, we
+                // need to jump to return back out to the beginning. We'll do so by
+                // adding a jump to the End node's m_jumps list, and linking this
+                // when we come to generate the Begin node. For alternatives other
+                // than the last, we need to jump to the next alternative.
+                //
+                // If the alternative had adjusted the input position we must link
+                // backtracking to here, correct, and then jump on. If not we can
+                // link the backtracks directly to their destination.
+                if (op.m_checkAdjust) {
+                    // Handle the cases where we need to link the backtracks here.
+                    m_backtrackingState.link(this);
+                    sub32(Imm32(op.m_checkAdjust), index);
+                    if (!isLastAlternative) {
+                        // An alternative that is not the last should jump to its successor.
+                        jump(nextOp.m_reentry);
+                    } else if (!isBegin) {
+                        // The last of more than one alternatives must jump back to the beginning.
+                        nextOp.m_jumps.append(jump());
+                    } else {
+                        // A single alternative on its own can fall through.
+                        m_backtrackingState.fallthrough();
+                    }
+                } else {
+                    // Handle the cases where we can link the backtracks directly to their destinations.
+                    if (!isLastAlternative) {
+                        // An alternative that is not the last should jump to its successor.
+                        m_backtrackingState.linkTo(nextOp.m_reentry, this);
+                    } else if (!isBegin) {
+                        // The last of more than one alternatives must jump back to the beginning.
+                        m_backtrackingState.takeBacktracksToJumpList(nextOp.m_jumps, this);
+                    }
+                    // In the case of a single alternative on its own do nothing - it can fall through.
+                }
+
+                // If there is a backtrack jump from a zero length match link it here.
+                if (op.m_zeroLengthMatch.isSet())
+                    m_backtrackingState.append(op.m_zeroLengthMatch);
+
+                // At this point we've handled the backtracking back into this node.
+                // Now link any backtracks that need to jump to here.
+
+                // For non-simple alternatives, link the alternative's 'return address'
+                // so that we backtrack back out into the previous alternative.
+                if (op.m_op == OpNestedAlternativeNext)
+                    m_backtrackingState.append(op.m_returnAddress);
+
+                // If there is more than one alternative, then the last alternative will
+                // have planted a jump to be linked to the end. This jump was added to the
+                // End node's m_jumps list. If we are back at the beginning, link it here.
+                if (isBegin) {
+                    YarrOp* endOp = &m_ops[op.m_nextOp];
+                    while (endOp->m_nextOp != notFound) {
+                        ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext);
+                        endOp = &m_ops[endOp->m_nextOp];
+                    }
+                    ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd);
+                    m_backtrackingState.append(endOp->m_jumps);
+                }
+
+                if (!isBegin) {
+                    YarrOp& lastOp = m_ops[op.m_previousOp];
+                    m_checked += lastOp.m_checkAdjust;
+                }
+                m_checked -= op.m_checkAdjust;
+                break;
+            }
+            case OpSimpleNestedAlternativeEnd:
+            case OpNestedAlternativeEnd: {
+                PatternTerm* term = op.m_term;
+
+                // If there is a backtrack jump from a zero length match link it here.
+                if (op.m_zeroLengthMatch.isSet())
+                    m_backtrackingState.append(op.m_zeroLengthMatch);
+
+                // If we backtrack into the end of a simple subpattern do nothing;
+                // just continue through into the last alternative. If we backtrack
+                // into the end of a non-simple set of alterntives we need to jump
+                // to the backtracking return address set up during generation.
+                if (op.m_op == OpNestedAlternativeEnd) {
+                    m_backtrackingState.link(this);
+
+                    // Plant a jump to the return address.
+                    unsigned parenthesesFrameLocation = term->frameLocation;
+                    unsigned alternativeFrameLocation = parenthesesFrameLocation;
+                    if (term->quantityType != QuantifierFixedCount)
+                        alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+                    loadFromFrameAndJump(alternativeFrameLocation);
+
+                    // Link the DataLabelPtr associated with the end of the last
+                    // alternative to this point.
+                    m_backtrackingState.append(op.m_returnAddress);
+                }
+
+                YarrOp& lastOp = m_ops[op.m_previousOp];
+                m_checked += lastOp.m_checkAdjust;
+                break;
+            }
+
+            // OpParenthesesSubpatternOnceBegin/End
+            //
+            // When we are backtracking back out of a capturing subpattern we need
+            // to clear the start index in the matches output array, to record that
+            // this subpattern has not been captured.
+            //
+            // When backtracking back out of a Greedy quantified subpattern we need
+            // to catch this, and try running the remainder of the alternative after
+            // the subpattern again, skipping the parentheses.
+            //
+            // Upon backtracking back into a quantified set of parentheses we need to
+            // check whether we were currently skipping the subpattern. If not, we
+            // can backtrack into them, if we were we need to either backtrack back
+            // out of the start of the parentheses, or jump back to the forwards
+            // matching start, depending of whether the match is Greedy or NonGreedy.
+            case OpParenthesesSubpatternOnceBegin: {
+                PatternTerm* term = op.m_term;
+                ASSERT(term->quantityCount == 1);
+
+                // We only need to backtrack to thispoint if capturing or greedy.
+                if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) {
+                    m_backtrackingState.link(this);
+
+                    // If capturing, clear the capture (we only need to reset start).
+                    if (term->capture() && compileMode == IncludeSubpatterns)
+                        clearSubpatternStart(term->parentheses.subpatternId);
+
+                    // If Greedy, jump to the end.
+                    if (term->quantityType == QuantifierGreedy) {
+                        // Clear the flag in the stackframe indicating we ran through the subpattern.
+                        unsigned parenthesesFrameLocation = term->frameLocation;
+                        storeToFrame(TrustedImm32(-1), parenthesesFrameLocation);
+                        // Jump to after the parentheses, skipping the subpattern.
+                        jump(m_ops[op.m_nextOp].m_reentry);
+                        // A backtrack from after the parentheses, when skipping the subpattern,
+                        // will jump back to here.
+                        op.m_jumps.link(this);
+                    }
+
+                    m_backtrackingState.fallthrough();
+                }
+                break;
+            }
+            case OpParenthesesSubpatternOnceEnd: {
+                PatternTerm* term = op.m_term;
+
+                if (term->quantityType != QuantifierFixedCount) {
+                    m_backtrackingState.link(this);
+
+                    // Check whether we should backtrack back into the parentheses, or if we
+                    // are currently in a state where we had skipped over the subpattern
+                    // (in which case the flag value on the stack will be -1).
+                    unsigned parenthesesFrameLocation = term->frameLocation;
+                    Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1));
+
+                    if (term->quantityType == QuantifierGreedy) {
+                        // For Greedy parentheses, we skip after having already tried going
+                        // through the subpattern, so if we get here we're done.
+                        YarrOp& beginOp = m_ops[op.m_previousOp];
+                        beginOp.m_jumps.append(hadSkipped);
+                    } else {
+                        // For NonGreedy parentheses, we try skipping the subpattern first,
+                        // so if we get here we need to try running through the subpattern
+                        // next. Jump back to the start of the parentheses in the forwards
+                        // matching path.
+                        ASSERT(term->quantityType == QuantifierNonGreedy);
+                        YarrOp& beginOp = m_ops[op.m_previousOp];
+                        hadSkipped.linkTo(beginOp.m_reentry, this);
+                    }
+
+                    m_backtrackingState.fallthrough();
+                }
+
+                m_backtrackingState.append(op.m_jumps);
+                break;
+            }
+
+            // OpParenthesesSubpatternTerminalBegin/End
+            //
+            // Terminal subpatterns will always match - there is nothing after them to
+            // force a backtrack, and they have a minimum count of 0, and as such will
+            // always produce an acceptable result.
+            case OpParenthesesSubpatternTerminalBegin: {
+                // We will backtrack to this point once the subpattern cannot match any
+                // more. Since no match is accepted as a successful match (we are Greedy
+                // quantified with a minimum of zero) jump back to the forwards matching
+                // path at the end.
+                YarrOp& endOp = m_ops[op.m_nextOp];
+                m_backtrackingState.linkTo(endOp.m_reentry, this);
+                break;
+            }
+            case OpParenthesesSubpatternTerminalEnd:
+                // We should never be backtracking to here (hence the 'terminal' in the name).
+                ASSERT(m_backtrackingState.isEmpty());
+                m_backtrackingState.append(op.m_jumps);
+                break;
+
+            // OpParentheticalAssertionBegin/End
+            case OpParentheticalAssertionBegin: {
+                PatternTerm* term = op.m_term;
+                YarrOp& endOp = m_ops[op.m_nextOp];
+
+                // We need to handle the backtracks upon backtracking back out
+                // of a parenthetical assertion if either we need to correct
+                // the input index, or the assertion was inverted.
+                if (op.m_checkAdjust || term->invert()) {
+                     m_backtrackingState.link(this);
+
+                    if (op.m_checkAdjust)
+                        add32(Imm32(op.m_checkAdjust), index);
+
+                    // In an inverted assertion failure to match the subpattern
+                    // is treated as a successful match - jump to the end of the
+                    // subpattern. We already have adjusted the input position
+                    // back to that before the assertion, which is correct.
+                    if (term->invert())
+                        jump(endOp.m_reentry);
+
+                    m_backtrackingState.fallthrough();
+                }
+
+                // The End node's jump list will contain any backtracks into
+                // the end of the assertion. Also, if inverted, we will have
+                // added the failure caused by a successful match to this.
+                m_backtrackingState.append(endOp.m_jumps);
+
+                m_checked += op.m_checkAdjust;
+                break;
+            }
+            case OpParentheticalAssertionEnd: {
+                // FIXME: We should really be clearing any nested subpattern
+                // matches on bailing out from after the pattern. Firefox has
+                // this bug too (presumably because they use YARR!)
+
+                // Never backtrack into an assertion; later failures bail to before the begin.
+                m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this);
+
+                YarrOp& lastOp = m_ops[op.m_previousOp];
+                m_checked -= lastOp.m_checkAdjust;
+                break;
+            }
+
+            case OpMatchFailed:
+                break;
+            }
+
+        } while (opIndex);
+    }
+
+    // Compilation methods:
+    // ====================
+
+    // opCompileParenthesesSubpattern
+    // Emits ops for a subpattern (set of parentheses). These consist
+    // of a set of alternatives wrapped in an outer set of nodes for
+    // the parentheses.
+    // Supported types of parentheses are 'Once' (quantityCount == 1)
+    // and 'Terminal' (non-capturing parentheses quantified as greedy
+    // and infinite).
+    // Alternatives will use the 'Simple' set of ops if either the
+    // subpattern is terminal (in which case we will never need to
+    // backtrack), or if the subpattern only contains one alternative.
+    void opCompileParenthesesSubpattern(PatternTerm* term)
+    {
+        YarrOpCode parenthesesBeginOpCode;
+        YarrOpCode parenthesesEndOpCode;
+        YarrOpCode alternativeBeginOpCode = OpSimpleNestedAlternativeBegin;
+        YarrOpCode alternativeNextOpCode = OpSimpleNestedAlternativeNext;
+        YarrOpCode alternativeEndOpCode = OpSimpleNestedAlternativeEnd;
+
+        // We can currently only compile quantity 1 subpatterns that are
+        // not copies. We generate a copy in the case of a range quantifier,
+        // e.g. /(?:x){3,9}/, or /(?:x)+/ (These are effectively expanded to
+        // /(?:x){3,3}(?:x){0,6}/ and /(?:x)(?:x)*/ repectively). The problem
+        // comes where the subpattern is capturing, in which case we would
+        // need to restore the capture from the first subpattern upon a
+        // failure in the second.
+        if (term->quantityCount == 1 && !term->parentheses.isCopy) {
+            // Select the 'Once' nodes.
+            parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin;
+            parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd;
+
+            // If there is more than one alternative we cannot use the 'simple' nodes.
+            if (term->parentheses.disjunction->m_alternatives.size() != 1) {
+                alternativeBeginOpCode = OpNestedAlternativeBegin;
+                alternativeNextOpCode = OpNestedAlternativeNext;
+                alternativeEndOpCode = OpNestedAlternativeEnd;
+            }
+        } else if (term->parentheses.isTerminal) {
+            // Select the 'Terminal' nodes.
+            parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin;
+            parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd;
+        } else {
+            // This subpattern is not supported by the JIT.
+            m_shouldFallBack = true;
+            return;
+        }
+
+        size_t parenBegin = m_ops.size();
+        m_ops.append(parenthesesBeginOpCode);
+
+        m_ops.append(alternativeBeginOpCode);
+        m_ops.last().m_previousOp = notFound;
+        m_ops.last().m_term = term;
+        Vector<PatternAlternative*>& alternatives =  term->parentheses.disjunction->m_alternatives;
+        for (unsigned i = 0; i < alternatives.size(); ++i) {
+            size_t lastOpIndex = m_ops.size() - 1;
+
+            PatternAlternative* nestedAlternative = alternatives[i];
+            opCompileAlternative(nestedAlternative);
+
+            size_t thisOpIndex = m_ops.size();
+            m_ops.append(YarrOp(alternativeNextOpCode));
+
+            YarrOp& lastOp = m_ops[lastOpIndex];
+            YarrOp& thisOp = m_ops[thisOpIndex];
+
+            lastOp.m_alternative = nestedAlternative;
+            lastOp.m_nextOp = thisOpIndex;
+            thisOp.m_previousOp = lastOpIndex;
+            thisOp.m_term = term;
+        }
+        YarrOp& lastOp = m_ops.last();
+        ASSERT(lastOp.m_op == alternativeNextOpCode);
+        lastOp.m_op = alternativeEndOpCode;
+        lastOp.m_alternative = 0;
+        lastOp.m_nextOp = notFound;
+
+        size_t parenEnd = m_ops.size();
+        m_ops.append(parenthesesEndOpCode);
+
+        m_ops[parenBegin].m_term = term;
+        m_ops[parenBegin].m_previousOp = notFound;
+        m_ops[parenBegin].m_nextOp = parenEnd;
+        m_ops[parenEnd].m_term = term;
+        m_ops[parenEnd].m_previousOp = parenBegin;
+        m_ops[parenEnd].m_nextOp = notFound;
+    }
+
+    // opCompileParentheticalAssertion
+    // Emits ops for a parenthetical assertion. These consist of an
+    // OpSimpleNestedAlternativeBegin/Next/End set of nodes wrapping
+    // the alternatives, with these wrapped by an outer pair of
+    // OpParentheticalAssertionBegin/End nodes.
+    // We can always use the OpSimpleNestedAlternative nodes in the
+    // case of parenthetical assertions since these only ever match
+    // once, and will never backtrack back into the assertion.
+    void opCompileParentheticalAssertion(PatternTerm* term)
+    {
+        size_t parenBegin = m_ops.size();
+        m_ops.append(OpParentheticalAssertionBegin);
+
+        m_ops.append(OpSimpleNestedAlternativeBegin);
+        m_ops.last().m_previousOp = notFound;
+        m_ops.last().m_term = term;
+        Vector<PatternAlternative*>& alternatives =  term->parentheses.disjunction->m_alternatives;
+        for (unsigned i = 0; i < alternatives.size(); ++i) {
+            size_t lastOpIndex = m_ops.size() - 1;
+
+            PatternAlternative* nestedAlternative = alternatives[i];
+            opCompileAlternative(nestedAlternative);
+
+            size_t thisOpIndex = m_ops.size();
+            m_ops.append(YarrOp(OpSimpleNestedAlternativeNext));
+
+            YarrOp& lastOp = m_ops[lastOpIndex];
+            YarrOp& thisOp = m_ops[thisOpIndex];
+
+            lastOp.m_alternative = nestedAlternative;
+            lastOp.m_nextOp = thisOpIndex;
+            thisOp.m_previousOp = lastOpIndex;
+            thisOp.m_term = term;
+        }
+        YarrOp& lastOp = m_ops.last();
+        ASSERT(lastOp.m_op == OpSimpleNestedAlternativeNext);
+        lastOp.m_op = OpSimpleNestedAlternativeEnd;
+        lastOp.m_alternative = 0;
+        lastOp.m_nextOp = notFound;
+
+        size_t parenEnd = m_ops.size();
+        m_ops.append(OpParentheticalAssertionEnd);
+
+        m_ops[parenBegin].m_term = term;
+        m_ops[parenBegin].m_previousOp = notFound;
+        m_ops[parenBegin].m_nextOp = parenEnd;
+        m_ops[parenEnd].m_term = term;
+        m_ops[parenEnd].m_previousOp = parenBegin;
+        m_ops[parenEnd].m_nextOp = notFound;
+    }
+
+    // opCompileAlternative
+    // Called to emit nodes for all terms in an alternative.
+    void opCompileAlternative(PatternAlternative* alternative)
+    {
+        optimizeAlternative(alternative);
+
+        for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+            PatternTerm* term = &alternative->m_terms[i];
+
+            switch (term->type) {
+            case PatternTerm::TypeParenthesesSubpattern:
+                opCompileParenthesesSubpattern(term);
+                break;
+
+            case PatternTerm::TypeParentheticalAssertion:
+                opCompileParentheticalAssertion(term);
+                break;
+
+            default:
+                m_ops.append(term);
+            }
+        }
+    }
+
+    // opCompileBody
+    // This method compiles the body disjunction of the regular expression.
+    // The body consists of two sets of alternatives - zero or more 'once
+    // through' (BOL anchored) alternatives, followed by zero or more
+    // repeated alternatives.
+    // For each of these two sets of alteratives, if not empty they will be
+    // wrapped in a set of OpBodyAlternativeBegin/Next/End nodes (with the
+    // 'begin' node referencing the first alternative, and 'next' nodes
+    // referencing any further alternatives. The begin/next/end nodes are
+    // linked together in a doubly linked list. In the case of repeating
+    // alternatives, the end node is also linked back to the beginning.
+    // If no repeating alternatives exist, then a OpMatchFailed node exists
+    // to return the failing result.
+    void opCompileBody(PatternDisjunction* disjunction)
+    {
+        Vector<PatternAlternative*>& alternatives =  disjunction->m_alternatives;
+        size_t currentAlternativeIndex = 0;
+
+        // Emit the 'once through' alternatives.
+        if (alternatives.size() && alternatives[0]->onceThrough()) {
+            m_ops.append(YarrOp(OpBodyAlternativeBegin));
+            m_ops.last().m_previousOp = notFound;
+
+            do {
+                size_t lastOpIndex = m_ops.size() - 1;
+                PatternAlternative* alternative = alternatives[currentAlternativeIndex];
+                opCompileAlternative(alternative);
+
+                size_t thisOpIndex = m_ops.size();
+                m_ops.append(YarrOp(OpBodyAlternativeNext));
+
+                YarrOp& lastOp = m_ops[lastOpIndex];
+                YarrOp& thisOp = m_ops[thisOpIndex];
+
+                lastOp.m_alternative = alternative;
+                lastOp.m_nextOp = thisOpIndex;
+                thisOp.m_previousOp = lastOpIndex;
+                
+                ++currentAlternativeIndex;
+            } while (currentAlternativeIndex < alternatives.size() && alternatives[currentAlternativeIndex]->onceThrough());
+
+            YarrOp& lastOp = m_ops.last();
+
+            ASSERT(lastOp.m_op == OpBodyAlternativeNext);
+            lastOp.m_op = OpBodyAlternativeEnd;
+            lastOp.m_alternative = 0;
+            lastOp.m_nextOp = notFound;
+        }
+
+        if (currentAlternativeIndex == alternatives.size()) {
+            m_ops.append(YarrOp(OpMatchFailed));
+            return;
+        }
+
+        // Emit the repeated alternatives.
+        size_t repeatLoop = m_ops.size();
+        m_ops.append(YarrOp(OpBodyAlternativeBegin));
+        m_ops.last().m_previousOp = notFound;
+        do {
+            size_t lastOpIndex = m_ops.size() - 1;
+            PatternAlternative* alternative = alternatives[currentAlternativeIndex];
+            ASSERT(!alternative->onceThrough());
+            opCompileAlternative(alternative);
+
+            size_t thisOpIndex = m_ops.size();
+            m_ops.append(YarrOp(OpBodyAlternativeNext));
+
+            YarrOp& lastOp = m_ops[lastOpIndex];
+            YarrOp& thisOp = m_ops[thisOpIndex];
+
+            lastOp.m_alternative = alternative;
+            lastOp.m_nextOp = thisOpIndex;
+            thisOp.m_previousOp = lastOpIndex;
+            
+            ++currentAlternativeIndex;
+        } while (currentAlternativeIndex < alternatives.size());
+        YarrOp& lastOp = m_ops.last();
+        ASSERT(lastOp.m_op == OpBodyAlternativeNext);
+        lastOp.m_op = OpBodyAlternativeEnd;
+        lastOp.m_alternative = 0;
+        lastOp.m_nextOp = repeatLoop;
+    }
+
+    void generateEnter()
+    {
+#if CPU(X86_64)
+        push(X86Registers::ebp);
+        move(stackPointerRegister, X86Registers::ebp);
+        push(X86Registers::ebx);
+#elif CPU(X86)
+        push(X86Registers::ebp);
+        move(stackPointerRegister, X86Registers::ebp);
+        // TODO: do we need spill registers to fill the output pointer if there are no sub captures?
+        push(X86Registers::ebx);
+        push(X86Registers::edi);
+        push(X86Registers::esi);
+        // load output into edi (2 = saved ebp + return address).
+    #if COMPILER(MSVC)
+        loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input);
+        loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index);
+        loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length);
+        if (compileMode == IncludeSubpatterns)
+            loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output);
+    #else
+        if (compileMode == IncludeSubpatterns)
+            loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output);
+    #endif
+#elif CPU(ARM)
+        push(ARMRegisters::r4);
+        push(ARMRegisters::r5);
+        push(ARMRegisters::r6);
+#if CPU(ARM_TRADITIONAL)
+        push(ARMRegisters::r8); // scratch register
+#endif
+        if (compileMode == IncludeSubpatterns)
+            move(ARMRegisters::r3, output);
+#elif CPU(SH4)
+        push(SH4Registers::r11);
+        push(SH4Registers::r13);
+#elif CPU(MIPS)
+        // Do nothing.
+#endif
+    }
+
+    void generateReturn()
+    {
+#if CPU(X86_64)
+        pop(X86Registers::ebx);
+        pop(X86Registers::ebp);
+#elif CPU(X86)
+        pop(X86Registers::esi);
+        pop(X86Registers::edi);
+        pop(X86Registers::ebx);
+        pop(X86Registers::ebp);
+#elif CPU(ARM)
+#if CPU(ARM_TRADITIONAL)
+        pop(ARMRegisters::r8); // scratch register
+#endif
+        pop(ARMRegisters::r6);
+        pop(ARMRegisters::r5);
+        pop(ARMRegisters::r4);
+#elif CPU(SH4)
+        pop(SH4Registers::r13);
+        pop(SH4Registers::r11);
+#elif CPU(MIPS)
+        // Do nothing
+#endif
+        ret();
+    }
+
+public:
+    YarrGenerator(YarrPattern& pattern, YarrCharSize charSize)
+        : m_pattern(pattern)
+        , m_charSize(charSize)
+        , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo)
+        , m_shouldFallBack(false)
+        , m_checked(0)
+    {
+    }
+
+    void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject)
+    {
+        generateEnter();
+
+        Jump hasInput = checkInput();
+        move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+        move(TrustedImm32(0), returnRegister2);
+        generateReturn();
+        hasInput.link(this);
+
+        if (compileMode == IncludeSubpatterns) {
+            for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i)
+                store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int)));
+        }
+
+        if (!m_pattern.m_body->m_hasFixedSize)
+            setMatchStart(index);
+
+        initCallFrame();
+
+        // Compile the pattern to the internal 'YarrOp' representation.
+        opCompileBody(m_pattern.m_body);
+
+        // If we encountered anything we can't handle in the JIT code
+        // (e.g. backreferences) then return early.
+        if (m_shouldFallBack) {
+            jitObject.setFallBack(true);
+            return;
+        }
+
+        generate();
+        backtrack();
+
+        // Link & finalize the code.
+        LinkBuffer linkBuffer(*globalData, this, REGEXP_CODE_ID);
+        m_backtrackingState.linkDataLabels(linkBuffer);
+
+        if (compileMode == MatchOnly) {
+            if (m_charSize == Char8)
+                jitObject.set8BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 8-bit regular expression")));
+            else
+                jitObject.set16BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 16-bit regular expression")));
+        } else {
+            if (m_charSize == Char8)
+                jitObject.set8BitCode(FINALIZE_CODE(linkBuffer, ("8-bit regular expression")));
+            else
+                jitObject.set16BitCode(FINALIZE_CODE(linkBuffer, ("16-bit regular expression")));
+        }
+        jitObject.setFallBack(m_shouldFallBack);
+    }
+
+private:
+    YarrPattern& m_pattern;
+
+    YarrCharSize m_charSize;
+
+    Scale m_charScale;
+
+    // Used to detect regular expression constructs that are not currently
+    // supported in the JIT; fall back to the interpreter when this is detected.
+    bool m_shouldFallBack;
+
+    // The regular expression expressed as a linear sequence of operations.
+    Vector<YarrOp, 128> m_ops;
+
+    // This records the current input offset being applied due to the current
+    // set of alternatives we are nested within. E.g. when matching the
+    // character 'b' within the regular expression /abc/, we will know that
+    // the minimum size for the alternative is 3, checked upon entry to the
+    // alternative, and that 'b' is at offset 1 from the start, and as such
+    // when matching 'b' we need to apply an offset of -2 to the load.
+    //
+    // FIXME: This should go away. Rather than tracking this value throughout
+    // code generation, we should gather this information up front & store it
+    // on the YarrOp structure.
+    int m_checked;
+
+    // This class records state whilst generating the backtracking path of code.
+    BacktrackingState m_backtrackingState;
+};
+
+void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode)
+{
+    if (mode == MatchOnly)
+        YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject);
+    else
+        YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject);
+}
+
+}}
+
+#endif
diff --git a/masm/yarr/YarrJIT.h b/masm/yarr/YarrJIT.h

new file mode 100644 (file)

index 0000000..bb7033f
--- /dev/null
+++ b/masm/yarr/YarrJIT.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrJIT_h
+#define YarrJIT_h
+
+#if ENABLE(YARR_JIT)
+
+#include "JSGlobalData.h"
+#include "MacroAssemblerCodeRef.h"
+#include "MatchResult.h"
+#include "Yarr.h"
+#include "YarrPattern.h"
+
+#if CPU(X86) && !COMPILER(MSVC)
+#define YARR_CALL __attribute__ ((regparm (3)))
+#else
+#define YARR_CALL
+#endif
+
+namespace JSC {
+
+class JSGlobalData;
+class ExecutablePool;
+
+namespace Yarr {
+
+class YarrCodeBlock {
+#if CPU(X86_64)
+    typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+    typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+    typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+    typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#else
+    typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+    typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+    typedef EncodedMatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+    typedef EncodedMatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#endif
+
+public:
+    YarrCodeBlock()
+        : m_needFallBack(false)
+    {
+    }
+
+    ~YarrCodeBlock()
+    {
+    }
+
+    void setFallBack(bool fallback) { m_needFallBack = fallback; }
+    bool isFallBack() { return m_needFallBack; }
+
+    bool has8BitCode() { return m_ref8.size(); }
+    bool has16BitCode() { return m_ref16.size(); }
+    void set8BitCode(MacroAssemblerCodeRef ref) { m_ref8 = ref; }
+    void set16BitCode(MacroAssemblerCodeRef ref) { m_ref16 = ref; }
+
+    bool has8BitCodeMatchOnly() { return m_matchOnly8.size(); }
+    bool has16BitCodeMatchOnly() { return m_matchOnly16.size(); }
+    void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; }
+    void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; }
+
+    MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output)
+    {
+        ASSERT(has8BitCode());
+        return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output));
+    }
+
+    MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output)
+    {
+        ASSERT(has16BitCode());
+        return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output));
+    }
+
+    MatchResult execute(const LChar* input, unsigned start, unsigned length)
+    {
+        ASSERT(has8BitCodeMatchOnly());
+        return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length));
+    }
+
+    MatchResult execute(const UChar* input, unsigned start, unsigned length)
+    {
+        ASSERT(has16BitCodeMatchOnly());
+        return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length));
+    }
+
+#if ENABLE(REGEXP_TRACING)
+    void *getAddr() { return m_ref.code().executableAddress(); }
+#endif
+
+    void clear()
+    {
+        m_ref8 = MacroAssemblerCodeRef();
+        m_ref16 = MacroAssemblerCodeRef();
+        m_matchOnly8 = MacroAssemblerCodeRef();
+        m_matchOnly16 = MacroAssemblerCodeRef();
+        m_needFallBack = false;
+    }
+
+private:
+    MacroAssemblerCodeRef m_ref8;
+    MacroAssemblerCodeRef m_ref16;
+    MacroAssemblerCodeRef m_matchOnly8;
+    MacroAssemblerCodeRef m_matchOnly16;
+    bool m_needFallBack;
+};
+
+enum YarrJITCompileMode {
+    MatchOnly,
+    IncludeSubpatterns
+};
+void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns);
+
+} } // namespace JSC::Yarr
+
+#endif
+
+#endif // YarrJIT_h
diff --git a/masm/yarr/YarrParser.h b/masm/yarr/YarrParser.h

new file mode 100644 (file)

index 0000000..4bab1a0
--- /dev/null
+++ b/masm/yarr/YarrParser.h
@@ -0,0 +1,880 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrParser_h
+#define YarrParser_h
+
+#include "Yarr.h"
+#include <wtf/ASCIICType.h>
+#include <wtf/text/WTFString.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
+
+enum BuiltInCharacterClassID {
+    DigitClassID,
+    SpaceClassID,
+    WordClassID,
+    NewlineClassID,
+};
+
+// The Parser class should not be used directly - only via the Yarr::parse() method.
+template<class Delegate, typename CharType>
+class Parser {
+private:
+    template<class FriendDelegate>
+    friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
+
+    enum ErrorCode {
+        NoError,
+        PatternTooLarge,
+        QuantifierOutOfOrder,
+        QuantifierWithoutAtom,
+        QuantifierTooLarge,
+        MissingParentheses,
+        ParenthesesUnmatched,
+        ParenthesesTypeInvalid,
+        CharacterClassUnmatched,
+        CharacterClassOutOfOrder,
+        EscapeUnterminated,
+        NumberOfErrorCodes
+    };
+
+    /*
+     * CharacterClassParserDelegate:
+     *
+     * The class CharacterClassParserDelegate is used in the parsing of character
+     * classes.  This class handles detection of character ranges.  This class
+     * implements enough of the delegate interface such that it can be passed to
+     * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
+     * to perform the parsing of escape characters in character sets.
+     */
+    class CharacterClassParserDelegate {
+    public:
+        CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+            : m_delegate(delegate)
+            , m_err(err)
+            , m_state(Empty)
+            , m_character(0)
+        {
+        }
+
+        /*
+         * begin():
+         *
+         * Called at beginning of construction.
+         */
+        void begin(bool invert)
+        {
+            m_delegate.atomCharacterClassBegin(invert);
+        }
+
+        /*
+         * atomPatternCharacter():
+         *
+         * This method is called either from parseCharacterClass() (for an unescaped
+         * character in a character class), or from parseEscape(). In the former case
+         * the value true will be passed for the argument 'hyphenIsRange', and in this
+         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+         * is different to /[a\-z]/).
+         */
+        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+        {
+            switch (m_state) {
+            case AfterCharacterClass:
+                // Following a builtin character class we need look out for a hyphen.
+                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+                // If we see a hyphen following a charater class then unlike usual
+                // we'll report it to the delegate immediately, and put ourself into
+                // a poisoned state. Any following calls to add another character or
+                // character class will result in an error. (A hypen following a
+                // character-class is itself valid, but only  at the end of a regex).
+                if (hyphenIsRange && ch == '-') {
+                    m_delegate.atomCharacterClassAtom('-');
+                    m_state = AfterCharacterClassHyphen;
+                    return;
+                }
+                // Otherwise just fall through - cached character so treat this as Empty.
+
+            case Empty:
+                m_character = ch;
+                m_state = CachedCharacter;
+                return;
+
+            case CachedCharacter:
+                if (hyphenIsRange && ch == '-')
+                    m_state = CachedCharacterHyphen;
+                else {
+                    m_delegate.atomCharacterClassAtom(m_character);
+                    m_character = ch;
+                }
+                return;
+
+            case CachedCharacterHyphen:
+                if (ch < m_character) {
+                    m_err = CharacterClassOutOfOrder;
+                    return;
+                }
+                m_delegate.atomCharacterClassRange(m_character, ch);
+                m_state = Empty;
+                return;
+
+                // See coment in atomBuiltInCharacterClass below.
+                // This too is technically an error, per ECMA-262, and again we
+                // we chose to allow this.  Note a subtlely here that while we
+                // diverge from the spec's definition of CharacterRange we do
+                // remain in compliance with the grammar.  For example, consider
+                // the expression /[\d-a-z]/.  We comply with the grammar in
+                // this case by not allowing a-z to be matched as a range.
+            case AfterCharacterClassHyphen:
+                m_delegate.atomCharacterClassAtom(ch);
+                m_state = Empty;
+                return;
+            }
+        }
+
+        /*
+         * atomBuiltInCharacterClass():
+         *
+         * Adds a built-in character class, called by parseEscape().
+         */
+        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+        {
+            switch (m_state) {
+            case CachedCharacter:
+                // Flush the currently cached character, then fall through.
+                m_delegate.atomCharacterClassAtom(m_character);
+
+            case Empty:
+            case AfterCharacterClass:
+                m_state = AfterCharacterClass;
+                m_delegate.atomCharacterClassBuiltIn(classID, invert);
+                return;
+
+                // If we hit either of these cases, we have an invalid range that
+                // looks something like /[x-\d]/ or /[\d-\d]/.
+                // According to ECMA-262 this should be a syntax error, but
+                // empirical testing shows this to break teh webz.  Instead we
+                // comply with to the ECMA-262 grammar, and assume the grammar to
+                // have matched the range correctly, but tweak our interpretation
+                // of CharacterRange.  Effectively we implicitly handle the hyphen
+                // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
+            case CachedCharacterHyphen:
+                m_delegate.atomCharacterClassAtom(m_character);
+                m_delegate.atomCharacterClassAtom('-');
+                // fall through
+            case AfterCharacterClassHyphen:
+                m_delegate.atomCharacterClassBuiltIn(classID, invert);
+                m_state = Empty;
+                return;
+            }
+        }
+
+        /*
+         * end():
+         *
+         * Called at end of construction.
+         */
+        void end()
+        {
+            if (m_state == CachedCharacter)
+                m_delegate.atomCharacterClassAtom(m_character);
+            else if (m_state == CachedCharacterHyphen) {
+                m_delegate.atomCharacterClassAtom(m_character);
+                m_delegate.atomCharacterClassAtom('-');
+            }
+            m_delegate.atomCharacterClassEnd();
+        }
+
+        // parseEscape() should never call these delegate methods when
+        // invoked with inCharacterClass set.
+        NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
+        NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
+
+    private:
+        Delegate& m_delegate;
+        ErrorCode& m_err;
+        enum CharacterClassConstructionState {
+            Empty,
+            CachedCharacter,
+            CachedCharacterHyphen,
+            AfterCharacterClass,
+            AfterCharacterClassHyphen,
+        } m_state;
+        UChar m_character;
+    };
+
+    Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
+        : m_delegate(delegate)
+        , m_backReferenceLimit(backReferenceLimit)
+        , m_err(NoError)
+        , m_data(pattern.getCharacters<CharType>())
+        , m_size(pattern.length())
+        , m_index(0)
+        , m_parenthesesNestingDepth(0)
+    {
+    }
+
+    /*
+     * parseEscape():
+     *
+     * Helper for parseTokens() AND parseCharacterClass().
+     * Unlike the other parser methods, this function does not report tokens
+     * directly to the member delegate (m_delegate), instead tokens are
+     * emitted to the delegate provided as an argument.  In the case of atom
+     * escapes, parseTokens() will call parseEscape() passing m_delegate as
+     * an argument, and as such the escape will be reported to the delegate.
+     *
+     * However this method may also be used by parseCharacterClass(), in which
+     * case a CharacterClassParserDelegate will be passed as the delegate that
+     * tokens should be added to.  A boolean flag is also provided to indicate
+     * whether that an escape in a CharacterClass is being parsed (some parsing
+     * rules change in this context).
+     *
+     * The boolean value returned by this method indicates whether the token
+     * parsed was an atom (outside of a characted class \b and \B will be
+     * interpreted as assertions).
+     */
+    template<bool inCharacterClass, class EscapeDelegate>
+    bool parseEscape(EscapeDelegate& delegate)
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == '\\');
+        consume();
+
+        if (atEndOfPattern()) {
+            m_err = EscapeUnterminated;
+            return false;
+        }
+
+        switch (peek()) {
+        // Assertions
+        case 'b':
+            consume();
+            if (inCharacterClass)
+                delegate.atomPatternCharacter('\b');
+            else {
+                delegate.assertionWordBoundary(false);
+                return false;
+            }
+            break;
+        case 'B':
+            consume();
+            if (inCharacterClass)
+                delegate.atomPatternCharacter('B');
+            else {
+                delegate.assertionWordBoundary(true);
+                return false;
+            }
+            break;
+
+        // CharacterClassEscape
+        case 'd':
+            consume();
+            delegate.atomBuiltInCharacterClass(DigitClassID, false);
+            break;
+        case 's':
+            consume();
+            delegate.atomBuiltInCharacterClass(SpaceClassID, false);
+            break;
+        case 'w':
+            consume();
+            delegate.atomBuiltInCharacterClass(WordClassID, false);
+            break;
+        case 'D':
+            consume();
+            delegate.atomBuiltInCharacterClass(DigitClassID, true);
+            break;
+        case 'S':
+            consume();
+            delegate.atomBuiltInCharacterClass(SpaceClassID, true);
+            break;
+        case 'W':
+            consume();
+            delegate.atomBuiltInCharacterClass(WordClassID, true);
+            break;
+
+        // DecimalEscape
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9': {
+            // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
+            // First, try to parse this as backreference.
+            if (!inCharacterClass) {
+                ParseState state = saveState();
+
+                unsigned backReference = consumeNumber();
+                if (backReference <= m_backReferenceLimit) {
+                    delegate.atomBackReference(backReference);
+                    break;
+                }
+
+                restoreState(state);
+            }
+            
+            // Not a backreference, and not octal.
+            if (peek() >= '8') {
+                delegate.atomPatternCharacter('\\');
+                break;
+            }
+
+            // Fall-through to handle this as an octal escape.
+        }
+
+        // Octal escape
+        case '0':
+            delegate.atomPatternCharacter(consumeOctal());
+            break;
+
+        // ControlEscape
+        case 'f':
+            consume();
+            delegate.atomPatternCharacter('\f');
+            break;
+        case 'n':
+            consume();
+            delegate.atomPatternCharacter('\n');
+            break;
+        case 'r':
+            consume();
+            delegate.atomPatternCharacter('\r');
+            break;
+        case 't':
+            consume();
+            delegate.atomPatternCharacter('\t');
+            break;
+        case 'v':
+            consume();
+            delegate.atomPatternCharacter('\v');
+            break;
+
+        // ControlLetter
+        case 'c': {
+            ParseState state = saveState();
+            consume();
+            if (!atEndOfPattern()) {
+                int control = consume();
+
+                // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
+                if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
+                    delegate.atomPatternCharacter(control & 0x1f);
+                    break;
+                }
+            }
+            restoreState(state);
+            delegate.atomPatternCharacter('\\');
+            break;
+        }
+
+        // HexEscape
+        case 'x': {
+            consume();
+            int x = tryConsumeHex(2);
+            if (x == -1)
+                delegate.atomPatternCharacter('x');
+            else
+                delegate.atomPatternCharacter(x);
+            break;
+        }
+
+        // UnicodeEscape
+        case 'u': {
+            consume();
+            int u = tryConsumeHex(4);
+            if (u == -1)
+                delegate.atomPatternCharacter('u');
+            else
+                delegate.atomPatternCharacter(u);
+            break;
+        }
+
+        // IdentityEscape
+        default:
+            delegate.atomPatternCharacter(consume());
+        }
+        
+        return true;
+    }
+
+    /*
+     * parseAtomEscape(), parseCharacterClassEscape():
+     *
+     * These methods alias to parseEscape().
+     */
+    bool parseAtomEscape()
+    {
+        return parseEscape<false>(m_delegate);
+    }
+    void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
+    {
+        parseEscape<true>(delegate);
+    }
+
+    /*
+     * parseCharacterClass():
+     *
+     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+     * to an instance of CharacterClassParserDelegate, to describe the character class to the
+     * delegate.
+     */
+    void parseCharacterClass()
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == '[');
+        consume();
+
+        CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
+
+        characterClassConstructor.begin(tryConsume('^'));
+
+        while (!atEndOfPattern()) {
+            switch (peek()) {
+            case ']':
+                consume();
+                characterClassConstructor.end();
+                return;
+
+            case '\\':
+                parseCharacterClassEscape(characterClassConstructor);
+                break;
+
+            default:
+                characterClassConstructor.atomPatternCharacter(consume(), true);
+            }
+
+            if (m_err)
+                return;
+        }
+
+        m_err = CharacterClassUnmatched;
+    }
+
+    /*
+     * parseParenthesesBegin():
+     *
+     * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
+     */
+    void parseParenthesesBegin()
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == '(');
+        consume();
+
+        if (tryConsume('?')) {
+            if (atEndOfPattern()) {
+                m_err = ParenthesesTypeInvalid;
+                return;
+            }
+
+            switch (consume()) {
+            case ':':
+                m_delegate.atomParenthesesSubpatternBegin(false);
+                break;
+            
+            case '=':
+                m_delegate.atomParentheticalAssertionBegin();
+                break;
+
+            case '!':
+                m_delegate.atomParentheticalAssertionBegin(true);
+                break;
+            
+            default:
+                m_err = ParenthesesTypeInvalid;
+            }
+        } else
+            m_delegate.atomParenthesesSubpatternBegin();
+
+        ++m_parenthesesNestingDepth;
+    }
+
+    /*
+     * parseParenthesesEnd():
+     *
+     * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
+     */
+    void parseParenthesesEnd()
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == ')');
+        consume();
+
+        if (m_parenthesesNestingDepth > 0)
+            m_delegate.atomParenthesesEnd();
+        else
+            m_err = ParenthesesUnmatched;
+
+        --m_parenthesesNestingDepth;
+    }
+
+    /*
+     * parseQuantifier():
+     *
+     * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
+     */
+    void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
+    {
+        ASSERT(!m_err);
+        ASSERT(min <= max);
+
+        if (min == UINT_MAX) {
+            m_err = QuantifierTooLarge;
+            return;
+        }
+
+        if (lastTokenWasAnAtom)
+            m_delegate.quantifyAtom(min, max, !tryConsume('?'));
+        else
+            m_err = QuantifierWithoutAtom;
+    }
+
+    /*
+     * parseTokens():
+     *
+     * This method loops over the input pattern reporting tokens to the delegate.
+     * The method returns when a parse error is detected, or the end of the pattern
+     * is reached.  One piece of state is tracked around the loop, which is whether
+     * the last token passed to the delegate was an atom (this is necessary to detect
+     * a parse error when a quantifier provided without an atom to quantify).
+     */
+    void parseTokens()
+    {
+        bool lastTokenWasAnAtom = false;
+
+        while (!atEndOfPattern()) {
+            switch (peek()) {
+            case '|':
+                consume();
+                m_delegate.disjunction();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '(':
+                parseParenthesesBegin();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case ')':
+                parseParenthesesEnd();
+                lastTokenWasAnAtom = true;
+                break;
+
+            case '^':
+                consume();
+                m_delegate.assertionBOL();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '$':
+                consume();
+                m_delegate.assertionEOL();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '.':
+                consume();
+                m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
+                lastTokenWasAnAtom = true;
+                break;
+
+            case '[':
+                parseCharacterClass();
+                lastTokenWasAnAtom = true;
+                break;
+
+            case '\\':
+                lastTokenWasAnAtom = parseAtomEscape();
+                break;
+
+            case '*':
+                consume();
+                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '+':
+                consume();
+                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '?':
+                consume();
+                parseQuantifier(lastTokenWasAnAtom, 0, 1);
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '{': {
+                ParseState state = saveState();
+
+                consume();
+                if (peekIsDigit()) {
+                    unsigned min = consumeNumber();
+                    unsigned max = min;
+                    
+                    if (tryConsume(','))
+                        max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
+
+                    if (tryConsume('}')) {
+                        if (min <= max)
+                            parseQuantifier(lastTokenWasAnAtom, min, max);
+                        else
+                            m_err = QuantifierOutOfOrder;
+                        lastTokenWasAnAtom = false;
+                        break;
+                    }
+                }
+
+                restoreState(state);
+            } // if we did not find a complete quantifer, fall through to the default case.
+
+            default:
+                m_delegate.atomPatternCharacter(consume());
+                lastTokenWasAnAtom = true;
+            }
+
+            if (m_err)
+                return;
+        }
+
+        if (m_parenthesesNestingDepth > 0)
+            m_err = MissingParentheses;
+    }
+
+    /*
+     * parse():
+     *
+     * This method calls parseTokens() to parse over the input and converts any
+     * error code to a const char* for a result.
+     */
+    const char* parse()
+    {
+        if (m_size > MAX_PATTERN_SIZE)
+            m_err = PatternTooLarge;
+        else
+            parseTokens();
+        ASSERT(atEndOfPattern() || m_err);
+
+        // The order of this array must match the ErrorCode enum.
+        static const char* errorMessages[NumberOfErrorCodes] = {
+            0, // NoError
+            REGEXP_ERROR_PREFIX "regular expression too large",
+            REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
+            REGEXP_ERROR_PREFIX "nothing to repeat",
+            REGEXP_ERROR_PREFIX "number too large in {} quantifier",
+            REGEXP_ERROR_PREFIX "missing )",
+            REGEXP_ERROR_PREFIX "unmatched parentheses",
+            REGEXP_ERROR_PREFIX "unrecognized character after (?",
+            REGEXP_ERROR_PREFIX "missing terminating ] for character class",
+            REGEXP_ERROR_PREFIX "range out of order in character class",
+            REGEXP_ERROR_PREFIX "\\ at end of pattern"
+        };
+
+        return errorMessages[m_err];
+    }
+
+    // Misc helper functions:
+
+    typedef unsigned ParseState;
+    
+    ParseState saveState()
+    {
+        return m_index;
+    }
+
+    void restoreState(ParseState state)
+    {
+        m_index = state;
+    }
+
+    bool atEndOfPattern()
+    {
+        ASSERT(m_index <= m_size);
+        return m_index == m_size;
+    }
+
+    int peek()
+    {
+        ASSERT(m_index < m_size);
+        return m_data[m_index];
+    }
+
+    bool peekIsDigit()
+    {
+        return !atEndOfPattern() && WTF::isASCIIDigit(peek());
+    }
+
+    unsigned peekDigit()
+    {
+        ASSERT(peekIsDigit());
+        return peek() - '0';
+    }
+
+    int consume()
+    {
+        ASSERT(m_index < m_size);
+        return m_data[m_index++];
+    }
+
+    unsigned consumeDigit()
+    {
+        ASSERT(peekIsDigit());
+        return consume() - '0';
+    }
+
+    unsigned consumeNumber()
+    {
+        unsigned n = consumeDigit();
+        // check for overflow.
+        for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
+            n = newValue;
+            consume();
+        }
+        return n;
+    }
+
+    unsigned consumeOctal()
+    {
+        ASSERT(WTF::isASCIIOctalDigit(peek()));
+
+        unsigned n = consumeDigit();
+        while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
+            n = n * 8 + consumeDigit();
+        return n;
+    }
+
+    bool tryConsume(UChar ch)
+    {
+        if (atEndOfPattern() || (m_data[m_index] != ch))
+            return false;
+        ++m_index;
+        return true;
+    }
+
+    int tryConsumeHex(int count)
+    {
+        ParseState state = saveState();
+
+        int n = 0;
+        while (count--) {
+            if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
+                restoreState(state);
+                return -1;
+            }
+            n = (n << 4) | WTF::toASCIIHexValue(consume());
+        }
+        return n;
+    }
+
+    Delegate& m_delegate;
+    unsigned m_backReferenceLimit;
+    ErrorCode m_err;
+    const CharType* m_data;
+    unsigned m_size;
+    unsigned m_index;
+    unsigned m_parenthesesNestingDepth;
+
+    // Derived by empirical testing of compile time in PCRE and WREC.
+    static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
+};
+
+/*
+ * Yarr::parse():
+ *
+ * The parse method is passed a pattern to be parsed and a delegate upon which
+ * callbacks will be made to record the parsed tokens forming the regex.
+ * Yarr::parse() returns null on success, or a const C string providing an error
+ * message where a parse error occurs.
+ *
+ * The Delegate must implement the following interface:
+ *
+ *    void assertionBOL();
+ *    void assertionEOL();
+ *    void assertionWordBoundary(bool invert);
+ *
+ *    void atomPatternCharacter(UChar ch);
+ *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
+ *    void atomCharacterClassBegin(bool invert)
+ *    void atomCharacterClassAtom(UChar ch)
+ *    void atomCharacterClassRange(UChar begin, UChar end)
+ *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+ *    void atomCharacterClassEnd()
+ *    void atomParenthesesSubpatternBegin(bool capture = true);
+ *    void atomParentheticalAssertionBegin(bool invert = false);
+ *    void atomParenthesesEnd();
+ *    void atomBackReference(unsigned subpatternId);
+ *
+ *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
+ *
+ *    void disjunction();
+ *
+ * The regular expression is described by a sequence of assertion*() and atom*()
+ * callbacks to the delegate, describing the terms in the regular expression.
+ * Following an atom a quantifyAtom() call may occur to indicate that the previous
+ * atom should be quantified.  In the case of atoms described across multiple
+ * calls (parentheses and character classes) the call to quantifyAtom() will come
+ * after the call to the atom*End() method, never after atom*Begin().
+ *
+ * Character classes may either be described by a single call to
+ * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
+ * In the latter case, ...Begin() will be called, followed by a sequence of
+ * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
+ *
+ * Sequences of atoms and assertions are broken into alternatives via calls to
+ * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
+ * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
+ * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
+ * capturing subpattern, this will be the subpatternId associated with these
+ * parentheses, and will also by definition be the lowest subpatternId of these
+ * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
+ * is passed the subpatternId of the last capturing subexpression nested within
+ * these paretheses.  In the case of a capturing subpattern with no nested
+ * capturing subpatterns, the same subpatternId will be passed to the begin and
+ * end functions.  In the case of non-capturing subpatterns the subpatternId
+ * passed to the begin method is also the first possible subpatternId that might
+ * be nested within these paretheses.  If a set of non-capturing parentheses does
+ * not contain any capturing subpatterns, then the subpatternId passed to begin
+ * will be greater than the subpatternId passed to end.
+ */
+
+template<class Delegate>
+const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
+{
+    if (pattern.is8Bit())
+        return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
+    return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
+}
+
+} } // namespace JSC::Yarr
+
+#endif // YarrParser_h
diff --git a/masm/yarr/YarrPattern.cpp b/masm/yarr/YarrPattern.cpp

new file mode 100644 (file)

index 0000000..c953a38
--- /dev/null
+++ b/masm/yarr/YarrPattern.cpp
@@ -0,0 +1,874 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "YarrPattern.h"
+
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+#include "YarrParser.h"
+#include <wtf/Vector.h>
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+#include "RegExpJitTables.h"
+
+class CharacterClassConstructor {
+public:
+    CharacterClassConstructor(bool isCaseInsensitive = false)
+        : m_isCaseInsensitive(isCaseInsensitive)
+    {
+    }
+    
+    void reset()
+    {
+        m_matches.clear();
+        m_ranges.clear();
+        m_matchesUnicode.clear();
+        m_rangesUnicode.clear();
+    }
+
+    void append(const CharacterClass* other)
+    {
+        for (size_t i = 0; i < other->m_matches.size(); ++i)
+            addSorted(m_matches, other->m_matches[i]);
+        for (size_t i = 0; i < other->m_ranges.size(); ++i)
+            addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
+        for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
+            addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
+        for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
+            addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
+    }
+
+    void putChar(UChar ch)
+    {
+        // Handle ascii cases.
+        if (ch <= 0x7f) {
+            if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
+                addSorted(m_matches, toASCIIUpper(ch));
+                addSorted(m_matches, toASCIILower(ch));
+            } else
+                addSorted(m_matches, ch);
+            return;
+        }
+
+        // Simple case, not a case-insensitive match.
+        if (!m_isCaseInsensitive) {
+            addSorted(m_matchesUnicode, ch);
+            return;
+        }
+
+        // Add multiple matches, if necessary.
+        UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+        if (info->type == CanonicalizeUnique)
+            addSorted(m_matchesUnicode, ch);
+        else
+            putUnicodeIgnoreCase(ch, info);
+    }
+
+    void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
+    {
+        ASSERT(m_isCaseInsensitive);
+        ASSERT(ch > 0x7f);
+        ASSERT(ch >= info->begin && ch <= info->end);
+        ASSERT(info->type != CanonicalizeUnique);
+        if (info->type == CanonicalizeSet) {
+            for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+                addSorted(m_matchesUnicode, ch);
+        } else {
+            addSorted(m_matchesUnicode, ch);
+            addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+        }
+    }
+
+    void putRange(UChar lo, UChar hi)
+    {
+        if (lo <= 0x7f) {
+            char asciiLo = lo;
+            char asciiHi = std::min(hi, (UChar)0x7f);
+            addSortedRange(m_ranges, lo, asciiHi);
+            
+            if (m_isCaseInsensitive) {
+                if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
+                    addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
+                if ((asciiLo <= 'z') && (asciiHi >= 'a'))
+                    addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
+            }
+        }
+        if (hi <= 0x7f)
+            return;
+
+        lo = std::max(lo, (UChar)0x80);
+        addSortedRange(m_rangesUnicode, lo, hi);
+        
+        if (!m_isCaseInsensitive)
+            return;
+
+        UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+        while (true) {
+            // Handle the range [lo .. end]
+            UChar end = std::min<UChar>(info->end, hi);
+
+            switch (info->type) {
+            case CanonicalizeUnique:
+                // Nothing to do - no canonical equivalents.
+                break;
+            case CanonicalizeSet: {
+                UChar ch;
+                for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+                    addSorted(m_matchesUnicode, ch);
+                break;
+            }
+            case CanonicalizeRangeLo:
+                addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
+                break;
+            case CanonicalizeRangeHi:
+                addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
+                break;
+            case CanonicalizeAlternatingAligned:
+                // Use addSortedRange since there is likely an abutting range to combine with.
+                if (lo & 1)
+                    addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+                if (!(end & 1))
+                    addSortedRange(m_rangesUnicode, end + 1, end + 1);
+                break;
+            case CanonicalizeAlternatingUnaligned:
+                // Use addSortedRange since there is likely an abutting range to combine with.
+                if (!(lo & 1))
+                    addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+                if (end & 1)
+                    addSortedRange(m_rangesUnicode, end + 1, end + 1);
+                break;
+            }
+
+            if (hi == end)
+                return;
+
+            ++info;
+            lo = info->begin;
+        };
+
+    }
+
+    CharacterClass* charClass()
+    {
+        CharacterClass* characterClass = new CharacterClass(0);
+
+        characterClass->m_matches.swap(m_matches);
+        characterClass->m_ranges.swap(m_ranges);
+        characterClass->m_matchesUnicode.swap(m_matchesUnicode);
+        characterClass->m_rangesUnicode.swap(m_rangesUnicode);
+
+        return characterClass;
+    }
+
+private:
+    void addSorted(Vector<UChar>& matches, UChar ch)
+    {
+        unsigned pos = 0;
+        unsigned range = matches.size();
+
+        // binary chop, find position to insert char.
+        while (range) {
+            unsigned index = range >> 1;
+
+            int val = matches[pos+index] - ch;
+            if (!val)
+                return;
+            else if (val > 0)
+                range = index;
+            else {
+                pos += (index+1);
+                range -= (index+1);
+            }
+        }
+        
+        if (pos == matches.size())
+            matches.append(ch);
+        else
+            matches.insert(pos, ch);
+    }
+
+    void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
+    {
+        unsigned end = ranges.size();
+        
+        // Simple linear scan - I doubt there are that many ranges anyway...
+        // feel free to fix this with something faster (eg binary chop).
+        for (unsigned i = 0; i < end; ++i) {
+            // does the new range fall before the current position in the array
+            if (hi < ranges[i].begin) {
+                // optional optimization: concatenate appending ranges? - may not be worthwhile.
+                if (hi == (ranges[i].begin - 1)) {
+                    ranges[i].begin = lo;
+                    return;
+                }
+                ranges.insert(i, CharacterRange(lo, hi));
+                return;
+            }
+            // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
+            // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
+            // end of the last range they concatenate, which is just as good.
+            if (lo <= (ranges[i].end + 1)) {
+                // found an intersect! we'll replace this entry in the array.
+                ranges[i].begin = std::min(ranges[i].begin, lo);
+                ranges[i].end = std::max(ranges[i].end, hi);
+
+                // now check if the new range can subsume any subsequent ranges.
+                unsigned next = i+1;
+                // each iteration of the loop we will either remove something from the list, or break the loop.
+                while (next < ranges.size()) {
+                    if (ranges[next].begin <= (ranges[i].end + 1)) {
+                        // the next entry now overlaps / concatenates this one.
+                        ranges[i].end = std::max(ranges[i].end, ranges[next].end);
+                        ranges.remove(next);
+                    } else
+                        break;
+                }
+                
+                return;
+            }
+        }
+
+        // CharacterRange comes after all existing ranges.
+        ranges.append(CharacterRange(lo, hi));
+    }
+
+    bool m_isCaseInsensitive;
+
+    Vector<UChar> m_matches;
+    Vector<CharacterRange> m_ranges;
+    Vector<UChar> m_matchesUnicode;
+    Vector<CharacterRange> m_rangesUnicode;
+};
+
+class YarrPatternConstructor {
+public:
+    YarrPatternConstructor(YarrPattern& pattern)
+        : m_pattern(pattern)
+        , m_characterClassConstructor(pattern.m_ignoreCase)
+        , m_invertParentheticalAssertion(false)
+    {
+        m_pattern.m_body = new PatternDisjunction();
+        m_alternative = m_pattern.m_body->addNewAlternative();
+        m_pattern.m_disjunctions.append(m_pattern.m_body);
+    }
+
+    ~YarrPatternConstructor()
+    {
+    }
+
+    void reset()
+    {
+        m_pattern.reset();
+        m_characterClassConstructor.reset();
+
+        m_pattern.m_body = new PatternDisjunction();
+        m_alternative = m_pattern.m_body->addNewAlternative();
+        m_pattern.m_disjunctions.append(m_pattern.m_body);
+    }
+    
+    void assertionBOL()
+    {
+        if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+            m_alternative->m_startsWithBOL = true;
+            m_alternative->m_containsBOL = true;
+            m_pattern.m_containsBOL = true;
+        }
+        m_alternative->m_terms.append(PatternTerm::BOL());
+    }
+    void assertionEOL()
+    {
+        m_alternative->m_terms.append(PatternTerm::EOL());
+    }
+    void assertionWordBoundary(bool invert)
+    {
+        m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
+    }
+
+    void atomPatternCharacter(UChar ch)
+    {
+        // We handle case-insensitive checking of unicode characters which do have both
+        // cases by handling them as if they were defined using a CharacterClass.
+        if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+            m_alternative->m_terms.append(PatternTerm(ch));
+            return;
+        }
+
+        UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+        if (info->type == CanonicalizeUnique) {
+            m_alternative->m_terms.append(PatternTerm(ch));
+            return;
+        }
+
+        m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
+        CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+        m_pattern.m_userCharacterClasses.append(newCharacterClass);
+        m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
+    }
+
+    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+    {
+        switch (classID) {
+        case DigitClassID:
+            m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
+            break;
+        case SpaceClassID:
+            m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
+            break;
+        case WordClassID:
+            m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+            break;
+        case NewlineClassID:
+            m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
+            break;
+        }
+    }
+
+    void atomCharacterClassBegin(bool invert = false)
+    {
+        m_invertCharacterClass = invert;
+    }
+
+    void atomCharacterClassAtom(UChar ch)
+    {
+        m_characterClassConstructor.putChar(ch);
+    }
+
+    void atomCharacterClassRange(UChar begin, UChar end)
+    {
+        m_characterClassConstructor.putRange(begin, end);
+    }
+
+    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+    {
+        ASSERT(classID != NewlineClassID);
+
+        switch (classID) {
+        case DigitClassID:
+            m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
+            break;
+        
+        case SpaceClassID:
+            m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
+            break;
+        
+        case WordClassID:
+            m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
+            break;
+        
+        default:
+            ASSERT_NOT_REACHED();
+        }
+    }
+
+    void atomCharacterClassEnd()
+    {
+        CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+        m_pattern.m_userCharacterClasses.append(newCharacterClass);
+        m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
+    }
+
+    void atomParenthesesSubpatternBegin(bool capture = true)
+    {
+        unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
+        if (capture)
+            m_pattern.m_numSubpatterns++;
+
+        PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
+        m_pattern.m_disjunctions.append(parenthesesDisjunction);
+        m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
+        m_alternative = parenthesesDisjunction->addNewAlternative();
+    }
+
+    void atomParentheticalAssertionBegin(bool invert = false)
+    {
+        PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
+        m_pattern.m_disjunctions.append(parenthesesDisjunction);
+        m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
+        m_alternative = parenthesesDisjunction->addNewAlternative();
+        m_invertParentheticalAssertion = invert;
+    }
+
+    void atomParenthesesEnd()
+    {
+        ASSERT(m_alternative->m_parent);
+        ASSERT(m_alternative->m_parent->m_parent);
+
+        PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
+        m_alternative = m_alternative->m_parent->m_parent;
+
+        PatternTerm& lastTerm = m_alternative->lastTerm();
+
+        unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
+        unsigned numBOLAnchoredAlts = 0;
+
+        for (unsigned i = 0; i < numParenAlternatives; i++) {
+            // Bubble up BOL flags
+            if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
+                numBOLAnchoredAlts++;
+        }
+
+        if (numBOLAnchoredAlts) {
+            m_alternative->m_containsBOL = true;
+            // If all the alternatives in parens start with BOL, then so does this one
+            if (numBOLAnchoredAlts == numParenAlternatives)
+                m_alternative->m_startsWithBOL = true;
+        }
+
+        lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+        m_invertParentheticalAssertion = false;
+    }
+
+    void atomBackReference(unsigned subpatternId)
+    {
+        ASSERT(subpatternId);
+        m_pattern.m_containsBackreferences = true;
+        m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
+
+        if (subpatternId > m_pattern.m_numSubpatterns) {
+            m_alternative->m_terms.append(PatternTerm::ForwardReference());
+            return;
+        }
+
+        PatternAlternative* currentAlternative = m_alternative;
+        ASSERT(currentAlternative);
+
+        // Note to self: if we waited until the AST was baked, we could also remove forwards refs 
+        while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
+            PatternTerm& term = currentAlternative->lastTerm();
+            ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
+
+            if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
+                m_alternative->m_terms.append(PatternTerm::ForwardReference());
+                return;
+            }
+        }
+
+        m_alternative->m_terms.append(PatternTerm(subpatternId));
+    }
+
+    // deep copy the argument disjunction.  If filterStartsWithBOL is true, 
+    // skip alternatives with m_startsWithBOL set true.
+    PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
+    {
+        PatternDisjunction* newDisjunction = 0;
+        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+            PatternAlternative* alternative = disjunction->m_alternatives[alt];
+            if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
+                if (!newDisjunction) {
+                    newDisjunction = new PatternDisjunction();
+                    newDisjunction->m_parent = disjunction->m_parent;
+                }
+                PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
+                for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
+                    newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
+            }
+        }
+        
+        if (newDisjunction)
+            m_pattern.m_disjunctions.append(newDisjunction);
+        return newDisjunction;
+    }
+    
+    PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
+    {
+        if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
+            return PatternTerm(term);
+        
+        PatternTerm termCopy = term;
+        termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
+        return termCopy;
+    }
+    
+    void quantifyAtom(unsigned min, unsigned max, bool greedy)
+    {
+        ASSERT(min <= max);
+        ASSERT(m_alternative->m_terms.size());
+
+        if (!max) {
+            m_alternative->removeLastTerm();
+            return;
+        }
+
+        PatternTerm& term = m_alternative->lastTerm();
+        ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
+        ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
+
+        if (term.type == PatternTerm::TypeParentheticalAssertion) {
+            // If an assertion is quantified with a minimum count of zero, it can simply be removed.
+            // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
+            // results in any input being consumed, however the continuation passed to the assertion
+            // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
+            // reject all zero length matches (see step 2.1). A match from the continuation of the
+            // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
+            // this is that matches from the assertion are not required, and won't be accepted anyway,
+            // so no need to ever run it.
+            if (!min)
+                m_alternative->removeLastTerm();
+            // We never need to run an assertion more than once. Subsequent interations will be run
+            // with the same start index (since assertions are non-capturing) and the same captures
+            // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
+            // same result and captures. If the first match succeeds then the subsequent (min - 1)
+            // matches will too. Any additional optional matches will fail (on the same basis as the
+            // minimum zero quantified assertions, above), but this will still result in a match.
+            return;
+        }
+
+        if (min == 0)
+            term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
+        else if (min == max)
+            term.quantify(min, QuantifierFixedCount);
+        else {
+            term.quantify(min, QuantifierFixedCount);
+            m_alternative->m_terms.append(copyTerm(term));
+            // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
+            m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+            if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
+                m_alternative->lastTerm().parentheses.isCopy = true;
+        }
+    }
+
+    void disjunction()
+    {
+        m_alternative = m_alternative->m_parent->addNewAlternative();
+    }
+
+    unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
+    {
+        alternative->m_hasFixedSize = true;
+        Checked<unsigned> currentInputPosition = initialInputPosition;
+
+        for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+            PatternTerm& term = alternative->m_terms[i];
+
+            switch (term.type) {
+            case PatternTerm::TypeAssertionBOL:
+            case PatternTerm::TypeAssertionEOL:
+            case PatternTerm::TypeAssertionWordBoundary:
+                term.inputPosition = currentInputPosition.unsafeGet();
+                break;
+
+            case PatternTerm::TypeBackReference:
+                term.inputPosition = currentInputPosition.unsafeGet();
+                term.frameLocation = currentCallFrameSize;
+                currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
+                alternative->m_hasFixedSize = false;
+                break;
+
+            case PatternTerm::TypeForwardReference:
+                break;
+
+            case PatternTerm::TypePatternCharacter:
+                term.inputPosition = currentInputPosition.unsafeGet();
+                if (term.quantityType != QuantifierFixedCount) {
+                    term.frameLocation = currentCallFrameSize;
+                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
+                    alternative->m_hasFixedSize = false;
+                } else
+                    currentInputPosition += term.quantityCount;
+                break;
+
+            case PatternTerm::TypeCharacterClass:
+                term.inputPosition = currentInputPosition.unsafeGet();
+                if (term.quantityType != QuantifierFixedCount) {
+                    term.frameLocation = currentCallFrameSize;
+                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
+                    alternative->m_hasFixedSize = false;
+                } else
+                    currentInputPosition += term.quantityCount;
+                break;
+
+            case PatternTerm::TypeParenthesesSubpattern:
+                // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
+                term.frameLocation = currentCallFrameSize;
+                if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+                    if (term.quantityType != QuantifierFixedCount)
+                        currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+                    currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+                    // If quantity is fixed, then pre-check its minimum size.
+                    if (term.quantityType == QuantifierFixedCount)
+                        currentInputPosition += term.parentheses.disjunction->m_minimumSize;
+                    term.inputPosition = currentInputPosition.unsafeGet();
+                } else if (term.parentheses.isTerminal) {
+                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
+                    currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+                    term.inputPosition = currentInputPosition.unsafeGet();
+                } else {
+                    term.inputPosition = currentInputPosition.unsafeGet();
+                    setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
+                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
+                }
+                // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
+                alternative->m_hasFixedSize = false;
+                break;
+
+            case PatternTerm::TypeParentheticalAssertion:
+                term.inputPosition = currentInputPosition.unsafeGet();
+                term.frameLocation = currentCallFrameSize;
+                currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
+                break;
+
+            case PatternTerm::TypeDotStarEnclosure:
+                alternative->m_hasFixedSize = false;
+                term.inputPosition = initialInputPosition;
+                break;
+            }
+        }
+
+        alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
+        return currentCallFrameSize;
+    }
+
+    unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
+    {
+        if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
+            initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
+
+        unsigned minimumInputSize = UINT_MAX;
+        unsigned maximumCallFrameSize = 0;
+        bool hasFixedSize = true;
+
+        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+            PatternAlternative* alternative = disjunction->m_alternatives[alt];
+            unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
+            minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
+            maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
+            hasFixedSize &= alternative->m_hasFixedSize;
+        }
+        
+        ASSERT(minimumInputSize != UINT_MAX);
+        ASSERT(maximumCallFrameSize >= initialCallFrameSize);
+
+        disjunction->m_hasFixedSize = hasFixedSize;
+        disjunction->m_minimumSize = minimumInputSize;
+        disjunction->m_callFrameSize = maximumCallFrameSize;
+        return maximumCallFrameSize;
+    }
+
+    void setupOffsets()
+    {
+        setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
+    }
+
+    // This optimization identifies sets of parentheses that we will never need to backtrack.
+    // In these cases we do not need to store state from prior iterations.
+    // We can presently avoid backtracking for:
+    //   * where the parens are at the end of the regular expression (last term in any of the
+    //     alternatives of the main body disjunction).
+    //   * where the parens are non-capturing, and quantified unbounded greedy (*).
+    //   * where the parens do not contain any capturing subpatterns.
+    void checkForTerminalParentheses()
+    {
+        // This check is much too crude; should be just checking whether the candidate
+        // node contains nested capturing subpatterns, not the whole expression!
+        if (m_pattern.m_numSubpatterns)
+            return;
+
+        Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+        for (size_t i = 0; i < alternatives.size(); ++i) {
+            Vector<PatternTerm>& terms = alternatives[i]->m_terms;
+            if (terms.size()) {
+                PatternTerm& term = terms.last();
+                if (term.type == PatternTerm::TypeParenthesesSubpattern
+                    && term.quantityType == QuantifierGreedy
+                    && term.quantityCount == quantifyInfinite
+                    && !term.capture())
+                    term.parentheses.isTerminal = true;
+            }
+        }
+    }
+
+    void optimizeBOL()
+    {
+        // Look for expressions containing beginning of line (^) anchoring and unroll them.
+        // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
+        // This code relies on the parsing code tagging alternatives with m_containsBOL and
+        // m_startsWithBOL and rolling those up to containing alternatives.
+        // At this point, this is only valid for non-multiline expressions.
+        PatternDisjunction* disjunction = m_pattern.m_body;
+        
+        if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+            return;
+        
+        PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
+
+        // Set alternatives in disjunction to "onceThrough"
+        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
+            disjunction->m_alternatives[alt]->setOnceThrough();
+
+        if (loopDisjunction) {
+            // Move alternatives from loopDisjunction to disjunction
+            for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
+                disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
+                
+            loopDisjunction->m_alternatives.clear();
+        }
+    }
+
+    bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
+    {
+        Vector<PatternTerm>& terms = alternative->m_terms;
+
+        for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
+            PatternTerm& term = terms[termIndex];
+
+            if (term.m_capture)
+                return true;
+
+            if (term.type == PatternTerm::TypeParenthesesSubpattern) {
+                PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
+                for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
+                    if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+                        return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    // This optimization identifies alternatives in the form of 
+    // [^].*[?]<expression>.*[$] for expressions that don't have any 
+    // capturing terms. The alternative is changed to <expression> 
+    // followed by processing of the dot stars to find and adjust the 
+    // beginning and the end of the match.
+    void optimizeDotStarWrappedExpressions()
+    {
+        Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+        if (alternatives.size() != 1)
+            return;
+
+        PatternAlternative* alternative = alternatives[0];
+        Vector<PatternTerm>& terms = alternative->m_terms;
+        if (terms.size() >= 3) {
+            bool startsWithBOL = false;
+            bool endsWithEOL = false;
+            size_t termIndex, firstExpressionTerm, lastExpressionTerm;
+
+            termIndex = 0;
+            if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
+                startsWithBOL = true;
+                ++termIndex;
+            }
+            
+            PatternTerm& firstNonAnchorTerm = terms[termIndex];
+            if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
+                return;
+            
+            firstExpressionTerm = termIndex + 1;
+            
+            termIndex = terms.size() - 1;
+            if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
+                endsWithEOL = true;
+                --termIndex;
+            }
+            
+            PatternTerm& lastNonAnchorTerm = terms[termIndex];
+            if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
+                return;
+            
+            lastExpressionTerm = termIndex - 1;
+
+            if (firstExpressionTerm > lastExpressionTerm)
+                return;
+
+            if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
+                for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
+                    terms.remove(termIndex);
+
+                for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
+                    terms.remove(termIndex - 1);
+
+                terms.append(PatternTerm(startsWithBOL, endsWithEOL));
+                
+                m_pattern.m_containsBOL = false;
+            }
+        }
+    }
+
+private:
+    YarrPattern& m_pattern;
+    PatternAlternative* m_alternative;
+    CharacterClassConstructor m_characterClassConstructor;
+    bool m_invertCharacterClass;
+    bool m_invertParentheticalAssertion;
+};
+
+const char* YarrPattern::compile(const String& patternString)
+{
+    YarrPatternConstructor constructor(*this);
+
+    if (const char* error = parse(constructor, patternString))
+        return error;
+    
+    // If the pattern contains illegal backreferences reset & reparse.
+    // Quoting Netscape's "What's new in JavaScript 1.2",
+    //      "Note: if the number of left parentheses is less than the number specified
+    //       in \#, the \# is taken as an octal escape as described in the next row."
+    if (containsIllegalBackReference()) {
+        unsigned numSubpatterns = m_numSubpatterns;
+
+        constructor.reset();
+#if !ASSERT_DISABLED
+        const char* error =
+#endif
+            parse(constructor, patternString, numSubpatterns);
+
+        ASSERT(!error);
+        ASSERT(numSubpatterns == m_numSubpatterns);
+    }
+
+    constructor.checkForTerminalParentheses();
+    constructor.optimizeDotStarWrappedExpressions();
+    constructor.optimizeBOL();
+        
+    constructor.setupOffsets();
+
+    return 0;
+}
+
+YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
+    : m_ignoreCase(ignoreCase)
+    , m_multiline(multiline)
+    , m_containsBackreferences(false)
+    , m_containsBOL(false)
+    , m_numSubpatterns(0)
+    , m_maxBackReference(0)
+    , newlineCached(0)
+    , digitsCached(0)
+    , spacesCached(0)
+    , wordcharCached(0)
+    , nondigitsCached(0)
+    , nonspacesCached(0)
+    , nonwordcharCached(0)
+{
+    *error = compile(pattern);
+}
+
+} }
diff --git a/masm/yarr/YarrPattern.h b/masm/yarr/YarrPattern.h

new file mode 100644 (file)

index 0000000..14e89b8
--- /dev/null
+++ b/masm/yarr/YarrPattern.h
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrPattern_h
+#define YarrPattern_h
+
+#include <wtf/CheckedArithmetic.h>
+#include <wtf/RefCounted.h>
+#include <wtf/Vector.h>
+#include <wtf/text/WTFString.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+struct PatternDisjunction;
+
+struct CharacterRange {
+    UChar begin;
+    UChar end;
+
+    CharacterRange(UChar begin, UChar end)
+        : begin(begin)
+        , end(end)
+    {
+    }
+};
+
+struct CharacterClassTable : RefCounted<CharacterClassTable> {
+    const char* m_table;
+    bool m_inverted;
+    static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
+    {
+        return adoptRef(new CharacterClassTable(table, inverted));
+    }
+
+private:
+    CharacterClassTable(const char* table, bool inverted)
+        : m_table(table)
+        , m_inverted(inverted)
+    {
+    }
+};
+
+struct CharacterClass {
+    WTF_MAKE_FAST_ALLOCATED;
+public:
+    // All CharacterClass instances have to have the full set of matches and ranges,
+    // they may have an optional table for faster lookups (which must match the
+    // specified matches and ranges)
+    CharacterClass(PassRefPtr<CharacterClassTable> table)
+        : m_table(table)
+    {
+    }
+    Vector<UChar> m_matches;
+    Vector<CharacterRange> m_ranges;
+    Vector<UChar> m_matchesUnicode;
+    Vector<CharacterRange> m_rangesUnicode;
+    RefPtr<CharacterClassTable> m_table;
+};
+
+enum QuantifierType {
+    QuantifierFixedCount,
+    QuantifierGreedy,
+    QuantifierNonGreedy,
+};
+
+struct PatternTerm {
+    enum Type {
+        TypeAssertionBOL,
+        TypeAssertionEOL,
+        TypeAssertionWordBoundary,
+        TypePatternCharacter,
+        TypeCharacterClass,
+        TypeBackReference,
+        TypeForwardReference,
+        TypeParenthesesSubpattern,
+        TypeParentheticalAssertion,
+        TypeDotStarEnclosure,
+    } type;
+    bool m_capture :1;
+    bool m_invert :1;
+    union {
+        UChar patternCharacter;
+        CharacterClass* characterClass;
+        unsigned backReferenceSubpatternId;
+        struct {
+            PatternDisjunction* disjunction;
+            unsigned subpatternId;
+            unsigned lastSubpatternId;
+            bool isCopy;
+            bool isTerminal;
+        } parentheses;
+        struct {
+            bool bolAnchor : 1;
+            bool eolAnchor : 1;
+        } anchors;
+    };
+    QuantifierType quantityType;
+    Checked<unsigned> quantityCount;
+    int inputPosition;
+    unsigned frameLocation;
+
+    PatternTerm(UChar ch)
+        : type(PatternTerm::TypePatternCharacter)
+        , m_capture(false)
+        , m_invert(false)
+    {
+        patternCharacter = ch;
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+
+    PatternTerm(CharacterClass* charClass, bool invert)
+        : type(PatternTerm::TypeCharacterClass)
+        , m_capture(false)
+        , m_invert(invert)
+    {
+        characterClass = charClass;
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+
+    PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
+        : type(type)
+        , m_capture(capture)
+        , m_invert(invert)
+    {
+        parentheses.disjunction = disjunction;
+        parentheses.subpatternId = subpatternId;
+        parentheses.isCopy = false;
+        parentheses.isTerminal = false;
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+    
+    PatternTerm(Type type, bool invert = false)
+        : type(type)
+        , m_capture(false)
+        , m_invert(invert)
+    {
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+
+    PatternTerm(unsigned spatternId)
+        : type(TypeBackReference)
+        , m_capture(false)
+        , m_invert(false)
+    {
+        backReferenceSubpatternId = spatternId;
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+
+    PatternTerm(bool bolAnchor, bool eolAnchor)
+        : type(TypeDotStarEnclosure)
+        , m_capture(false)
+        , m_invert(false)
+    {
+        anchors.bolAnchor = bolAnchor;
+        anchors.eolAnchor = eolAnchor;
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+    
+    static PatternTerm ForwardReference()
+    {
+        return PatternTerm(TypeForwardReference);
+    }
+
+    static PatternTerm BOL()
+    {
+        return PatternTerm(TypeAssertionBOL);
+    }
+
+    static PatternTerm EOL()
+    {
+        return PatternTerm(TypeAssertionEOL);
+    }
+
+    static PatternTerm WordBoundary(bool invert)
+    {
+        return PatternTerm(TypeAssertionWordBoundary, invert);
+    }
+    
+    bool invert()
+    {
+        return m_invert;
+    }
+
+    bool capture()
+    {
+        return m_capture;
+    }
+    
+    void quantify(unsigned count, QuantifierType type)
+    {
+        quantityCount = count;
+        quantityType = type;
+    }
+};
+
+struct PatternAlternative {
+    WTF_MAKE_FAST_ALLOCATED;
+public:
+    PatternAlternative(PatternDisjunction* disjunction)
+        : m_parent(disjunction)
+        , m_onceThrough(false)
+        , m_hasFixedSize(false)
+        , m_startsWithBOL(false)
+        , m_containsBOL(false)
+    {
+    }
+
+    PatternTerm& lastTerm()
+    {
+        ASSERT(m_terms.size());
+        return m_terms[m_terms.size() - 1];
+    }
+    
+    void removeLastTerm()
+    {
+        ASSERT(m_terms.size());
+        m_terms.shrink(m_terms.size() - 1);
+    }
+    
+    void setOnceThrough()
+    {
+        m_onceThrough = true;
+    }
+    
+    bool onceThrough()
+    {
+        return m_onceThrough;
+    }
+
+    Vector<PatternTerm> m_terms;
+    PatternDisjunction* m_parent;
+    unsigned m_minimumSize;
+    bool m_onceThrough : 1;
+    bool m_hasFixedSize : 1;
+    bool m_startsWithBOL : 1;
+    bool m_containsBOL : 1;
+};
+
+struct PatternDisjunction {
+    WTF_MAKE_FAST_ALLOCATED;
+public:
+    PatternDisjunction(PatternAlternative* parent = 0)
+        : m_parent(parent)
+        , m_hasFixedSize(false)
+    {
+    }
+    
+    ~PatternDisjunction()
+    {
+        deleteAllValues(m_alternatives);
+    }
+
+    PatternAlternative* addNewAlternative()
+    {
+        PatternAlternative* alternative = new PatternAlternative(this);
+        m_alternatives.append(alternative);
+        return alternative;
+    }
+
+    Vector<PatternAlternative*> m_alternatives;
+    PatternAlternative* m_parent;
+    unsigned m_minimumSize;
+    unsigned m_callFrameSize;
+    bool m_hasFixedSize;
+};
+
+// You probably don't want to be calling these functions directly
+// (please to be calling newlineCharacterClass() et al on your
+// friendly neighborhood YarrPattern instance to get nicely
+// cached copies).
+CharacterClass* newlineCreate();
+CharacterClass* digitsCreate();
+CharacterClass* spacesCreate();
+CharacterClass* wordcharCreate();
+CharacterClass* nondigitsCreate();
+CharacterClass* nonspacesCreate();
+CharacterClass* nonwordcharCreate();
+
+struct TermChain {
+    TermChain(PatternTerm term)
+        : term(term)
+    {}
+
+    PatternTerm term;
+    Vector<TermChain> hotTerms;
+};
+
+struct YarrPattern {
+    JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error);
+
+    ~YarrPattern()
+    {
+        deleteAllValues(m_disjunctions);
+        deleteAllValues(m_userCharacterClasses);
+    }
+
+    void reset()
+    {
+        m_numSubpatterns = 0;
+        m_maxBackReference = 0;
+
+        m_containsBackreferences = false;
+        m_containsBOL = false;
+
+        newlineCached = 0;
+        digitsCached = 0;
+        spacesCached = 0;
+        wordcharCached = 0;
+        nondigitsCached = 0;
+        nonspacesCached = 0;
+        nonwordcharCached = 0;
+
+        deleteAllValues(m_disjunctions);
+        m_disjunctions.clear();
+        deleteAllValues(m_userCharacterClasses);
+        m_userCharacterClasses.clear();
+    }
+
+    bool containsIllegalBackReference()
+    {
+        return m_maxBackReference > m_numSubpatterns;
+    }
+
+    CharacterClass* newlineCharacterClass()
+    {
+        if (!newlineCached)
+            m_userCharacterClasses.append(newlineCached = newlineCreate());
+        return newlineCached;
+    }
+    CharacterClass* digitsCharacterClass()
+    {
+        if (!digitsCached)
+            m_userCharacterClasses.append(digitsCached = digitsCreate());
+        return digitsCached;
+    }
+    CharacterClass* spacesCharacterClass()
+    {
+        if (!spacesCached)
+            m_userCharacterClasses.append(spacesCached = spacesCreate());
+        return spacesCached;
+    }
+    CharacterClass* wordcharCharacterClass()
+    {
+        if (!wordcharCached)
+            m_userCharacterClasses.append(wordcharCached = wordcharCreate());
+        return wordcharCached;
+    }
+    CharacterClass* nondigitsCharacterClass()
+    {
+        if (!nondigitsCached)
+            m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
+        return nondigitsCached;
+    }
+    CharacterClass* nonspacesCharacterClass()
+    {
+        if (!nonspacesCached)
+            m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
+        return nonspacesCached;
+    }
+    CharacterClass* nonwordcharCharacterClass()
+    {
+        if (!nonwordcharCached)
+            m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
+        return nonwordcharCached;
+    }
+
+    bool m_ignoreCase : 1;
+    bool m_multiline : 1;
+    bool m_containsBackreferences : 1;
+    bool m_containsBOL : 1;
+    unsigned m_numSubpatterns;
+    unsigned m_maxBackReference;
+    PatternDisjunction* m_body;
+    Vector<PatternDisjunction*, 4> m_disjunctions;
+    Vector<CharacterClass*> m_userCharacterClasses;
+
+private:
+    const char* compile(const String& patternString);
+
+    CharacterClass* newlineCached;
+    CharacterClass* digitsCached;
+    CharacterClass* spacesCached;
+    CharacterClass* wordcharCached;
+    CharacterClass* nondigitsCached;
+    CharacterClass* nonspacesCached;
+    CharacterClass* nonwordcharCached;
+};
+
+} } // namespace JSC::Yarr
+
+#endif // YarrPattern_h
diff --git a/masm/yarr/YarrSyntaxChecker.cpp b/masm/yarr/YarrSyntaxChecker.cpp

new file mode 100644 (file)

index 0000000..aa98c4a
--- /dev/null
+++ b/masm/yarr/YarrSyntaxChecker.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "YarrSyntaxChecker.h"
+
+#include "YarrParser.h"
+
+namespace JSC { namespace Yarr {
+
+class SyntaxChecker {
+public:
+    void assertionBOL() {}
+    void assertionEOL() {}
+    void assertionWordBoundary(bool) {}
+    void atomPatternCharacter(UChar) {}
+    void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {}
+    void atomCharacterClassBegin(bool = false) {}
+    void atomCharacterClassAtom(UChar) {}
+    void atomCharacterClassRange(UChar, UChar) {}
+    void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {}
+    void atomCharacterClassEnd() {}
+    void atomParenthesesSubpatternBegin(bool = true) {}
+    void atomParentheticalAssertionBegin(bool = false) {}
+    void atomParenthesesEnd() {}
+    void atomBackReference(unsigned) {}
+    void quantifyAtom(unsigned, unsigned, bool) {}
+    void disjunction() {}
+};
+
+const char* checkSyntax(const String& pattern)
+{
+    SyntaxChecker syntaxChecker;
+    return parse(syntaxChecker, pattern);
+}
+
+}} // JSC::YARR
diff --git a/masm/yarr/YarrSyntaxChecker.h b/masm/yarr/YarrSyntaxChecker.h

new file mode 100644 (file)

index 0000000..104ced3
--- /dev/null
+++ b/masm/yarr/YarrSyntaxChecker.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrSyntaxChecker_h
+#define YarrSyntaxChecker_h
+
+#include <wtf/text/WTFString.h>
+
+namespace JSC { namespace Yarr {
+
+const char* checkSyntax(const String& pattern);
+
+}} // JSC::YARR
+
+#endif // YarrSyntaxChecker_h
+
diff --git a/masm/yarr/yarr.pri b/masm/yarr/yarr.pri

new file mode 100644 (file)

index 0000000..7e9b4d3
--- /dev/null
+++ b/masm/yarr/yarr.pri
@@ -0,0 +1,12 @@
+# -------------------------------------------------------------------
+# Project file for YARR
+#
+# See 'Tools/qmake/README' for an overview of the build system
+# -------------------------------------------------------------------
+
+SOURCES += \
+    $$PWD/YarrInterpreter.cpp \
+    $$PWD/YarrPattern.cpp \
+    $$PWD/YarrSyntaxChecker.cpp \
+    $$PWD/YarrCanonicalizeUCS2.cpp
+
diff --git a/qmljs_engine.cpp b/qmljs_engine.cpp

index 2d34b3dc8d554af64874301fcca8dff24145cb0c..a1f99f52c95d0e1326b3d49b3719f3e2449648d2 100644 (file)
--- a/qmljs_engine.cpp
+++ b/qmljs_engine.cpp
@@ -357,17 +357,17 @@ FunctionObject *ExecutionEngine::newDateCtor(ExecutionContext *ctx)
  RegExpObject *ExecutionEngine::newRegExpObject(const QString &pattern, int flags)
  {
      bool global = (flags & IR::RegExp::RegExp_Global);
-    QRegularExpression::PatternOptions options = 0;
+    bool ignoreCase = false;
+    bool multiline = false;
      if (flags & IR::RegExp::RegExp_IgnoreCase)
-        options |= QRegularExpression::CaseInsensitiveOption;
+        ignoreCase = true;
      if (flags & IR::RegExp::RegExp_Multiline)
-        options |= QRegularExpression::MultilineOption;
+        multiline = true;
  
-    QRegularExpression re(pattern, options);
-    return newRegExpObject(re, global);
+    return newRegExpObject(RegExp::create(this, pattern, ignoreCase, multiline), global);
  }
  
-RegExpObject *ExecutionEngine::newRegExpObject(const QRegularExpression &re, bool global)
+RegExpObject *ExecutionEngine::newRegExpObject(PassRefPtr<RegExp> re, bool global)
  {
      RegExpObject *object = new (memoryManager) RegExpObject(re, global);
      object->prototype = regExpPrototype;
diff --git a/qmljs_engine.h b/qmljs_engine.h

index 191268f4c4857d75b5fe287428b354b857ef1c0b..83e8e62bb80d01c04d9362c516182cd3f240a51e 100644 (file)
--- a/qmljs_engine.h
+++ b/qmljs_engine.h
@@ -46,6 +46,9 @@
  #include <qmljs_environment.h>
  #include <setjmp.h>
  
+#include <wtf/PassRefPtr.h>
+#include <wtf/BumpPointerAllocator.h>
+
  namespace QQmlJS {
  
  namespace Debugging {
@@ -88,12 +91,15 @@ struct SyntaxErrorPrototype;
  struct TypeErrorPrototype;
  struct URIErrorPrototype;
  
+class RegExp;
+
  struct ExecutionEngine
  {
      MemoryManager *memoryManager;
      EvalISelFactory *iselFactory;
      ExecutionContext *current;
      ExecutionContext *rootContext;
+    WTF::BumpPointerAllocator bumperPointerAllocator; // Used by Yarr Regex engine.
  
      Debugging::Debugger *debugger;
  
@@ -195,7 +201,7 @@ struct ExecutionEngine
      FunctionObject *newDateCtor(ExecutionContext *ctx);
  
      RegExpObject *newRegExpObject(const QString &pattern, int flags);
-    RegExpObject *newRegExpObject(const QRegularExpression &re, bool global);
+    RegExpObject *newRegExpObject(PassRefPtr<RegExp> re, bool global);
      FunctionObject *newRegExpCtor(ExecutionContext *ctx);
  
      Object *newErrorObject(const Value &value);
diff --git a/qmljs_objects.cpp b/qmljs_objects.cpp

index fbc976d54b4e867c7b45437520d1e35cda868a2e..fee53784210ecb3f15c3e26d31509bdabafbccd2 100644 (file)
--- a/qmljs_objects.cpp
+++ b/qmljs_objects.cpp
@@ -1114,13 +1114,13 @@ Value RegExpObject::__get__(ExecutionContext *ctx, String *name, bool *hasProper
      QString n = name->toQString();
      Value v = Value::undefinedValue();
      if (n == QLatin1String("source"))
-        v = Value::fromString(ctx, value.pattern());
+        v = Value::fromString(ctx, value->pattern());
      else if (n == QLatin1String("global"))
          v = Value::fromBoolean(global);
      else if (n == QLatin1String("ignoreCase"))
-        v = Value::fromBoolean(value.patternOptions() & QRegularExpression::CaseInsensitiveOption);
+        v = Value::fromBoolean(value->ignoreCase());
      else if (n == QLatin1String("multiline"))
-        v = Value::fromBoolean(value.patternOptions() & QRegularExpression::MultilineOption);
+        v = Value::fromBoolean(value->multiLine());
      else if (n == QLatin1String("lastIndex"))
          v = lastIndex;
      if (v.type() != Value::Undefined_Type) {
@@ -1128,7 +1128,6 @@ Value RegExpObject::__get__(ExecutionContext *ctx, String *name, bool *hasProper
              *hasProperty = true;
          return v;
      }
-
      return Object::__get__(ctx, name, hasProperty);
  }
  
diff --git a/qmljs_objects.h b/qmljs_objects.h

index 93796622fdc369928536af6878515db4710eb3be..d00486816f29c207334052d57cff35ee0e79278b 100644 (file)
--- a/qmljs_objects.h
+++ b/qmljs_objects.h
@@ -52,10 +52,10 @@
  #include "qv4propertydescriptor.h"
  #include "qv4propertytable.h"
  #include "qv4objectiterator.h"
+#include "qv4regexp.h"
  
  #include <QtCore/QString>
  #include <QtCore/QHash>
-#include <QtCore/QRegularExpression>
  #include <QtCore/QScopedPointer>
  #include <cstdio>
  #include <cassert>
@@ -362,10 +362,10 @@ struct IsFiniteFunction: FunctionObject
  };
  
  struct RegExpObject: Object {
-    QRegularExpression value;
+    RefPtr<RegExp> value;
      Value lastIndex;
      bool global;
-    RegExpObject(const QRegularExpression &value, bool global): value(value), lastIndex(Value::fromInt32(0)), global(global) {}
+    RegExpObject(PassRefPtr<RegExp> value, bool global): value(value), lastIndex(Value::fromInt32(0)), global(global) {}
      virtual QString className() { return QStringLiteral("RegExp"); }
      virtual RegExpObject *asRegExpObject() { return this; }
      virtual Value __get__(ExecutionContext *ctx, String *name, bool *hasProperty);
diff --git a/qv4ecmaobjects.cpp b/qv4ecmaobjects.cpp

index 9310676603dcfb528b7d35b27f46dbb32669c6b1..3392fae6ce779aac3296bcf8341d92e540147f8e 100644 (file)
--- a/qv4ecmaobjects.cpp
+++ b/qv4ecmaobjects.cpp
@@ -46,7 +46,6 @@
  #include <QtCore/qmath.h>
  #include <QtCore/QDateTime>
  #include <QtCore/QStringList>
-#include <QtCore/QRegularExpression>
  #include <QtCore/QDebug>
  #include <cmath>
  #include <qmath.h>
@@ -2846,25 +2845,26 @@ Value RegExpCtor::construct(ExecutionContext *ctx)
          r = __qmljs_to_string(r, ctx);
  
      bool global = false;
-    QRegularExpression::PatternOptions options = QRegularExpression::NoPatternOption;
+    bool ignoreCase = false;
+    bool multiLine = false;
      if (!f.isUndefined()) {
          f = __qmljs_to_string(f, ctx);
          QString str = f.stringValue()->toQString();
          for (int i = 0; i < str.length(); ++i) {
              if (str.at(i) == QChar('g') && !global) {
                  global = true;
-            } else if (str.at(i) == QChar('i') && !(options & QRegularExpression::CaseInsensitiveOption)) {
-                options |= QRegularExpression::CaseInsensitiveOption;
-            } else if (str.at(i) == QChar('m') && !(options & QRegularExpression::MultilineOption)) {
-                options |= QRegularExpression::MultilineOption;
+            } else if (str.at(i) == QChar('i') && !ignoreCase) {
+                ignoreCase = true;
+            } else if (str.at(i) == QChar('m') && !multiLine) {
+                multiLine = true;
              } else {
                  ctx->throwTypeError();
              }
          }
      }
  
-    QRegularExpression re(r.stringValue()->toQString(), options);
-    if (!re.isValid())
+    RefPtr<RegExp> re = RegExp::create(ctx->engine, r.stringValue()->toQString(), ignoreCase, multiLine);
+    if (!re->isValid())
          ctx->throwTypeError();
  
      RegExpObject *o = ctx->engine->newRegExpObject(re, global);
@@ -2905,21 +2905,25 @@ Value RegExpPrototype::method_exec(ExecutionContext *ctx)
      if (offset < 0 || offset > s.length())
          return Value::nullValue();
  
-    QRegularExpressionMatch match = r->value.match(s, offset);
-    if (!match.hasMatch())
+    uint* matchOffsets = (uint*)alloca(r->value->captureCount() * 2 * sizeof(uint));
+    int result = r->value->match(s, offset, matchOffsets);
+    if (result == -1)
          return Value::nullValue();
  
      // fill in result data
      ArrayObject *array = ctx->engine->newArrayObject(ctx)->asArrayObject();
-    int captured = match.lastCapturedIndex();
-    for (int i = 0; i <= captured; ++i)
-        array->array.push_back(Value::fromString(ctx, match.captured(i)));
+    for (int i = 0; i < r->value->captureCount(); ++i) {
+        int start = matchOffsets[i * 2];
+        int end = matchOffsets[i * 2 + 1];
+        if (start != -1 && end != -1)
+            array->array.push_back(Value::fromString(ctx, s.mid(start, end - start)));
+    }
  
-    array->__put__(ctx, QLatin1String("index"), Value::fromInt32(match.capturedStart(0)));
+    array->__put__(ctx, QLatin1String("index"), Value::fromInt32(result));
      array->__put__(ctx, QLatin1String("input"), arg);
  
      if (r->global)
-        r->lastIndex = Value::fromInt32(match.capturedEnd(0));
+        r->lastIndex = Value::fromInt32(matchOffsets[1]);
  
      return Value::fromObject(array);
  }
@@ -2936,13 +2940,12 @@ Value RegExpPrototype::method_toString(ExecutionContext *ctx)
      if (!r)
          ctx->throwTypeError();
  
-    QString result = QChar('/') + r->value.pattern();
+    QString result = QChar('/') + r->value->pattern();
      result += QChar('/');
-    QRegularExpression::PatternOptions o = r->value.patternOptions();
      // ### 'g' option missing
-    if (o & QRegularExpression::CaseInsensitiveOption)
+    if (r->value->ignoreCase())
          result += QChar('i');
-    if (o & QRegularExpression::MultilineOption)
+    if (r->value->multiLine())
          result += QChar('m');
      return Value::fromString(ctx, result);
  }
diff --git a/qv4ecmaobjects_p.h b/qv4ecmaobjects_p.h

index 35a3f6529ef81b87bbe29da47f31abd68178e9bb..0576eb45f30d2de534a3d6b26b187d64247c01e7 100644 (file)
--- a/qv4ecmaobjects_p.h
+++ b/qv4ecmaobjects_p.h
@@ -295,7 +295,7 @@ struct RegExpCtor: FunctionObject
  
  struct RegExpPrototype: RegExpObject
  {
-    RegExpPrototype(): RegExpObject(QRegularExpression(), false) {}
+    RegExpPrototype(): RegExpObject(RegExp::create(0, QString()), false) {}
      void init(ExecutionContext *ctx, const Value &ctor);
  
      static Value method_exec(ExecutionContext *ctx);
diff --git a/qv4regexp.cpp b/qv4regexp.cpp

new file mode 100644 (file)

index 0000000..64d9fef
--- /dev/null
+++ b/qv4regexp.cpp
@@ -0,0 +1,76 @@
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia.  For licensing terms and
+** conditions see http://qt.digia.com/licensing.  For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights.  These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qv4regexp.h"
+
+#include "qmljs_engine.h"
+
+namespace QQmlJS {
+namespace VM {
+
+int RegExp::match(const QString &string, int start, uint *matchOffsets)
+{
+    if (!isValid())
+        return JSC::Yarr::offsetNoMatch;
+
+    return JSC::Yarr::interpret(m_byteCode.get(), WTF::String(string).characters16(), string.length(), start, matchOffsets);
+}
+
+RegExp::RegExp(ExecutionEngine* engine, const QString &pattern, bool ignoreCase, bool multiline)
+    : m_pattern(pattern)
+    , m_subPatternCount(0)
+    , m_ignoreCase(ignoreCase)
+    , m_multiLine(multiline)
+{
+    if (!engine)
+        return;
+    const char* error = 0;
+    JSC::Yarr::YarrPattern yarrPattern(WTF::String(pattern), ignoreCase, multiline, &error);
+    if (error)
+        return;
+    m_subPatternCount = yarrPattern.m_numSubpatterns;
+    m_byteCode = JSC::Yarr::byteCompile(yarrPattern, &engine->bumperPointerAllocator);
+}
+
+} // end of namespace VM
+} // end of namespace QQmlJS
+
+
diff --git a/qv4regexp.h b/qv4regexp.h

new file mode 100644 (file)

index 0000000..3b7a980
--- /dev/null
+++ b/qv4regexp.h
@@ -0,0 +1,92 @@
+/****************************************************************************
+**
+** Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of the V4VM module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia.  For licensing terms and
+** conditions see http://qt.digia.com/licensing.  For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights.  These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef QV4REGEXP_H
+#define QV4REGEXP_H
+
+#include <QString>
+#include <QVector>
+
+#include <wtf/RefCounted.h>
+#include <wtf/RefPtr.h>
+#include <wtf/FastAllocBase.h>
+#include <wtf/BumpPointerAllocator.h>
+
+#include <limits.h>
+
+#include <yarr/Yarr.h>
+#include <yarr/YarrInterpreter.h>
+
+namespace QQmlJS {
+namespace VM {
+
+struct ExecutionEngine;
+
+class RegExp : public RefCounted<RegExp>
+{
+public:
+    static PassRefPtr<RegExp> create(ExecutionEngine* engine, const QString& pattern, bool ignoreCase = false, bool multiline = false)
+    { return adoptRef(new RegExp(engine, pattern, ignoreCase, multiline)); }
+
+    QString pattern() const { return m_pattern; }
+
+    bool isValid() const { return m_byteCode.get(); }
+
+    int match(const QString& string, int start, uint *matchOffsets);
+
+    bool ignoreCase() const { return m_ignoreCase; }
+    bool multiLine() const { return m_multiLine; }
+    int captureCount() const { return m_subPatternCount + 1; }
+
+private:
+    Q_DISABLE_COPY(RegExp);
+    RegExp(ExecutionEngine* engine, const QString& pattern, bool ignoreCase, bool multiline);
+
+    QString m_pattern;
+    OwnPtr<JSC::Yarr::BytecodePattern> m_byteCode;
+    int m_subPatternCount;
+    bool m_ignoreCase;
+    bool m_multiLine;
+};
+
+} // end of namespace VM
+} // end of namespace QQmlJS
+
+#endif // QV4REGEXP_H
diff --git a/tests/TestExpectations b/tests/TestExpectations

index 839ba7c3a2d825d528680841c801948d67c58cdb..56b55097f61b55839ac14c213f7bbff2abc74efd 100644 (file)
--- a/tests/TestExpectations
+++ b/tests/TestExpectations
@@ -392,9 +392,6 @@ S15.10.2.3_A1_T17 failing
  S15.10.2.3_A1_T2 failing
  S15.10.2.5_A1_T4 failing
  S15.10.2.12_A6_T1 failing
-S15.10.2.13_A2_T1 failing
-S15.10.2.13_A2_T2 failing
-S15.10.2.13_A2_T8 failing
  S15.10.2.15_A1_T1 failing
  S15.10.2.15_A1_T10 failing
  S15.10.2.15_A1_T11 failing
@@ -444,19 +441,8 @@ S15.10.1_A1_T9 failing
  15.10.2.15-6-1 failing
  15.10.2.2-1 failing
  15.10.2.5-3-1 failing
-S15.10.2.10_A2.1_T3 failing
-S15.10.2.10_A4.1_T1 failing
-S15.10.2.10_A4.1_T2 failing
-S15.10.2.10_A4.1_T3 failing
-S15.10.2.10_A5.1_T1 failing
-S15.10.2.11_A1_T5 failing
-S15.10.2.11_A1_T7 failing
  S15.10.2.12_A1_T1 failing
-S15.10.2.12_A1_T2 failing
-S15.10.2.12_A1_T5 failing
  S15.10.2.12_A2_T1 failing
-S15.10.2.12_A2_T2 failing
-S15.10.2.12_A2_T5 failing
  S15.10.2.12_A3_T1 failing
  S15.10.2.12_A4_T1 failing
  S15.10.2.12_A5_T1 failing
@@ -484,7 +470,6 @@ S15.10.4.1_A5_T6 failing
  S15.10.4.1_A5_T7 failing
  S15.10.4.1_A5_T8 failing
  S15.10.4.1_A5_T9 failing
-S15.10.4.1_A8_T2 failing
  S15.10.4.1_A9_T1 failing
  S15.10.4.1_A9_T2 failing
  S15.10.4.1_A9_T3 failing
@@ -2808,4 +2793,3 @@ S15.4.4.13_A1_T2 failing
  15.4.4.14-9-b-i-5 failing
  15.4.4.16-7-c-i-6 failing
  15.4.4.17-7-c-i-6 failing
-
diff --git a/v4.pro b/v4.pro

index 2132f1266434bb9b728725bcf891bba94b030928..7213e6c7ad5210090887b5c420ee2b3ba10392b4 100644 (file)
--- a/v4.pro
+++ b/v4.pro
@@ -28,7 +28,8 @@ SOURCES += main.cpp \
      qv4managed.cpp \
      qv4array.cpp \
      qv4string.cpp \
-    qv4objectiterator.cpp
+    qv4objectiterator.cpp \
+    qv4regexp.cpp
  
  HEADERS += \
      qv4codegen_p.h \
@@ -51,7 +52,8 @@ HEADERS += \
      qv4string.h \
      qv4propertydescriptor.h \
      qv4propertytable.h \
-    qv4objectiterator.h
+    qv4objectiterator.h \
+    qv4regexp.h
  
  llvm {
author	Simon Hausmann <simon.hausmann@digia.com>
	Mon, 14 Jan 2013 15:53:43 +0000 (16:53 +0100)
committer	Simon Hausmann <simon.hausmann@digia.com>
	Thu, 17 Jan 2013 13:24:04 +0000 (14:24 +0100)
.gitignore		patch \| blob \| history
masm/config.h		patch \| blob \| history
masm/create_regex_tables	[new file with mode: 0644]	patch \| blob
masm/masm.pri		patch \| blob \| history
masm/runtime/MatchResult.h	[new file with mode: 0644]	patch \| blob
masm/stubs/wtf/PassOwnPtr.h		patch \| blob \| history
masm/stubs/wtf/RefCounted.h		patch \| blob \| history
masm/stubs/wtf/Vector.h		patch \| blob \| history
masm/stubs/wtf/text/CString.h	[new file with mode: 0644]	patch \| blob
masm/stubs/wtf/text/WTFString.h	[new file with mode: 0644]	patch \| blob
masm/stubs/wtf/unicode/Unicode.h	[new file with mode: 0644]	patch \| blob
masm/wtf/ASCIICType.h	[new file with mode: 0644]	patch \| blob
masm/wtf/BumpPointerAllocator.h	[new file with mode: 0644]	patch \| blob
masm/yarr/Yarr.h	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrCanonicalizeUCS2.cpp	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrCanonicalizeUCS2.h	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrCanonicalizeUCS2.js	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrInterpreter.cpp	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrInterpreter.h	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrJIT.cpp	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrJIT.h	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrParser.h	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrPattern.cpp	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrPattern.h	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrSyntaxChecker.cpp	[new file with mode: 0644]	patch \| blob
masm/yarr/YarrSyntaxChecker.h	[new file with mode: 0644]	patch \| blob
masm/yarr/yarr.pri	[new file with mode: 0644]	patch \| blob
qmljs_engine.cpp		patch \| blob \| history
qmljs_engine.h		patch \| blob \| history
qmljs_objects.cpp		patch \| blob \| history
qmljs_objects.h		patch \| blob \| history
qv4ecmaobjects.cpp		patch \| blob \| history
qv4ecmaobjects_p.h		patch \| blob \| history
qv4regexp.cpp	[new file with mode: 0644]	patch \| blob
qv4regexp.h	[new file with mode: 0644]	patch \| blob
tests/TestExpectations		patch \| blob \| history
v4.pro		patch \| blob \| history