NEON intrinsic should be used with arithmetic mode of composite filter
authorzherczeg@webkit.org <zherczeg@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 17 May 2012 09:12:54 +0000 (09:12 +0000)
committerzherczeg@webkit.org <zherczeg@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 17 May 2012 09:12:54 +0000 (09:12 +0000)
https://bugs.webkit.org/show_bug.cgi?id=86622

Reviewed by Nikolas Zimmermann.

Rewrite hand written assembly code to increase portability and readibility
of the code. Remove the unnecessary FECompositeArithmeticNEON.cpp from the
project

Existing tests cover this issue.

* CMakeLists.txt:
* GNUmakefile.list.am:
* Target.pri:
* WebCore.gypi:
* WebCore.vcproj/WebCore.vcproj:
* WebCore.xcodeproj/project.pbxproj:
* platform/graphics/filters/FEComposite.cpp:
(WebCore):
(WebCore::computeArithmeticPixels):
(WebCore::arithmeticSoftware):
(WebCore::FEComposite::platformArithmeticSoftware):
* platform/graphics/filters/FEComposite.h:
* platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp: Removed.
* platform/graphics/filters/arm/FECompositeArithmeticNEON.h:
(WebCore):
(WebCore::FEComposite::computeArithmeticPixelsNeon):
(WebCore::FEComposite::platformArithmeticNeon):

git-svn-id: http://svn.webkit.org/repository/webkit/trunk@117418 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/WebCore/CMakeLists.txt
Source/WebCore/ChangeLog
Source/WebCore/GNUmakefile.list.am
Source/WebCore/Target.pri
Source/WebCore/WebCore.gypi
Source/WebCore/WebCore.vcproj/WebCore.vcproj
Source/WebCore/WebCore.xcodeproj/project.pbxproj
Source/WebCore/platform/graphics/filters/FEComposite.cpp
Source/WebCore/platform/graphics/filters/FEComposite.h
Source/WebCore/platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp [deleted file]
Source/WebCore/platform/graphics/filters/arm/FECompositeArithmeticNEON.h

index 9de4866..b200757 100644 (file)
@@ -1213,7 +1213,6 @@ SET(WebCore_SOURCES
     platform/graphics/filters/SourceAlpha.cpp
     platform/graphics/filters/SourceGraphic.cpp
 
-    platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp
     platform/graphics/filters/arm/FECompositeArithmeticNEON.h
     platform/graphics/filters/arm/FEGaussianBlurNEON.cpp
     platform/graphics/filters/arm/FEGaussianBlurNEON.h
index 6da4b40..def731f 100644 (file)
@@ -1,3 +1,34 @@
+2012-05-17  Zoltan Herczeg  <zherczeg@webkit.org>
+
+        NEON intrinsic should be used with arithmetic mode of composite filter
+        https://bugs.webkit.org/show_bug.cgi?id=86622
+
+        Reviewed by Nikolas Zimmermann.
+
+        Rewrite hand written assembly code to increase portability and readibility
+        of the code. Remove the unnecessary FECompositeArithmeticNEON.cpp from the
+        project
+
+        Existing tests cover this issue.
+
+        * CMakeLists.txt:
+        * GNUmakefile.list.am:
+        * Target.pri:
+        * WebCore.gypi:
+        * WebCore.vcproj/WebCore.vcproj:
+        * WebCore.xcodeproj/project.pbxproj:
+        * platform/graphics/filters/FEComposite.cpp:
+        (WebCore):
+        (WebCore::computeArithmeticPixels):
+        (WebCore::arithmeticSoftware):
+        (WebCore::FEComposite::platformArithmeticSoftware):
+        * platform/graphics/filters/FEComposite.h:
+        * platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp: Removed.
+        * platform/graphics/filters/arm/FECompositeArithmeticNEON.h:
+        (WebCore):
+        (WebCore::FEComposite::computeArithmeticPixelsNeon):
+        (WebCore::FEComposite::platformArithmeticNeon):
+
 2012-05-17  Takashi Sakamoto  <tasak@google.com>
 
         showNodePath will be useful for debugging purpose.
index 9544867..60b4c8e 100644 (file)
@@ -3214,7 +3214,6 @@ webcore_sources += \
        Source/WebCore/platform/graphics/filters/SourceGraphic.h \
        Source/WebCore/platform/graphics/filters/SpotLightSource.cpp \
        Source/WebCore/platform/graphics/filters/SpotLightSource.h \
-       Source/WebCore/platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp \
        Source/WebCore/platform/graphics/filters/arm/FECompositeArithmeticNEON.h \
        Source/WebCore/platform/graphics/filters/arm/FEGaussianBlurNEON.cpp \
        Source/WebCore/platform/graphics/filters/arm/FEGaussianBlurNEON.h \
index 7bd6022..cf32210 100644 (file)
@@ -3405,7 +3405,6 @@ contains(DEFINES, ENABLE_FILTERS=1) {
         platform/graphics/filters/SpotLightSource.cpp \
         platform/graphics/filters/SourceAlpha.cpp \
         platform/graphics/filters/SourceGraphic.cpp \
-        platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp \
         platform/graphics/filters/arm/FELightingNEON.cpp \
         platform/graphics/filters/arm/FEGaussianBlurNEON.cpp \
 }
index e0ee239..9974efc 100644 (file)
             'platform/graphics/filters/SourceGraphic.h',
             'platform/graphics/filters/SpotLightSource.cpp',
             'platform/graphics/filters/SpotLightSource.h',
-            'platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp',
             'platform/graphics/filters/arm/FECompositeArithmeticNEON.h',
             'platform/graphics/filters/arm/FEGaussianBlurNEON.cpp',
             'platform/graphics/filters/arm/FEGaussianBlurNEON.h',
index b92068b..0fe7b98 100755 (executable)
                                                >
                                        </File>
                                        <File
-                                               RelativePath="..\platform\graphics\filters\arm\FECompositeArithmeticNEON.cpp"
-                                               >
-                                       </File>
-                                       <File
                                                RelativePath="..\platform\graphics\filters\arm\FECompositeArithmeticNEON.h"
                                                >
                                        </File>
index a930b03..902ed7c 100644 (file)
                49E912AC0EFAC906009D0CAF /* AnimationList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 49E912A70EFAC906009D0CAF /* AnimationList.cpp */; };
                49E912AD0EFAC906009D0CAF /* AnimationList.h in Headers */ = {isa = PBXBuildFile; fileRef = 49E912A80EFAC906009D0CAF /* AnimationList.h */; settings = {ATTRIBUTES = (Private, ); }; };
                49E912AE0EFAC906009D0CAF /* TimingFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 49E912A90EFAC906009D0CAF /* TimingFunction.h */; settings = {ATTRIBUTES = (Private, ); }; };
-               49ECEB671499790D00CDD3A4 /* FECompositeArithmeticNEON.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 49ECEB5D1499790D00CDD3A4 /* FECompositeArithmeticNEON.cpp */; };
                49ECEB681499790D00CDD3A4 /* FECompositeArithmeticNEON.h in Headers */ = {isa = PBXBuildFile; fileRef = 49ECEB5E1499790D00CDD3A4 /* FECompositeArithmeticNEON.h */; };
                49ECEB691499790D00CDD3A4 /* FEGaussianBlurNEON.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 49ECEB5F1499790D00CDD3A4 /* FEGaussianBlurNEON.cpp */; };
                49ECEB6A1499790D00CDD3A4 /* FEGaussianBlurNEON.h in Headers */ = {isa = PBXBuildFile; fileRef = 49ECEB601499790D00CDD3A4 /* FEGaussianBlurNEON.h */; };
                49E912A70EFAC906009D0CAF /* AnimationList.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = AnimationList.cpp; path = animation/AnimationList.cpp; sourceTree = "<group>"; };
                49E912A80EFAC906009D0CAF /* AnimationList.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = AnimationList.h; path = animation/AnimationList.h; sourceTree = "<group>"; };
                49E912A90EFAC906009D0CAF /* TimingFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = TimingFunction.h; path = animation/TimingFunction.h; sourceTree = "<group>"; };
-               49ECEB5D1499790D00CDD3A4 /* FECompositeArithmeticNEON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FECompositeArithmeticNEON.cpp; sourceTree = "<group>"; };
                49ECEB5E1499790D00CDD3A4 /* FECompositeArithmeticNEON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FECompositeArithmeticNEON.h; sourceTree = "<group>"; };
                49ECEB5F1499790D00CDD3A4 /* FEGaussianBlurNEON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FEGaussianBlurNEON.cpp; sourceTree = "<group>"; };
                49ECEB601499790D00CDD3A4 /* FEGaussianBlurNEON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FEGaussianBlurNEON.h; sourceTree = "<group>"; };
                49ECEB5C1499790D00CDD3A4 /* arm */ = {
                        isa = PBXGroup;
                        children = (
-                               49ECEB5D1499790D00CDD3A4 /* FECompositeArithmeticNEON.cpp */,
                                49ECEB5E1499790D00CDD3A4 /* FECompositeArithmeticNEON.h */,
                                49ECEB5F1499790D00CDD3A4 /* FEGaussianBlurNEON.cpp */,
                                49ECEB601499790D00CDD3A4 /* FEGaussianBlurNEON.h */,
                                A75E8B8A0E1DE2D6007F2481 /* FEColorMatrix.cpp in Sources */,
                                A75E8B8C0E1DE2D6007F2481 /* FEComponentTransfer.cpp in Sources */,
                                A75E8B8E0E1DE2D6007F2481 /* FEComposite.cpp in Sources */,
-                               49ECEB671499790D00CDD3A4 /* FECompositeArithmeticNEON.cpp in Sources */,
                                84730D781248F0B300D3A9C9 /* FEConvolveMatrix.cpp in Sources */,
                                50D403C714768C9400D30BB5 /* FECustomFilter.cpp in Sources */,
                                84730D7A1248F0B300D3A9C9 /* FEDiffuseLighting.cpp in Sources */,
index 970b0d2..ddfa1e4 100644 (file)
@@ -124,7 +124,7 @@ void FEComposite::correctFilterResultIfNeeded()
     forceValidPreMultipliedPixels();
 }
 
-template <int b1, int b2, int b3, int b4>
+template <int b1, int b4>
 static inline void computeArithmeticPixels(unsigned char* source, unsigned char* destination, int pixelArrayLength,
                                     float k1, float k2, float k3, float k4)
 {
@@ -138,13 +138,9 @@ static inline void computeArithmeticPixels(unsigned char* source, unsigned char*
     while (--pixelArrayLength >= 0) {
         unsigned char i1 = *source;
         unsigned char i2 = *destination;
-        float result = 0;
+        float result = k2 * i1 + k3 * i2;
         if (b1)
             result += scaledK1 * i1 * i2;
-        if (b2)
-            result += k2 * i1;
-        if (b3)
-            result += k3 * i2;
         if (b4)
             result += scaledK4;
 
@@ -164,19 +160,19 @@ static inline void arithmeticSoftware(unsigned char* source, unsigned char* dest
 {
     if (!k4) {
         if (!k1) {
-            computeArithmeticPixels<0, 1, 1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+            computeArithmeticPixels<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
             return;
         }
 
-        computeArithmeticPixels<1, 1, 1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        computeArithmeticPixels<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
         return;
     }
 
     if (!k1) {
-        computeArithmeticPixels<0, 1, 1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        computeArithmeticPixels<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
         return;
     }
-    computeArithmeticPixels<1, 1, 1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+    computeArithmeticPixels<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
 }
 
 inline void FEComposite::platformArithmeticSoftware(Uint8ClampedArray* source, Uint8ClampedArray* destination,
@@ -185,10 +181,9 @@ inline void FEComposite::platformArithmeticSoftware(Uint8ClampedArray* source, U
     int length = source->length();
     ASSERT(length == static_cast<int>(destination->length()));
     // The selection here eventually should happen dynamically.
-#if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC)
+#if HAVE(ARM_NEON_INTRINSICS)
     ASSERT(!(length & 0x3));
-    float coefficients[4]  = { k1, k2, k3, k4 };
-    platformArithmeticNeon(source->data(), destination->data(), length, coefficients);
+    platformArithmeticNeon(source->data(), destination->data(), length, k1, k2, k3, k4);
 #else
     arithmeticSoftware(source->data(), destination->data(), length, k1, k2, k3, k4);
 #endif
index 42695f6..60fded1 100644 (file)
@@ -74,8 +74,13 @@ protected:
 private:
     FEComposite(Filter*, const CompositeOperationType&, float, float, float, float);
 
-    inline void platformArithmeticSoftware(Uint8ClampedArray* source, Uint8ClampedArray* destination, float k1, float k2, float k3, float k4);
-    inline void platformArithmeticNeon(unsigned char* source, unsigned  char* destination, unsigned pixelArrayLength, float* kArray);
+    inline void platformArithmeticSoftware(Uint8ClampedArray* source, Uint8ClampedArray* destination,
+        float k1, float k2, float k3, float k4);
+    template <int b1, int b4>
+    static inline void computeArithmeticPixelsNeon(unsigned char* source, unsigned  char* destination,
+        unsigned pixelArrayLength, float k1, float k2, float k3, float k4);
+    static inline void platformArithmeticNeon(unsigned char* source, unsigned  char* destination,
+        unsigned pixelArrayLength, float k1, float k2, float k3, float k4);
 
     CompositeOperationType m_type;
     float m_k1;
diff --git a/Source/WebCore/platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp b/Source/WebCore/platform/graphics/filters/arm/FECompositeArithmeticNEON.cpp
deleted file mode 100644 (file)
index 57e0cf0..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (C) 2011 University of Szeged
- * Copyright (C) 2011 Felician Marton
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#if ENABLE(FILTERS)
-#include "FECompositeArithmeticNEON.h"
-
-#if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC)
-
-namespace WebCore {
-
-#define ASSTRING(str) #str
-#define TOSTRING(value) ASSTRING(value)
-
-#define NL "\n"
-
-#define SOURCE_R     "r0"
-#define DEST_R       "r1"
-#define END_R        "r2"
-#define K_R          "r3"
-#define NEXTPIXEL_R  "r12"
-
-#define TEMP_Q       "q0"
-#define TEMP_D0      "d0"
-#define TEMP_D00     "d0[0]"
-#define TEMP_D01     "d0[1]"
-#define TEMP_D10     "d1[0]"
-#define TEMP_D11     "d1[1]"
-#define PIXEL1_Q     "q1"
-#define PIXEL1_D0    "d2"
-#define PIXEL1_D00   "d2[0]"
-#define PIXEL2_Q     "q2"
-#define PIXEL2_D0    "d4"
-#define PIXEL2_D00   "d4[0]"
-#define BYTEMAX_Q    "q3"
-#define K1_Q         "q8"
-#define K2_Q         "q9"
-#define K3_Q         "q10"
-#define K4_Q         "q11"
-
-asm ( // NOLINT
-".globl " TOSTRING(neonDrawCompositeArithmetic) NL
-TOSTRING(neonDrawCompositeArithmetic) ":" NL
-    "cmp " END_R ", #0" NL
-    "bxeq lr"  NL
-    // Set the end of the source register.
-    "add " END_R ", " SOURCE_R ", " END_R NL
-
-    "vld1.f32 {" TEMP_Q "}, [" K_R "]" NL
-    "ldr " K_R ", [" K_R "]" NL
-    "vdup.f32 " K1_Q ", " TEMP_D00 NL
-    "vdup.f32 " K2_Q ", " TEMP_D01 NL
-    "vdup.f32 " K3_Q ", " TEMP_D10 NL
-    "vdup.f32 " K4_Q ", " TEMP_D11 NL
-
-    "vmov.i32 " BYTEMAX_Q ", #0xFF" NL
-    "vcvt.f32.u32 " TEMP_Q ", " BYTEMAX_Q NL
-    "vmul.f32 " K4_Q ", " K4_Q ", " TEMP_Q NL
-
-    "mov " NEXTPIXEL_R ", #4" NL
-    "cmp " K_R ", #0" NL
-    "beq .arithmeticK1IsZero" NL
-
-    "vrecpe.f32 " TEMP_Q ", " TEMP_Q NL
-    "vmul.f32 " K1_Q ", " K1_Q ", " TEMP_Q NL
-
-".arithmeticK1IsNonZero:" NL
-
-    "vld1.u32 " PIXEL1_D00 ", [ " SOURCE_R "], " NEXTPIXEL_R NL
-    "vld1.u32 " PIXEL2_D00 ", [" DEST_R "]" NL
-
-    "vmovl.u8 " PIXEL1_Q ", " PIXEL1_D0 NL
-    "vmovl.u16 " PIXEL1_Q ", " PIXEL1_D0 NL
-    "vcvt.f32.u32 " PIXEL1_Q ", " PIXEL1_Q NL
-    "vmovl.u8 " PIXEL2_Q ", " PIXEL2_D0 NL
-    "vmovl.u16 " PIXEL2_Q ", " PIXEL2_D0 NL
-    "vcvt.f32.u32 " PIXEL2_Q ", " PIXEL2_Q NL
-
-    "vmul.f32 " TEMP_Q ", " PIXEL1_Q ", " PIXEL2_Q NL
-    "vmul.f32 " TEMP_Q ", " TEMP_Q ", " K1_Q NL
-    "vmla.f32 " TEMP_Q ", " PIXEL1_Q ", " K2_Q NL
-    "vmla.f32 " TEMP_Q ", " PIXEL2_Q ", " K3_Q NL
-    "vadd.f32 " TEMP_Q ", " K4_Q NL
-
-    // Convert result to uint so negative values are converted to zero.
-    "vcvt.u32.f32 " TEMP_Q ", " TEMP_Q NL
-    "vmin.u32 " TEMP_Q ", " TEMP_Q ", " BYTEMAX_Q NL
-    "vmovn.u32 " TEMP_D0 ", " TEMP_Q NL
-    "vmovn.u16 " TEMP_D0 ", " TEMP_Q NL
-
-    "vst1.u32 " TEMP_D00 ", [" DEST_R "], " NEXTPIXEL_R NL
-
-    "cmp " SOURCE_R ", " END_R NL
-    "bcc .arithmeticK1IsNonZero" NL
-    "bx lr" NL
-
-".arithmeticK1IsZero:" NL
-
-    "vld1.u32 " PIXEL1_D00 ", [ " SOURCE_R "], " NEXTPIXEL_R NL
-    "vld1.u32 " PIXEL2_D00 ", [" DEST_R "]" NL
-
-    "vmovl.u8 " PIXEL1_Q ", " PIXEL1_D0 NL
-    "vmovl.u16 " PIXEL1_Q ", " PIXEL1_D0 NL
-    "vcvt.f32.u32 " PIXEL1_Q ", " PIXEL1_Q NL
-    "vmovl.u8 " PIXEL2_Q ", " PIXEL2_D0 NL
-    "vmovl.u16 " PIXEL2_Q ", " PIXEL2_D0 NL
-    "vcvt.f32.u32 " PIXEL2_Q ", " PIXEL2_Q NL
-
-    "vmul.f32 " TEMP_Q ", " PIXEL1_Q ", " K2_Q NL
-    "vmla.f32 " TEMP_Q ", " PIXEL2_Q ", " K3_Q NL
-    "vadd.f32 " TEMP_Q ", " K4_Q NL
-
-    // Convert result to uint so negative values are converted to zero.
-    "vcvt.u32.f32 " TEMP_Q ", " TEMP_Q NL
-    "vmin.u32 " TEMP_Q ", " TEMP_Q ", " BYTEMAX_Q NL
-    "vmovn.u32 " TEMP_D0 ", " TEMP_Q NL
-    "vmovn.u16 " TEMP_D0 ", " TEMP_Q NL
-
-    "vst1.u32 " TEMP_D00 ", [" DEST_R "], " NEXTPIXEL_R NL
-
-    "cmp " SOURCE_R ", " END_R NL
-    "bcc .arithmeticK1IsZero" NL
-    "bx lr" NL
-); // NOLINT
-
-} // namespace WebCore
-
-#endif // CPU(ARM_NEON) && COMPILER(GCC)
-
-#endif // ENABLE(FILTERS)
-
index 64cec1d..50f34b7 100644 (file)
 
 #include <wtf/Platform.h>
 
-#if ENABLE(FILTERS)
-#if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC)
+#if ENABLE(FILTERS) && HAVE(ARM_NEON_INTRINSICS)
 
 #include "FEComposite.h"
+#include <arm_neon.h>
 
 namespace WebCore {
 
-extern "C" {
-void neonDrawCompositeArithmetic(unsigned char* source, unsigned char* destination, unsigned pixelArrayLength, float* coefficients);
+template <int b1, int b4>
+inline void FEComposite::computeArithmeticPixelsNeon(unsigned char* source, unsigned char* destination,
+    unsigned pixelArrayLength, float k1, float k2, float k3, float k4)
+{
+    float32x4_t k1x4 = vdupq_n_f32(k1 / 255);
+    float32x4_t k2x4 = vdupq_n_f32(k2);
+    float32x4_t k3x4 = vdupq_n_f32(k3);
+    float32x4_t k4x4 = vdupq_n_f32(k4 * 255);
+    uint32x4_t max255 = vdupq_n_u32(255);
+
+    uint32_t* sourcePixel = reinterpret_cast<uint32_t*>(source);
+    uint32_t* destinationPixel = reinterpret_cast<uint32_t*>(destination);
+    uint32_t* destinationEndPixel = destinationPixel + (pixelArrayLength >> 2);
+
+    while (destinationPixel < destinationEndPixel) {
+        uint32x2_t temporary1 = vset_lane_u32(*sourcePixel, temporary1, 0);
+        uint16x4_t temporary2 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(temporary1)));
+        float32x4_t sourcePixelAsFloat = vcvtq_f32_u32(vmovl_u16(temporary2));
+
+        temporary1 = vset_lane_u32(*destinationPixel, temporary1, 0);
+        temporary2 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(temporary1)));
+        float32x4_t destinationPixelAsFloat = vcvtq_f32_u32(vmovl_u16(temporary2));
+
+        float32x4_t result = vmulq_f32(sourcePixelAsFloat, k2x4);
+        result = vmlaq_f32(result, destinationPixelAsFloat, k3x4);
+        if (b1)
+            result = vmlaq_f32(result, vmulq_f32(sourcePixelAsFloat, destinationPixelAsFloat), k1x4);
+        if (b4)
+            result = vaddq_f32(result, k4x4);
+
+        // Convert result to uint so negative values are converted to zero.
+        uint16x4_t temporary3 = vmovn_u32(vminq_u32(vcvtq_u32_f32(result), max255));
+        uint8x8_t temporary4 = vmovn_u16(vcombine_u16(temporary3, temporary3));
+        *destinationPixel++ = vget_lane_u32(vreinterpret_u32_u8(temporary4), 0);
+        ++sourcePixel;
+    }
 }
 
-inline void FEComposite::platformArithmeticNeon(unsigned char* source, unsigned char* destination, unsigned pixelArrayLength, float* coefficients)
+inline void FEComposite::platformArithmeticNeon(unsigned char* source, unsigned char* destination,
+    unsigned pixelArrayLength, float k1, float k2, float k3, float k4)
 {
-    neonDrawCompositeArithmetic(source, destination, pixelArrayLength, coefficients);
+    if (!k4) {
+        if (!k1) {
+            computeArithmeticPixelsNeon<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+            return;
+        }
+
+        computeArithmeticPixelsNeon<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        return;
+    }
+
+    if (!k1) {
+        computeArithmeticPixelsNeon<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
+        return;
+    }
+    computeArithmeticPixelsNeon<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
 }
 
 } // namespace WebCore
 
-#endif // CPU(ARM_NEON) && COMPILER(GCC)
-#endif // ENABLE(FILTERS)
+#endif // ENABLE(FILTERS) && HAVE(ARM_NEON_INTRINSICS)
 
 #endif // FECompositeArithmeticNEON_h