From: Ian Rickards <Ian.Rickards@arm.com>
Date: Tue, 21 Apr 2009 21:32:36 +0000 (-0400)
Subject: Add support for ARM NEON fast paths
X-Git-Tag: pixman-0.15.6~54
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f2af00bf02dcf3e7e27ac3e035d41c387fc9400b;p=platform%2Fupstream%2Fpixman.git

Add support for ARM NEON fast paths

Currently disabled
---

diff --git a/pixman/pixman-arm-detect-win32.asm b/pixman/pixman-arm-detect-win32.asm
index dfe363e..8f5d5eb 100644
--- a/pixman/pixman-arm-detect-win32.asm
+++ b/pixman/pixman-arm-detect-win32.asm
@@ -1,9 +1,20 @@
     area pixman_msvc, code, readonly
 
-    export  pixman_msvc_try_armv6_op
+    export  pixman_msvc_try_arm_simd_op
 
-pixman_msvc_try_armv6_op
-    uqadd8 r0,r0,r1
+pixman_msvc_try_arm_simd_op
+    ;; I don't think the msvc arm asm knows how to do SIMD insns
+    ;; uqadd8 r3,r3,r3
+    dcd 0xe6633f93
+    mov pc,lr
+    endp
+
+    export  pixman_msvc_try_arm_neon_op
+
+pixman_msvc_try_arm_neon_op
+    ;; I don't think the msvc arm asm knows how to do NEON insns
+    ;; veor d0,d0,d0
+    dcd 0xf3000110
     mov pc,lr
     endp
 
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
new file mode 100644
index 0000000..56fb247
--- /dev/null
+++ b/pixman/pixman-arm-neon.c
@@ -0,0 +1,1387 @@
+/*
+ * Copyright Â© 2009 ARM Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  ARM Ltd makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ian Rickards (ian.rickards@arm.com) 
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-arm-neon.h"
+
+#include <arm_neon.h>
+
+
+#if !defined(__ARMCC_VERSION) && !defined(FORCE_NO_NEON_INLINE_ASM)
+// [both armcc & gcc set __GNUC__]
+// Use GNU style inline asm on gcc, for best performance
+// Use intrinsics on armcc
+// This switch determines if any GNU style inline asm is allowed
+#define USE_NEON_INLINE_ASM
+#endif
+
+
+static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
+{
+    uint16x8_t gb, b;
+    uint8x8x4_t res;
+
+    res.val[3] = vdup_n_u8(0);
+    gb = vshrq_n_u16(rgb, 5);
+    b = vshrq_n_u16(rgb, 5+6);
+    res.val[0] = vmovn_u16(rgb);  // get low 5 bits
+    res.val[1] = vmovn_u16(gb);   // get mid 6 bits
+    res.val[2] = vmovn_u16(b);    // get top 5 bits
+
+    res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top
+    res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top
+    res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top
+
+    res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5); 
+    res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6);
+    res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5);
+
+    return res;
+}
+
+static force_inline uint16x8_t pack0565(uint8x8x4_t s)
+{
+    uint16x8_t rgb, val_g, val_r;
+
+    rgb = vshll_n_u8(s.val[2],8);
+    val_g = vshll_n_u8(s.val[1],8);
+    val_r = vshll_n_u8(s.val[0],8);
+    rgb = vsriq_n_u16(rgb, val_g, 5);
+    rgb = vsriq_n_u16(rgb, val_r, 5+6);
+
+    return rgb;
+}
+
+static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha)
+{
+    uint16x8_t tmp,tmp2;
+    uint8x8_t res;
+
+    tmp = vmull_u8(x,alpha);
+    tmp2 = vrshrq_n_u16(tmp,8);
+    res = vraddhn_u16(tmp,tmp2);
+
+    return res;
+}
+
+static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha)
+{
+    uint16x8x4_t tmp;
+    uint8x8x4_t res;
+    uint16x8_t qtmp1,qtmp2;
+
+    tmp.val[0] = vmull_u8(x.val[0],alpha);
+    tmp.val[1] = vmull_u8(x.val[1],alpha);
+    tmp.val[2] = vmull_u8(x.val[2],alpha);
+    tmp.val[3] = vmull_u8(x.val[3],alpha);
+
+    qtmp1 = vrshrq_n_u16(tmp.val[0],8);
+    qtmp2 = vrshrq_n_u16(tmp.val[1],8);
+    res.val[0] = vraddhn_u16(tmp.val[0],qtmp1);
+    qtmp1 = vrshrq_n_u16(tmp.val[2],8);
+    res.val[1] = vraddhn_u16(tmp.val[1],qtmp2);
+    qtmp2 = vrshrq_n_u16(tmp.val[3],8);
+    res.val[2] = vraddhn_u16(tmp.val[2],qtmp1);
+    res.val[3] = vraddhn_u16(tmp.val[3],qtmp2);
+
+    return res;
+}
+
+static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y)
+{
+    uint8x8x4_t res;
+
+    res.val[0] = vqadd_u8(x.val[0],y.val[0]);
+    res.val[1] = vqadd_u8(x.val[1],y.val[1]);
+    res.val[2] = vqadd_u8(x.val[2],y.val[2]);
+    res.val[3] = vqadd_u8(x.val[3],y.val[3]);
+
+    return res;
+}
+
+
+void
+fbCompositeSrcAdd_8000x8000neon (pixman_op_t op,
+                                pixman_image_t * pSrc,
+                                pixman_image_t * pMask,
+                                pixman_image_t * pDst,
+                                int16_t      xSrc,
+                                int16_t      ySrc,
+                                int16_t      xMask,
+                                int16_t      yMask,
+                                int16_t      xDst,
+                                int16_t      yDst,
+                                uint16_t     width,
+                                uint16_t     height)
+{
+    uint8_t     *dstLine, *dst;
+    uint8_t     *srcLine, *src;
+    int dstStride, srcStride;
+    uint16_t    w;
+
+    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
+    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+
+    if (width>=8)
+    {
+        // Use overlapping 8-pixel method
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+
+            uint8_t *keep_dst;
+
+#ifndef USE_NEON_INLINE_ASM
+            uint8x8_t sval,dval,temp;
+
+            sval = vld1_u8((void*)src);
+            dval = vld1_u8((void*)dst);
+            keep_dst = dst;
+
+            temp = vqadd_u8(dval,sval);
+
+            src += (w & 7);
+            dst += (w & 7);
+            w -= (w & 7);
+
+            while (w)
+            {
+                sval = vld1_u8((void*)src);
+                dval = vld1_u8((void*)dst);
+
+                vst1_u8((void*)keep_dst,temp);
+                keep_dst = dst;
+
+                temp = vqadd_u8(dval,sval);
+
+                src+=8;
+                dst+=8;
+                w-=8;
+            }
+            vst1_u8((void*)keep_dst,temp);
+#else
+            asm volatile (
+// avoid using d8-d15 (q4-q7) aapcs callee-save registers
+                        "vld1.8  {d0}, [%[src]]\n\t"
+                        "vld1.8  {d4}, [%[dst]]\n\t"
+                        "mov     %[keep_dst], %[dst]\n\t"
+
+                        "and ip, %[w], #7\n\t"
+                        "add %[src], %[src], ip\n\t"
+                        "add %[dst], %[dst], ip\n\t"
+                        "subs %[w], %[w], ip\n\t"
+                        "b 9f\n\t"
+// LOOP
+                        "2:\n\t"
+                        "vld1.8  {d0}, [%[src]]!\n\t"
+                        "vld1.8  {d4}, [%[dst]]!\n\t"
+                        "vst1.8  {d20}, [%[keep_dst]]\n\t"
+                        "sub     %[keep_dst], %[dst], #8\n\t"
+                        "subs %[w], %[w], #8\n\t"
+                        "9:\n\t"
+                        "vqadd.u8 d20, d0, d4\n\t"
+
+                        "bne 2b\n\t"
+
+                        "1:\n\t"
+                        "vst1.8  {d20}, [%[keep_dst]]\n\t"
+
+                        : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "+r" (keep_dst)
+                        :
+                        : "ip", "cc", "memory", "d0","d4",
+                          "d20"
+                        );
+#endif
+        }
+    }
+    else
+    {
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+            uint8x8_t sval, dval;
+            uint8_t *dst4, *dst2;
+
+            if (w&4)
+            {
+                sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1));
+                dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1));
+                dst4=dst;
+                src+=4;
+                dst+=4;
+            }
+            if (w&2)
+            {
+                sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1));
+                dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1));
+                dst2=dst;
+                src+=2;
+                dst+=2;
+            }
+            if (w&1)
+            {
+                sval = vld1_lane_u8((void*)src,sval,1);
+                dval = vld1_lane_u8((void*)dst,dval,1);
+            }
+
+            dval = vqadd_u8(dval,sval);
+
+            if (w&1)
+                vst1_lane_u8((void*)dst,dval,1);
+            if (w&2)
+                vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1);
+            if (w&4)
+                vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1);
+        }
+    }
+}
+
+
+void
+fbCompositeSrc_8888x8888neon (pixman_op_t op,
+			 pixman_image_t * pSrc,
+			 pixman_image_t * pMask,
+			 pixman_image_t * pDst,
+			 int16_t      xSrc,
+			 int16_t      ySrc,
+			 int16_t      xMask,
+			 int16_t      yMask,
+			 int16_t      xDst,
+			 int16_t      yDst,
+			 uint16_t     width,
+			 uint16_t     height)
+{
+    uint32_t	*dstLine, *dst;
+    uint32_t	*srcLine, *src;
+    int	dstStride, srcStride;
+    uint32_t	w;
+
+    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+
+    if (width>=8)
+    {
+        // Use overlapping 8-pixel method  
+        while (height--)
+        {
+	    dst = dstLine;
+	    dstLine += dstStride;
+	    src = srcLine;
+	    srcLine += srcStride;
+	    w = width;
+
+            uint32_t *keep_dst;
+
+#ifndef USE_NEON_INLINE_ASM
+            uint8x8x4_t sval,dval,temp;
+
+            sval = vld4_u8((void*)src);
+            dval = vld4_u8((void*)dst);
+            keep_dst = dst;
+
+            temp = neon8mul(dval,vmvn_u8(sval.val[3]));
+            temp = neon8qadd(sval,temp);
+
+            src += (w & 7);
+            dst += (w & 7);
+            w -= (w & 7);
+
+            while (w)
+            {
+                sval = vld4_u8((void*)src);
+                dval = vld4_u8((void*)dst);
+
+                vst4_u8((void*)keep_dst,temp);
+                keep_dst = dst;
+
+                temp = neon8mul(dval,vmvn_u8(sval.val[3]));
+                temp = neon8qadd(sval,temp);
+
+                src+=8;
+                dst+=8;
+                w-=8;
+            }
+            vst4_u8((void*)keep_dst,temp);
+#else
+            asm volatile (
+// avoid using d8-d15 (q4-q7) aapcs callee-save registers
+                        "vld4.8  {d0-d3}, [%[src]]\n\t"
+                        "vld4.8  {d4-d7}, [%[dst]]\n\t"
+                        "mov     %[keep_dst], %[dst]\n\t"
+
+                        "and ip, %[w], #7\n\t"
+                        "add %[src], %[src], ip, LSL#2\n\t"
+                        "add %[dst], %[dst], ip, LSL#2\n\t"
+                        "subs %[w], %[w], ip\n\t"
+                        "b 9f\n\t"
+// LOOP
+                        "2:\n\t"
+                        "vld4.8  {d0-d3}, [%[src]]!\n\t"
+                        "vld4.8  {d4-d7}, [%[dst]]!\n\t"
+                        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
+                        "sub     %[keep_dst], %[dst], #8*4\n\t"
+                        "subs %[w], %[w], #8\n\t"
+                        "9:\n\t"
+                        "vmvn.8  d31, d3\n\t"
+                        "vmull.u8 q10, d31, d4\n\t"
+                        "vmull.u8 q11, d31, d5\n\t"
+                        "vmull.u8 q12, d31, d6\n\t"
+                        "vmull.u8 q13, d31, d7\n\t"
+                        "vrshr.u16 q8, q10, #8\n\t"
+                        "vrshr.u16 q9, q11, #8\n\t"
+                        "vraddhn.u16 d20, q10, q8\n\t"
+                        "vraddhn.u16 d21, q11, q9\n\t"
+                        "vrshr.u16 q8, q12, #8\n\t"
+                        "vrshr.u16 q9, q13, #8\n\t"
+                        "vraddhn.u16 d22, q12, q8\n\t"
+                        "vraddhn.u16 d23, q13, q9\n\t"
+// result in d20-d23
+                        "vqadd.u8 d20, d0, d20\n\t"
+                        "vqadd.u8 d21, d1, d21\n\t"
+                        "vqadd.u8 d22, d2, d22\n\t"
+                        "vqadd.u8 d23, d3, d23\n\t"
+
+                        "bne 2b\n\t"
+
+                        "1:\n\t"
+                        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
+
+                        : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "+r" (keep_dst)
+                        : 
+                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+                          "d16","d17","d18","d19","d20","d21","d22","d23"
+                        );
+#endif
+        }
+    }
+    else
+    {
+        uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
+
+        // Handle width<8
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+
+            while (w>=2)
+            {
+                uint8x8_t sval,dval;
+
+                /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
+                sval = vreinterpret_u8_u32(vld1_u32((void*)src));
+                dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
+                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
+                vst1_u8((void*)dst,vqadd_u8(sval,dval));
+
+                src+=2;
+                dst+=2;
+                w-=2;
+            }
+
+            if (w)
+            {
+                uint8x8_t sval,dval;
+
+                /* single 32-bit pixel in lane 0 */
+                sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));  // only interested in lane 0
+                dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));  // only interested in lane 0
+                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
+                vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
+            }
+        }
+    }
+}
+
+
+
+void
+fbCompositeSrc_x888x0565neon (pixman_op_t op,
+                          pixman_image_t * pSrc,
+                          pixman_image_t * pMask,
+                          pixman_image_t * pDst,
+                          int16_t      xSrc,
+                          int16_t      ySrc,
+                          int16_t      xMask,
+                          int16_t      yMask,
+                          int16_t      xDst,
+                          int16_t      yDst,
+                          uint16_t     width,
+                          uint16_t     height)
+{
+    uint16_t    *dstLine, *dst;
+    uint32_t    *srcLine, *src;
+    int dstStride, srcStride;
+    uint32_t    w;
+
+    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+    if (width>=8)
+    {
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+
+	    do {
+	        while (w>=8)
+	        {
+#ifndef USE_NEON_INLINE_ASM
+	            vst1q_u16(dst, pack0565(vld4_u8((void*)src)));
+#else
+                    asm volatile (
+                        "vld4.8       {d4-d7}, [%[src]]\n\t"
+                        "vshll.u8     q0, d6, #8\n\t"
+                        "vshll.u8     q1, d5, #8\n\t"
+                        "vsriq.u16    q0, q1, #5\t\n"
+                        "vshll.u8     q1, d4, #8\n\t"
+                        "vsriq.u16    q0, q1, #11\t\n"
+                        "vst1.16      {q0}, [%[dst]]\n\t"
+                        :
+                        : [dst] "r" (dst), [src] "r" (src)
+                        : "memory", "d0","d1","d2","d3","d4","d5","d6","d7"
+                        );
+#endif
+	            src+=8;
+	            dst+=8;
+	            w-=8;
+          	}
+                if (w != 0)
+                {
+                    src -= (8-w);
+                    dst -= (8-w);
+                    w = 8;  // do another vector
+                }
+            } while (w!=0);
+        }
+    }
+    else
+    {
+        // Handle width<8
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+
+	    while (w>=2)
+	    {
+	        uint32x2_t sval, rgb, g, b;
+	        sval = vld1_u32(src);
+	        rgb = vshr_n_u32(sval,8-5); // r (5 bits) 
+	        g = vshr_n_u32(sval,8+8-6);  // g to bottom byte
+	        rgb = vsli_n_u32(rgb, g, 5);
+	        b = vshr_n_u32(sval,8+8+8-5);  // b to bottom byte
+                rgb = vsli_n_u32(rgb, b, 11);
+	        vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
+	        vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),2);
+	        src+=2;
+	        w-=2;
+	    }
+            if (w)
+            {
+                uint32x2_t sval, rgb, g, b;
+                sval = vld1_dup_u32(src);
+                rgb = vshr_n_u32(sval,8-5); // r (5 bits)
+                g = vshr_n_u32(sval,8+8-6);  // g to bottom byte
+                rgb = vsli_n_u32(rgb, g, 5);
+                b = vshr_n_u32(sval,8+8+8-5);  // b to bottom byte
+                rgb = vsli_n_u32(rgb, b, 11);
+                vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
+            }
+	}
+    }
+}
+
+
+void
+fbCompositeSrc_8888x8x8888neon (pixman_op_t op,
+			       pixman_image_t * pSrc,
+			       pixman_image_t * pMask,
+			       pixman_image_t * pDst,
+			       int16_t	xSrc,
+			       int16_t	ySrc,
+			       int16_t      xMask,
+			       int16_t      yMask,
+			       int16_t      xDst,
+			       int16_t      yDst,
+			       uint16_t     width,
+			       uint16_t     height)
+{
+    uint32_t	*dstLine, *dst;
+    uint32_t	*srcLine, *src;
+    uint32_t	mask;
+    int	dstStride, srcStride;
+    uint32_t	w;
+    uint8x8_t mask_alpha;
+
+    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+
+    fbComposeGetSolid (pMask, mask, pDst->bits.format);
+    mask_alpha = vdup_n_u8((mask) >> 24);
+
+    if (width>=8)
+    {
+        // Use overlapping 8-pixel method
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+
+            uint32_t *keep_dst;
+
+#ifndef USE_NEON_INLINE_ASM
+            uint8x8x4_t sval,dval,temp;
+
+            sval = vld4_u8((void*)src);
+            dval = vld4_u8((void*)dst);
+            keep_dst = dst;
+
+            sval = neon8mul(sval,mask_alpha);
+            temp = neon8mul(dval,vmvn_u8(sval.val[3]));
+            temp = neon8qadd(sval,temp);
+
+            src += (w & 7);
+            dst += (w & 7);
+            w -= (w & 7);
+
+            while (w)
+            {
+                sval = vld4_u8((void*)src);
+                dval = vld4_u8((void*)dst);
+
+                vst4_u8((void*)keep_dst,temp);
+                keep_dst = dst;
+
+                sval = neon8mul(sval,mask_alpha);
+                temp = neon8mul(dval,vmvn_u8(sval.val[3]));
+                temp = neon8qadd(sval,temp);
+
+                src+=8;
+                dst+=8;
+                w-=8;
+            }
+            vst4_u8((void*)keep_dst,temp);
+#else
+            asm volatile (
+// avoid using d8-d15 (q4-q7) aapcs callee-save registers
+                        "vdup.32      d30, %[mask]\n\t"
+                        "vdup.8       d30, d30[3]\n\t"
+
+                        "vld4.8       {d0-d3}, [%[src]]\n\t"
+                        "vld4.8       {d4-d7}, [%[dst]]\n\t"
+                        "mov  %[keep_dst], %[dst]\n\t"
+
+                        "and  ip, %[w], #7\n\t"
+                        "add  %[src], %[src], ip, LSL#2\n\t"
+                        "add  %[dst], %[dst], ip, LSL#2\n\t"
+                        "subs  %[w], %[w], ip\n\t"
+                        "b 9f\n\t"
+// LOOP
+                        "2:\n\t"
+                        "vld4.8       {d0-d3}, [%[src]]!\n\t"
+                        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
+                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+                        "sub  %[keep_dst], %[dst], #8*4\n\t"
+                        "subs  %[w], %[w], #8\n\t"
+
+                        "9:\n\t"
+                        "vmull.u8     q10, d30, d0\n\t"
+                        "vmull.u8     q11, d30, d1\n\t"
+                        "vmull.u8     q12, d30, d2\n\t"
+                        "vmull.u8     q13, d30, d3\n\t"
+                        "vrshr.u16    q8, q10, #8\n\t"
+                        "vrshr.u16    q9, q11, #8\n\t"
+                        "vraddhn.u16  d0, q10, q8\n\t"
+                        "vraddhn.u16  d1, q11, q9\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d3, q13, q9\n\t"
+                        "vraddhn.u16  d2, q12, q8\n\t"
+
+                        "vmvn.8       d31, d3\n\t"
+                        "vmull.u8     q10, d31, d4\n\t"
+                        "vmull.u8     q11, d31, d5\n\t"
+                        "vmull.u8     q12, d31, d6\n\t"
+                        "vmull.u8     q13, d31, d7\n\t"
+                        "vrshr.u16    q8, q10, #8\n\t"
+                        "vrshr.u16    q9, q11, #8\n\t"
+                        "vraddhn.u16  d20, q10, q8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d21, q11, q9\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vraddhn.u16  d22, q12, q8\n\t"
+                        "vraddhn.u16  d23, q13, q9\n\t"
+// result in d20-d23
+                        "vqadd.u8     d20, d0, d20\n\t"
+                        "vqadd.u8     d21, d1, d21\n\t"
+                        "vqadd.u8     d22, d2, d22\n\t"
+                        "vqadd.u8     d23, d3, d23\n\t"
+
+                        "bne  2b\n\t"
+
+                        "1:\n\t"
+                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+
+                        : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "+r" (keep_dst)
+                        : [mask] "r" (mask)
+                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
+                          "d30","d31"
+                        );
+#endif
+        }
+    }
+    else
+    {
+        uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
+
+        // Handle width<8
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            src = srcLine;
+            srcLine += srcStride;
+            w = width;
+
+            while (w>=2)
+            {
+                uint8x8_t sval,dval;
+
+                sval = vreinterpret_u8_u32(vld1_u32((void*)src));
+                dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
+
+                /* sval * const alpha_mul */
+                sval = neon2mul(sval,mask_alpha);
+
+                /* dval * 255-(src alpha) */
+                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
+
+                vst1_u8((void*)dst,vqadd_u8(sval,dval));
+
+                src+=2;
+                dst+=2;
+                w-=2;
+            }
+
+            if (w)
+            {
+                uint8x8_t sval,dval;
+
+                sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));
+                dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
+
+                /* sval * const alpha_mul */
+                sval = neon2mul(sval,mask_alpha);
+
+                /* dval * 255-(src alpha) */
+                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
+
+                vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
+            }
+        }
+    }
+}
+
+
+
+void
+fbCompositeSolidMask_nx8x0565neon (pixman_op_t op,
+                               pixman_image_t * pSrc,
+                               pixman_image_t * pMask,
+                               pixman_image_t * pDst,
+                               int16_t      xSrc,
+                               int16_t      ySrc,
+                               int16_t      xMask,
+                               int16_t      yMask,
+                               int16_t      xDst,
+                               int16_t      yDst,
+                               uint16_t     width,
+                               uint16_t     height)
+{
+    uint32_t     src, srca;
+    uint16_t    *dstLine, *dst;
+    uint8_t     *maskLine, *mask;
+    int          dstStride, maskStride;
+    uint32_t     w;
+    uint8x8_t    sval2;
+    uint8x8x4_t  sval8;
+
+    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+        return;
+
+    sval2=vreinterpret_u8_u32(vdup_n_u32(src));
+    sval8.val[0]=vdup_lane_u8(sval2,0);
+    sval8.val[1]=vdup_lane_u8(sval2,1);
+    sval8.val[2]=vdup_lane_u8(sval2,2);
+    sval8.val[3]=vdup_lane_u8(sval2,3);
+
+    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+
+    if (width>=8)
+    {
+        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
+        while (height--)
+        {
+            uint16_t *keep_dst;
+
+            dst = dstLine;
+            dstLine += dstStride;
+            mask = maskLine;
+            maskLine += maskStride;
+            w = width;
+
+#ifndef USE_NEON_INLINE_ASM
+            uint8x8_t alpha;
+            uint16x8_t dval, temp; 
+            uint8x8x4_t sval8temp;
+
+            alpha = vld1_u8((void*)mask);
+            dval = vld1q_u16((void*)dst);
+            keep_dst = dst;
+
+            sval8temp = neon8mul(sval8,alpha);
+            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
+
+            mask += (w & 7);
+            dst += (w & 7);
+            w -= (w & 7);
+
+            while (w)
+            {
+                dval = vld1q_u16((void*)dst);
+	        alpha = vld1_u8((void*)mask);
+
+                vst1q_u16((void*)keep_dst,temp);
+                keep_dst = dst;
+
+                sval8temp = neon8mul(sval8,alpha);
+                temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
+
+                mask+=8;
+                dst+=8;
+                w-=8;
+            }
+            vst1q_u16((void*)keep_dst,temp);
+#else
+        asm volatile (
+                        "vdup.32      d0, %[src]\n\t"
+                        "vdup.8       d1, d0[1]\n\t"
+                        "vdup.8       d2, d0[2]\n\t"
+                        "vdup.8       d3, d0[3]\n\t"
+                        "vdup.8       d0, d0[0]\n\t"
+
+                        "vld1.8       {q12}, [%[dst]]\n\t"
+                        "vld1.8       {d31}, [%[mask]]\n\t"
+                        "mov  %[keep_dst], %[dst]\n\t"
+
+                        "and  ip, %[w], #7\n\t"
+                        "add  %[mask], %[mask], ip\n\t"
+                        "add  %[dst], %[dst], ip, LSL#1\n\t"
+                        "subs  %[w], %[w], ip\n\t"
+                        "b  9f\n\t"
+// LOOP
+                        "2:\n\t"
+
+                        "vld1.16      {q12}, [%[dst]]!\n\t"
+                        "vld1.8       {d31}, [%[mask]]!\n\t"
+                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
+                        "sub  %[keep_dst], %[dst], #8*2\n\t"
+                        "subs  %[w], %[w], #8\n\t"
+                        "9:\n\t"
+// expand 0565 q12 to 8888 {d4-d7}
+                        "vmovn.u16    d4, q12\t\n"
+                        "vshr.u16     q11, q12, #5\t\n"
+                        "vshr.u16     q10, q12, #6+5\t\n"
+                        "vmovn.u16    d5, q11\t\n"
+                        "vmovn.u16    d6, q10\t\n"
+                        "vshl.u8      d4, d4, #3\t\n"
+                        "vshl.u8      d5, d5, #2\t\n"
+                        "vshl.u8      d6, d6, #3\t\n"
+                        "vsri.u8      d4, d4, #5\t\n"
+                        "vsri.u8      d5, d5, #6\t\n"
+                        "vsri.u8      d6, d6, #5\t\n"
+
+                        "vmull.u8     q10, d31, d0\n\t"
+                        "vmull.u8     q11, d31, d1\n\t"
+                        "vmull.u8     q12, d31, d2\n\t"
+                        "vmull.u8     q13, d31, d3\n\t"
+                        "vrshr.u16    q8, q10, #8\n\t"
+                        "vrshr.u16    q9, q11, #8\n\t"
+                        "vraddhn.u16  d20, q10, q8\n\t"
+                        "vraddhn.u16  d21, q11, q9\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d23, q13, q9\n\t"
+                        "vraddhn.u16  d22, q12, q8\n\t"
+
+// duplicate in 4/2/1 & 8pix vsns
+                        "vmvn.8       d30, d23\n\t"
+                        "vmull.u8     q14, d30, d6\n\t"
+                        "vmull.u8     q13, d30, d5\n\t"
+                        "vmull.u8     q12, d30, d4\n\t"
+                        "vrshr.u16    q8, q14, #8\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vraddhn.u16  d6, q14, q8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d5, q13, q9\n\t"
+                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
+                        "vraddhn.u16  d4, q12, q8\n\t"
+// intentionally don't calculate alpha
+// result in d4-d6
+
+//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
+                        "vqadd.u8     d5, d5, d21\n\t"
+                        "vqadd.u8     d4, d4, d20\n\t"
+
+// pack 8888 {d20-d23} to 0565 q10
+                        "vshll.u8     q10, d6, #8\n\t"
+                        "vshll.u8     q3, d5, #8\n\t"
+                        "vshll.u8     q2, d4, #8\n\t"
+                        "vsri.u16     q10, q3, #5\t\n"
+                        "vsri.u16     q10, q2, #11\t\n"
+
+                        "bne 2b\n\t"
+
+                        "1:\n\t"
+                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
+
+                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "+r" (keep_dst)
+                        : [src] "r" (src)
+                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+                          "d30","d31"
+                        );
+#endif
+        }
+    }
+    else
+    {
+        while (height--)
+        {
+            void *dst4, *dst2;
+
+            dst = dstLine;
+            dstLine += dstStride;
+            mask = maskLine;
+            maskLine += maskStride;
+            w = width;
+
+
+#ifndef USE_NEON_INLINE_ASM
+            uint8x8_t alpha;
+            uint16x8_t dval, temp;
+            uint8x8x4_t sval8temp;
+
+            if (w&4)
+            {
+                alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1));
+                dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1));
+                dst4=dst;
+                mask+=4;
+                dst+=4;
+            }
+            if (w&2)
+            {
+                alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1));
+                dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1));
+                dst2=dst;
+                mask+=2;
+                dst+=2;
+            }
+            if (w&1)
+            {
+                alpha = vld1_lane_u8((void*)mask,alpha,1);
+                dval = vld1q_lane_u16((void*)dst,dval,1);
+            }
+
+            sval8temp = neon8mul(sval8,alpha);
+            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
+
+            if (w&1)
+                vst1q_lane_u16((void*)dst,temp,1);
+            if (w&2)
+                vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1);
+            if (w&4)
+                vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1);
+#else
+            asm volatile (
+                        "vdup.32      d0, %[src]\n\t"
+                        "vdup.8       d1, d0[1]\n\t"
+                        "vdup.8       d2, d0[2]\n\t"
+                        "vdup.8       d3, d0[3]\n\t"
+                        "vdup.8       d0, d0[0]\n\t"
+
+                        "tst  %[w], #4\t\n"
+                        "beq  skip_load4\t\n"
+
+                        "vld1.64      {d25}, [%[dst]]\n\t"
+                        "vld1.32      {d31[1]}, [%[mask]]\n\t"
+                        "mov  %[dst4], %[dst]\t\n"
+                        "add  %[mask], %[mask], #4\t\n"
+                        "add  %[dst], %[dst], #4*2\t\n"
+
+                        "skip_load4:\t\n"
+                        "tst  %[w], #2\t\n"
+                        "beq  skip_load2\t\n"
+                        "vld1.32      {d24[1]}, [%[dst]]\n\t"
+                        "vld1.16      {d31[1]}, [%[mask]]\n\t"
+                        "mov  %[dst2], %[dst]\t\n"
+                        "add  %[mask], %[mask], #2\t\n"
+                        "add  %[dst], %[dst], #2*2\t\n"
+
+                        "skip_load2:\t\n"
+                        "tst  %[w], #1\t\n"
+                        "beq  skip_load1\t\n"
+                        "vld1.16      {d24[1]}, [%[dst]]\n\t"
+                        "vld1.8       {d31[1]}, [%[mask]]\n\t"
+
+                        "skip_load1:\t\n"
+// expand 0565 q12 to 8888 {d4-d7}
+                        "vmovn.u16    d4, q12\t\n"
+                        "vshr.u16     q11, q12, #5\t\n"
+                        "vshr.u16     q10, q12, #6+5\t\n"
+                        "vmovn.u16    d5, q11\t\n"
+                        "vmovn.u16    d6, q10\t\n"
+                        "vshl.u8      d4, d4, #3\t\n"
+                        "vshl.u8      d5, d5, #2\t\n"
+                        "vshl.u8      d6, d6, #3\t\n"
+                        "vsri.u8      d4, d4, #5\t\n"
+                        "vsri.u8      d5, d5, #6\t\n"
+                        "vsri.u8      d6, d6, #5\t\n"
+
+                        "vmull.u8     q10, d31, d0\n\t"
+                        "vmull.u8     q11, d31, d1\n\t"
+                        "vmull.u8     q12, d31, d2\n\t"
+                        "vmull.u8     q13, d31, d3\n\t"
+                        "vrshr.u16    q8, q10, #8\n\t"
+                        "vrshr.u16    q9, q11, #8\n\t"
+                        "vraddhn.u16  d20, q10, q8\n\t"
+                        "vraddhn.u16  d21, q11, q9\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d23, q13, q9\n\t"
+                        "vraddhn.u16  d22, q12, q8\n\t"
+
+// duplicate in 4/2/1 & 8pix vsns
+                        "vmvn.8       d30, d23\n\t"
+                        "vmull.u8     q14, d30, d6\n\t"
+                        "vmull.u8     q13, d30, d5\n\t"
+                        "vmull.u8     q12, d30, d4\n\t"
+                        "vrshr.u16    q8, q14, #8\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vraddhn.u16  d6, q14, q8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d5, q13, q9\n\t"
+                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
+                        "vraddhn.u16  d4, q12, q8\n\t"
+// intentionally don't calculate alpha
+// result in d4-d6
+
+//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
+                        "vqadd.u8     d5, d5, d21\n\t"
+                        "vqadd.u8     d4, d4, d20\n\t"
+
+// pack 8888 {d20-d23} to 0565 q10
+                        "vshll.u8     q10, d6, #8\n\t"
+                        "vshll.u8     q3, d5, #8\n\t"
+                        "vshll.u8     q2, d4, #8\n\t"
+                        "vsri.u16     q10, q3, #5\t\n"
+                        "vsri.u16     q10, q2, #11\t\n"
+
+                        "tst  %[w], #1\n\t"
+                        "beq skip_store1\t\n"
+                        "vst1.16      {d20[1]}, [%[dst]]\t\n"
+                        "skip_store1:\t\n"
+                        "tst  %[w], #2\n\t"
+                        "beq  skip_store2\t\n"
+                        "vst1.32      {d20[1]}, [%[dst2]]\t\n"
+                        "skip_store2:\t\n"
+                        "tst  %[w], #4\n\t"
+                        "beq skip_store4\t\n"
+                        "vst1.16      {d21}, [%[dst4]]\t\n"
+                        "skip_store4:\t\n"
+
+                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
+                        : [src] "r" (src)
+                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+                          "d30","d31"
+                        );
+#endif
+        }
+    }
+}
+
+
+void
+fbCompositeSolidMask_nx8x8888neon (pixman_op_t      op,
+			       pixman_image_t * pSrc,
+			       pixman_image_t * pMask,
+			       pixman_image_t * pDst,
+			       int16_t      xSrc,
+			       int16_t      ySrc,
+			       int16_t      xMask,
+			       int16_t      yMask,
+			       int16_t      xDst,
+			       int16_t      yDst,
+			       uint16_t     width,
+			       uint16_t     height)
+{
+    uint32_t	 src, srca;
+    uint32_t	*dstLine, *dst;
+    uint8_t	*maskLine, *mask;
+    int		 dstStride, maskStride;
+    uint32_t	 w;
+    uint8x8_t    sval2;
+    uint8x8x4_t  sval8;
+    uint8x8_t    mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL));
+    uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
+
+    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    sval2=vreinterpret_u8_u32(vdup_n_u32(src));
+    sval8.val[0]=vdup_lane_u8(sval2,0);
+    sval8.val[1]=vdup_lane_u8(sval2,1);
+    sval8.val[2]=vdup_lane_u8(sval2,2);
+    sval8.val[3]=vdup_lane_u8(sval2,3);
+
+    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+
+    if (width>=8)
+    {
+        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
+        while (height--)
+        {
+            uint32_t *keep_dst;
+
+            dst = dstLine;
+            dstLine += dstStride;
+            mask = maskLine;
+            maskLine += maskStride;
+            w = width;
+
+#ifndef USE_NEON_INLINE_ASM
+            uint8x8_t alpha;
+            uint8x8x4_t dval, temp;
+
+            alpha = vld1_u8((void*)mask);
+            dval = vld4_u8((void*)dst);
+            keep_dst = dst;
+
+            temp = neon8mul(sval8,alpha);
+            dval = neon8mul(dval,vmvn_u8(temp.val[3]));
+            temp = neon8qadd(temp,dval);
+
+            mask += (w & 7);
+            dst += (w & 7);
+            w -= (w & 7);
+
+            while (w)
+            {
+                alpha = vld1_u8((void*)mask);
+                dval = vld4_u8((void*)dst);
+
+                vst4_u8((void*)keep_dst,temp);
+                keep_dst = dst;
+
+                temp = neon8mul(sval8,alpha);
+                dval = neon8mul(dval,vmvn_u8(temp.val[3]));
+                temp = neon8qadd(temp,dval);
+
+                mask+=8;
+                dst+=8;
+                w-=8;
+            }
+            vst4_u8((void*)keep_dst,temp);
+#else
+        asm volatile (
+                        "vdup.32      d0, %[src]\n\t"
+                        "vdup.8       d1, d0[1]\n\t"
+                        "vdup.8       d2, d0[2]\n\t"
+                        "vdup.8       d3, d0[3]\n\t"
+                        "vdup.8       d0, d0[0]\n\t"
+
+                        "vld4.8       {d4-d7}, [%[dst]]\n\t"
+                        "vld1.8       {d31}, [%[mask]]\n\t"
+                        "mov  %[keep_dst], %[dst]\n\t"
+
+                        "and  ip, %[w], #7\n\t"
+                        "add  %[mask], %[mask], ip\n\t"
+                        "add  %[dst], %[dst], ip, LSL#2\n\t"
+                        "subs  %[w], %[w], ip\n\t"
+                        "b 9f\n\t"
+// LOOP
+                        "2:\n\t" 
+                        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
+                        "vld1.8       {d31}, [%[mask]]!\n\t"
+                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+                        "sub  %[keep_dst], %[dst], #8*4\n\t"
+                        "subs  %[w], %[w], #8\n\t"
+                        "9:\n\t"
+
+                        "vmull.u8     q10, d31, d0\n\t"
+                        "vmull.u8     q11, d31, d1\n\t"
+                        "vmull.u8     q12, d31, d2\n\t"
+                        "vmull.u8     q13, d31, d3\n\t"
+                        "vrshr.u16    q8, q10, #8\n\t"
+                        "vrshr.u16    q9, q11, #8\n\t"
+                        "vraddhn.u16  d20, q10, q8\n\t"
+                        "vraddhn.u16  d21, q11, q9\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vraddhn.u16  d23, q13, q9\n\t"
+                        "vraddhn.u16  d22, q12, q8\n\t"
+
+                        "vmvn.8       d30, d23\n\t"
+                        "vmull.u8     q12, d30, d4\n\t"
+                        "vmull.u8     q13, d30, d5\n\t"
+                        "vmull.u8     q14, d30, d6\n\t"
+                        "vmull.u8     q15, d30, d7\n\t"
+
+                        "vrshr.u16    q8, q12, #8\n\t"
+                        "vrshr.u16    q9, q13, #8\n\t"
+                        "vraddhn.u16  d4, q12, q8\n\t"
+                        "vrshr.u16    q8, q14, #8\n\t"
+                        "vraddhn.u16  d5, q13, q9\n\t"
+                        "vrshr.u16    q9, q15, #8\n\t"
+                        "vraddhn.u16  d6, q14, q8\n\t"
+                        "vraddhn.u16  d7, q15, q9\n\t"
+// result in d4-d7
+
+                        "vqadd.u8     d20, d4, d20\n\t"
+                        "vqadd.u8     d21, d5, d21\n\t"
+                        "vqadd.u8     d22, d6, d22\n\t"
+                        "vqadd.u8     d23, d7, d23\n\t"
+
+                        "bne 2b\n\t"
+
+                        "1:\n\t"
+                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+
+                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "+r" (keep_dst)
+                        : [src] "r" (src) 
+                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+                          "d30","d31"
+                        );
+#endif
+        }
+    }
+    else
+    {
+        while (height--)
+        {
+            uint8x8_t alpha;
+
+            dst = dstLine;
+            dstLine += dstStride;
+            mask = maskLine;
+            maskLine += maskStride;
+            w = width;
+
+            while (w>=2)
+            {
+                uint8x8_t dval, temp, res;
+
+                alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector);
+                dval = vld1_u8((void*)dst);
+
+                temp = neon2mul(sval2,alpha);
+                res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
+
+                vst1_u8((void*)dst,res);
+
+                mask+=2;
+                dst+=2;
+                w-=2;
+            }
+            if (w)
+            {
+                uint8x8_t dval, temp, res;
+
+                alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector);
+                dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
+
+                temp = neon2mul(sval2,alpha);
+                res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
+
+                vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0);
+            }
+        }
+    }
+}
+
+
+void
+fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op,
+                            pixman_image_t * pSrc,
+                            pixman_image_t * pMask,
+                            pixman_image_t * pDst,
+                            int16_t      xSrc,
+                            int16_t      ySrc,
+                            int16_t      xMask,
+                            int16_t      yMask,
+                            int16_t      xDst,
+                            int16_t      yDst,
+                            uint16_t     width,
+                            uint16_t     height)
+{
+    uint8_t     *dstLine, *dst;
+    uint8_t     *maskLine, *mask;
+    int dstStride, maskStride;
+    uint32_t    w;
+    uint32_t    src;
+    uint8x8_t   sa;
+
+    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    fbComposeGetSolid (pSrc, src, pDst->bits.format);
+    sa = vdup_n_u8((src) >> 24);
+
+    if (width>=8)
+    {
+        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            mask = maskLine;
+            maskLine += maskStride;
+            w = width;
+
+            uint8x8_t mval, dval, res;
+            uint8_t     *keep_dst;
+
+            mval = vld1_u8((void *)mask);
+            dval = vld1_u8((void *)dst);
+            keep_dst = dst;
+
+            res = vqadd_u8(neon2mul(mval,sa),dval);
+
+            mask += (w & 7);
+            dst += (w & 7);
+            w -= w & 7;
+
+            while (w)
+            {
+                mval = vld1_u8((void *)mask);
+                dval = vld1_u8((void *)dst);
+                vst1_u8((void *)keep_dst, res);
+                keep_dst = dst;
+
+                res = vqadd_u8(neon2mul(mval,sa),dval);
+
+                mask += 8;
+                dst += 8;
+                w -= 8;
+            }
+            vst1_u8((void *)keep_dst, res);
+        }
+    }
+    else
+    {
+        // Use 4/2/1 load/store method to handle 1-7 pixels
+        while (height--)
+        {
+            dst = dstLine;
+            dstLine += dstStride;
+            mask = maskLine;
+            maskLine += maskStride;
+            w = width;
+
+            uint8x8_t mval, dval, res;
+            uint8_t *dst4, *dst2;
+
+            if (w&4)
+            {
+                mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1));
+                dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1));
+
+                dst4 = dst;
+                mask += 4;
+                dst += 4;
+            }
+            if (w&2)
+            {
+                mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1));
+                dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1));
+                dst2 = dst;
+                mask += 2;
+                dst += 2;
+            }
+            if (w&1)
+            {
+                mval = vld1_lane_u8((void *)mask, mval, 1);
+                dval = vld1_lane_u8((void *)dst, dval, 1);
+            }
+
+            res = vqadd_u8(neon2mul(mval,sa),dval);
+
+            if (w&1)
+                vst1_lane_u8((void *)dst, res, 1);
+            if (w&2)
+                vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1);
+            if (w&4)
+                vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1);
+        }
+    }
+}
+
diff --git a/pixman/pixman-arm-neon.h b/pixman/pixman-arm-neon.h
new file mode 100644
index 0000000..acfe8a4
--- /dev/null
+++ b/pixman/pixman-arm-neon.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright Â© 2009 ARM Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  ARM Ltd makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ian Rickards (ian.rickards@arm.com) 
+ *
+ */
+
+#include "pixman-private.h"
+
+#ifdef USE_ARM_NEON
+
+pixman_bool_t pixman_have_arm_neon(void);
+
+#else
+#define pixman_have_arm_neon() FALSE
+#endif
+
+#ifdef USE_ARM_NEON
+
+void
+fbCompositeSrcAdd_8000x8000neon (pixman_op_t op,
+                        pixman_image_t * pSrc,
+                        pixman_image_t * pMask,
+                        pixman_image_t * pDst,
+                        int16_t      xSrc,
+                        int16_t      ySrc,
+                        int16_t      xMask,
+                        int16_t      yMask,
+                        int16_t      xDst,
+                        int16_t      yDst,
+                        uint16_t     width,
+                        uint16_t     height);
+
+void
+fbCompositeSrc_8888x8888neon (pixman_op_t op,
+			pixman_image_t * pSrc,
+			pixman_image_t * pMask,
+			pixman_image_t * pDst,
+			int16_t      xSrc,
+			int16_t      ySrc,
+			int16_t      xMask,
+			int16_t      yMask,
+			int16_t      xDst,
+			int16_t      yDst,
+			uint16_t     width,
+			uint16_t     height);
+
+void
+fbCompositeSrc_8888x8x8888neon (pixman_op_t op,
+			pixman_image_t * pSrc,
+			pixman_image_t * pMask,
+			pixman_image_t * pDst,
+			int16_t      xSrc,
+			int16_t      ySrc,
+			int16_t      xMask,
+			int16_t      yMask,
+			int16_t      xDst,
+			int16_t      yDst,
+			uint16_t     width,
+			uint16_t     height);
+
+void
+fbCompositeSolidMask_nx8x0565neon (pixman_op_t op,
+                        pixman_image_t * pSrc,
+                        pixman_image_t * pMask,
+                        pixman_image_t * pDst,
+                        int16_t      xSrc,
+                        int16_t      ySrc,
+                        int16_t      xMask,
+                        int16_t      yMask,
+                        int16_t      xDst,
+                        int16_t      yDst,
+                        uint16_t     width,
+                        uint16_t     height);
+
+void
+fbCompositeSolidMask_nx8x8888neon (pixman_op_t op,
+			pixman_image_t * pSrc,
+			pixman_image_t * pMask,
+			pixman_image_t * pDst,
+			int16_t      xSrc,
+			int16_t      ySrc,
+			int16_t      xMask,
+			int16_t      yMask,
+			int16_t      xDst,
+			int16_t      yDst,
+		 	uint16_t     width,
+			uint16_t     height);
+
+void
+fbCompositeSrc_x888x0565neon (pixman_op_t op,
+                        pixman_image_t * pSrc,
+                        pixman_image_t * pMask,
+                        pixman_image_t * pDst,
+                        int16_t      xSrc,
+                        int16_t      ySrc,
+                        int16_t      xMask,
+                        int16_t      yMask,
+                        int16_t      xDst,
+                        int16_t      yDst,
+                        uint16_t     width,
+                        uint16_t     height);
+
+void
+fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op,
+                        pixman_image_t * pSrc,
+                        pixman_image_t * pMask,
+                        pixman_image_t * pDst,
+                        int16_t      xSrc,
+                        int16_t      ySrc,
+                        int16_t      xMask,
+                        int16_t      yMask,
+                        int16_t      xDst,
+                        int16_t      yDst,
+                        uint16_t     width,
+                        uint16_t     height);
+
+#endif /* USE_ARM_NEON */
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index 82d7160..7decc75 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1619,6 +1619,33 @@ static const FastPathInfo vmx_fast_paths[] =
 };
 #endif
 
+#ifdef USE_ARM_NEON
+static const FastPathInfo arm_neon_fast_paths[] =
+{
+#if 0
+    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
+    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000neon,       0 },
+    { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
+    { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
+    { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
+    { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
+#endif
+    { PIXMAN_OP_NONE },
+};
+#endif
+
 #ifdef USE_ARM_SIMD
 static const FastPathInfo arm_simd_fast_paths[] =
 {
@@ -2018,6 +2045,11 @@ pixman_image_composite (pixman_op_t      op,
 	    info = get_fast_path (vmx_fast_paths, op, pSrc, pMask, pDst, pixbuf);
 #endif
 
+#ifdef USE_ARM_NEON
+        if (!info && pixman_have_arm_neon())
+            info = get_fast_path (arm_neon_fast_paths, op, pSrc, pMask, pDst, pixbuf);
+#endif
+
 #ifdef USE_ARM_SIMD
 	if (!info && pixman_have_arm_simd())
 	    info = get_fast_path (arm_simd_fast_paths, op, pSrc, pMask, pDst, pixbuf);
@@ -2195,17 +2227,22 @@ pixman_bool_t pixman_have_vmx (void) {
 #endif /* __APPLE__ */
 #endif /* USE_VMX */
 
-#ifdef USE_ARM_SIMD
+#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON)
+
+#if defined(_MSC_VER)
+
+#if defined(USE_ARM_SIMD)
+extern int pixman_msvc_try_arm_simd_op();
+
 pixman_bool_t
 pixman_have_arm_simd (void)
 {
-#ifdef _MSC_VER
     static pixman_bool_t initialized = FALSE;
     static pixman_bool_t have_arm_simd = FALSE;
 
     if (!initialized) {
         __try {
-            pixman_msvc_try_armv6_op();
+            pixman_msvc_try_arm_simd_op();
             have_arm_simd = TRUE;
         } __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
             have_arm_simd = FALSE;
@@ -2214,11 +2251,116 @@ pixman_have_arm_simd (void)
     }
 
     return have_arm_simd;
-#else
-    return TRUE;
-#endif
 }
-#endif
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+extern int pixman_msvc_try_arm_neon_op();
+
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t have_arm_neon = FALSE;
+
+    if (!initialized) {
+        __try {
+            pixman_msvc_try_arm_neon_op();
+            have_arm_neon = TRUE;
+        } __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+            have_arm_neon = FALSE;
+        }
+	initialized = TRUE;
+    }
+
+    return have_arm_neon;
+}
+#endif /* USE_ARM_NEON */
+
+#else /* linux ELF */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+static pixman_bool_t arm_has_v7 = FALSE;
+static pixman_bool_t arm_has_v6 = FALSE;
+static pixman_bool_t arm_has_vfp = FALSE;
+static pixman_bool_t arm_has_neon = FALSE;
+static pixman_bool_t arm_has_iwmmxt = FALSE;
+static pixman_bool_t arm_tests_initialized = FALSE;
+
+static void
+pixman_arm_read_auxv() {
+    int fd;
+    Elf32_auxv_t aux;
+
+    fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd > 0) {
+        while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
+            if (aux.a_type == AT_HWCAP) {
+		uint32_t hwcap = aux.a_un.a_val;
+		if (getenv("ARM_FORCE_HWCAP"))
+		    hwcap = strtoul(getenv("ARM_FORCE_HWCAP"), NULL, 0);
+		// hardcode these values to avoid depending on specific versions
+		// of the hwcap header, e.g. HWCAP_NEON
+		arm_has_vfp = (hwcap & 64) != 0;
+		arm_has_iwmmxt = (hwcap & 512) != 0;
+		// this flag is only present on kernel 2.6.29
+		arm_has_neon = (hwcap & 4096) != 0;
+            } else if (aux.a_type == AT_PLATFORM) {
+		const char *plat = (const char*) aux.a_un.a_val;
+		if (getenv("ARM_FORCE_PLATFORM"))
+		    plat = getenv("ARM_FORCE_PLATFORM");
+		if (strncmp(plat, "v7l", 3) == 0) {
+		    arm_has_v7 = TRUE;
+		    arm_has_v6 = TRUE;
+		} else if (strncmp(plat, "v6l", 3) == 0) {
+		    arm_has_v6 = TRUE;
+		}
+            }
+        }
+        close (fd);
+
+	// if we don't have 2.6.29, we have to do this hack; set
+	// the env var to trust HWCAP.
+	if (!getenv("ARM_TRUST_HWCAP") && arm_has_v7)
+	    arm_has_neon = TRUE;
+    }
+
+    arm_tests_initialized = TRUE;
+}
+
+#if defined(USE_ARM_SIMD)
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+    if (!arm_tests_initialized)
+	pixman_arm_read_auxv();
+
+    return arm_has_v6;
+}
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+    if (!arm_tests_initialized)
+	pixman_arm_read_auxv();
+
+    return arm_has_neon;
+}
+#endif /* USE_ARM_NEON */
+
+#endif /* linux */
+
+#endif /* USE_ARM_SIMD || USE_ARM_NEON */
 
 #ifdef USE_MMX
 /* The CPU detection code needs to be in a file not compiled with