Add more NEON fast paths
authorJonathan Morton <jmorton@sd070.hel.movial.fi>
Wed, 3 Jun 2009 14:43:41 +0000 (10:43 -0400)
committerJeff Muizelaar <jrmuizel@jeff-desktop.(none)>
Wed, 3 Jun 2009 14:43:41 +0000 (10:43 -0400)
pixman/pixman-arm-neon.c

index 39b7f53..434f737 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2009 ARM Ltd
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -21,6 +21,8 @@
  * SOFTWARE.
  *
  * Author:  Ian Rickards (ian.rickards@arm.com) 
+ * Author:  Jonathan Morton (jonathan.morton@movial.com)
+ * Author:  Markku Vire (markku.vire@movial.com)
  *
  */
 
 #include "pixman-arm-neon.h"
 
 #include <arm_neon.h>
+#include <string.h>
 
+// Deal with an intrinsic that is defined differently in GCC
+#if !defined(__ARMCC_VERSION) && !defined(__pld)
+#define __pld(_x) __builtin_prefetch(_x)
+#endif
 
 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
 {
@@ -1391,22 +1398,659 @@ fbCompositeSrcAdd_8888x8x8neon (
     }
 }
 
+#ifdef USE_GCC_INLINE_ASM
+
+void
+fbCompositeSrc_16x16neon (
+       pixman_implementation_t * impl,
+       pixman_op_t op,
+       pixman_image_t * pSrc,
+       pixman_image_t * pMask,
+       pixman_image_t * pDst,
+       int32_t      xSrc,
+       int32_t      ySrc,
+       int32_t      xMask,
+       int32_t      yMask,
+       int32_t      xDst,
+       int32_t      yDst,
+       int32_t      width,
+       int32_t      height)
+{
+       uint16_t    *dstLine, *srcLine;
+       uint32_t     dstStride, srcStride;
+
+       if(!height || !width)
+               return;
+
+       /* We simply copy 16-bit-aligned pixels from one place to another. */
+       fbComposeGetStart (pSrc, xSrc, ySrc, uint16_t, srcStride, srcLine, 1);
+       fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+       /* Preload the first input scanline */
+       {
+               uint16_t *srcPtr = srcLine;
+               uint32_t count = width;
+
+               asm volatile (
+               "0: @ loop                                                      \n"
+               "       subs    %[count], %[count], #32                         \n"
+               "       pld     [%[src]]                                        \n"
+               "       add     %[src], %[src], #64                             \n"
+               "       bgt 0b                                                  \n"
+
+               // Clobbered input registers marked as input/outputs
+               : [src] "+r" (srcPtr), [count] "+r" (count)
+               : // no unclobbered inputs
+               : "cc"
+               );
+       }
+
+       while(height--) {
+               uint16_t *dstPtr = dstLine;
+               uint16_t *srcPtr = srcLine;
+               uint32_t count = width;
+               uint32_t tmp = 0;
+
+               // Uses multi-register access and preloading to maximise bandwidth.
+               // Each pixel is one halfword, so a quadword contains 8px.
+               // Preload frequency assumed a 64-byte cacheline.
+               asm volatile (
+               "       cmp       %[count], #64                         \n"
+               "       blt 1f    @ skip oversized fragments            \n"
+               "0: @ start with eight quadwords at a time              \n"
+               "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
+               "       sub       %[count], %[count], #64               \n"
+               "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
+               "       vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
+               "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
+               "       vld1.16   {d24,d25,d26,d27}, [%[src]]!          \n"
+               "       vld1.16   {d28,d29,d30,d31}, [%[src]]!          \n"
+               "       cmp       %[count], #64                         \n"
+               "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
+               "       vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
+               "       vst1.16   {d24,d25,d26,d27}, [%[dst]]!          \n"
+               "       vst1.16   {d28,d29,d30,d31}, [%[dst]]!          \n"
+               "       bge 0b                                          \n"
+               "       cmp       %[count], #0                          \n"
+               "       beq 7f    @ aligned fastpath                    \n"
+               "1: @ four quadwords                                    \n"
+               "       tst       %[count], #32                         \n"
+               "       beq 2f    @ skip oversized fragment             \n"
+               "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
+               "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
+               "       vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
+               "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
+               "       vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
+               "2: @ two quadwords                                     \n"
+               "       tst       %[count], #16                         \n"
+               "       beq 3f    @ skip oversized fragment             \n"
+               "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
+               "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
+               "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
+               "3: @ one quadword                                      \n"
+               "       tst       %[count], #8                          \n"
+               "       beq 4f    @ skip oversized fragment             \n"
+               "       vld1.16   {d16,d17}, [%[src]]!                  \n"
+               "       vst1.16   {d16,d17}, [%[dst]]!                  \n"
+               "4: @ one doubleword                                    \n"
+               "       tst       %[count], #4                          \n"
+               "       beq 5f    @ skip oversized fragment             \n"
+               "       vld1.16   {d16}, [%[src]]!                      \n"
+               "       vst1.16   {d16}, [%[dst]]!                      \n"
+               "5: @ one word                                          \n"
+               "       tst       %[count], #2                          \n"
+               "       beq 6f    @ skip oversized fragment             \n"
+               "       ldr       %[tmp], [%[src]], #4                  \n"
+               "       str       %[tmp], [%[dst]], #4                  \n"
+               "6: @ one halfword                                      \n"
+               "       tst       %[count], #1                          \n"
+               "       beq 7f    @ skip oversized fragment             \n"
+               "       ldrh      %[tmp], [%[src]]                      \n"
+               "       strh      %[tmp], [%[dst]]                      \n"
+               "7: @ end                                               \n"
+
+               // Clobbered input registers marked as input/outputs
+               : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
+
+               // Unclobbered input
+               : [srcStride] "r" (srcStride)
+
+               // Clobbered vector registers
+               // NB: these are the quad aliases of the double registers used in the asm
+               : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
+               );
+
+               srcLine += srcStride;
+               dstLine += dstStride;
+       }
+}
+
+#endif /* USE_GCC_INLINE_ASM */
+
+void
+fbCompositeSrc_24x16neon (
+       pixman_implementation_t * impl,
+       pixman_op_t op,
+       pixman_image_t * pSrc,
+       pixman_image_t * pMask,
+       pixman_image_t * pDst,
+       int32_t      xSrc,
+       int32_t      ySrc,
+       int32_t      xMask,
+       int32_t      yMask,
+       int32_t      xDst,
+       int32_t      yDst,
+       int32_t      width,
+       int32_t      height)
+{
+       uint16_t    *dstLine;
+       uint32_t    *srcLine;
+       uint32_t     dstStride, srcStride;
+
+       if(!width || !height)
+               return;
+
+       /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
+       fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+       fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+       /* Preload the first input scanline */
+       {
+               uint8_t *srcPtr = (uint8_t*) srcLine;
+               uint32_t count = (width + 15) / 16;
+
+#ifdef USE_GCC_INLINE_ASM
+               asm volatile (
+               "0: @ loop                                              \n"
+               "       subs    %[count], %[count], #1                  \n"
+               "       pld     [%[src]]                                \n"
+               "       add     %[src], %[src], #64                     \n"
+               "       bgt 0b                                          \n"
+
+               // Clobbered input registers marked as input/outputs
+               : [src] "+r" (srcPtr), [count] "+r" (count)
+               : // no unclobbered inputs
+               : "cc"
+               );
+#else
+               do {
+                       __pld(srcPtr);
+                       srcPtr += 64;
+               } while(--count);
+#endif
+       }
+
+       while(height--) {
+               uint16_t *dstPtr = dstLine;
+               uint32_t *srcPtr = srcLine;
+               uint32_t count = width;
+               const uint32_t RBmask = 0x1F;
+               const uint32_t Gmask = 0x3F;
+
+               // If you're going to complain about a goto, take a long hard look
+               // at the massive blocks of assembler this skips over.  ;-)
+               if(count < 8)
+                       goto smallStuff;
+
+#ifdef USE_GCC_INLINE_ASM
+
+               // This is not as aggressive as the RGB565-source case.
+               // Generally the source is in cached RAM when the formats are different, so we use preload.
+               // We don't need to blend, so we are not reading from the uncached framebuffer.
+               asm volatile (
+               "       cmp       %[count], #16                                                                         \n"
+               "       blt 1f    @ skip oversized fragments                                                            \n"
+               "0: @ start with sixteen pixels at a time                                                               \n"
+               "       sub       %[count], %[count], #16                                                               \n"
+               "       pld      [%[src], %[srcStride], lsl #2]         @ preload from next scanline                    \n"
+               "       vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
+               "       vld4.8    {d4,d5,d6,d7}, [%[src]]!              @ d7 is alpha and ignored, d6-4 are rgb.        \n"
+               "       vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
+               "       vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
+               "       vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
+               "       vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
+               "       vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
+               "       vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
+               "       vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
+               "       vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
+               "       vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
+               "       vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
+               "       cmp       %[count], #16                                                                         \n"
+               "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels                               \n"
+               "       bge 0b                                                                                          \n"
+               "1: @ end of main loop  \n"
+               "       cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
+               "       blt 2f                                                                                          \n"
+               "       sub       %[count], %[count], #8        \n"
+               "       pld      [%[src], %[srcStride], lsl #2]         @ preload from next scanline                    \n"
+               "       vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
+               "       vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
+               "       vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
+               "       vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
+               "       vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
+               "       vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
+               "       vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels                                \n"
+               "2: @ end                                                                                               \n"
+
+               // Clobbered input and working registers marked as input/outputs
+               : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count)
+
+               // Unclobbered input
+               : [srcStride] "r" (srcStride)
+
+               // Clobbered vector registers
+               // NB: these are the quad aliases of the double registers used in the asm
+               : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
+               );
+#else
+               // A copy of the above code, in intrinsics-form.
+               // This should be pretty self-documenting...
+               while(count >= 16) {
+                       uint8x8x4_t pixelSetA, pixelSetB;
+                       uint16x8_t redA, greenA, blueA;
+                       uint16x8_t redB, greenB, blueB;
+                       uint16x8_t destPixelsA, destPixelsB;
+
+                       count -= 16;
+                       __pld(srcPtr + srcStride);
+                       pixelSetA = vld4_u8((uint8_t*)(srcPtr));
+                       pixelSetB = vld4_u8((uint8_t*)(srcPtr+8));
+                       srcPtr += 16;
+
+                       redA   = vshll_n_u8(pixelSetA.val[2], 8);
+                       greenA = vshll_n_u8(pixelSetA.val[1], 8);
+                       blueA  = vshll_n_u8(pixelSetA.val[0], 8);
+                       redB   = vshll_n_u8(pixelSetB.val[2], 8);
+                       greenB = vshll_n_u8(pixelSetB.val[1], 8);
+                       blueB  = vshll_n_u8(pixelSetB.val[0], 8);
+                       destPixelsA = vsriq_n_u16(redA, greenA, 5);
+                       destPixelsB = vsriq_n_u16(redB, greenB, 5);
+                       destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
+                       destPixelsB = vsriq_n_u16(destPixelsB, blueB, 11);
+
+                       // There doesn't seem to be an intrinsic for the double-quadword variant
+                       vst1q_u16(dstPtr  , destPixelsA);
+                       vst1q_u16(dstPtr+8, destPixelsB);
+                       dstPtr += 16;
+               }
+
+               // 8-pixel loop
+               if(count >= 8) {
+                       uint8x8x4_t pixelSetA;
+                       uint16x8_t redA, greenA, blueA;
+                       uint16x8_t destPixelsA;
+
+                       __pld(srcPtr + srcStride);
+                       count -= 8;
+                       pixelSetA = vld4_u8((uint8_t*)(srcPtr));
+                       srcPtr += 8;
+
+                       redA   = vshll_n_u8(pixelSetA.val[2], 8);
+                       greenA = vshll_n_u8(pixelSetA.val[1], 8);
+                       blueA  = vshll_n_u8(pixelSetA.val[0], 8);
+                       destPixelsA = vsriq_n_u16(redA, greenA, 5);
+                       destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
+
+                       vst1q_u16(dstPtr  , destPixelsA);
+                       dstPtr += 8;
+               }
+
+#endif // USE_GCC_INLINE_ASM
+
+       smallStuff:
+
+               if(count)
+                       __pld(srcPtr + srcStride);
+
+               while(count >= 2) {
+                       uint32_t srcPixelA = *srcPtr++;
+                       uint32_t srcPixelB = *srcPtr++;
+
+                       // ARM is really good at shift-then-ALU ops.
+                       // This should be a total of six shift-ANDs and five shift-ORs.
+                       uint32_t dstPixelsA;
+                       uint32_t dstPixelsB;
+
+                       dstPixelsA  = ((srcPixelA >>  3) & RBmask);
+                       dstPixelsA |= ((srcPixelA >> 10) &  Gmask) << 5;
+                       dstPixelsA |= ((srcPixelA >> 19) & RBmask) << 11;
+
+                       dstPixelsB  = ((srcPixelB >>  3) & RBmask);
+                       dstPixelsB |= ((srcPixelB >> 10) &  Gmask) << 5;
+                       dstPixelsB |= ((srcPixelB >> 19) & RBmask) << 11;
+
+                       // little-endian mode only
+                       *((uint32_t*) dstPtr) = dstPixelsA | (dstPixelsB << 16);
+                       dstPtr += 2;
+                       count -= 2;
+               }
+
+               if(count) {
+                       uint32_t srcPixel = *srcPtr++;
+
+                       // ARM is really good at shift-then-ALU ops.
+                       // This block should end up as three shift-ANDs and two shift-ORs.
+                       uint32_t tmpBlue  = (srcPixel >>  3) & RBmask;
+                       uint32_t tmpGreen = (srcPixel >> 10) & Gmask;
+                       uint32_t tmpRed   = (srcPixel >> 19) & RBmask;
+                       uint16_t dstPixel = (tmpRed << 11) | (tmpGreen << 5) | tmpBlue;
+
+                       *dstPtr++ = dstPixel;
+                       count--;
+               }
+
+               srcLine += srcStride;
+               dstLine += dstStride;
+       }
+}
+
+
+pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+                 int stride,
+                 int bpp,
+                 int x,
+                 int y,
+                 int width,
+                 int height,
+                 uint32_t _xor)
+{
+    uint32_t byte_stride, color;
+    char *dst;
+
+    /* stride is always multiple of 32bit units in pixman */
+    byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+       case 8:
+           dst = ((char *) bits) + y * byte_stride + x;
+           _xor &= 0xff;
+           color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
+           break;
+       case 16:
+           dst = ((char *) bits) + y * byte_stride + x * 2;
+           _xor &= 0xffff;
+           color = _xor << 16 | _xor;
+           width *= 2;     /* width to bytes */
+           break;
+       case 32:
+           dst = ((char *) bits) + y * byte_stride + x * 4;
+           color = _xor;
+           width *= 4;     /* width to bytes */
+           break;
+       default:
+           return FALSE;
+    }
+
+#ifdef USE_GCC_INLINE_ASM
+    if (width < 16)
+       /* We have a special case for such small widths that don't allow
+          us to use wide 128-bit stores anyway. We don't waste time
+          trying to align writes, since there are only very few of them anyway */
+       asm volatile (
+       "cmp            %[height], #0\n" /* Check if empty fill */
+       "beq            3f\n"
+       "vdup.32        d0, %[color]\n"  /* Fill the color to neon req */
+
+       /* Check if we have a such width that can easily be handled by single
+          operation for each scanline. This significantly reduces the number
+          of test/branch instructions for each scanline */
+       "cmp            %[width], #8\n"
+       "beq            4f\n"
+       "cmp            %[width], #4\n"
+       "beq            5f\n"
+       "cmp            %[width], #2\n"
+       "beq            6f\n"
+
+       /* Loop starts here for each scanline */
+       "1:\n"
+       "mov            r4, %[dst]\n"    /* Starting address of the current line */
+       "tst            %[width], #8\n"
+       "beq            2f\n"
+       "vst1.8         {d0}, [r4]!\n"
+       "2:\n"
+       "tst            %[width], #4\n"
+       "beq            2f\n"
+       "str            %[color], [r4]!\n"
+       "2:\n"
+       "tst            %[width], #2\n"
+       "beq            2f\n"
+       "strh           %[color], [r4]!\n"
+       "2:\n"
+       "tst            r5, #1\n"
+       "beq            2f\n"
+       "strb           %[color], [r4]!\n"
+       "2:\n"
+
+       "subs           %[height], %[height], #1\n"
+       "add            %[dst], %[dst], %[byte_stride]\n"
+       "bne            1b\n"
+       "b              3f\n"
+
+       /* Special fillers for those widths that we can do with single operation */
+       "4:\n"
+       "subs           %[height], %[height], #1\n"
+       "vst1.8         {d0}, [%[dst]]\n"
+       "add            %[dst], %[dst], %[byte_stride]\n"
+       "bne            4b\n"
+       "b              3f\n"
+
+       "5:\n"
+       "subs           %[height], %[height], #1\n"
+       "str            %[color], [%[dst]]\n"
+       "add            %[dst], %[dst], %[byte_stride]\n"
+       "bne            5b\n"
+       "b              3f\n"
+
+       "6:\n"
+       "subs           %[height], %[height], #1\n"
+       "strh           %[color], [%[dst]]\n"
+       "add            %[dst], %[dst], %[byte_stride]\n"
+       "bne            6b\n"
+
+       "3:\n"
+       : /* No output members */
+       : [color] "r" (color), [height] "r" (height), [width] "r" (width),
+         [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
+       : "memory", "cc", "d0", "r4", "r5");
+    else
+       asm volatile (
+       "cmp            %[height], #0\n" /* Check if empty fill */
+       "beq            5f\n"
+       "vdup.32        q0, %[color]\n"  /* Fill the color to neon req */
+
+       /* Loop starts here for each scanline */
+       "1:\n"
+       "mov            r4, %[dst]\n"    /* Starting address of the current line */
+       "mov            r5, %[width]\n"  /* We're going to write this many bytes */
+       "ands           r6, r4, #15\n"   /* Are we at the 128-bit aligned address? */
+       "beq            2f\n"            /* Jump to the best case */
+
+       /* We're not 128-bit aligned: However, we know that we can get to the
+          next aligned location, since the fill is at least 16 bytes wide */
+       "rsb            r6, r6, #16\n"   /* We would need to go forward this much */
+       "sub            r5, r5, r6\n"    /* Update bytes left */
+       "tst            r6, #1\n"
+       "beq            6f\n"
+       "vst1.8         {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
+       "6:\n"
+       "tst            r6, #2\n"
+       "beq            6f\n"
+       "vst1.16        {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
+       "6:\n"
+       "tst            r6, #4\n"
+       "beq            6f\n"
+       "vst1.32        {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
+       "6:\n"
+       "tst            r6, #8\n"
+       "beq            2f\n"
+       "vst1.64        {d0}, [r4, :64]!\n"    /* Store qword now we're 64-bit aligned */
+
+       /* The good case: We're 128-bit aligned for this scanline */
+       "2:\n"
+       "and            r6, r5, #15\n"        /* Number of tailing bytes */
+       "cmp            r5, r6\n"             /* Do we have at least one qword to write? */
+       "beq            6f\n"                 /* No, we just write the tail */
+       "lsr            r5, r5, #4\n"         /* This many full qwords to write */
+
+       /* The main block: Do 128-bit aligned writes */
+       "3:\n"
+       "subs           r5, r5, #1\n"
+       "vst1.64        {d0,d1}, [r4, :128]!\n"
+       "bne            3b\n"
+
+       /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
+           We know that we're currently at 128-bit aligned address, so we can just
+           pick the biggest operations that the remaining write width allows */
+       "6:\n"
+       "cmp            r6, #0\n"
+       "beq            4f\n"
+       "tst            r6, #8\n"
+       "beq            6f\n"
+       "vst1.64        {d0}, [r4, :64]!\n"
+       "6:\n"
+       "tst            r6, #4\n"
+       "beq            6f\n"
+       "vst1.32        {d0[0]}, [r4, :32]!\n"
+       "6:\n"
+       "tst            r6, #2\n"
+       "beq            6f\n"
+       "vst1.16        {d0[0]}, [r4, :16]!\n"
+       "6:\n"
+       "tst            r6, #1\n"
+       "beq            4f\n"
+       "vst1.8         {d0[0]}, [r4]!\n"
+       "4:\n"
+
+       /* Handle the next scanline */
+       "subs           %[height], %[height], #1\n"
+       "add            %[dst], %[dst], %[byte_stride]\n"
+       "bne            1b\n"
+       "5:\n"
+       : /* No output members */
+       : [color] "r" (color), [height] "r" (height), [width] "r" (width),
+         [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
+       : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
+
+    return TRUE;
+
+#else
+
+    // TODO: intrinsic version for armcc
+    return FALSE;
+
+#endif
+}
+
+#ifdef USE_GCC_INLINE_ASM
+
+// TODO: is there a more generic way of doing this being introduced?
+#define NEON_SCANLINE_BUFFER_PIXELS (1024)
+
+static inline void QuadwordCopy_neon(
+       void* dst,
+       void* src,
+       uint32_t count,       // of quadwords
+       uint32_t trailerCount // of bytes
+)
+{
+       // Uses aligned multi-register loads to maximise read bandwidth
+       // on uncached memory such as framebuffers
+       // The accesses do not have the aligned qualifiers, so that the copy
+       // may convert between aligned-uncached and unaligned-cached memory.
+       // It is assumed that the CPU can infer alignedness from the address.
+       asm volatile (
+       "       cmp       %[count], #8                                          \n"
+       "       blt 1f    @ skip oversized fragments            \n"
+       "0: @ start with eight quadwords at a time              \n"
+       "       sub       %[count], %[count], #8                        \n"
+       "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
+       "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
+       "       vld1.8    {d24,d25,d26,d27}, [%[src]]!          \n"
+       "       vld1.8    {d28,d29,d30,d31}, [%[src]]!          \n"
+       "       cmp       %[count], #8                                          \n"
+       "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
+       "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
+       "       vst1.8    {d24,d25,d26,d27}, [%[dst]]!          \n"
+       "       vst1.8    {d28,d29,d30,d31}, [%[dst]]!          \n"
+       "       bge 0b                                                                          \n"
+       "1: @ four quadwords                                                    \n"
+       "       tst       %[count], #4                                          \n"
+       "       beq 2f    @ skip oversized fragment                     \n"
+       "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
+       "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
+       "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
+       "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
+       "2: @ two quadwords                                                             \n"
+       "       tst       %[count], #2                                          \n"
+       "       beq 3f    @ skip oversized fragment                     \n"
+       "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
+       "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
+       "3: @ one quadword                                                              \n"
+       "       tst       %[count], #1                                          \n"
+       "       beq 4f    @ skip oversized fragment                     \n"
+       "       vld1.8    {d16,d17}, [%[src]]!                          \n"
+       "       vst1.8    {d16,d17}, [%[dst]]!                          \n"
+       "4: @ end                                                                               \n"
+
+       // Clobbered input registers marked as input/outputs
+       : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+
+       // No unclobbered inputs
+       :
+
+       // Clobbered vector registers
+       // NB: these are the quad aliases of the double registers used in the asm
+       : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
+       );
+
+       if(trailerCount) {
+               uint8_t *tDst = dst, *tSrc = src;
+
+               while(trailerCount >= 4) {
+                       *((uint32_t*) tDst) = *((uint32_t*) tSrc);
+                       tDst += 4;
+                       tSrc += 4;
+                       trailerCount -= 4;
+               }
+
+               if(trailerCount >= 2) {
+                       *((uint16_t*) tDst) = *((uint16_t*) tSrc);
+                       tDst += 2;
+                       tSrc += 2;
+                       trailerCount -= 2;
+               }
+
+               if(trailerCount) {
+                       *tDst++ = *tSrc++;
+                       trailerCount--;
+               }
+       }
+}
+
+#endif  // USE_GCC_INLINE_ASM
+
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000neon,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
+#ifdef USE_GCC_INLINE_ASM
+//  { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_16x16neon,              0 },
+//  { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
+#endif
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
@@ -1462,10 +2106,13 @@ pixman_blt_neon (
        int width, int height)
 {
 
-#if 0  // Relies on code which isn't upstreamed yet
+       if(!width || !height)
+               return TRUE;
+
+#ifdef USE_GCC_INLINE_ASM
 
-       // accelerate only straight copies
-       if(src_bpp != dst_bpp || (src_bpp & 7) || !width || !height)
+       // accelerate only straight copies involving complete bytes
+       if(src_bpp != dst_bpp || (src_bpp & 7))
                return FALSE;
 
        {