From 8eeeca993252edc39da9c5c57545b81215fafc81 Mon Sep 17 00:00:00 2001
From: Jonathan Morton <jmorton@sd070.hel.movial.fi>
Date: Tue, 16 Jun 2009 12:08:29 -0400
Subject: [PATCH] [NEON] Replace Ian's glyph-blitter with a better one.

Each scanline of the destination is bulk-loaded into a cached buffer on
the stack (using the QuadWordCopy routine) before being processed.  This
is the primary benefit on uncached framebuffers, since it is necessary
to minimise the number of accesses to such things and avoid
write-to-read turnarounds.

This also simplifies edge handling, since QuadWordCopy() can do a
precise writeback efficiently via the write-combiner, allowing the main
routine to "over-read" the scanline edge safely when required.  This is
why the glyph's mask data is also copied into a temporary buffer of
known size.

Each group of 8 pixels is then processed using fewer instructions,
taking advantage of the lower precision requirements of the 6-bit
destination (so a simpler pixel multiply can be used) and using a more
efficient bit-repacking method.

(As an aside, this patch removes nearly twice as much code as it
introduces.  Most of this is due to duplication of Ian's inner loop,
since he has to handle narrow cases separately.  RVCT support is of
course preserved.)

We measured the doubling of performance by rendering 96-pixel height
glyph strings, which are fillrate limited rather than latency/overhead
limited.  The performance is also improved, albeit by a smaller amount,
on the more usual smaller text, demonstrating that internal overhead is
not a problem.
---
 pixman/pixman-arm-neon.c | 515 ++++++++++++++++-------------------------------
 1 file changed, 178 insertions(+), 337 deletions(-)

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 467a0dd..1aefb5a 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -632,343 +632,6 @@ fbCompositeSrc_8888x8x8888neon (
 }
 
 
-
-void
-fbCompositeSolidMask_nx8x0565neon (
-                               pixman_implementation_t * impl,
-                               pixman_op_t op,
-                               pixman_image_t * pSrc,
-                               pixman_image_t * pMask,
-                               pixman_image_t * pDst,
-                               int32_t      xSrc,
-                               int32_t      ySrc,
-                               int32_t      xMask,
-                               int32_t      yMask,
-                               int32_t      xDst,
-                               int32_t      yDst,
-                               int32_t      width,
-                               int32_t      height)
-{
-    uint32_t     src, srca;
-    uint16_t    *dstLine, *dst;
-    uint8_t     *maskLine, *mask;
-    int          dstStride, maskStride;
-    uint32_t     w;
-    uint8x8_t    sval2;
-    uint8x8x4_t  sval8;
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-        return;
-
-    sval2=vreinterpret_u8_u32(vdup_n_u32(src));
-    sval8.val[0]=vdup_lane_u8(sval2,0);
-    sval8.val[1]=vdup_lane_u8(sval2,1);
-    sval8.val[2]=vdup_lane_u8(sval2,2);
-    sval8.val[3]=vdup_lane_u8(sval2,3);
-
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
-    if (width>=8)
-    {
-        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
-        while (height--)
-        {
-            uint16_t *keep_dst=0;
-
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
-
-#ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint16x8_t dval, temp; 
-            uint8x8x4_t sval8temp;
-
-            alpha = vld1_u8((void*)mask);
-            dval = vld1q_u16((void*)dst);
-            keep_dst = dst;
-
-            sval8temp = neon8mul(sval8,alpha);
-            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-            mask += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
-
-            while (w)
-            {
-                dval = vld1q_u16((void*)dst);
-	        alpha = vld1_u8((void*)mask);
-
-                vst1q_u16((void*)keep_dst,temp);
-                keep_dst = dst;
-
-                sval8temp = neon8mul(sval8,alpha);
-                temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-                mask+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst1q_u16((void*)keep_dst,temp);
-#else
-        asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "vld1.8       {q12}, [%[dst]]\n\t"
-                        "vld1.8       {d31}, [%[mask]]\n\t"
-                        "mov  %[keep_dst], %[dst]\n\t"
-
-                        "and  ip, %[w], #7\n\t"
-                        "add  %[mask], %[mask], ip\n\t"
-                        "add  %[dst], %[dst], ip, LSL#1\n\t"
-                        "subs  %[w], %[w], ip\n\t"
-                        "b  9f\n\t"
-// LOOP
-                        "2:\n\t"
-
-                        "vld1.16      {q12}, [%[dst]]!\n\t"
-                        "vld1.8       {d31}, [%[mask]]!\n\t"
-                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
-                        "sub  %[keep_dst], %[dst], #8*2\n\t"
-                        "subs  %[w], %[w], #8\n\t"
-                        "9:\n\t"
-// expand 0565 q12 to 8888 {d4-d7}
-                        "vmovn.u16    d4, q12\t\n"
-                        "vshr.u16     q11, q12, #5\t\n"
-                        "vshr.u16     q10, q12, #6+5\t\n"
-                        "vmovn.u16    d5, q11\t\n"
-                        "vmovn.u16    d6, q10\t\n"
-                        "vshl.u8      d4, d4, #3\t\n"
-                        "vshl.u8      d5, d5, #2\t\n"
-                        "vshl.u8      d6, d6, #3\t\n"
-                        "vsri.u8      d4, d4, #5\t\n"
-                        "vsri.u8      d5, d5, #6\t\n"
-                        "vsri.u8      d6, d6, #5\t\n"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
-                        "vraddhn.u16  d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
-                        "vqadd.u8     d5, d5, d21\n\t"
-                        "vqadd.u8     d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
-                        "vshll.u8     q10, d6, #8\n\t"
-                        "vshll.u8     q3, d5, #8\n\t"
-                        "vshll.u8     q2, d4, #8\n\t"
-                        "vsri.u16     q10, q3, #5\t\n"
-                        "vsri.u16     q10, q2, #11\t\n"
-
-                        "bne 2b\n\t"
-
-                        "1:\n\t"
-                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
-                        : [src] "r" (src)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
-#endif
-        }
-    }
-    else
-    {
-        while (height--)
-        {
-            void *dst4=0, *dst2=0;
-
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
-
-
-#ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint16x8_t dval, temp;
-            uint8x8x4_t sval8temp;
-
-            if (w&4)
-            {
-                alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1));
-                dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1));
-                dst4=dst;
-                mask+=4;
-                dst+=4;
-            }
-            if (w&2)
-            {
-                alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1));
-                dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1));
-                dst2=dst;
-                mask+=2;
-                dst+=2;
-            }
-            if (w&1)
-            {
-                alpha = vld1_lane_u8((void*)mask,alpha,1);
-                dval = vld1q_lane_u16((void*)dst,dval,1);
-            }
-
-            sval8temp = neon8mul(sval8,alpha);
-            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-            if (w&1)
-                vst1q_lane_u16((void*)dst,temp,1);
-            if (w&2)
-                vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1);
-            if (w&4)
-                vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1);
-#else
-            asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "tst  %[w], #4\t\n"
-                        "beq  skip_load4\t\n"
-
-                        "vld1.64      {d25}, [%[dst]]\n\t"
-                        "vld1.32      {d31[1]}, [%[mask]]\n\t"
-                        "mov  %[dst4], %[dst]\t\n"
-                        "add  %[mask], %[mask], #4\t\n"
-                        "add  %[dst], %[dst], #4*2\t\n"
-
-                        "skip_load4:\t\n"
-                        "tst  %[w], #2\t\n"
-                        "beq  skip_load2\t\n"
-                        "vld1.32      {d24[1]}, [%[dst]]\n\t"
-                        "vld1.16      {d31[1]}, [%[mask]]\n\t"
-                        "mov  %[dst2], %[dst]\t\n"
-                        "add  %[mask], %[mask], #2\t\n"
-                        "add  %[dst], %[dst], #2*2\t\n"
-
-                        "skip_load2:\t\n"
-                        "tst  %[w], #1\t\n"
-                        "beq  skip_load1\t\n"
-                        "vld1.16      {d24[1]}, [%[dst]]\n\t"
-                        "vld1.8       {d31[1]}, [%[mask]]\n\t"
-
-                        "skip_load1:\t\n"
-// expand 0565 q12 to 8888 {d4-d7}
-                        "vmovn.u16    d4, q12\t\n"
-                        "vshr.u16     q11, q12, #5\t\n"
-                        "vshr.u16     q10, q12, #6+5\t\n"
-                        "vmovn.u16    d5, q11\t\n"
-                        "vmovn.u16    d6, q10\t\n"
-                        "vshl.u8      d4, d4, #3\t\n"
-                        "vshl.u8      d5, d5, #2\t\n"
-                        "vshl.u8      d6, d6, #3\t\n"
-                        "vsri.u8      d4, d4, #5\t\n"
-                        "vsri.u8      d5, d5, #6\t\n"
-                        "vsri.u8      d6, d6, #5\t\n"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
-                        "vraddhn.u16  d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
-                        "vqadd.u8     d5, d5, d21\n\t"
-                        "vqadd.u8     d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
-                        "vshll.u8     q10, d6, #8\n\t"
-                        "vshll.u8     q3, d5, #8\n\t"
-                        "vshll.u8     q2, d4, #8\n\t"
-                        "vsri.u16     q10, q3, #5\t\n"
-                        "vsri.u16     q10, q2, #11\t\n"
-
-                        "tst  %[w], #1\n\t"
-                        "beq skip_store1\t\n"
-                        "vst1.16      {d20[1]}, [%[dst]]\t\n"
-                        "skip_store1:\t\n"
-                        "tst  %[w], #2\n\t"
-                        "beq  skip_store2\t\n"
-                        "vst1.32      {d20[1]}, [%[dst2]]\t\n"
-                        "skip_store2:\t\n"
-                        "tst  %[w], #4\n\t"
-                        "beq skip_store4\t\n"
-                        "vst1.16      {d21}, [%[dst4]]\t\n"
-                        "skip_store4:\t\n"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
-                        : [src] "r" (src)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
-#endif
-        }
-    }
-}
-
-
-
 void
 fbCompositeSolidMask_nx8x8888neon (
                             pixman_implementation_t * impl,
@@ -1964,6 +1627,184 @@ static inline void QuadwordCopy_neon(
 	}
 }
 
+static inline void SolidOver565_8pix_neon(
+	uint32_t  glyphColour,
+	uint16_t *dest,
+	uint8_t  *inMask,
+	uint32_t  destStride,  // bytes, not elements
+	uint32_t  maskStride,
+	uint32_t  count        // 8-pixel groups
+)
+{
+	// Inner loop of glyph blitter (solid colour, alpha mask)
+
+#ifdef USE_GCC_INLINE_ASM
+
+	asm volatile (
+	"	vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyphColour]]  @ splat solid colour components	\n"
+	"0:	@ loop																				\n"
+	"	vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer			\n"
+	"	vld1.8    {d17}, [%[inMask]]         @ load alpha mask of glyph						\n"
+	"	vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask				\n"
+	"	vshrn.u16 d17, q9, #8                @ reformat it to match original mask			\n"
+	"	vmvn      d18, d17                   @ we need the inverse mask for the background	\n"
+	"	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits				\n"
+	"	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels			\n"
+	"	vshrn.u16 d4, q0, #3                 @ unpack green									\n"
+	"	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)			\n"
+	"	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)		\n"
+	"	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)			\n"
+	"	vmull.u8  q1, d2, d18                @ apply inverse mask to background red...		\n"
+	"	vmull.u8  q2, d4, d18                @ ...green...									\n"
+	"	vmull.u8  q3, d6, d18                @ ...blue										\n"
+	"	subs      %[count], %[count], #1     @ decrement/test loop counter					\n"
+	"	vmlal.u8  q1, d17, d22               @ add masked foreground red...					\n"
+	"	vmlal.u8  q2, d17, d21               @ ...green...									\n"
+	"	vmlal.u8  q3, d17, d20               @ ...blue										\n"
+	"	add %[inMask], %[inMask], %[maskStride] @ advance mask pointer, while we wait		\n"
+	"	vsri.16   q1, q2, #5                 @ pack green behind red						\n"
+	"	vsri.16   q1, q3, #11                @ pack blue into pixels						\n"
+	"	vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels						\n"
+	"	add %[dest], %[dest], %[destStride]  @ advance framebuffer pointer					\n"
+	"	bne 0b                               @ next please									\n"
+
+	// Clobbered registers marked as input/outputs
+	: [dest] "+r" (dest), [inMask] "+r" (inMask), [count] "+r" (count)
+
+	// Inputs
+	: [destStride] "r" (destStride), [maskStride] "r" (maskStride), [glyphColour] "r" (&glyphColour)
+
+	// Clobbers, including the inputs we modify, and potentially lots of memory
+	: "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
+	);
+
+#else
+
+	uint8x8x4_t solidColour = vld4_dup_u8((uint8_t*) &glyphColour);
+
+	while(count--)
+	{
+		uint16x8_t  pixels = vld1q_u16(dest);
+		uint8x8_t   mask = vshrn_n_u16(vmull_u8(solidColour.val[3], vld1_u8(inMask)), 8);
+		uint8x8_t  iMask = vmvn_u8(mask);
+
+		uint8x8_t  tRed   = vshrn_n_u16(pixels, 8);
+		uint8x8_t  tGreen = vshrn_n_u16(pixels, 3);
+		uint8x8_t  tBlue  = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
+
+		uint16x8_t sRed   = vmull_u8(vsri_n_u8(tRed  , tRed  , 5), iMask);
+		uint16x8_t sGreen = vmull_u8(vsri_n_u8(tGreen, tGreen, 6), iMask);
+		uint16x8_t sBlue  = vmull_u8(          tBlue             , iMask);
+
+		sRed   = vmlal(sRed  , mask, solidColour.val[2]);
+		sGreen = vmlal(sGreen, mask, solidColour.val[1]);
+		sBlue  = vmlal(sBlue , mask, solidColour.val[0]);
+
+		pixels = vsri_n_u16(sRed, sGreen, 5);
+		pixels = vsri_n_u16(pixels, sBlue, 11);
+		vst1q_u16(dest, pixels);
+
+		dest += destStride;
+		mask += maskStride;
+	}
+
+#endif
+}
+
+void
+fbCompositeSolidMask_nx8x0565neon (
+	pixman_implementation_t * impl,
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int32_t      xSrc,
+	int32_t      ySrc,
+	int32_t      xMask,
+	int32_t      yMask,
+	int32_t      xDst,
+	int32_t      yDst,
+	int32_t      width,
+	int32_t      height)
+{
+	uint32_t     src, srca;
+	uint16_t    *dstLine, *alignedLine;
+	uint8_t     *maskLine;
+	uint32_t     dstStride, maskStride;
+	uint32_t     kernelCount, copyCount;
+	uint8_t      kernelOffset, copyOffset;
+
+	fbComposeGetSolid(pSrc, src, pDst->bits.format);
+
+	// bail out if fully transparent or degenerate
+	srca = src >> 24;
+	if(srca == 0)
+		return;
+	if(width == 0 || height == 0)
+		return;
+
+	if(width > NEON_SCANLINE_BUFFER_PIXELS) {
+		// split the blit, so we can use a fixed-size scanline buffer
+		// TODO: there must be a more elegant way of doing this.
+		int x;
+		for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
+			fbCompositeSolidMask_nx8x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
+											  (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
+		}
+		return;
+	}
+
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+	fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+
+	// keep within minimum number of aligned quadwords on width
+	// while also keeping the minimum number of columns to process
+	{
+		unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
+		unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
+		unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
+
+		// the fast copy must always be quadword aligned
+		copyOffset = dstLine - ((uint16_t*) alignedLeft);
+		alignedLine = dstLine - copyOffset;
+		copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+
+		if(alignedRight - alignedLeft > ceilingLength) {
+			// unaligned routine is tightest, and will not overrun
+			kernelCount = (uint32_t) (ceilingLength >> 4);
+			kernelOffset = copyOffset;
+		} else {
+			// aligned routine is equally tight, so it is safer to align
+			kernelCount = copyCount;
+			kernelOffset = 0;
+		}
+	}
+
+	{
+		uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
+		uint8_t glyphLine[NEON_SCANLINE_BUFFER_PIXELS + 8];
+		int y = height;
+
+		// row-major order
+		// left edge, middle block, right edge
+		for( ; y--; maskLine += maskStride, alignedLine += dstStride, dstLine += dstStride) {
+			// We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
+			QuadwordCopy_neon(glyphLine + copyOffset, maskLine, width >> 4, width & 0xF);
+
+			// Uncached framebuffer access is really, really slow if we do it piecemeal.
+			// It should be much faster if we grab it all at once.
+			// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
+			QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+
+			// Apply the actual filter
+			SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount);
+
+			// Copy the modified scanline back
+			QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
+		}
+	}
+}
+
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
-- 
2.7.4