From 8eeeca993252edc39da9c5c57545b81215fafc81 Mon Sep 17 00:00:00 2001 From: Jonathan Morton Date: Tue, 16 Jun 2009 12:08:29 -0400 Subject: [PATCH] [NEON] Replace Ian's glyph-blitter with a better one. Each scanline of the destination is bulk-loaded into a cached buffer on the stack (using the QuadWordCopy routine) before being processed. This is the primary benefit on uncached framebuffers, since it is necessary to minimise the number of accesses to such things and avoid write-to-read turnarounds. This also simplifies edge handling, since QuadWordCopy() can do a precise writeback efficiently via the write-combiner, allowing the main routine to "over-read" the scanline edge safely when required. This is why the glyph's mask data is also copied into a temporary buffer of known size. Each group of 8 pixels is then processed using fewer instructions, taking advantage of the lower precision requirements of the 6-bit destination (so a simpler pixel multiply can be used) and using a more efficient bit-repacking method. (As an aside, this patch removes nearly twice as much code as it introduces. Most of this is due to duplication of Ian's inner loop, since he has to handle narrow cases separately. RVCT support is of course preserved.) We measured the doubling of performance by rendering 96-pixel height glyph strings, which are fillrate limited rather than latency/overhead limited. The performance is also improved, albeit by a smaller amount, on the more usual smaller text, demonstrating that internal overhead is not a problem. --- pixman/pixman-arm-neon.c | 515 ++++++++++++++++------------------------------- 1 file changed, 178 insertions(+), 337 deletions(-) diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 467a0dd..1aefb5a 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -632,343 +632,6 @@ fbCompositeSrc_8888x8x8888neon ( } - -void -fbCompositeSolidMask_nx8x0565neon ( - pixman_implementation_t * impl, - pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int32_t xSrc, - int32_t ySrc, - int32_t xMask, - int32_t yMask, - int32_t xDst, - int32_t yDst, - int32_t width, - int32_t height) -{ - uint32_t src, srca; - uint16_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint32_t w; - uint8x8_t sval2; - uint8x8x4_t sval8; - - fbComposeGetSolid(pSrc, src, pDst->bits.format); - - srca = src >> 24; - if (src == 0) - return; - - sval2=vreinterpret_u8_u32(vdup_n_u32(src)); - sval8.val[0]=vdup_lane_u8(sval2,0); - sval8.val[1]=vdup_lane_u8(sval2,1); - sval8.val[2]=vdup_lane_u8(sval2,2); - sval8.val[3]=vdup_lane_u8(sval2,3); - - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); - - if (width>=8) - { - // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused - while (height--) - { - uint16_t *keep_dst=0; - - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - -#ifndef USE_GCC_INLINE_ASM - uint8x8_t alpha; - uint16x8_t dval, temp; - uint8x8x4_t sval8temp; - - alpha = vld1_u8((void*)mask); - dval = vld1q_u16((void*)dst); - keep_dst = dst; - - sval8temp = neon8mul(sval8,alpha); - temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3])))); - - mask += (w & 7); - dst += (w & 7); - w -= (w & 7); - - while (w) - { - dval = vld1q_u16((void*)dst); - alpha = vld1_u8((void*)mask); - - vst1q_u16((void*)keep_dst,temp); - keep_dst = dst; - - sval8temp = neon8mul(sval8,alpha); - temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3])))); - - mask+=8; - dst+=8; - w-=8; - } - vst1q_u16((void*)keep_dst,temp); -#else - asm volatile ( - "vdup.32 d0, %[src]\n\t" - "vdup.8 d1, d0[1]\n\t" - "vdup.8 d2, d0[2]\n\t" - "vdup.8 d3, d0[3]\n\t" - "vdup.8 d0, d0[0]\n\t" - - "vld1.8 {q12}, [%[dst]]\n\t" - "vld1.8 {d31}, [%[mask]]\n\t" - "mov %[keep_dst], %[dst]\n\t" - - "and ip, %[w], #7\n\t" - "add %[mask], %[mask], ip\n\t" - "add %[dst], %[dst], ip, LSL#1\n\t" - "subs %[w], %[w], ip\n\t" - "b 9f\n\t" -// LOOP - "2:\n\t" - - "vld1.16 {q12}, [%[dst]]!\n\t" - "vld1.8 {d31}, [%[mask]]!\n\t" - "vst1.16 {q10}, [%[keep_dst]]\n\t" - "sub %[keep_dst], %[dst], #8*2\n\t" - "subs %[w], %[w], #8\n\t" - "9:\n\t" -// expand 0565 q12 to 8888 {d4-d7} - "vmovn.u16 d4, q12\t\n" - "vshr.u16 q11, q12, #5\t\n" - "vshr.u16 q10, q12, #6+5\t\n" - "vmovn.u16 d5, q11\t\n" - "vmovn.u16 d6, q10\t\n" - "vshl.u8 d4, d4, #3\t\n" - "vshl.u8 d5, d5, #2\t\n" - "vshl.u8 d6, d6, #3\t\n" - "vsri.u8 d4, d4, #5\t\n" - "vsri.u8 d5, d5, #6\t\n" - "vsri.u8 d6, d6, #5\t\n" - - "vmull.u8 q10, d31, d0\n\t" - "vmull.u8 q11, d31, d1\n\t" - "vmull.u8 q12, d31, d2\n\t" - "vmull.u8 q13, d31, d3\n\t" - "vrshr.u16 q8, q10, #8\n\t" - "vrshr.u16 q9, q11, #8\n\t" - "vraddhn.u16 d20, q10, q8\n\t" - "vraddhn.u16 d21, q11, q9\n\t" - "vrshr.u16 q9, q13, #8\n\t" - "vrshr.u16 q8, q12, #8\n\t" - "vraddhn.u16 d23, q13, q9\n\t" - "vraddhn.u16 d22, q12, q8\n\t" - -// duplicate in 4/2/1 & 8pix vsns - "vmvn.8 d30, d23\n\t" - "vmull.u8 q14, d30, d6\n\t" - "vmull.u8 q13, d30, d5\n\t" - "vmull.u8 q12, d30, d4\n\t" - "vrshr.u16 q8, q14, #8\n\t" - "vrshr.u16 q9, q13, #8\n\t" - "vraddhn.u16 d6, q14, q8\n\t" - "vrshr.u16 q8, q12, #8\n\t" - "vraddhn.u16 d5, q13, q9\n\t" - "vqadd.u8 d6, d6, d22\n\t" // moved up - "vraddhn.u16 d4, q12, q8\n\t" -// intentionally don't calculate alpha -// result in d4-d6 - -// "vqadd.u8 d6, d6, d22\n\t" ** moved up - "vqadd.u8 d5, d5, d21\n\t" - "vqadd.u8 d4, d4, d20\n\t" - -// pack 8888 {d20-d23} to 0565 q10 - "vshll.u8 q10, d6, #8\n\t" - "vshll.u8 q3, d5, #8\n\t" - "vshll.u8 q2, d4, #8\n\t" - "vsri.u16 q10, q3, #5\t\n" - "vsri.u16 q10, q2, #11\t\n" - - "bne 2b\n\t" - - "1:\n\t" - "vst1.16 {q10}, [%[keep_dst]]\n\t" - - : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst) - : [src] "r" (src) - : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", - "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", - "d30","d31" - ); -#endif - } - } - else - { - while (height--) - { - void *dst4=0, *dst2=0; - - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; - w = width; - - -#ifndef USE_GCC_INLINE_ASM - uint8x8_t alpha; - uint16x8_t dval, temp; - uint8x8x4_t sval8temp; - - if (w&4) - { - alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1)); - dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1)); - dst4=dst; - mask+=4; - dst+=4; - } - if (w&2) - { - alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1)); - dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1)); - dst2=dst; - mask+=2; - dst+=2; - } - if (w&1) - { - alpha = vld1_lane_u8((void*)mask,alpha,1); - dval = vld1q_lane_u16((void*)dst,dval,1); - } - - sval8temp = neon8mul(sval8,alpha); - temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3])))); - - if (w&1) - vst1q_lane_u16((void*)dst,temp,1); - if (w&2) - vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1); - if (w&4) - vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1); -#else - asm volatile ( - "vdup.32 d0, %[src]\n\t" - "vdup.8 d1, d0[1]\n\t" - "vdup.8 d2, d0[2]\n\t" - "vdup.8 d3, d0[3]\n\t" - "vdup.8 d0, d0[0]\n\t" - - "tst %[w], #4\t\n" - "beq skip_load4\t\n" - - "vld1.64 {d25}, [%[dst]]\n\t" - "vld1.32 {d31[1]}, [%[mask]]\n\t" - "mov %[dst4], %[dst]\t\n" - "add %[mask], %[mask], #4\t\n" - "add %[dst], %[dst], #4*2\t\n" - - "skip_load4:\t\n" - "tst %[w], #2\t\n" - "beq skip_load2\t\n" - "vld1.32 {d24[1]}, [%[dst]]\n\t" - "vld1.16 {d31[1]}, [%[mask]]\n\t" - "mov %[dst2], %[dst]\t\n" - "add %[mask], %[mask], #2\t\n" - "add %[dst], %[dst], #2*2\t\n" - - "skip_load2:\t\n" - "tst %[w], #1\t\n" - "beq skip_load1\t\n" - "vld1.16 {d24[1]}, [%[dst]]\n\t" - "vld1.8 {d31[1]}, [%[mask]]\n\t" - - "skip_load1:\t\n" -// expand 0565 q12 to 8888 {d4-d7} - "vmovn.u16 d4, q12\t\n" - "vshr.u16 q11, q12, #5\t\n" - "vshr.u16 q10, q12, #6+5\t\n" - "vmovn.u16 d5, q11\t\n" - "vmovn.u16 d6, q10\t\n" - "vshl.u8 d4, d4, #3\t\n" - "vshl.u8 d5, d5, #2\t\n" - "vshl.u8 d6, d6, #3\t\n" - "vsri.u8 d4, d4, #5\t\n" - "vsri.u8 d5, d5, #6\t\n" - "vsri.u8 d6, d6, #5\t\n" - - "vmull.u8 q10, d31, d0\n\t" - "vmull.u8 q11, d31, d1\n\t" - "vmull.u8 q12, d31, d2\n\t" - "vmull.u8 q13, d31, d3\n\t" - "vrshr.u16 q8, q10, #8\n\t" - "vrshr.u16 q9, q11, #8\n\t" - "vraddhn.u16 d20, q10, q8\n\t" - "vraddhn.u16 d21, q11, q9\n\t" - "vrshr.u16 q9, q13, #8\n\t" - "vrshr.u16 q8, q12, #8\n\t" - "vraddhn.u16 d23, q13, q9\n\t" - "vraddhn.u16 d22, q12, q8\n\t" - -// duplicate in 4/2/1 & 8pix vsns - "vmvn.8 d30, d23\n\t" - "vmull.u8 q14, d30, d6\n\t" - "vmull.u8 q13, d30, d5\n\t" - "vmull.u8 q12, d30, d4\n\t" - "vrshr.u16 q8, q14, #8\n\t" - "vrshr.u16 q9, q13, #8\n\t" - "vraddhn.u16 d6, q14, q8\n\t" - "vrshr.u16 q8, q12, #8\n\t" - "vraddhn.u16 d5, q13, q9\n\t" - "vqadd.u8 d6, d6, d22\n\t" // moved up - "vraddhn.u16 d4, q12, q8\n\t" -// intentionally don't calculate alpha -// result in d4-d6 - -// "vqadd.u8 d6, d6, d22\n\t" ** moved up - "vqadd.u8 d5, d5, d21\n\t" - "vqadd.u8 d4, d4, d20\n\t" - -// pack 8888 {d20-d23} to 0565 q10 - "vshll.u8 q10, d6, #8\n\t" - "vshll.u8 q3, d5, #8\n\t" - "vshll.u8 q2, d4, #8\n\t" - "vsri.u16 q10, q3, #5\t\n" - "vsri.u16 q10, q2, #11\t\n" - - "tst %[w], #1\n\t" - "beq skip_store1\t\n" - "vst1.16 {d20[1]}, [%[dst]]\t\n" - "skip_store1:\t\n" - "tst %[w], #2\n\t" - "beq skip_store2\t\n" - "vst1.32 {d20[1]}, [%[dst2]]\t\n" - "skip_store2:\t\n" - "tst %[w], #4\n\t" - "beq skip_store4\t\n" - "vst1.16 {d21}, [%[dst4]]\t\n" - "skip_store4:\t\n" - - : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2) - : [src] "r" (src) - : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", - "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", - "d30","d31" - ); -#endif - } - } -} - - - void fbCompositeSolidMask_nx8x8888neon ( pixman_implementation_t * impl, @@ -1964,6 +1627,184 @@ static inline void QuadwordCopy_neon( } } +static inline void SolidOver565_8pix_neon( + uint32_t glyphColour, + uint16_t *dest, + uint8_t *inMask, + uint32_t destStride, // bytes, not elements + uint32_t maskStride, + uint32_t count // 8-pixel groups +) +{ + // Inner loop of glyph blitter (solid colour, alpha mask) + +#ifdef USE_GCC_INLINE_ASM + + asm volatile ( + " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyphColour]] @ splat solid colour components \n" + "0: @ loop \n" + " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n" + " vld1.8 {d17}, [%[inMask]] @ load alpha mask of glyph \n" + " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n" + " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n" + " vmvn d18, d17 @ we need the inverse mask for the background \n" + " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n" + " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n" + " vshrn.u16 d4, q0, #3 @ unpack green \n" + " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n" + " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n" + " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n" + " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n" + " vmull.u8 q2, d4, d18 @ ...green... \n" + " vmull.u8 q3, d6, d18 @ ...blue \n" + " subs %[count], %[count], #1 @ decrement/test loop counter \n" + " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n" + " vmlal.u8 q2, d17, d21 @ ...green... \n" + " vmlal.u8 q3, d17, d20 @ ...blue \n" + " add %[inMask], %[inMask], %[maskStride] @ advance mask pointer, while we wait \n" + " vsri.16 q1, q2, #5 @ pack green behind red \n" + " vsri.16 q1, q3, #11 @ pack blue into pixels \n" + " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n" + " add %[dest], %[dest], %[destStride] @ advance framebuffer pointer \n" + " bne 0b @ next please \n" + + // Clobbered registers marked as input/outputs + : [dest] "+r" (dest), [inMask] "+r" (inMask), [count] "+r" (count) + + // Inputs + : [destStride] "r" (destStride), [maskStride] "r" (maskStride), [glyphColour] "r" (&glyphColour) + + // Clobbers, including the inputs we modify, and potentially lots of memory + : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory" + ); + +#else + + uint8x8x4_t solidColour = vld4_dup_u8((uint8_t*) &glyphColour); + + while(count--) + { + uint16x8_t pixels = vld1q_u16(dest); + uint8x8_t mask = vshrn_n_u16(vmull_u8(solidColour.val[3], vld1_u8(inMask)), 8); + uint8x8_t iMask = vmvn_u8(mask); + + uint8x8_t tRed = vshrn_n_u16(pixels, 8); + uint8x8_t tGreen = vshrn_n_u16(pixels, 3); + uint8x8_t tBlue = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2); + + uint16x8_t sRed = vmull_u8(vsri_n_u8(tRed , tRed , 5), iMask); + uint16x8_t sGreen = vmull_u8(vsri_n_u8(tGreen, tGreen, 6), iMask); + uint16x8_t sBlue = vmull_u8( tBlue , iMask); + + sRed = vmlal(sRed , mask, solidColour.val[2]); + sGreen = vmlal(sGreen, mask, solidColour.val[1]); + sBlue = vmlal(sBlue , mask, solidColour.val[0]); + + pixels = vsri_n_u16(sRed, sGreen, 5); + pixels = vsri_n_u16(pixels, sBlue, 11); + vst1q_u16(dest, pixels); + + dest += destStride; + mask += maskStride; + } + +#endif +} + +void +fbCompositeSolidMask_nx8x0565neon ( + pixman_implementation_t * impl, + pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int32_t xSrc, + int32_t ySrc, + int32_t xMask, + int32_t yMask, + int32_t xDst, + int32_t yDst, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint16_t *dstLine, *alignedLine; + uint8_t *maskLine; + uint32_t dstStride, maskStride; + uint32_t kernelCount, copyCount; + uint8_t kernelOffset, copyOffset; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + // bail out if fully transparent or degenerate + srca = src >> 24; + if(srca == 0) + return; + if(width == 0 || height == 0) + return; + + if(width > NEON_SCANLINE_BUFFER_PIXELS) { + // split the blit, so we can use a fixed-size scanline buffer + // TODO: there must be a more elegant way of doing this. + int x; + for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) { + fbCompositeSolidMask_nx8x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst, + (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height); + } + return; + } + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + + // keep within minimum number of aligned quadwords on width + // while also keeping the minimum number of columns to process + { + unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF; + unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF; + unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF; + + // the fast copy must always be quadword aligned + copyOffset = dstLine - ((uint16_t*) alignedLeft); + alignedLine = dstLine - copyOffset; + copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4); + + if(alignedRight - alignedLeft > ceilingLength) { + // unaligned routine is tightest, and will not overrun + kernelCount = (uint32_t) (ceilingLength >> 4); + kernelOffset = copyOffset; + } else { + // aligned routine is equally tight, so it is safer to align + kernelCount = copyCount; + kernelOffset = 0; + } + } + + { + uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised + uint8_t glyphLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; + int y = height; + + // row-major order + // left edge, middle block, right edge + for( ; y--; maskLine += maskStride, alignedLine += dstStride, dstLine += dstStride) { + // We don't want to overrun the edges of the glyph, so realign the edge data into known buffers + QuadwordCopy_neon(glyphLine + copyOffset, maskLine, width >> 4, width & 0xF); + + // Uncached framebuffer access is really, really slow if we do it piecemeal. + // It should be much faster if we grab it all at once. + // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth. + QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0); + + // Apply the actual filter + SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount); + + // Copy the modified scanline back + QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2); + } + } +} + static const FastPathInfo arm_neon_fast_path_array[] = { { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8neon, 0 }, -- 2.7.4