From b1cb5922f785310ef790811b52e4e2b0c85dfccc Mon Sep 17 00:00:00 2001 From: Jonathan Morton Date: Mon, 15 Jun 2009 16:09:32 +0300 Subject: [PATCH] Add RVCT support for straight blitter. --- pixman/pixman-arm-neon.c | 71 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 5fd82ab..467a0dd 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -1837,7 +1837,6 @@ pixman_fill_neon (uint32_t *bits, #endif } -#ifdef USE_GCC_INLINE_ASM // TODO: is there a more generic way of doing this being introduced? #define NEON_SCANLINE_BUFFER_PIXELS (1024) @@ -1849,11 +1848,16 @@ static inline void QuadwordCopy_neon( uint32_t trailerCount // of bytes ) { + uint8_t *tDst = dst, *tSrc = src; + // Uses aligned multi-register loads to maximise read bandwidth // on uncached memory such as framebuffers // The accesses do not have the aligned qualifiers, so that the copy // may convert between aligned-uncached and unaligned-cached memory. // It is assumed that the CPU can infer alignedness from the address. + +#ifdef USE_GCC_INLINE_ASM + asm volatile ( " cmp %[count], #8 \n" " blt 1f @ skip oversized fragments \n" @@ -1889,7 +1893,7 @@ static inline void QuadwordCopy_neon( "4: @ end \n" // Clobbered input registers marked as input/outputs - : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) + : [dst] "+r" (tDst), [src] "+r" (tSrc), [count] "+r" (count) // No unclobbered inputs : @@ -1899,32 +1903,67 @@ static inline void QuadwordCopy_neon( : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory" ); +#else + + while(count >= 8) { + uint8x16x4_t t1 = vld4q_u8(tSrc); + uint8x16x4_t t2 = vld4q_u8(tSrc + sizeof(uint8x16x4_t)); + tSrc += sizeof(uint8x16x4_t) * 2; + vst4q_u8(tDst, t1); + vst4q_u8(tDst + sizeof(uint8x16x4_t), t2); + tDst += sizeof(uint8x16x4_t) * 2; + count -= 8; + } + + if(count & 4) { + uint8x16x4_t t1 = vld4q_u8(tSrc); + tSrc += sizeof(uint8x16x4_t); + vst4q_u8(tDst, t1); + tDst += sizeof(uint8x16x4_t); + } + + if(count & 2) { + uint8x8x4_t t1 = vld4_u8(tSrc); + tSrc += sizeof(uint8x8x4_t); + vst4_u8(tDst, t1); + tDst += sizeof(uint8x8x4_t); + } + + if(count & 1) { + uint8x16_t t1 = vld1q_u8(tSrc); + tSrc += sizeof(uint8x16_t); + vst1q_u8(tDst, t1); + tDst += sizeof(uint8x16_t); + } + +#endif // !USE_GCC_INLINE_ASM + if(trailerCount) { - uint8_t *tDst = dst, *tSrc = src; + if(trailerCount & 8) { + uint8x8_t t1 = vld1_u8(tSrc); + tSrc += sizeof(uint8x8_t); + vst1_u8(tDst, t1); + tDst += sizeof(uint8x8_t); + } - while(trailerCount >= 4) { + if(trailerCount & 4) { *((uint32_t*) tDst) = *((uint32_t*) tSrc); tDst += 4; tSrc += 4; - trailerCount -= 4; } - if(trailerCount >= 2) { + if(trailerCount & 2) { *((uint16_t*) tDst) = *((uint16_t*) tSrc); tDst += 2; tSrc += 2; - trailerCount -= 2; } - if(trailerCount) { + if(trailerCount & 1) { *tDst++ = *tSrc++; - trailerCount--; } } } -#endif // USE_GCC_INLINE_ASM - static const FastPathInfo arm_neon_fast_path_array[] = { { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8neon, 0 }, @@ -1999,12 +2038,9 @@ pixman_blt_neon ( int dst_x, int dst_y, int width, int height) { - if(!width || !height) return TRUE; -#ifdef USE_GCC_INLINE_ASM - // accelerate only straight copies involving complete bytes if(src_bpp != dst_bpp || (src_bpp & 7)) return FALSE; @@ -2027,13 +2063,6 @@ pixman_blt_neon ( } return TRUE; - -#else /* USE_GCC_INLINE_ASM */ - - // TODO: intrinsic version for armcc - return FALSE; - -#endif } static pixman_bool_t -- 2.7.4