From b1cb5922f785310ef790811b52e4e2b0c85dfccc Mon Sep 17 00:00:00 2001
From: Jonathan Morton <jmorton@sd070.hel.movial.fi>
Date: Mon, 15 Jun 2009 16:09:32 +0300
Subject: [PATCH] Add RVCT support for straight blitter.

---
 pixman/pixman-arm-neon.c | 71 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 5fd82ab..467a0dd 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1837,7 +1837,6 @@ pixman_fill_neon (uint32_t *bits,
 #endif
 }
 
-#ifdef USE_GCC_INLINE_ASM
 
 // TODO: is there a more generic way of doing this being introduced?
 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
@@ -1849,11 +1848,16 @@ static inline void QuadwordCopy_neon(
 	uint32_t trailerCount // of bytes
 )
 {
+	uint8_t *tDst = dst, *tSrc = src;
+
 	// Uses aligned multi-register loads to maximise read bandwidth
 	// on uncached memory such as framebuffers
 	// The accesses do not have the aligned qualifiers, so that the copy
 	// may convert between aligned-uncached and unaligned-cached memory.
 	// It is assumed that the CPU can infer alignedness from the address.
+
+#ifdef USE_GCC_INLINE_ASM
+
 	asm volatile (
 	"	cmp       %[count], #8						\n"
 	"	blt 1f    @ skip oversized fragments		\n"
@@ -1889,7 +1893,7 @@ static inline void QuadwordCopy_neon(
 	"4: @ end										\n"
 
 	// Clobbered input registers marked as input/outputs
-	: [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+	: [dst] "+r" (tDst), [src] "+r" (tSrc), [count] "+r" (count)
 
 	// No unclobbered inputs
 	:
@@ -1899,32 +1903,67 @@ static inline void QuadwordCopy_neon(
 	: "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
 	);
 
+#else
+
+	while(count >= 8) {
+		uint8x16x4_t t1 = vld4q_u8(tSrc);
+		uint8x16x4_t t2 = vld4q_u8(tSrc + sizeof(uint8x16x4_t));
+		tSrc += sizeof(uint8x16x4_t) * 2;
+		vst4q_u8(tDst, t1);
+		vst4q_u8(tDst + sizeof(uint8x16x4_t), t2);
+		tDst += sizeof(uint8x16x4_t) * 2;
+		count -= 8;
+	}
+
+	if(count & 4) {
+		uint8x16x4_t t1 = vld4q_u8(tSrc);
+		tSrc += sizeof(uint8x16x4_t);
+		vst4q_u8(tDst, t1);
+		tDst += sizeof(uint8x16x4_t);
+	}
+
+	if(count & 2) {
+		uint8x8x4_t t1 = vld4_u8(tSrc);
+		tSrc += sizeof(uint8x8x4_t);
+		vst4_u8(tDst, t1);
+		tDst += sizeof(uint8x8x4_t);
+	}
+
+	if(count & 1) {
+		uint8x16_t t1 = vld1q_u8(tSrc);
+		tSrc += sizeof(uint8x16_t);
+		vst1q_u8(tDst, t1);
+		tDst += sizeof(uint8x16_t);
+	}
+
+#endif  // !USE_GCC_INLINE_ASM
+
 	if(trailerCount) {
-		uint8_t *tDst = dst, *tSrc = src;
+		if(trailerCount & 8) {
+			uint8x8_t t1 = vld1_u8(tSrc);
+			tSrc += sizeof(uint8x8_t);
+			vst1_u8(tDst, t1);
+			tDst += sizeof(uint8x8_t);
+		}
 
-		while(trailerCount >= 4) {
+		if(trailerCount & 4) {
 			*((uint32_t*) tDst) = *((uint32_t*) tSrc);
 			tDst += 4;
 			tSrc += 4;
-			trailerCount -= 4;
 		}
 
-		if(trailerCount >= 2) {
+		if(trailerCount & 2) {
 			*((uint16_t*) tDst) = *((uint16_t*) tSrc);
 			tDst += 2;
 			tSrc += 2;
-			trailerCount -= 2;
 		}
 
-		if(trailerCount) {
+		if(trailerCount & 1) {
 			*tDst++ = *tSrc++;
-			trailerCount--;
 		}
 	}
 }
 
-#endif  // USE_GCC_INLINE_ASM
-
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
@@ -1999,12 +2038,9 @@ pixman_blt_neon (
 	int dst_x, int dst_y,
 	int width, int height)
 {
-
 	if(!width || !height)
 		return TRUE;
 
-#ifdef USE_GCC_INLINE_ASM
-
 	// accelerate only straight copies involving complete bytes
 	if(src_bpp != dst_bpp || (src_bpp & 7))
 		return FALSE;
@@ -2027,13 +2063,6 @@ pixman_blt_neon (
 	}
 
 	return TRUE;
-
-#else /* USE_GCC_INLINE_ASM */
-
-	// TODO: intrinsic version for armcc
-	return FALSE;
-
-#endif
 }
 
 static pixman_bool_t
-- 
2.7.4