From: raster Date: Wed, 26 Jan 2011 08:45:11 +0000 (+0000) Subject: well neon asm was wrongly documented - and wrong before anyway. it's X-Git-Tag: 2.0_alpha~240^2~1261 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d03185f8ee8fab6bef7e7be3475f55937c34aab3;p=framework%2Fuifw%2Fevas.git well neon asm was wrongly documented - and wrong before anyway. it's disabled for now and uses memcpy. bizarre stuff occurs if i try use it though. git-svn-id: svn+ssh://svn.enlightenment.org/var/svn/e/trunk/evas@56304 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33 --- diff --git a/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c b/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c index e4afbbb..5b8bd60 100644 --- a/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c +++ b/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c @@ -3,47 +3,47 @@ #ifdef BUILD_NEON static void _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) { +//#define USENEON 1 +#ifndef USENEON + memcpy(d, s, l * sizeof(DATA32)); + return; +#else DATA32 *e; -// if (((unsigned long)s & 0xf) || ((unsigned long)d & 0xf)) -// { - memcpy(d, s, l * sizeof(DATA32)); -// return; -// } -/* e = d + l - 23; if (e > d) { int dl; - asm volatile ( - ".fpu neon \n\t" - "asmloop2: \n\t" - "cmp %[e], %[d] \n\t" // compare current and end ptr - "pld [%[s], #64] \n\t" // preload 64 bytes ahead - "pld [%[s], #256] \n\t" // preload 256 bytes ahead - "pld [%[s], #320] \n\t" // preload 320 bytes ahead - "vld1.64 {d0-d3}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix) - "vld1.64 {d4-d7} , [%[s]]! \n\t" // load 256bits (32 bytes 8 pix) - "vld1.64 {d8-d11}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix) - "vst1.64 {d0-d3}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix) - "vst1.64 {d4-d7}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix) - "vst1.64 {d8-d11}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix) - "bhi asmloop2 \n\t" - : // output regs - : [s] "r" (s), [e] "r" (e), [d] "r" (d) // input - : "q0", "q1", "q2", "q3", "q4", "q5", - "d0", "d1", "d2", "d3", "d4", "d5", - "d6", "d7", "d8", "d9", "d10", "d11", - "memory" // clobbered - ); - e = d + l; - dl = l - (l % 24); - s = s + dl; - d = d + dl; + + asm volatile + (".fpu neon \n\t" + "_op_copy_p_dp_neon_asmloop: \n\t" + "pld [%[s], #192] \n\t" // preload 256 bytes ahead + "pld [%[s], #320] \n\t" // preload 320 bytes ahead + "vld1.32 {d0-d3}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix), 32bit aligned + "vld1.32 {d4-d7} , [%[s]]! \n\t" // load 256bits (32 bytes 8 pix), 32bit aligned + "vld1.32 {d8-d11}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix), 32bit aligned + "vst1.32 {d0-d3}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix), 32bit aligned + "vst1.32 {d4-d7}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix), 32bit aligned + "vst1.32 {d8-d11}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix), 32bit aligned + "cmp %[e], %[d] \n\t" // compare current and end ptr + "bgt _op_copy_p_dp_neon_asmloop \n\t" + : /*out*/ + : /*in */ [s] "r" (s), [e] "r" (e), [d] "r" (d) + : /*clobber*/ + "q0", "q1", "q2","q3", "q4", "q5", "q6", + "d0", "d1", "d2", "d3", + "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", + "memory" // clobbered + ); + dl = l % 24; // dl is how many pixels at end that is not a multiple of 24 + l = l - dl; // jump to there at the end of the run? + s = s + l; + d = d + l; } - for (; d < e; d++, s++) { - *d = *s; - } - */ + e += 23; + for (;d < e; d++, s++) *d = *s; +#endif } #define _op_copy_pan_dp_neon _op_copy_p_dp_neon