Zbigniew Kosinski <z.kosinski@samsung.com>
Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
Jean-Philippe Andre <jp.andre@samsung.com>
+Yury Usischev <y.usishchev@samsung.com>
Ecore
+2013-08-02 Yury Usischev
+
+ * Add neon optimizations for several scaling/map routines in evas
+
2013-08-02 Cedric Bail
* Evas: change mapping policy for image loader (RANDOM during header,
- Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
- Optimized path for when map use the same color for all corner.
- Asynchronous preload of GL texture.
+ - Add neon assembly for upscaling and map routines
* Ecore_Con:
- Rebase dns.c against upstream
* Edje:
#ifdef SCALE_USING_MMX
pxor_r2r(mm0, mm0);
MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+ FPU_NEON;
+ VMOV_I2R_NEON(q2, #255);
#endif
line = &(spans[y - ystart]);
#ifdef SMOOTH
{
# ifdef SCALE_USING_MMX
-# ifdef COLMUL
-# ifdef COLSAME
+# ifdef COLMUL
+# ifdef COLSAME
MOV_P2R(c1, mm7, mm0); // col
-# endif
# endif
# endif
- while (ww > 0)
+# endif
+# ifdef SCALE_USING_NEON
+# ifdef COLMUL
+# ifndef COLBLACK
+ // this part can be done here as c1 and c2 are constants in the cycle
+ FPU_NEON;
+ VMOV_M2R_NEON(d18, c1);
+ VEOR_NEON(q8);
+ VMOV_M2R_NEON(d19, c2);
+ VZIP_NEON(q9, q8);
+ VMOV_R2R_NEON(d19, d16);
+ // here we have c1 and c2 spread through q9 register
+# endif
+# endif
+# endif
+ while (ww > 0)
{
# ifdef COLBLACK
*d = 0xff000000; // col
# endif
# endif
MOV_R2P(mm1, *d, mm0);
+# elif defined SCALE_USING_NEON
+ // not sure if we need this condition, but it doesn't affect the result
+ if (val1 | val2 | val3 | val4)
+ {
+ FPU_NEON;
+# ifdef COLMUL
+ // initialize alpha for interpolation of c1 and c2
+ VDUP_NEON(d15, cv >> 16);
+ // copy c1 and c2 as algorithm will overwrite it
+ VMOV_R2R_NEON(q6, q9);
+ cv += cd; // col
+# endif
+ VMOV_M2R_NEON(d8, val1);
+ VEOR_NEON(q0);
+ VMOV_M2R_NEON(d9, val3);
+ VMOV_M2R_NEON(d10, val2);
+ VEOR_NEON(q1);
+ VMOV_M2R_NEON(d11, val4);
+ VDUP_NEON(q3, ru);
+ VDUP_NEON(d14, rv);
+ VZIP_NEON(q4, q0);
+ VZIP_NEON(q5, q1);
+ VMOV_R2R_NEON(d9, d0);
+ VMOV_R2R_NEON(d11, d2);
+ // by this point we have all required data in right registers
+ INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
+ VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
+ INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
+# ifdef COLMUL
+ MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
+# endif
+ VMOV_R2M_NEON(q4, d8, d); // save result to d
+ }
+ else
+ *d = val1;
# else
val1 = INTERP_256(ru, val2, val1);
val3 = INTERP_256(ru, val4, val3);
}
#else
{
+# ifdef SCALE_USING_NEON
+# ifdef COLMUL
+# ifndef COLBLACK
+ // c1 and c2 are constants inside the cycle
+ FPU_NEON;
+ VMOV_M2R_NEON(d10, c1);
+ VEOR_NEON(q0);
+ VMOV_M2R_NEON(d11, c2);
+ VZIP_NEON(q5, q0);
+ VMOV_R2R_NEON(d11, d0);
+# endif
+# endif
+# endif
while (ww > 0)
{
# ifdef COLMUL
-# ifndef COLBLACK
+# ifndef COLBLACK
DATA32 val1;
# ifdef COLSAME
# else
# ifdef COLMUL
val1 = *s; // col
# ifdef COLSAME
+# ifdef SCALE_USING_NEON
*d = MUL4_SYM(c1, val1);
-# else
+# else
+ *d = MUL4_SYM(c1, val1); // XXX: do this in neon
+# endif
+# else
+# ifdef SCALE_USING_NEON
+ FPU_NEON;
+ VMOV_M2R_NEON(d12, val1);
+ VMOV_R2R_NEON(q4, q5);
+ VEOR_NEON(q1);
+ VDUP_NEON(d15, cv >> 16);
+ VZIP_NEON(q6, q1);
+ INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
+ MUL4_SYM_NEON(d8, d12, d4); // multiply
+ VMOV_R2M_NEON(q4, d8, d); // save result
+# else
cval = INTERP_256((cv >> 16), c2, c1); // col
*d = MUL4_SYM(cval, val1);
cv += cd; // col
+# endif
# endif
# else
*d = *s;
# include "evas_scale_smooth_scaler.c"
#endif
+#ifdef BUILD_NEON
+# undef SCALE_FUNC
+# undef SCALE_USING_NEON
+# define SCALE_USING_NEON
+# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
+# include "evas_scale_smooth_scaler.c"
+# undef SCALE_USING_NEON
+#endif
+
#undef SCALE_FUNC
#define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
#undef SCALE_USING_MMX
if (mmx)
cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
else
+#endif
+#ifdef BUILD_NEON
+ if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+ cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
+ else
#endif
cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
src_region_x, src_region_y, src_region_w, src_region_h,
dst_region_x, dst_region_y, dst_region_w, dst_region_h);
else
+#endif
+#ifdef BUILD_NEON
+ if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+ _evas_common_scale_rgba_in_to_out_clip_smooth_neon
+ (src, dst,
+ dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
+ mul_col, render_op,
+ src_region_x, src_region_y, src_region_w, src_region_h,
+ dst_region_x, dst_region_y, dst_region_w, dst_region_h);
+ else
#endif
_evas_common_scale_rgba_in_to_out_clip_smooth_c
(src, dst,
dst_region_w, dst_region_h);
else
# endif
+#ifdef BUILD_NEON
+ if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+ evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+ src_region_x, src_region_y,
+ src_region_w, src_region_h,
+ dst_region_x, dst_region_y,
+ dst_region_w, dst_region_h);
+ else
+#endif
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
src_region_x, src_region_y,
src_region_w, src_region_h,
dst_region_w, dst_region_h);
else
# endif
- evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
+#ifdef BUILD_NEON
+ if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+ evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+ src_region_x, src_region_y,
+ src_region_w, src_region_h,
+ dst_region_x, dst_region_y,
+ dst_region_w, dst_region_h);
+ else
+#endif
+ evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
src_region_x, src_region_y,
src_region_w, src_region_h,
dst_region_x, dst_region_y,
MOV_A2R(ay, mm4)
pxor_r2r(mm0, mm0);
MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+ FPU_NEON;
+ VDUP_NEON(d12, ay);
+ VMOV_I2R_NEON(q2, #255);
#endif
pbuf = buf; pbuf_end = buf + dst_clip_w;
sxx = sxx0;
INTERP_256_R2R(mm4, mm2, mm1, mm5)
MOV_R2P(mm1, *pbuf, mm0)
pbuf++;
+#elif defined SCALE_USING_NEON
+ if (p0 | p1 | p2 | p3)
+ {
+ FPU_NEON;
+ VMOV_M2R_NEON(d8, p0);
+ VEOR_NEON(q0);
+ VMOV_M2R_NEON(d9, p2);
+ VMOV_M2R_NEON(d10, p1);
+ VEOR_NEON(q1);
+ VMOV_M2R_NEON(d11, p3);
+ VDUP_NEON(q3, ax);
+ VZIP_NEON(q4, q0);
+ VZIP_NEON(q5, q1);
+ VMOV_R2R_NEON(d9, d0);
+ VMOV_R2R_NEON(d11, d2);
+ INTERP_256_NEON(q3, q5, q4, q2);
+ INTERP_256_NEON(d12, d9, d8, d5);
+ VMOV_R2M_NEON(q4, d8, pbuf);
+ pbuf++;
+ }
+ else
+ *pbuf++ = p0;
#else
if (p0 | p1)
p0 = INTERP_256(ax, p1, p0);
#endif
+/* some useful NEON macros */
+
+#ifdef BUILD_NEON
+#define FPU_NEON \
+ __asm__ __volatile__(".fpu neon \n\t");
+
+/* copy reg1 to reg2 */
+#define VMOV_R2R_NEON(reg1, reg2) \
+ __asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
+
+/* copy 32bit value to lower bits of register reg */
+#define VMOV_M2R_NEON(reg, value) \
+ __asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg);
+
+/* save 32bit value from lower 64 bits of register regq to memory location */
+/* pointed to by pointer, using 64bit register regd as temporary location */
+#define VMOV_R2M_NEON(regq, regd, pointer) \
+ __asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
+ "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
+
+/* spread constant imm in register reg */
+#define VMOV_I2R_NEON(reg, imm) \
+ __asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
+
+/* spread value in register reg */
+#define VDUP_NEON(reg, value) \
+ __asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg);
+
+/* interleave contents of reg1 and reg2 */
+#define VZIP_NEON(reg1, reg2) \
+ __asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* swap contents of two registers */
+#define VSWP_NEON(reg1, reg2) \
+ __asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* set register to zero */
+#define VEOR_NEON(reg) \
+ __asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
+
+/* do interpolation of every channel RGBA, result is contained in regy */
+#define INTERP_256_NEON(rega, regx, regy, reg255) \
+ __asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
+ "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
+ "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+ "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
+ "vand " #regy ", " #regx ", " #reg255 " \n\t" \
+ ::: #regx, #regy );
+
+/* multiply every channel of regx and regy */
+#define MUL4_SYM_NEON(regx, regy, reg255) \
+ __asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
+ "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
+ "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+ "vand " #regx ", " #regx ", " #reg255 " \n\t" \
+ ::: #regx );
+
+#endif
/* some useful SSE3 inline functions */