Add neon for upscaling and map routines in evas.

author Yury Usischev <y.usishchev@samsung.com>

Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)

committer Carsten Haitzler (Rasterman) <raster@rasterman.com>

Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
author Yury Usischev <y.usishchev@samsung.com>
Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
committer Carsten Haitzler (Rasterman) <raster@rasterman.com>
Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
diff --git a/AUTHORS b/AUTHORS

index a42f1e411e76d079bbd22a6b5008c6a0baa1f8ec..e0e0cecbe4d697e074a67df335d6dac5ddd97c9f 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -125,6 +125,7 @@ Patryk Kaczmarek <patryk.k@samsung.com>
  Zbigniew Kosinski <z.kosinski@samsung.com>
  Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
  Jean-Philippe Andre <jp.andre@samsung.com>
+Yury Usischev <y.usishchev@samsung.com>
  
  
  Ecore
diff --git a/ChangeLog b/ChangeLog

index 4cd2a4f3648f7a323ce7a9a3711a5ba9cf586cdf..d45dab1af318ef359b52e06910e20b5bb3270e7e 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-02  Yury Usischev
+
+        * Add neon optimizations for several scaling/map routines in evas
+
  2013-08-02  Cedric Bail
  
          * Evas: change mapping policy for image loader (RANDOM during header,
diff --git a/NEWS b/NEWS

index bbbdc067638b0bc5e3aaf3fbd0588cc31738ce50..243bf6d12aa2c1f0408abfbbb54fdb3bd77a0508 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -201,6 +201,7 @@ Improvements:
       - Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
       - Optimized path for when map use the same color for all corner.
       - Asynchronous preload of GL texture.
+     - Add neon assembly for upscaling and map routines
      * Ecore_Con:
       - Rebase dns.c against upstream
      * Edje:
diff --git a/src/lib/evas/common/evas_map_image_core.c b/src/lib/evas/common/evas_map_image_core.c

index 7e44c4b161635091c40f35061e5ffdb7f1a955b7..6e2be0e30ae510ae12910f6dea906044b5fe1c66 100644 (file)
--- a/src/lib/evas/common/evas_map_image_core.c
+++ b/src/lib/evas/common/evas_map_image_core.c
@@ -19,6 +19,9 @@
  #ifdef SCALE_USING_MMX
               pxor_r2r(mm0, mm0);
               MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+             FPU_NEON;
+             VMOV_I2R_NEON(q2, #255);
  #endif
                 
               line = &(spans[y - ystart]);
diff --git a/src/lib/evas/common/evas_map_image_loop.c b/src/lib/evas/common/evas_map_image_loop.c

index fc322860aa2fbc814280dc7998e15244f90cfc16..a8a49eb7f439311be9998df7948c0c7e719f9b2f 100644 (file)
--- a/src/lib/evas/common/evas_map_image_loop.c
+++ b/src/lib/evas/common/evas_map_image_loop.c
@@ -1,13 +1,27 @@
  #ifdef SMOOTH
  {
  # ifdef SCALE_USING_MMX
-#   ifdef COLMUL
-#    ifdef COLSAME
+#  ifdef COLMUL
+#   ifdef COLSAME
     MOV_P2R(c1, mm7, mm0); // col
-#    endif   
  #   endif   
  #  endif   
-   while (ww > 0)
+# endif
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // this part can be done here as c1 and c2 are constants in the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d18, c1);
+   VEOR_NEON(q8);
+   VMOV_M2R_NEON(d19, c2);
+   VZIP_NEON(q9, q8);
+   VMOV_R2R_NEON(d19, d16);
+   // here we have c1 and c2 spread through q9 register
+#   endif
+#  endif
+# endif
+     while (ww > 0)
       {
  # ifdef COLBLACK
          *d = 0xff000000; // col
@@ -77,6 +91,41 @@
  #    endif        
  #   endif                            
          MOV_R2P(mm1, *d, mm0);
+#  elif defined SCALE_USING_NEON
+        // not sure if we need this condition, but it doesn't affect the result
+        if (val1 | val2 | val3 | val4)
+          {
+            FPU_NEON;
+#   ifdef COLMUL
+            // initialize alpha for interpolation of c1 and c2
+            VDUP_NEON(d15, cv >> 16);
+            // copy c1 and c2 as algorithm will overwrite it
+            VMOV_R2R_NEON(q6, q9);
+            cv += cd; // col
+#   endif
+            VMOV_M2R_NEON(d8, val1);
+            VEOR_NEON(q0);
+            VMOV_M2R_NEON(d9, val3);
+            VMOV_M2R_NEON(d10, val2);
+            VEOR_NEON(q1);
+            VMOV_M2R_NEON(d11, val4);
+            VDUP_NEON(q3, ru);
+            VDUP_NEON(d14, rv);
+            VZIP_NEON(q4, q0);
+            VZIP_NEON(q5, q1);
+            VMOV_R2R_NEON(d9, d0);
+            VMOV_R2R_NEON(d11, d2);
+            // by this point we have all required data in right registers
+            INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
+            VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
+            INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
+#   ifdef COLMUL
+            MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
+#   endif
+            VMOV_R2M_NEON(q4, d8, d); // save result to d
+          }
+        else
+          *d = val1;
  #  else
          val1 = INTERP_256(ru, val2, val1);
          val3 = INTERP_256(ru, val4, val3);
@@ -102,10 +151,23 @@
  }
  #else
  {
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // c1 and c2 are constants inside the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d10, c1);
+   VEOR_NEON(q0);
+   VMOV_M2R_NEON(d11, c2);
+   VZIP_NEON(q5, q0);
+   VMOV_R2R_NEON(d11, d0);
+#   endif
+#  endif
+# endif
     while (ww > 0)
       {
  # ifdef COLMUL
-#  ifndef COLBLACK        
+#  ifndef COLBLACK
          DATA32 val1;
  #   ifdef COLSAME
  #   else        
@@ -121,11 +183,27 @@
  #  ifdef COLMUL
          val1 = *s; // col
  #   ifdef COLSAME
+#    ifdef SCALE_USING_NEON
          *d = MUL4_SYM(c1, val1);
-#   else        
+#    else
+        *d = MUL4_SYM(c1, val1); // XXX: do this in neon
+#    endif
+#   else
+#    ifdef SCALE_USING_NEON
+        FPU_NEON;
+        VMOV_M2R_NEON(d12, val1);
+        VMOV_R2R_NEON(q4, q5);
+        VEOR_NEON(q1);
+        VDUP_NEON(d15, cv >> 16);
+        VZIP_NEON(q6, q1);
+        INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
+        MUL4_SYM_NEON(d8, d12, d4); // multiply
+        VMOV_R2M_NEON(q4, d8, d); // save result
+#    else
          cval = INTERP_256((cv >> 16), c2, c1); // col
          *d = MUL4_SYM(cval, val1);
          cv += cd; // col              
+#    endif
  #   endif        
  #  else
          *d = *s;
diff --git a/src/lib/evas/common/evas_scale_smooth.c b/src/lib/evas/common/evas_scale_smooth.c

index 02dbe7d44ddac5f137f1a927d4f91f7caa9825be..61bda22b0a4087653f1ca91722dc1d54199484c3 100644 (file)
--- a/src/lib/evas/common/evas_scale_smooth.c
+++ b/src/lib/evas/common/evas_scale_smooth.c
@@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
  # include "evas_scale_smooth_scaler.c"
  #endif
  
+#ifdef BUILD_NEON
+# undef SCALE_FUNC
+# undef SCALE_USING_NEON
+# define SCALE_USING_NEON
+# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
+# include "evas_scale_smooth_scaler.c"
+# undef SCALE_USING_NEON
+#endif
+
  #undef SCALE_FUNC
  #define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
  #undef SCALE_USING_MMX
@@ -196,6 +205,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst,
     if (mmx)
       cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
     else
+#endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
+   else
  #endif
       cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
  
@@ -222,6 +236,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli
          src_region_x, src_region_y, src_region_w, src_region_h,
          dst_region_x, dst_region_y, dst_region_w, dst_region_h);
     else
+#endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       _evas_common_scale_rgba_in_to_out_clip_smooth_neon
+     (src, dst,
+         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
+         mul_col, render_op,
+         src_region_x, src_region_y, src_region_w, src_region_h,
+         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
+   else
  #endif
       _evas_common_scale_rgba_in_to_out_clip_smooth_c
         (src, dst,
@@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
                                                dst_region_w, dst_region_h);
         else
  # endif
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
           evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                           src_region_x, src_region_y,
                                                           src_region_w, src_region_h,
@@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
                                                dst_region_w, dst_region_h);
         else
  # endif
-         evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
+            evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                           src_region_x, src_region_y,
                                                           src_region_w, src_region_h,
                                                           dst_region_x, dst_region_y,
diff --git a/src/lib/evas/common/evas_scale_smooth_scaler_up.c b/src/lib/evas/common/evas_scale_smooth_scaler_up.c

index e43e0c7a6cad878a8ff0d50e0b89075a9f2960ea..4b21d598dd8d8eb3f939c5311f794cbcb2edc5bb 100644 (file)
--- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c
+++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
@@ -172,6 +172,10 @@
             MOV_A2R(ay, mm4)
             pxor_r2r(mm0, mm0);
             MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+           FPU_NEON;
+           VDUP_NEON(d12, ay);
+           VMOV_I2R_NEON(q2, #255);
  #endif
             pbuf = buf;  pbuf_end = buf + dst_clip_w;
             sxx = sxx0;
@@ -210,6 +214,28 @@
                 INTERP_256_R2R(mm4, mm2, mm1, mm5)
                 MOV_R2P(mm1, *pbuf, mm0)
                 pbuf++;
+#elif defined SCALE_USING_NEON
+               if (p0 | p1 | p2 | p3)
+                 {
+                   FPU_NEON;
+                   VMOV_M2R_NEON(d8, p0);
+                   VEOR_NEON(q0);
+                   VMOV_M2R_NEON(d9, p2);
+                   VMOV_M2R_NEON(d10, p1);
+                   VEOR_NEON(q1);
+                   VMOV_M2R_NEON(d11, p3);
+                   VDUP_NEON(q3, ax);
+                   VZIP_NEON(q4, q0);
+                   VZIP_NEON(q5, q1);
+                   VMOV_R2R_NEON(d9, d0);
+                   VMOV_R2R_NEON(d11, d2);
+                   INTERP_256_NEON(q3, q5, q4, q2);
+                   INTERP_256_NEON(d12, d9, d8, d5);
+                   VMOV_R2M_NEON(q4, d8, pbuf);
+                   pbuf++;
+                 }
+               else
+                 *pbuf++ = p0;
  #else
                 if (p0 | p1)
                   p0 = INTERP_256(ax, p1, p0);
diff --git a/src/lib/evas/include/evas_blend_ops.h b/src/lib/evas/include/evas_blend_ops.h

index 0a78843579acd758f03fe2aa457ef6dafc65fc0b..3ae94379ec2bdfa2f0161d1d97df2d1cb7c23551 100644 (file)
--- a/src/lib/evas/include/evas_blend_ops.h
+++ b/src/lib/evas/include/evas_blend_ops.h
@@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;
  
  #endif
  
+/* some useful NEON macros */
+
+#ifdef BUILD_NEON
+#define FPU_NEON \
+       __asm__ __volatile__(".fpu neon \n\t");
+
+/* copy reg1 to reg2 */
+#define VMOV_R2R_NEON(reg1, reg2) \
+       __asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
+
+/* copy 32bit value to lower bits of register reg */
+#define VMOV_M2R_NEON(reg, value) \
+       __asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* save 32bit value from lower 64 bits of register regq to memory location */
+/* pointed to by pointer, using 64bit register regd as temporary location */
+#define VMOV_R2M_NEON(regq, regd, pointer) \
+       __asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
+                            "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
+
+/* spread constant imm in register reg */
+#define VMOV_I2R_NEON(reg, imm) \
+       __asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
+
+/* spread value in register reg */
+#define VDUP_NEON(reg, value) \
+       __asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* interleave contents of reg1 and reg2 */
+#define VZIP_NEON(reg1, reg2) \
+       __asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* swap contents of two registers */
+#define VSWP_NEON(reg1, reg2) \
+       __asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* set register to zero */
+#define VEOR_NEON(reg) \
+       __asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
+
+/* do interpolation of every channel RGBA, result is contained in regy */
+#define INTERP_256_NEON(rega, regx, regy, reg255) \
+       __asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
+                            "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+                            "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vand " #regy ", " #regx ", " #reg255 " \n\t" \
+                            ::: #regx, #regy );
+
+/* multiply every channel of regx and regy */
+#define MUL4_SYM_NEON(regx, regy, reg255) \
+       __asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
+                            "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+                            "vand " #regx ", " #regx ", " #reg255 " \n\t" \
+                            ::: #regx );
+
+#endif
  
  /* some useful SSE3 inline functions */
author	Yury Usischev <y.usishchev@samsung.com>
	Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
committer	Carsten Haitzler (Rasterman) <raster@rasterman.com>
	Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
AUTHORS		patch \| blob \| history
ChangeLog		patch \| blob \| history
NEWS		patch \| blob \| history
src/lib/evas/common/evas_map_image_core.c		patch \| blob \| history
src/lib/evas/common/evas_map_image_loop.c		patch \| blob \| history
src/lib/evas/common/evas_scale_smooth.c		patch \| blob \| history
src/lib/evas/common/evas_scale_smooth_scaler_up.c		patch \| blob \| history
src/lib/evas/include/evas_blend_ops.h		patch \| blob \| history