Add neon for upscaling and map routines in evas.
authorYury Usischev <y.usishchev@samsung.com>
Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>
Fri, 2 Aug 2013 09:06:55 +0000 (18:06 +0900)
AUTHORS
ChangeLog
NEWS
src/lib/evas/common/evas_map_image_core.c
src/lib/evas/common/evas_map_image_loop.c
src/lib/evas/common/evas_scale_smooth.c
src/lib/evas/common/evas_scale_smooth_scaler_up.c
src/lib/evas/include/evas_blend_ops.h

diff --git a/AUTHORS b/AUTHORS
index a42f1e411e76d079bbd22a6b5008c6a0baa1f8ec..e0e0cecbe4d697e074a67df335d6dac5ddd97c9f 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -125,6 +125,7 @@ Patryk Kaczmarek <patryk.k@samsung.com>
 Zbigniew Kosinski <z.kosinski@samsung.com>
 Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
 Jean-Philippe Andre <jp.andre@samsung.com>
+Yury Usischev <y.usishchev@samsung.com>
 
 
 Ecore
index 4cd2a4f3648f7a323ce7a9a3711a5ba9cf586cdf..d45dab1af318ef359b52e06910e20b5bb3270e7e 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-02  Yury Usischev
+
+        * Add neon optimizations for several scaling/map routines in evas
+
 2013-08-02  Cedric Bail
 
         * Evas: change mapping policy for image loader (RANDOM during header,
diff --git a/NEWS b/NEWS
index bbbdc067638b0bc5e3aaf3fbd0588cc31738ce50..243bf6d12aa2c1f0408abfbbb54fdb3bd77a0508 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -201,6 +201,7 @@ Improvements:
      - Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
      - Optimized path for when map use the same color for all corner.
      - Asynchronous preload of GL texture.
+     - Add neon assembly for upscaling and map routines
     * Ecore_Con:
      - Rebase dns.c against upstream
     * Edje:
index 7e44c4b161635091c40f35061e5ffdb7f1a955b7..6e2be0e30ae510ae12910f6dea906044b5fe1c66 100644 (file)
@@ -19,6 +19,9 @@
 #ifdef SCALE_USING_MMX
              pxor_r2r(mm0, mm0);
              MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+             FPU_NEON;
+             VMOV_I2R_NEON(q2, #255);
 #endif
                
              line = &(spans[y - ystart]);
index fc322860aa2fbc814280dc7998e15244f90cfc16..a8a49eb7f439311be9998df7948c0c7e719f9b2f 100644 (file)
@@ -1,13 +1,27 @@
 #ifdef SMOOTH
 {
 # ifdef SCALE_USING_MMX
-#   ifdef COLMUL
-#    ifdef COLSAME
+#  ifdef COLMUL
+#   ifdef COLSAME
    MOV_P2R(c1, mm7, mm0); // col
-#    endif   
 #   endif   
 #  endif   
-   while (ww > 0)
+# endif
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // this part can be done here as c1 and c2 are constants in the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d18, c1);
+   VEOR_NEON(q8);
+   VMOV_M2R_NEON(d19, c2);
+   VZIP_NEON(q9, q8);
+   VMOV_R2R_NEON(d19, d16);
+   // here we have c1 and c2 spread through q9 register
+#   endif
+#  endif
+# endif
+     while (ww > 0)
      {
 # ifdef COLBLACK
         *d = 0xff000000; // col
 #    endif        
 #   endif                            
         MOV_R2P(mm1, *d, mm0);
+#  elif defined SCALE_USING_NEON
+        // not sure if we need this condition, but it doesn't affect the result
+        if (val1 | val2 | val3 | val4)
+          {
+            FPU_NEON;
+#   ifdef COLMUL
+            // initialize alpha for interpolation of c1 and c2
+            VDUP_NEON(d15, cv >> 16);
+            // copy c1 and c2 as algorithm will overwrite it
+            VMOV_R2R_NEON(q6, q9);
+            cv += cd; // col
+#   endif
+            VMOV_M2R_NEON(d8, val1);
+            VEOR_NEON(q0);
+            VMOV_M2R_NEON(d9, val3);
+            VMOV_M2R_NEON(d10, val2);
+            VEOR_NEON(q1);
+            VMOV_M2R_NEON(d11, val4);
+            VDUP_NEON(q3, ru);
+            VDUP_NEON(d14, rv);
+            VZIP_NEON(q4, q0);
+            VZIP_NEON(q5, q1);
+            VMOV_R2R_NEON(d9, d0);
+            VMOV_R2R_NEON(d11, d2);
+            // by this point we have all required data in right registers
+            INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
+            VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
+            INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
+#   ifdef COLMUL
+            MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
+#   endif
+            VMOV_R2M_NEON(q4, d8, d); // save result to d
+          }
+        else
+          *d = val1;
 #  else
         val1 = INTERP_256(ru, val2, val1);
         val3 = INTERP_256(ru, val4, val3);
 }
 #else
 {
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // c1 and c2 are constants inside the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d10, c1);
+   VEOR_NEON(q0);
+   VMOV_M2R_NEON(d11, c2);
+   VZIP_NEON(q5, q0);
+   VMOV_R2R_NEON(d11, d0);
+#   endif
+#  endif
+# endif
    while (ww > 0)
      {
 # ifdef COLMUL
-#  ifndef COLBLACK        
+#  ifndef COLBLACK
         DATA32 val1;
 #   ifdef COLSAME
 #   else        
 #  ifdef COLMUL
         val1 = *s; // col
 #   ifdef COLSAME
+#    ifdef SCALE_USING_NEON
         *d = MUL4_SYM(c1, val1);
-#   else        
+#    else
+        *d = MUL4_SYM(c1, val1); // XXX: do this in neon
+#    endif
+#   else
+#    ifdef SCALE_USING_NEON
+        FPU_NEON;
+        VMOV_M2R_NEON(d12, val1);
+        VMOV_R2R_NEON(q4, q5);
+        VEOR_NEON(q1);
+        VDUP_NEON(d15, cv >> 16);
+        VZIP_NEON(q6, q1);
+        INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
+        MUL4_SYM_NEON(d8, d12, d4); // multiply
+        VMOV_R2M_NEON(q4, d8, d); // save result
+#    else
         cval = INTERP_256((cv >> 16), c2, c1); // col
         *d = MUL4_SYM(cval, val1);
         cv += cd; // col              
+#    endif
 #   endif        
 #  else
         *d = *s;
index 02dbe7d44ddac5f137f1a927d4f91f7caa9825be..61bda22b0a4087653f1ca91722dc1d54199484c3 100644 (file)
@@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
 # include "evas_scale_smooth_scaler.c"
 #endif
 
+#ifdef BUILD_NEON
+# undef SCALE_FUNC
+# undef SCALE_USING_NEON
+# define SCALE_USING_NEON
+# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
+# include "evas_scale_smooth_scaler.c"
+# undef SCALE_USING_NEON
+#endif
+
 #undef SCALE_FUNC
 #define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
 #undef SCALE_USING_MMX
@@ -196,6 +205,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst,
    if (mmx)
      cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
    else
+#endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
+   else
 #endif
      cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
 
@@ -222,6 +236,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli
         src_region_x, src_region_y, src_region_w, src_region_h,
         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
    else
+#endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       _evas_common_scale_rgba_in_to_out_clip_smooth_neon
+     (src, dst,
+         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
+         mul_col, render_op,
+         src_region_x, src_region_y, src_region_w, src_region_h,
+         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
+   else
 #endif
      _evas_common_scale_rgba_in_to_out_clip_smooth_c
        (src, dst,
@@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
                                               dst_region_w, dst_region_h);
        else
 # endif
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
          evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                          src_region_x, src_region_y,
                                                          src_region_w, src_region_h,
@@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
                                               dst_region_w, dst_region_h);
        else
 # endif
-         evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
+            evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                          src_region_x, src_region_y,
                                                          src_region_w, src_region_h,
                                                          dst_region_x, dst_region_y,
index e43e0c7a6cad878a8ff0d50e0b89075a9f2960ea..4b21d598dd8d8eb3f939c5311f794cbcb2edc5bb 100644 (file)
            MOV_A2R(ay, mm4)
            pxor_r2r(mm0, mm0);
            MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+           FPU_NEON;
+           VDUP_NEON(d12, ay);
+           VMOV_I2R_NEON(q2, #255);
 #endif
            pbuf = buf;  pbuf_end = buf + dst_clip_w;
            sxx = sxx0;
                INTERP_256_R2R(mm4, mm2, mm1, mm5)
                MOV_R2P(mm1, *pbuf, mm0)
                pbuf++;
+#elif defined SCALE_USING_NEON
+               if (p0 | p1 | p2 | p3)
+                 {
+                   FPU_NEON;
+                   VMOV_M2R_NEON(d8, p0);
+                   VEOR_NEON(q0);
+                   VMOV_M2R_NEON(d9, p2);
+                   VMOV_M2R_NEON(d10, p1);
+                   VEOR_NEON(q1);
+                   VMOV_M2R_NEON(d11, p3);
+                   VDUP_NEON(q3, ax);
+                   VZIP_NEON(q4, q0);
+                   VZIP_NEON(q5, q1);
+                   VMOV_R2R_NEON(d9, d0);
+                   VMOV_R2R_NEON(d11, d2);
+                   INTERP_256_NEON(q3, q5, q4, q2);
+                   INTERP_256_NEON(d12, d9, d8, d5);
+                   VMOV_R2M_NEON(q4, d8, pbuf);
+                   pbuf++;
+                 }
+               else
+                 *pbuf++ = p0;
 #else
                if (p0 | p1)
                  p0 = INTERP_256(ax, p1, p0);
index 0a78843579acd758f03fe2aa457ef6dafc65fc0b..3ae94379ec2bdfa2f0161d1d97df2d1cb7c23551 100644 (file)
@@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;
 
 #endif
 
+/* some useful NEON macros */
+
+#ifdef BUILD_NEON
+#define FPU_NEON \
+       __asm__ __volatile__(".fpu neon \n\t");
+
+/* copy reg1 to reg2 */
+#define VMOV_R2R_NEON(reg1, reg2) \
+       __asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
+
+/* copy 32bit value to lower bits of register reg */
+#define VMOV_M2R_NEON(reg, value) \
+       __asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* save 32bit value from lower 64 bits of register regq to memory location */
+/* pointed to by pointer, using 64bit register regd as temporary location */
+#define VMOV_R2M_NEON(regq, regd, pointer) \
+       __asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
+                            "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
+
+/* spread constant imm in register reg */
+#define VMOV_I2R_NEON(reg, imm) \
+       __asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
+
+/* spread value in register reg */
+#define VDUP_NEON(reg, value) \
+       __asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* interleave contents of reg1 and reg2 */
+#define VZIP_NEON(reg1, reg2) \
+       __asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* swap contents of two registers */
+#define VSWP_NEON(reg1, reg2) \
+       __asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* set register to zero */
+#define VEOR_NEON(reg) \
+       __asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
+
+/* do interpolation of every channel RGBA, result is contained in regy */
+#define INTERP_256_NEON(rega, regx, regy, reg255) \
+       __asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
+                            "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+                            "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vand " #regy ", " #regx ", " #reg255 " \n\t" \
+                            ::: #regx, #regy );
+
+/* multiply every channel of regx and regy */
+#define MUL4_SYM_NEON(regx, regy, reg255) \
+       __asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
+                            "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
+                            "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+                            "vand " #regx ", " #regx ", " #reg255 " \n\t" \
+                            ::: #regx );
+
+#endif
 
 /* some useful SSE3 inline functions */