evas - fix neon blend code used for text rendering to not leave dirty end
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>
Fri, 15 Nov 2013 10:16:03 +0000 (19:16 +0900)
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>
Fri, 15 Nov 2013 10:17:01 +0000 (19:17 +0900)
src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c

index da7cd3e..252f276 100644 (file)
 #ifdef BUILD_NEON
 static void
 _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e;
-
-   DEBUG_FNCOUNT("");
-
-#define AP "blend_mas_c_dp_"
-     asm volatile (
-       ".fpu neon                                              \n\t"
-       "       vdup.i32        q15, %[c]                       \n\t"
-       "       vmov.i8         q14,    #1                      \n\t"
-
-               // If aligned already - straight to quads
-       "       andS            %[tmp], %[d],$0xf               \n\t"
-       "       beq             "AP"quadloops                   \n\t"
-
-       "       andS            %[tmp], %[d],$0x4               \n\t"
-       "       beq             "AP"dualloop                    \n\t"
-
-       AP"singleloop:                                          \n\t"
-       "       vld1.8          d0[0],  [%[m]]!                 \n\t"
-       "       vld1.32         d4[0],  [%[d]]                  \n\t"
-       "       vdup.u8         d0,     d0[0]                   \n\t"
-       "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
-       "       vmvn.u16        d14,    d12                     \n\t"
-       "       vshr.u32        d16,    d14, #24                \n\t"
-       "       vmul.u32        d16,    d16, d28                \n\t"
-       "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
-       "       vqadd.u8        d0,     d0, d12                 \n\t"
-       "       vst1.32         d0[0],  [%[d]]!                 \n\t"
-
-               // Can we go the fast path?
-       "       andS            %[tmp], %[d],$0xf               \n\t"
-       "       beq             "AP"quadloops                   \n\t"
-
-       AP"dualloop:                                            \n\t"
-       "       sub             %[tmp], %[e], %[d]              \n\t"
-       "       cmp             %[tmp], #16                     \n\t"
-       "       blt             "AP"loopout                     \n\t"
-
-       "       vld1.16         d0[0],  [%[m]]!                 \n\t"
-       "       vldm            %[d],   {d4}                    \n\t"
-       "       vmovl.u8        q0,     d0                      \n\t"
-       "       vmovl.u8        q0,     d0                      \n\t"
-       "       vmul.u32        q0,     q14                     \n\t"
-       "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
-       "       vmvn.u16        d14,    d12                     \n\t"
-       "       vshr.u32        d16,    d14, #24                \n\t"
-       "       vmul.u32        d16,    d16, d28                \n\t"
-       "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
-       "       vqadd.u8        q0,     q0, q6                  \n\t"
-       "       vstm            %[d]!,  {d0}                    \n\t"
-
-       AP"quadloops:                                           \n\t"
-       "       sub             %[tmp], %[e], %[d]              \n\t"
-       "       cmp             %[tmp], #16                     \n\t"
-       "       blt             "AP"loopout                     \n\t"
-
-
-       "       sub             %[tmp], %[e], #15               \n\t"
-
-       "       sub             %[d],   #16                     \n\t"
-       AP"fastloop:"
-       "       add             %[d],   #16                     \n\t"
-       "       cmp             %[tmp], %[d]                    \n\t"
-       "       ble             "AP"loopout                     \n\t"
-       AP"quadloopint:                                         \n\t"
-       "       ldr             %[x],   [%[m]]                  \n\t"
-       "       add             %[m], #4                        \n\t"
-       "       cmp             %[x],   #0                      \n\t"
-       "       beq             "AP"fastloop                    \n\t"
-       "       vmov.32         d0[0],  %[x]                    \n\t"
-       "       vldm            %[d], {d4,d5}                   \n\t"
-
-       // Expand M: Fixme: Can we do this quicker?
-       "       vmovl.u8        q0,     d0                      \n\t"
-       "       vmovl.u8        q0,     d0                      \n\t"
-       "       vmul.u32        q0,     q14                     \n\t"
-
-       // Multiply     a * c
-       "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vmull.u8        q5,     d1, d31                 \n\t"
-
-       // Shorten
-       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
-       "       vqrshrn.u16     d13,    q5, #8                  \n\t"
-
-       // extract negated alpha
-       "       vmvn.u16        q7,     q6                      \n\t"
-       "       vshr.u32        q8,     q7, #24                 \n\t"
-       "       vmul.u32        q8,     q8, q14                 \n\t"
-
-       // Multiply
-       "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vmull.u8        q8,     d17, d5                 \n\t"
-
-       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
-       "       vqrshrn.u16     d1,     q8, #8                  \n\t"
-
-       // Add
-       "       vqadd.u8        q0,     q0, q6                  \n\t"
-
-       "       vstm            %[d]!,  {d0,d1}                 \n\t"
-
-       "       cmp             %[tmp], %[d]                    \n\t"
-       "       bhi             "AP"quadloopint                 \n\t"
-
-       AP"loopout:                                             \n\t"
-#if NEONDEBUG
-               "cmp            %[d], %[e]              \n\t"
-               "ble            "AP"foo         \n\t"
-               "cmp            %[tmp], %[m]    \n\t"
-               "sub            %[x],   %[x]            \n\t"
-               "vst1.32        d0[0], [%[x]]           \n\t"
-       AP"foo: \n\t"
-#endif
-
-       "       cmp             %[d], %[e]                      \n\t"
-       "       beq             "AP"done                        \n\t"
-       "       sub             %[tmp],%[e], %[d]               \n\t"
-       "       cmp             %[tmp],#4                       \n\t"
-       "       beq             "AP"singleout                   \n\t"
-
-       AP "dualloop2:                                  \n\t"
-               "sub            %[tmp],%[e],$0x8        \n\t"
-       "       vld1.16         d0[0],  [%[m]]!                 \n\t"
-       "       vldm            %[d],   {d4}                    \n\t"
-       "       vmovl.u8        q0,     d0                      \n\t"
-       "       vmovl.u8        q0,     d0                      \n\t"
-       "       vmul.u32        q0,     q14                     \n\t"
-       "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
-       "       vmvn.u16        d14,    d12                     \n\t"
-       "       vshr.u32        d16,    d14, #24                \n\t"
-       "       vmul.u32        d16,    d16, d28                \n\t"
-       "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
-       "       vqadd.u8        q0,     q0, q6                  \n\t"
-       "       vstm            %[d]!,  {d0}                    \n\t"
-
-       "       cmp             %[e], %[d]              \n\t"
-       "       beq             "AP"done                \n\t"
-
-       AP"singleout:                                           \n\t"
-       "       vld1.8          d0[0],  [%[m]]!                 \n\t"
-       "       vld1.32         d4[0],  [%[d]]                  \n\t"
-       "       vdup.u8         d0,     d0[0]                   \n\t"
-       "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
-       "       vmvn.u16        d14,    d12                     \n\t"
-       "       vshr.u32        d16,    d14, #24                \n\t"
-       "       vmul.u32        d16,    d16, d28                \n\t"
-       "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
-       "       vqadd.u8        q0,     q0, q6                  \n\t"
-       "       vst1.32         d0[0],  [%[d]]!                 \n\t"
-
-       AP"done:                                                \n\t"
-#if NEONDEBUG
-               "cmp            %[d], %[e]              \n\t"
-               "beq            "AP"reallydone          \n\t"
-               "sub            %[tmp], %[tmp]          \n\t"
-               "vst1.32        d0[0], [%[tmp]]         \n\t"
-       AP"reallydone:"
-#endif
-       : // Out
-       :  [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
-               [tmp] "r" (7), [m] "r" (m), [x] "r" (0)
-          : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15",
-                       "memory" // clobbered
-       );
-#undef AP
+   // main loop process data in pairs, so we need count to be even
+   DATA32 *e = d + l - (l % 2);
+
+   // everything we can do only once per cycle
+   // loading of 'c', initialization of some registers
+   __asm__ __volatile__
+   (
+       ".fpu neon                                      \n\t"
+       "       vmov.32         d30[0], %[c]            \n\t"
+       "       vmov.i16        q10,    #255            \n\t"
+       "       vmov.i16        q11,    #256            \n\t"
+       "       veor            d29,    d29, d29        \n\t"
+       "       vzip.8          d30,    d29             \n\t"
+       "       vmov            d31,    d30             \n\t"
+     :
+     : [c] "r" (c)
+     : "q10", "q11", "q15", "d29"
+   );
+   while (d < e)
+     {
+        // main cycle
+        __asm__ __volatile__
+        (
+            // load pair '*d' and '*(d+1)' into vector register
+            "       vldm            %[d],   {d4}            \n\t"
+
+            // load '*m' and '*(m+1)'
+            "       veor            q0,     q0, q0          \n\t"
+            "       vld1.8          d0[0],  [%[m]]!         \n\t"
+            "       vld1.8          d1[0],  [%[m]]!         \n\t"
+
+            // spread values from d in vector registers so for each
+            // 8 bit channel data we have 8 bit of zeros
+            // so each 32bit value occupies now one 64 bit register
+            "       veor            d5,     d5, d5          \n\t"
+            "       vzip.8          d4,     d5              \n\t"
+
+            // copy *m values in corresponding registers
+            "       vdup.u16        d0,     d0[0]           \n\t"
+            "       vdup.u16        d1,     d1[0]           \n\t"
+
+            // multiply a * c
+            "       vmul.u16        q13,    q0, q15         \n\t"
+            "       vadd.i16        q13,    q13, q10        \n\t"
+            "       vsri.16         q13,    q13, #8         \n\t"
+            "       vand            q13,    q13, q10        \n\t"
+
+            // extract negated alpha
+            "       vdup.u16        d24,    d26[3]          \n\t"
+            "       vdup.u16        d25,    d27[3]          \n\t"
+            "       vsub.i16        q12,    q11, q12        \n\t"
+
+            // multiply alpha * (*d) and add a*c
+            "       vmul.u16        q2,     q2, q12         \n\t"
+            "       vsri.16         q2,     q2, #8          \n\t"
+            "       vand            q2,     q2, q10         \n\t"
+            "       vadd.i16        q2,     q2, q13         \n\t"
+            "       vand            q2,     q2, q10         \n\t"
+
+            // save results
+            "       vqmovn.u16      d4,     q2              \n\t"
+            "       vstm            %[d]!,  {d4}            \n\t"
+          : [d] "+r" (d), [m] "+r" (m)
+          : [c] "r" (c)
+          : "q0", "q2", "q15", "q13", "q12", "q11", "q10",
+            "memory"
+        );
+     }
+   if (l % 2)
+     {
+        // do analogue of main loop for last element, if needed
+        __asm__ __volatile__
+        (
+            "       vld1.32         d4[0],  [%[d]]          \n\t"
+
+            "       veor            d0,     d0, d0          \n\t"
+            "       vld1.8          d0[0],  [%[m]]!         \n\t"
+
+            "       veor            d5,     d5, d5          \n\t"
+            "       vzip.8          d4,     d5              \n\t"
+
+            "       vdup.u16        d0,     d0[0]           \n\t"
+
+            "       vmul.u16        d26,    d0, d30         \n\t"
+            "       vadd.i16        d26,    d26, d20        \n\t"
+            "       vsri.16         d26,    d26, #8         \n\t"
+            "       vand            d26,    d26, d20        \n\t"
+
+            "       vdup.u16        d24,    d26[3]          \n\t"
+
+            "       vsub.i16        d24,    d22, d24        \n\t"
+            "       vmul.u16        d4,     d4, d24         \n\t"
+            "       vsri.16         d4,     d4, #8          \n\t"
+            "       vand            d4,     d4, d20         \n\t"
+            "       vadd.i16        d4,     d4, d26         \n\t"
+            "       vand            d4,     d4, d20         \n\t"
+
+            "       vqmovn.u16      d4,     q2              \n\t"
+            "       vst1.32         {d4[0]}, [%[d]]!        \n\t"
+          : [d] "+r" (d), [m] "+r" (m)
+          : [c] "r" (c)
+          : "q0", "q2", "q15", "q13", "q12", "q11", "q10",
+            "memory"
+        );
+     }
 }
 #endif