Neon improvements for ARM Cortex.
authornash <nash@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Mon, 10 May 2010 09:24:11 +0000 (09:24 +0000)
committernash <nash@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Mon, 10 May 2010 09:24:11 +0000 (09:24 +0000)
Samsung Electronics just allowed me to release the first set of ARM neon
patches under the Evas licence.  They are silently helping improve EFL and
this another product of their help.

These patches have been tested on a Cortex A8 and show consistent improvement
across the board.  For expedite some tests up to 100% improvements, and
practical real world examples show that rendering limited applications show
similar improvements.  For instance in one application from 17fps->30fps or
for another 40->63fps.

The patches are pure neon code (intrinsics tend to generate worse code).  To
build under GCC you will need a recent GCC and the following C flags:
-mfloat-abi=softfp -mfpu=neon
I also recommend -O2 and -ffast-math.

If you have any problems please let me know.

git-svn-id: http://svn.enlightenment.org/svn/e/trunk/evas@48733 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33

src/lib/engines/common/evas_op_blend/op_blend_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c
src/lib/engines/common/evas_op_copy/op_copy_color_neon.c

index ce6ed84..52c2083 100644 (file)
 #ifdef BUILD_NEON
 static void
 _op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e, a = 256 - (c >> 24);
-   UNROLL8_PLD_WHILE(d, l, e,
-                     {
-                        *d = c + MUL_256(a, *d);
-                        d++;
-                     });
+       DATA32 *e, *tmp = 0;
+#define AP     "B_C_DP"
+   asm volatile (
+       "vdup.u32       q6, %[c]                        \n\t"
+       "vmov.i8        q5, #1                          \n\t"
+       "vmvn.u8        q7,q6                           \n\t"
+       "vshr.u32       q7, q7, $0x18                   \n\t"
+       "vmul.u32       q7,q5, q7                       \n\t"
+       "bic            %[e], #3                        \n\t"
+       "bic            %[d], #3                        \n\t"
+
+       AP "loopchoose:                                 \n\t"
+               // If aligned already - straight to quads
+               "andS           %[tmp], %[d],$0x1f              \n\t"
+               "beq            "AP"quadloops                   \n\t"
+
+               "andS           %[tmp], %[d],$0x4               \n\t"
+               "beq            "AP"dualloop                    \n\t"
+
+       // Only ever executes once, fall through to dual
+       AP "singleloop:                                 \n\t"
+               // Use 'tmp' not 'd'
+               "vld1.32        d0[0], [%[d]]           \n\t"
+               // Only touch d1
+               "vmull.u8       q0, d0, d14             \n\t"
+               "vshrn.u16      d0, q0, #8              \n\t"
+               "vadd.u8        d0, d12, d0             \n\t"
+               "vst1.32        d0[0], [%[d]]           \n\t"
+
+               "add            %[d], #4                \n\t"
+
+               // Can we go the fast path?
+               "andS           %[tmp], %[d],$0x1f      \n\t"
+               "beq            "AP"quadloops           \n\t"
+
+       AP "dualloop:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "blt            "AP"loopout                     \n\t"
+
+
+       AP "dualloopint:                                        \n\t"
+               "vldr.32        d0, [%[d]]              \n\t"
+               "vmull.u8       q1, d0, d14             \n\t"
+               "vshrn.u16      d0, q1, #8              \n\t"
+               "vqadd.u8       d0, d0, d12             \n\t"
+
+               "vstm           %[d]!, {d0}             \n\t"
+
+               "ands           %[tmp], %[d], $0x1f     \n\t"
+               "bne            "AP"dualloopint         \n\t"
+
+       AP "quadloops:                                  \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "blt            "AP"loopout                     \n\t"
+
+               "sub            %[tmp],%[e],#31 \n\t"
+
+       AP "quadloopint:\n\t"
+               "vldm   %[d],   {d0,d1,d2,d3}           \n\t"
+
+               "vmull.u8       q2, d0, d14             \n\t"
+               "vmull.u8       q3, d1, d15             \n\t"
+               "vmull.u8       q4, d2, d14             \n\t"
+               "vmull.u8       q5, d3, d15             \n\t"
+
+               "vshrn.u16      d0, q2, #8              \n\t"
+               "vshrn.u16      d1, q3, #8              \n\t"
+               "vshrn.u16      d2, q4, #8              \n\t"
+               "vshrn.u16      d3, q5, #8              \n\t"
+
+               "vqadd.u8       q0, q6, q0              \n\t"
+               "vqadd.u8       q1, q6, q1              \n\t"
+
+               "vstm   %[d]!,  {d0,d1,d2,d3}           \n\t"
+
+               "cmp     %[tmp], %[d]\n\t"
+                "bhi "AP"quadloopint\n\t"
+
+       AP "loopout:                                    \n\t"
+               "cmp            %[d], %[e]\n\t"
+                "beq           "AP"done\n\t"
+               "sub            %[tmp],%[e], %[d]       \n\t"
+               "cmp            %[tmp],#8               \n\t"
+               "blt            "AP"singleloop2         \n\t"
+
+       AP "dualloop2:                                  \n\t"
+               "sub            %[tmp],%[e],$0x7        \n\t"
+       AP "dualloop2int:                                       \n\t"
+               "vldr.64        d0, [%[d]]              \n\t"
+               "vmull.u8       q1, d0, d14             \n\t"
+               "vshrn.u16      d0, q1, #8              \n\t"
+               "vqadd.u8       d0, d0, d12             \n\t"
+
+               "vstr.64        d0, [%[d]]              \n\t"
+
+               "add            %[d], #8                \n\t"
+               "cmp            %[tmp], %[d]            \n\t"
+               "bhi            "AP"dualloop2int                \n\t"
+
+               // Single ??
+               "cmp            %[e], %[d]              \n\t"
+               "beq            "AP"done                \n\t"
+
+       AP "singleloop2:                                        \n\t"
+               "vld1.32        d0[0], [%[d]]           \n\t"
+               "vmull.u8       q1, d0, d14             \n\t"
+               "vshrn.u16      d0, q1, #8              \n\t"
+               "vqadd.u8       d0, d0, d12             \n\t"
+
+               "vst1.32        d0[0], [%[d]]           \n\t"
+
+       AP "done:\n\t"
+
+         : // output regs
+         // Input
+          :  [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp)
+          : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","memory" // clobbered
+
+       );
+#undef AP
+
 }
 
 #define _op_blend_caa_dp_neon _op_blend_c_dp_neon
index 09e6e8c..7664248 100644 (file)
@@ -6,51 +6,367 @@ static void
 _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
    DATA32 *e;
    int alpha = 256 - (c >> 24);
-   UNROLL8_PLD_WHILE(d, l, e,
-                     {
-                        DATA32 a = *m;
-                        switch(a)
-                          {
-                          case 0:
-                             break;
-                          case 255:
-                             *d = c + MUL_256(alpha, *d);
-                             break;
-                          default:
-                               {
-                                  DATA32 mc = MUL_SYM(a, c);
-                                  a = 256 - (mc >> 24);
-                                  *d = mc + MUL_256(a, *d);
-                               }
-                             break;
-                          }
-                        m++;  d++;
-                     });
+#define AP "blend_mas_c_dp_"
+     asm volatile (
+       "       vdup.i32        q15, %[c]                       \n\t"
+       "       vmov.i8         q14,    #1                      \n\t"
+
+               // If aligned already - straight to quads
+       "       andS            %[tmp], %[d],$0xf               \n\t"
+       "       beq             "AP"quadloops                   \n\t"
+
+       "       andS            %[tmp], %[d],$0x4               \n\t"
+       "       beq             "AP"dualloop                    \n\t"
+
+       AP"singleloop:                                          \n\t"
+       "       vld1.8          d0[0],  [%[m]]!                 \n\t"
+       "       vld1.32         d4[0],  [%[d]]                  \n\t"
+       "       vdup.u8         d0,     d0[0]                   \n\t"
+       "       vmull.u8        q4,     d0, d30                 \n\t"
+       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vmvn.u16        d14,    d12                     \n\t"
+       "       vshr.u32        d16,    d14, #24                \n\t"
+       "       vmul.u32        d16,    d16, d28                \n\t"
+       "       vmull.u8        q7,     d16, d4                 \n\t"
+       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqadd.u8        d0,     d0, d12                 \n\t"
+       "       vst1.32         d0[0],  [%[d]]!                 \n\t"
+
+               // Can we go the fast path?
+       "       andS            %[tmp], %[d],$0xf               \n\t"
+       "       beq             "AP"quadloops                   \n\t"
+
+       AP"dualloop:                                            \n\t"
+       "       sub             %[tmp], %[e], %[d]              \n\t"
+       "       cmp             %[tmp], #16                     \n\t"
+       "       blt             "AP"loopout                     \n\t"
+
+       "       vld1.16         d0[0],  [%[m]]!                 \n\t"
+       "       vldm            %[d],   {d4}                    \n\t"
+       "       vmovl.u8        q0,     d0                      \n\t"
+       "       vmovl.u8        q0,     d0                      \n\t"
+       "       vmul.u32        q0,     q14                     \n\t"
+       "       vmull.u8        q4,     d0, d30                 \n\t"
+       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vmvn.u16        d14,    d12                     \n\t"
+       "       vshr.u32        d16,    d14, #24                \n\t"
+       "       vmul.u32        d16,    d16, d28                \n\t"
+       "       vmull.u8        q7,     d16, d4                 \n\t"
+       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqadd.u8        q0,     q0, q6                  \n\t"
+       "       vstm            %[d]!,  {d0}                    \n\t"
+
+       AP"quadloops:                                           \n\t"
+       "       sub             %[tmp], %[e], %[d]              \n\t"
+       "       cmp             %[tmp], #16                     \n\t"
+       "       blt             "AP"loopout                     \n\t"
+       "       sub             %[tmp], %[e], #15               \n\t"
+
+       "       sub             %[d],   #16                     \n\t"
+       AP"fastloop:"
+       "       add             %[d],   #16                     \n\t"
+       "       cmp             %[tmp], %[d]                    \n\t"
+       "       ble             "AP"loopout                     \n\t"
+       AP"quadloopint:                                         \n\t"
+//     "       vld1.32         d0[0],  [%[m]]!                 \n\t"
+       "       ldr.32          %[x],   [%[m]]                  \n\t"
+       "       add %[m], #4                                    \n\t"
+       "       cmp             %[x],   #0                      \n\t"
+       "       beq             "AP"fastloop                    \n\t"
+       "       vmov.32         d0[0],  %[x]                    \n\t"
+       "       vldm            %[d], {d4,d5}                   \n\t"
+
+       // Expand M: Fixme: Can we do this quicker?
+       "       vmovl.u8        q0,     d0                      \n\t"
+       "       vmovl.u8        q0,     d0                      \n\t"
+       "       vmul.u32        q0,     q14                     \n\t"
+
+       // Multiply     a * c
+       "       vmull.u8        q4,     d0, d30                 \n\t"
+       "       vmull.u8        q5,     d1, d31                 \n\t"
+
+       // Shorten
+       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vshrn.u16       d13,    q5, #8                  \n\t"
+
+       // extract negated alpha
+       "       vmvn.u16        q7,     q6                      \n\t"
+       "       vshr.u32        q8,     q7, #24                 \n\t"
+       "       vmul.u32        q8,     q8, q14                 \n\t"
+
+       // Multiply
+       "       vmull.u8        q7,     d16, d4                 \n\t"
+       "       vmull.u8        q8,     d17, d5                 \n\t"
+
+       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vshrn.u16       d1,     q8, #8                  \n\t"
+
+       // Add
+       "       vqadd.u8        q0,     q0, q6                  \n\t"
+
+       "       vstm            %[d]!,  {d0,d1}                 \n\t"
+
+       "       cmp             %[tmp], %[d]                    \n\t"
+       "       bhi             "AP"quadloopint                 \n\t"
+
+       AP"loopout:                                             \n\t"
+       "       cmp             %[d], %[e]                      \n\t"
+       "       beq             "AP"done                        \n\t"
+       "       sub             %[tmp],%[e], %[d]               \n\t"
+       "       cmp             %[tmp],#4                       \n\t"
+       "       beq             "AP"singleout                   \n\t"
+
+       AP "dualloop2:                                  \n\t"
+               "sub            %[tmp],%[e],$0x7        \n\t"
+       "       vld1.16         d0[0],  [%[m]]!                 \n\t"
+       "       vldm            %[d],   {d4}                    \n\t"
+       "       vmovl.u8        q0,     d0                      \n\t"
+       "       vmovl.u8        q0,     d0                      \n\t"
+       "       vmul.u32        q0,     q14                     \n\t"
+       "       vmull.u8        q4,     d0, d30                 \n\t"
+       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vmvn.u16        d14,    d12                     \n\t"
+       "       vshr.u32        d16,    d14, #24                \n\t"
+       "       vmul.u32        d16,    d16, d28                \n\t"
+       "       vmull.u8        q7,     d16, d4                 \n\t"
+       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqadd.u8        q0,     q0, q6                  \n\t"
+       "       vstm            %[d]!,  {d0}                    \n\t"
+
+       "       cmp             %[e], %[d]              \n\t"
+       "       beq             "AP"done                \n\t"
+
+       AP"singleout:                                           \n\t"
+       "       vld1.8          d0[0],  [%[m]]!                 \n\t"
+       "       vld1.32         d4[0],  [%[d]]                  \n\t"
+       "       vdup.u8         d0,     d0[0]                   \n\t"
+       "       vmull.u8        q4,     d0, d30                 \n\t"
+       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vmvn.u16        d14,    d12                     \n\t"
+       "       vshr.u32        d16,    d14, #24                \n\t"
+       "       vmul.u32        d16,    d16, d28                \n\t"
+       "       vmull.u8        q7,     d16, d4                 \n\t"
+       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqadd.u8        q0,     q0, q6                  \n\t"
+       "       vst1.32         d0[0],  [%[d]]!                 \n\t"
+
+       AP"done:                                                \n\t"
+
+       : // Out
+       :  [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
+               [tmp] "r" (7), [m] "r" (m), [x] "r" (0)
+          : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15",
+                       "memory" // clobbered
+       );
+#undef AP
 }
+#endif
 
+#ifdef BUILD_NEON
 static void
 _op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e;
+   DATA32 *e,*tmp;
    int alpha;
-   UNROLL8_PLD_WHILE(d, l, e,
-                     {
-                        alpha = *m;
-                        switch(alpha)
-                          {
-                          case 0:
-                             break;
-                          case 255:
-                             *d = c;
-                             break;
-                          default:
-                             alpha++;
-                             *d = INTERP_256(alpha, c, *d);
-                             break;
-                          }
-                        m++;  d++;
-                     });
+#define AP     "_blend_mas_can_dp_neon_"
+     asm volatile (
+               "vdup.u32       q9,     %[c]            \n\t"
+               "vmov.i8        q15,    #1              \n\t"
+               "vmov.i8        q14,    #0              \n\t"
+
+               // Make C 16 bit (C in q3/q2)
+               "vmovl.u8       q3,     d19             \n\t"
+               "vmovl.u8       q2,     d18             \n\t"
+
+               // Which loop to start
+       "       andS            %[tmp], %[d],$0xf       \n\t"
+       "       beq             "AP"quadloop            \n\t"
+
+       "       andS            %[tmp], %[d], #4        \n\t"
+       "       beq             "AP"dualloop            \n\t"
+
+
+       AP"singleloop:                                  \n\t"
+       "       vld1.8          d0[0],  [%[m]]!         \n\t"
+       "       vld1.32         d8[0],  [%[d]]          \n\t"
+       "       vdup.u8         d0,     d0[0]           \n\t"
+       "       vshr.u8         d0,     d0,     #1      \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmovl.u8        q4,     d8              \n\t"
+       "       vsub.s16        q6,     q2,     q4      \n\t"
+       "       vmul.s16        q6,     q0              \n\t"
+       "       vshr.s16        q6,     #7              \n\t"
+       "       vadd.s16        q6,     q4              \n\t"
+       "       vqmovun.s16     d2,     q6              \n\t"
+       "       vst1.32         d2[0],  [%[d]]!         \n\t"
+
+       "       andS            %[tmp], %[d],   #15     \n\t"
+       "       beq             "AP"quadloop            \n\t"
+
+       AP"dualloop:                                    \n\t"
+       "       vld1.16 d0[0],  [%[m]]!         \n\t"
+       "       vldm            %[d],           {d8}    \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmul.u32        d0,     d0,     d30     \n\t"
+       "       vshr.u8 d0,     d0, #1          \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmovl.u8        q4,     d8              \n\t"
+       "       vsub.s16        q6,     q2, q4          \n\t"
+       "       vmul.s16        q6,     q0              \n\t"
+       "       vshr.s16        q6,  #7                 \n\t"
+       "       vadd.s16        q6,  q4                 \n\t"
+       "       vqmovun.s16     d2,  q6                 \n\t"
+       "       vstm            %[d]!,  {d2}    \n\t"
+
+       AP"quadloop:                                    \n\t"
+       "       sub             %[tmp], %[e], %[d]      \n\t"
+       "       cmp             %[tmp], #16             \n\t"
+       "       blt             "AP"loopout             \n\t"
+       "       sub             %[tmp], %[e], #15       \n\t"
+
+       "       sub             %[d],   #16             \n\t"
+       AP"fastloop:                                    \n\t"
+       "       add             %[d],   #16             \n\t"
+       "       cmp             %[tmp], %[d]            \n\t"
+       "       ble             "AP"loopout             \n\t"
+
+       AP"quadloopint:                                 \n\t"
+               // Load the mask: 4 bytes: It has d0/d1
+       "       ldr.32          %[x],   [%[m]]          \n\t"
+       "       add             %[m], #4                \n\t"
+       "       cmp             %[x],   #0              \n\t"
+       "       beq             "AP"fastloop            \n\t"
+       "       vmov.32         d0[0],  %[x]            \n\t"
+
+               // Load d into d8/d9 q4
+       "       vldm            %[d],   {d8,d9}         \n\t"
+       "       cmp             %[x],   $0xffffffff     \n\t"
+       "       beq             "AP"quadstore           \n\t"
+
+
+               // Get the alpha channel ready (m)
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmul.u32        q0,     q0,q15          \n\t"
+               // Lop a bit off to prevent overflow
+       "       vshr.u8 q0,     q0, #1          \n\t"
+
+               // Now make it 16 bit
+       "       vmovl.u8        q1,     d1              \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+
+               // 16 bit 'd'
+       "       vmovl.u8        q5,     d9              \n\t"
+       "       vmovl.u8        q4,     d8              \n\t"
+
+               // Diff 'd' & 'c'
+       "       vsub.s16        q7,     q3, q5          \n\t"
+       "       vsub.s16        q6,     q2, q4          \n\t"
+
+       "       vmul.s16        q7,     q1              \n\t"
+       "       vmul.s16        q6,     q0              \n\t"
+
+               // Shift results a bit
+       "       vshr.s16        q7,  #7                 \n\t"
+       "       vshr.s16        q6,  #7                 \n\t"
+
+               // Add 'd'
+       "       vadd.s16        q7,  q5                 \n\t"
+       "       vadd.s16        q6,  q4                 \n\t"
+
+               // Make sure none are negative
+       "       vqmovun.s16     d9,  q7                 \n\t"
+       "       vqmovun.s16     d8,  q6                 \n\t"
+
+       "       vstm            %[d]!,  {d8,d9} \n\t"
+
+       "       cmp             %[tmp], %[d]            \n\t"
+       "       bhi             "AP"quadloopint         \n\t"
+       "       b               "AP"loopout             \n\t"
+
+       AP"quadstore:                                   \n\t"
+       "       vstm            %[d]!,  {d18,d19}       \n\t"
+       "       cmp             %[tmp], %[d]            \n\t"
+       "       bhi             "AP"quadloopint         \n\t"
+
+
+       AP"loopout:                                     \n\t"
+
+       "       cmp             %[e], %[d]              \n\t"
+       "       beq             "AP"done                \n\t"
+
+       "       sub             %[tmp],%[e], %[d]       \n\t"
+       "       cmp             %[tmp],#8               \n\t"
+
+       "       blt             "AP"onebyte             \n\t"
+
+               // Load the mask: 2 bytes: It has d0
+       "       vld1.16 d0[0],  [%[m]]!         \n\t"
+
+               // Load d into d8/d9 q4
+       "       vldm            %[d],           {d8}    \n\t"
+
+               // Get the alpha channel ready (m)
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmovl.u8        q0,     d0              \n\t"
+       "       vmul.u32        d0,     d0,     d30     \n\t"
+               // Lop a bit off to prevent overflow
+       "       vshr.u8 d0,     d0, #1          \n\t"
+
+               // Now make it 16 bit
+       "       vmovl.u8        q0,     d0              \n\t"
+
+               // 16 bit 'd'
+       "       vmovl.u8        q4,     d8              \n\t"
+
+               // Diff 'd' & 'c'
+       "       vsub.s16        q6,     q2, q4          \n\t"
+
+       "       vmul.s16        q6,     q0              \n\t"
+
+               // Shift results a bit
+       "       vshr.s16        q6,  #7                 \n\t"
+
+               // Add 'd'
+               "vadd.s16       q6,  q4                 \n\t"
+
+               // Make sure none are negative
+               "vqmovun.s16    d2,  q6                 \n\t"
+
+               "vstm           %[d]!,  {d2}    \n\t"
+
+               "cmp            %[e], %[d]              \n\t"
+               "beq            "AP"done                \n\t"
+
+       AP"onebyte:                                     \n\t"
+               "vld1.8 d0[0],  [%[m]]!                 \n\t"
+               "vld1.32        d8[0],  [%[d]]          \n\t"
+               "vdup.u8        d0,     d0[0]           \n\t"
+               "vshr.u8        d0,     d0, #1          \n\t"
+               "vmovl.u8       q0,     d0              \n\t"
+               "vmovl.u8       q4,     d8              \n\t"
+               "vsub.s16       q6,     q2, q4          \n\t"
+               "vmul.s16       q6,     q0              \n\t"
+               "vshr.s16       q6,  #7                 \n\t"
+               "vadd.s16       q6,  q4                 \n\t"
+               "vqmovun.s16    d2,  q6                 \n\t"
+               "vst1.32        d2[0], [%[d]]!          \n\t"
+
+       AP"done:                                        \n\t"
+
+         : // output regs
+         // Input
+          :  [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c),
+               [m] "r" (m), [tmp] "r" (7), [x] "r" (33)
+          : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q14","q15",
+                       "memory" // clobbered
+
+     );
+#undef AP
 }
+#endif
 
+#ifdef BUILD_NEON
 #define _op_blend_mas_cn_dp_neon _op_blend_mas_can_dp_neon
 #define _op_blend_mas_caa_dp_neon _op_blend_mas_c_dp_neon
 
index c152b2c..6442d5e 100644 (file)
 
 /* blend pixel x color --> dst */
-
 #ifdef BUILD_NEON
+/* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of
+ * reads, then two writes, a miss on read is 'just' two reads */
 static void
 _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e;
-   int alpha;
-   UNROLL8_PLD_WHILE(d, l, e,
-                     {
-                        DATA32 sc = MUL4_SYM(c, *s);
-                        alpha = 256 - (sc >> 24);
-                        *d = sc + MUL_256(alpha, *d);
-                        d++;
-                        s++;
-                     });
+#define AP     "blend_p_c_dp_"
+   asm volatile (
+               // Load 'c'
+               "vdup.u32       q7, %[c]                        \n\t"
+               "vmov.i8        q6, #1                          \n\t"
+
+               // Choose a loop
+               "andS           %[tmp], %[d], $0xf      \n\t"
+               "beq            "AP"quadstart           \n\t"
+
+               "andS           %[tmp],%[d], $0x4       \n\t"
+               "beq            "AP"dualloop            \n\t"
+
+       AP"singleloop:"
+               "vld1.32        d0[0],  [%[s]]!         \n\t"
+               "vld1.32        d2[0],  [%[d]]          \n\t"
+               //  Mulitply s * c (= sc)
+               "vmull.u8       q4,     d0,d14          \n\t"
+               // sc in d8
+               "vshrn.u16      d4,     q4, #8          \n\t"
+
+               // sca in d9
+               "vmvn.u32       d6,     d4              \n\t"
+               "vshr.u32       d6,     d6, #24         \n\t"
+
+               "vmul.u32       d6,     d12, d6         \n\t"
+
+               /* d * alpha */
+               "vmull.u8       q4,     d6, d2          \n\t"
+               "vshrn.u16      d0,     q4, #8          \n\t"
+
+               "vqadd.u8       d2,     d0, d4          \n\t"
+
+               // Save dsc + sc
+               "vst1.32        d2[0],  [%[d]]!         \n\t"
+
+               // Now where?
+               // Can we go the fast path?
+               "andS           %[tmp], %[d],$0xf       \n\t"
+               "beq            "AP"quadstart           \n\t"
+
+       AP"dualloop:                                    \n\t"
+               // Check we have enough to bother with!
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #16             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+               //  load 's' -> q0, 'd' -> q1
+               "vldm           %[s]!,  {d0}            \n\t"
+               "vldm           %[d],   {d2}            \n\t"
+               //  Mulitply s * c (= sc)
+               "vmull.u8       q4,     d0,d14          \n\t"
+               // sc in d8
+               "vshrn.u16      d4,     q4, #8          \n\t"
+
+               // sca in d9
+               "vmvn.u32       d6,     d4              \n\t"
+               "vshr.u32       d6,     d6, #24         \n\t"
+
+               "vmul.u32       d6,     d12, d6         \n\t"
+
+               /* d * alpha */
+               "vmull.u8       q4,     d6, d2          \n\t"
+               "vshrn.u16      d0,     q4, #8          \n\t"
+
+               "vqadd.u8       d2,     d0, d4          \n\t"
+
+               // Save dsc + sc
+               "vst1.32        d2,     [%[d]]!         \n\t"
+
+       AP"quadstart:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #16             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+               "sub            %[tmp], %[e], #15       \n\t"
+
+       AP"quadloop:\n\t"
+               //  load 's' -> q0, 'd' -> q1
+               "vldm   %[s]!, {d0,d1}          \n\t"
+               "vldm   %[d], {d2,d3}           \n\t"
+               //  Mulitply s * c (= sc)
+               "vmull.u8       q4,     d0,d14  \n\t"
+               "vmull.u8       q5,     d1,d14  \n\t"
+
+               // Get sc & sc alpha
+               "vshrn.u16      d4,     q4, #8          \n\t"
+               "vshrn.u16      d5,     q5, #8          \n\t"
+                       // sc is now in q2, 8bpp
+               // Shift out, then spread alpha for q2
+               "vmvn.u32       q3,     q2              \n\t"
+               "vshr.u32       q3,     q3, $0x18       \n\t"
+               "vmul.u32       q3,     q6,q3           \n\t"
+
+               //  Multiply 'd' by sc.alpha (dsca)
+               "vmull.u8       q4,     d6,d2           \n\t"
+               "vmull.u8       q5,     d7,d3           \n\t"
+
+               "vshrn.u16      d0,     q4, #8          \n\t"
+               "vshrn.u16      d1,     q5, #8          \n\t"
+
+               "vqadd.u8       q1,     q0, q2          \n\t"
+
+               // Save dsc + sc
+               "vstm           %[d]!,  {d2,d3}         \n\t"
+
+               "cmp            %[tmp], %[d]            \n\t"
+
+               "bhi            "AP"quadloop            \n\t"
+
+       /* Trailing stuff */
+       AP"loopout:                                     \n\t"
+
+               "cmp            %[d], %[e]              \n\t"
+                "beq           "AP"done\n\t"
+               "sub            %[tmp],%[e], %[d]       \n\t"
+               "cmp            %[tmp],$0x04            \n\t"
+               "beq            "AP"singleloop2         \n\t"
+
+               "sub            %[tmp], %[e], #7        \n\t"
+       /* Dual loop */
+       AP"dualloop2:                                   \n\t"
+               "vldm           %[s]!, {d0}             \n\t"
+               "vldm           %[d], {d2}              \n\t"
+               //  Mulitply s * c (= sc)
+               "vmull.u8       q4,     d0,d14          \n\t"
+               // sc in d8
+               "vshrn.u16      d4,     q4, #8          \n\t"
+
+               // sca in d9
+               // XXX: I can probably squash one of these 3
+               "vmvn.u32       d6,     d4              \n\t"
+               "vshr.u32       d6,     d6, #24         \n\t"
+               "vmul.u32       d6,     d6, d12         \n\t"
+
+               /* d * alpha */
+               "vmull.u8       q4,     d6, d2          \n\t"
+               "vshrn.u16      d0,     q4, #8          \n\t"
+
+               "vqadd.u8       d2,     d0, d4          \n\t"
+
+               // Save dsc + sc
+               "vstm           %[d]!,  {d2}            \n\t"
+
+               "cmp            %[tmp], %[d]            \n\t"
+               "bhi            "AP"dualloop2           \n\t"
+
+               "cmp            %[d], %[e]              \n\t"
+                "beq           "AP"done                \n\t"
+
+       AP"singleloop2:                                 \n\t"
+               "vld1.32        d0[0],  [%[s]]!         \n\t"
+               "vld1.32        d2[0],  [%[d]]          \n\t"
+               //  Mulitply s * c (= sc)
+               "vmull.u8       q4,     d0,d14          \n\t"
+               // sc in d8
+               "vshrn.u16      d4,     q4, #8          \n\t"
+
+               // sca in d6
+               "vmvn.u32       d6,     d4              \n\t"
+               "vshr.u32       d6,     d6, #24         \n\t"
+               "vmul.u32       d6,     d12,d6          \n\t"
+
+               /* d * alpha */
+               "vmull.u8       q4,     d6, d2          \n\t"
+               "vshrn.u16      d0,     q4, #8          \n\t"
+
+               "vqadd.u8       d2,     d0, d4          \n\t"
+
+               // Save dsc + sc
+               "vst1.32        d2[0],  [%[d]]!         \n\t"
+
+
+       AP"done:"
+               : // No output
+               //
+               : [s] "r" (s), [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
+                       [tmp] "r" (12)
+               : "q0","q1","q2","q3","q4","q5","q6","q7","memory"
+       );
+#undef AP
 }
 
 static void
@@ -28,14 +200,205 @@ _op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, i
 
 static void
 _op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e;
-   c = 1 + (c & 0xff);
-   UNROLL8_PLD_WHILE(d, l, e,
-                     {
-                        *d = INTERP_256(c, *s, *d);
-                        d++;
-                        s++;
-                     });
+#define AP     "_op_blend_pan_caa_dp_"
+   DATA32 *e = d + l, *tmp = (void*)73;
+      asm volatile (
+               /* Set up 'c' */
+               "vdup.u8     d14, %[c]          \n\t"
+               "vmov.i8     d15, #1            \n\t"
+               "vaddl.u8   q15, d14, d15       \n\t"
+               "vshr.u8        q15,#1          \n\t"
+
+               // Pick a loop
+               "andS           %[tmp], %[d], $0xf      \n\t"
+               "beq            "AP"quadstart           \n\t"
+
+               "andS           %[tmp], %[d], $0x4      \n\t"
+               "beq            "AP"dualstart           \n\t"
+
+       AP"singleloop:                                  \n\t"
+               "vld1.32        d4[0],  [%[d]]          \n\t"
+               "vld1.32        d0[0],  [%[s]]!         \n\t"
+
+               // Long version of 'd'
+               "vmovl.u8       q8, d4                  \n\t"
+
+               // Long version of 's'
+               "vmovl.u8       q6, d0                  \n\t"
+
+               // d8 = s -d
+               "vsub.s16       d8, d12, d16            \n\t"
+
+               // Multiply
+               "vmul.s16       d8, d8, d30             \n\t"
+
+               // Shift down
+               "vshr.s16       d8, #7                  \n\t"
+
+               // Add 'd'
+               "vqadd.s16      d8, d8, d16             \n\t"
+
+               // Shrink to save
+               "vqmovun.s16    d0,  q4                 \n\t"
+               "vst1.32        d0[0], [%[d]]!          \n\t"
+
+               // Now where?
+               "andS           %[tmp], %[d], $0xf      \n\t"
+               "beq            "AP"quadstart           \n\t"
+
+       AP"dualstart:                                   \n\t"
+               // Check we have enough
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #16             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+       AP"dualloop:"
+               "vldm           %[d],   {d4}            \n\t"
+               "vldm           %[s]!,  {d0}            \n\t"
+
+               // Long version of d
+               "vmovl.u8       q8, d4          \n\t"
+
+               // Long version of s
+               "vmovl.u8       q6, d0          \n\t"
+
+               // q4/q5 = s-d
+               "vsub.s16       q4, q6, q8      \n\t"
+
+               // Multiply
+               "vmul.s16       q4,  q4,q15     \n\t"
+
+               // Shift down
+               "vshr.s16       q4, #7          \n\t"
+
+               // Add d
+               "vqadd.s16      q4, q4, q8      \n\t"
+
+               // Shrink to save
+               "vqmovun.s16    d0,  q4         \n\t"
+
+               "vstm           %[d]!,  {d0}    \n\t"
+       AP"quadstart:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #16             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+               "sub            %[tmp], %[e], #15       \n\t"
+
+       AP"quadloop:                            \n\t"
+               //  load 's' -> q0, 'd' -> q2
+               "vldm   %[d],  {d4,d5}          \n\t"
+               "vldm   %[s]!, {d0,d1}          \n\t"
+
+               // Long version of d
+               "vmovl.u8       q8, d4          \n\t"
+               "vmovl.u8       q9, d5          \n\t"
+
+               // Long version of s
+               "vmovl.u8       q6, d0          \n\t"
+               "vmovl.u8       q7, d1          \n\t"
+
+               // q4/q5 = s-d
+               "vsub.s16       q4, q6, q8      \n\t"
+               "vsub.s16       q5, q7, q9      \n\t"
+
+               // Multiply
+               "vmul.s16       q4,  q4,q15     \n\t"
+               "vmul.s16       q5,  q5,q15     \n\t"
+
+               // Shift down
+               "vshr.s16       q4, #7          \n\t"
+               "vshr.s16       q5, #7          \n\t"
+
+               // Add d
+               "vqadd.s16      q4, q4, q8      \n\t"
+               "vqadd.s16      q5, q5, q9      \n\t"
+
+               // Shrink to save
+               "vqmovun.s16    d0,  q4         \n\t"
+               "vqmovun.s16    d1,  q5         \n\t"
+               "vstm           %[d]!,  {d0,d1} \n\t"
+               "cmp            %[tmp], %[d]            \n\t"
+
+               "bhi "AP"quadloop\n\t"
+
+
+               "b "AP"done\n\t"
+       AP"loopout:                                     \n\t"
+               "cmp            %[d], %[e]              \n\t"
+                "beq           "AP"done\n\t"
+               "sub            %[tmp],%[e], %[d]       \n\t"
+               "cmp            %[tmp],$0x04            \n\t"
+               "beq            "AP"singleloop2         \n\t"
+
+       AP"dualloop2:                                   \n\t"
+               "vldm           %[d],   {d4}            \n\t"
+               "vldm           %[s]!,  {d0}            \n\t"
+
+               // Long version of d
+               "vmovl.u8       q8, d4          \n\t"
+
+               // Long version of s
+               "vmovl.u8       q6, d0          \n\t"
+
+               // q4/q5 = s-d
+               "vsub.s16       q4, q6, q8      \n\t"
+
+               // Multiply
+               "vmul.s16       q4,  q4,q15     \n\t"
+
+               // Shift down
+               "vshr.s16       q4, #7          \n\t"
+
+               // Add d
+               "vqadd.s16      q4, q4, q8      \n\t"
+
+               // Shrink to save
+               "vqmovun.s16    d0,  q4         \n\t"
+
+               "vstm           %[d]!,  {d0}    \n\t"
+
+               "cmp            %[d], %[e]              \n\t"
+                "beq           "AP"done                \n\t"
+
+       AP"singleloop2:                                 \n\t"
+               "vld1.32        d4[0],  [%[d]]          \n\t"
+               "vld1.32        d0[0],  [%[s]]!         \n\t"
+
+               // Long version of 'd'
+               "vmovl.u8       q8, d4                  \n\t"
+
+               // Long version of 's'
+               "vmovl.u8       q6, d0                  \n\t"
+
+               // d8 = s -d
+               "vsub.s16       d8, d12, d16            \n\t"
+
+               // Multiply
+               "vmul.s16       d8, d8, d30             \n\t"
+
+               // Shift down
+               "vshr.s16       d8, #7                  \n\t"
+
+               // Add 'd'
+               "vqadd.s16      d8, d8, d16             \n\t"
+
+               // Shrink to save
+               "vqmovun.s16    d0,  q4                 \n\t"
+
+               "vst1.32        d0[0], [%[d]]           \n\t"
+
+
+       AP"done:                                        \n\t"
+
+       // No output
+       :
+       // Input
+       : [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp)
+       // Clobbered
+       : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory"
+      );
+#undef AP
 }
 
 #define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon
index c59e66d..1bdfa6a 100644 (file)
 #ifdef BUILD_NEON
 static void
 _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e = d + l;
-   while (d < e) {
-      l = 256 - (*s >> 24);
-      *d = *s++ + MUL_256(l, *d);
-      d++;
-   }
+#define AP "blend_p_dp_"
+  asm volatile (
+       //** init
+       "vmov.i8        q8,     $0x1            \n\t"
+
+       AP "loopchoose:                                 \n\t"
+               // If aligned already - straight to octs
+               "andS           %[tmp], %[d],$0x1f              \n\t"
+               "beq            "AP"octloops                    \n\t"
+
+               "andS           %[tmp], %[d],$0xf               \n\t"
+               "beq            "AP"quadloops                   \n\t"
+
+               "andS           %[tmp], %[d],$0x4               \n\t"
+               "beq            "AP"dualloop                    \n\t"
+
+       // Only ever executes once, fall through to dual
+       AP "singleloop:                                 \n\t"
+               "vld1.32        d0[0],  [%[s]]!         \n\t"
+               "vld1.32        d4[0],  [%[d]]          \n\t"
+
+               "vmvn.u8        d8,     d0              \n\t"
+               "vshr.u32       d8,     d8, #24         \n\t"
+
+               "vmul.u32       d8,     d16, d8         \n\t"
+
+               "vmull.u8       q6,     d4,d8           \n\t"
+               "vshrn.u16      d8,     q6, #8          \n\t"
+               // Add to 's'
+               "vqadd.u8       q2,     q4,q0           \n\t"
+
+               "vst1.32        d4[0],  [%[d]]          \n\t"
+               "add            %[d],   #4              \n\t"
+
+               // Can we go the fast path?
+               "andS           %[tmp], %[d],$0x1f      \n\t"
+               "beq            "AP"octloops            \n\t"
+
+               "andS           %[tmp], %[d],$0x0f      \n\t"
+               "beq            "AP"quadloops           \n\t"
+
+
+       AP "dualloop:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+       AP "dualloopint:                                \n\t"
+               //** Dual Loop
+               "vldm           %[s]!, {d0}             \n\t"
+               "vldr           d4,     [%[d]]          \n\t"
+
+               "vmvn.u8        d8,     d0              \n\t"
+               "vshr.u32       d8,     d8, #24         \n\t"
+
+               "vmul.u32       d8,     d16, d8         \n\t"
+
+               "vmull.u8       q6,     d4,d8           \n\t"
+               "vshrn.u16      d8,     q6, #8          \n\t"
+               // Add to 's'
+               "vqadd.u8       d4,     d8,d0           \n\t"
+               "vstr           d4,     [%[d]]          \n\t"
+               "add            %[d],   #8              \n\t"
+
+               "ands           %[tmp], %[d], $0x1f     \n\t"
+               "beq            "AP"octloops            \n\t"
+
+       AP"quadloops:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+               "vldm           %[s]!,  {d0,d1)         \n\t"
+               "vldm           %[d],   {d4,d5}         \n\t"
+
+
+               // Copy s.a into q2 (>> 24) & subtract from 255
+               "vmvn.u8        q4,     q0              \n\t"
+               "vshr.u32       q4,     q4,$0x18        \n\t"
+
+               // Multiply into all fields
+               "vmul.u32       q4,     q8,q4           \n\t"
+
+               // a * d  (clobbering 'd'/q7)
+               "vmull.u8       q6,     d4,d8           \n\t"
+               "vmull.u8       q2,     d5,d9           \n\t"
+
+               // Shift & narrow it
+               "vshrn.u16      d8,     q6, #8          \n\t"
+               "vshrn.u16      d9,     q2, #8          \n\t"
+
+               // Add to s
+               "vqadd.u8       q2,     q4,q0           \n\t"
+
+               // Write it
+               "vstm           %[d]!,  {d4,d5}         \n\t"
+
+       AP "octloops:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "ble            "AP"loopout             \n\t"
+
+               "sub            %[tmp],%[e],#64 \n\t"
+
+
+       AP "octloopint:\n\t"
+               //** Oct loop
+               "vldm   %[s]!,  {d0,d1,d2,d3)           \n\t"
+               "vldm   %[d],   {d4,d5,d6,d7}           \n\t"
+
+
+               // Copy s.a into q2 (>> 24) & subtract from 255
+               "vmvn.u8        q4,     q0              \n\t"
+                       "vmvn.u8        q5,     q1      \n\t"
+               "vshr.u32       q4,     q4,$0x18        \n\t"
+                       "vshr.u32       q5,     q5,$0x18\n\t"
+
+               // Multiply into all fields
+               "vmul.u32       q4,     q8,q4           \n\t"
+                       "vmul.u32       q5,     q8,q5   \n\t"
+
+
+               // a * d  (clobbering 'd'/q7)
+               "vmull.u8       q6,     d4,d8           \n\t"
+               "vmull.u8       q2,     d5,d9           \n\t"
+                       "vmull.u8       q7,     d6,d10  \n\t"
+                       "vmull.u8       q3,     d7,d11  \n\t"
+
+               "cmp     %[tmp], %[d]\n\t"
+
+               // Shift & narrow it
+               "vshrn.u16      d8,     q6, #8          \n\t"
+               "vshrn.u16      d9,     q2, #8          \n\t"
+                       "vshrn.u16      d10,    q7, #8  \n\t"
+                       "vshrn.u16      d11,    q3, #8  \n\t"
+
+
+               // Add to s
+               "vqadd.u8       q2,     q4,q0           \n\t"
+                       "vqadd.u8       q3,     q5,q1   \n\t"
+
+               // Write it
+               "vstm           %[d]!,  {d4,d5,d6,d7}   \n\t"
+
+                "bhi    "AP"octloopint\n\t"
+
+       AP "loopout:                                    \n\t"
+//"sub %[tmp], %[d], #4\n\t"
+//"vmov.i16    d0, $0xff00 \n\t"
+//"vst1.32     d0[0],  [%[tmp]]                \n\t"
+
+               "cmp            %[d], %[e]\n\t"
+                "beq           "AP"done\n\t"
+               "sub            %[tmp],%[e], %[d]       \n\t"
+               "cmp            %[tmp],$0x04            \n\t"
+               "ble            "AP"singleloop2         \n\t"
+
+       AP "dualloop2:                                  \n\t"
+               "sub            %[tmp],%[e],$0x7        \n\t"
+       AP "dualloop2int:                               \n\t"
+               //** Trailing double
+       
+               "vldm           %[s]!,  {d0}            \n\t"
+               "vldm           %[d],   {d4}            \n\t"
+
+               "vmvn.u8        d8,     d0              \n\t"
+               "vshr.u32       d8,     d8, #24         \n\t"
+
+               "vmul.u32       d8,     d16, d8         \n\t"
+
+               "vmull.u8       q6,     d4,d8           \n\t"
+               "vshrn.u16      d8,     q6, #8          \n\t"
+               // Add to 's'
+               "vqadd.u8       d4,     d8,d0           \n\t"
+
+               "vstr.32        d4,     [%[d]]          \n\t"
+               "add            %[d],   #8              \n\t"
+
+               "cmp            %[tmp], %[d]            \n\t"
+               "bhi            "AP"dualloop2int                \n\t"
+
+               // Single ??
+               "cmp            %[e], %[d]              \n\t"
+               "beq            "AP"done                \n\t"
+
+       AP"singleloop2:                                 \n\t"
+               "vld1.32        d0[0],  [%[s]]          \n\t"
+               "vld1.32        d4[0],  [%[d]]          \n\t"
+
+               "vmvn.u8        d8,     d0              \n\t"
+               "vshr.u32       d8,     d8, #24         \n\t"
+
+               "vmul.u32       d8,     d8, d16         \n\t"
+
+               "vmull.u8       q6,     d8,d4           \n\t"
+               "vshrn.u16      d8,     q6, #8          \n\t"
+               // Add to 's'
+               "vqadd.u8       d0,     d0,d8           \n\t"
+               "vst1.32        d0[0],  [%[d]]          \n\t"
+
+               //** Trailing single
+
+       AP"done:\n\t"
+//"sub %[tmp], %[e], #4 \n\t"
+//"vmov.i32    d0, $0xffff0000 \n\t"
+//"vst1.32     d0[0],  [%[tmp]]                \n\t"
+
+
+         : // output regs
+         // Input
+          :  [e] "r" (d + l), [d] "r" (d), [s] "r" (s), [c] "r" (c),
+                       [tmp] "r" (7)
+          : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
+   );
+#undef AP
+
 }
 
 static void
 _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
-#if 0
-#ifdef NEON_INSTRINSICS_OK
-#else
-   DATA32 *e = d + l;
-#if 1
-   if (l >= 4)
-     {
-        e -= 4;
-        asm volatile (
-//                      "vmov.i32 q3, $0xff000000\n\t"
-//                      "asmloop3:\n\t"
-//                      "vld1.32 {d0-d1}, [%[s]]!\n\t"
-//                      "vmov.32 q2, q0\n\t"
-//                      "vand.32 q2, q2, q3\n\t"
-//                      "vceq.i32 q2, q2, #0\n\t"
-//                      "beq blank\n\t"
-//                      "vmov.32 d3, d0\n\t"
-//                      "vmovl.u8 q0, d1\n\t"
-//                      "vmovl.u8 q1, d3\n\t"
-//                      "\n\t"
-//                      "vmovn.u16 d1, q0\n\t"
-//                      "vmovn.u16 d3, q1\n\t"
-//                      "vmov.32 d0, d3\n\t"
-//                      "\n\t"
-//                      "vst1.32 {d0-d1}, [%[d]]!\n\t"
-                      
-//                      "cmp %[e], %[d]\n\t" // if d < e ...
-//                      "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
-//                      "b done\n\t"
-                      
-//                      "blank:\n\t"
-//                      "add %[s], %[s], #16\n\t"
-//                      "add %[d], %[d], #16\n\t"
-//                      "cmp %[e], %[d]\n\t" // if d < e ...
-//                      "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
-                      
-//                      "done:\n\t"
-                      "asmloop3:\n\t"
-                      "vld4.8 {d0-d3}, [%[s]]\n\t" // d0-d3 = s
-                      "vld4.8 {d4-d7}, [%[d]]\n\t" // d4-d7 = d
-                      "vmvn.8  d31, d3\n\t" // d31 = 255 - s.a
-                      "vmull.u8 q4, d31, d4\n\t"
-                      "vmull.u8 q5, d31, d5\n\t"
-                      "vmull.u8 q6, d31, d6\n\t"
-                      "vmull.u8 q7, d31, d7\n\t"
-                      "vrshr.u16 q8, q4, #8\n\t"
-                      "vrshr.u16 q9, q5, #8\n\t"
-                      "vraddhn.u16 d20, q4, q8\n\t"
-                      "vrshr.u16 q8, q6, #8\n\t"
-                      "vraddhn.u16 d21, q5, q9\n\t"
-                      "vrshr.u16 q9, q7, #8\n\t"
-                      "vraddhn.u16 d22, q6, q8\n\t"
-                      "vraddhn.u16 d23, q7, q9\n\t"
-                      "vqadd.u8 d20, d0, d20\n\t"
-                      "vqadd.u8 d21, d1, d21\n\t"
-                      "vqadd.u8 d22, d2, d22\n\t"
-                      "vqadd.u8 d23, d3, d23\n\t"
-                      "vst4.8 {d20-d23}, [%[d]]!\n\t"
-                      "vst4.8 {d20-d23}, [%[d]]\n\t"
-                      "add %[s], %[s], #4\n\t" // s++
-                      "add %[d], %[d], #4\n\t" // d++
-                      "cmp %[e], %[d]\n\t" // if d < e ...
-                      "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
-                      : // output regs
-                      : [s] "r" (s), [e] "r" (e), [d] "r" (d) // input
-                      : "d0", "d1", "memory" // clobbered
-                      );
-        e += 4;
-     }
-#endif   
-   while (d < e)
-     {
-        switch (*s & 0xff000000)
-          {
-          case 0:
-             break;
-          case 0xff000000:
-             *d = *s;
-             break;
-          default :
-             l = 256 - (*s >> 24);
-             *d = *s + MUL_256(l, *d);
-             break;
-          }
-        s++;  d++;
-     }
-#endif   
-#else   
-   DATA32 *e = d + l;
-   while (d < e)
-     {
-        switch (*s & 0xff000000)
-          {
-          case 0:
-             break;
-          case 0xff000000:
-             *d = *s;
-             break;
-          default :
-             l = 256 - (*s >> 24);
-             *d = *s + MUL_256(l, *d);
-             break;
-          }
-        s++;  d++;
-     }
-#endif   
+#define AP "blend_pas_dp_"
+   DATA32 *e = d + l,*tmp  = e + 32,*pl=(void*)912;
+      asm volatile (
+               "vmov.i8        q8,     #1                      \n\t"
+       AP"loopchoose:                                          \n\t"
+               // If aliged - go as fast we can
+               "andS   %[tmp], %[d],   #31             \n\t"
+               "beq    "AP"quadstart                           \n\t"
+
+               // See if we can at least do our double loop
+               "andS   %[tmp], %[d], $0x7                      \n\t"
+               "beq    "AP"dualstart                           \n\t"
+
+       // Ugly single word version
+       AP "singleloop:                                         \n\t"
+               "vld1.32        d0[0], [%[s]]!                  \n\t"
+               "vld1.32        d4[0], [%[d]]                   \n\t"
+
+               "vmvn.u8        d8,     d0                      \n\t"
+
+               "vshr.u32       d8,     d8,$0x18                \n\t"
+
+               // Mulitply into all fields
+               "vmul.u32       d8,     d8, d16                 \n\t"
+
+               // Multiply out
+               "vmull.u8       q6,     d8, d4                  \n\t"
+
+               "vshrn.u16      d8,     q6, #8                  \n\t"
+
+               // Add to s
+               "vqadd.u8       d0,     d0,d8                   \n\t"
+               "vst1.32        d0[0], [%[d]]!                  \n\t"
+
+       AP"dualstart:                                           \n\t"
+               "sub            %[tmp], %[e], %[d]              \n\t"
+               "cmp            %[tmp], #32                     \n\t"
+               "blt            "AP"loopout                     \n\t"
+
+               // If aligned - go as fast we can
+               "andS           %[tmp], %[d], #31               \n\t"
+               "beq            "AP"quadstart                   \n\t"
+
+
+       AP"dualloop:                                            \n\t"
+
+               "vldm   %[s]!,  {d0)                            \n\t"
+               "vldm   %[d],   {d4}                            \n\t"
+
+               // Subtract from 255 (ie negate) and extract alpha channel
+               "vmvn.u8        d8,     d0                      \n\t"
+               "vshr.u32       d8,     d8,$0x18                \n\t"
+
+               // Mulitply into all fields
+               "vmul.u32       d8,     d8, d16                 \n\t"
+
+               // Multiply out
+               "vmull.u8       q6,     d8, d4                  \n\t"
+
+               "vshrn.u16      d8,     q6, #8                  \n\t"
+
+               // Add to s
+               "vqadd.u8       d0,     d0,d8                   \n\t"
+               "vstm           %[d]!,  {d0}                    \n\t"
+
+               "andS           %[tmp], %[d], $0x1f             \n\t"
+               "bne            "AP"dualloop                    \n\t"
+
+
+        AP"quadstart:                                          \n\t"
+               "sub            %[tmp], %[e], %[d]              \n\t"
+               "cmp            %[tmp], #32                     \n\t"
+               "blt            "AP"loopout                     \n\t"
+
+               "sub            %[tmp], %[e],  #31              \n\t"
+
+        AP"quadloop:\n\t"
+               "vldm   %[s]!,  {d0,d1,d2,d3)                   \n\t"
+               "vldm   %[d],   {d4,d5,d6,d7}                   \n\t"
+
+               // Subtract from 255 (ie negate) and extract alpha channel
+               "vmvn.u8        q4,     q0                      \n\t"
+                       "vmvn.u8        q5,     q1              \n\t"
+               "vshr.u32       q4,     q4,$0x18                \n\t"
+                       "vshr.u32       q5,     q5,$0x18        \n\t"
+
+               // Prepare to preload
+               "add    %[pl], %[s], #32\n\t"
+
+               // Mulitply into all fields
+               "vmul.u32       q4,     q4, q8                  \n\t"
+                       "vmul.u32       q5,     q5, q8          \n\t"
+               "pld    [%[pl]]\n\t"
+
+               // Multiply out
+               "vmull.u8       q6,     d8, d4                  \n\t"
+                       "vmull.u8       q7,     d10, d6         \n\t"
+               "vmull.u8       q2,     d9, d5                  \n\t"
+                       "vmull.u8       q3,     d11, d7         \n\t"
+
+               "add    %[pl], %[d], #32\n\t"
+
+               "vshrn.u16      d8,     q6, #8                  \n\t"
+                       "vshrn.u16      d10,    q7, #8          \n\t"
+               "vshrn.u16      d9,     q2, #8                  \n\t"
+                       "vshrn.u16      d11,    q3, #8          \n\t"
+               "pld    [%[pl]]\n\t"
+
+               "cmp            %[tmp], %[pl]                   \n\t"
+               // Add to s
+               "vqadd.u8       q0,     q0,q4                   \n\t"
+                       "vqadd.u8       q1,     q1,q5           \n\t"
+
+               "vstm           %[d]!,  {d0,d1,d2,d3}           \n\t"
+
+               "bhi            "AP"quadloop                    \n\t"
+
+       AP "loopout:                                            \n\t"
+               "cmp            %[d], %[e]                      \n\t"
+                "beq           "AP"done                        \n\t"
+
+               "sub            %[tmp],%[e], %[d]               \n\t"
+               "cmp            %[tmp],$0x04                    \n\t"
+               "beq            "AP"singleloop2                 \n\t"
+
+               "sub            %[tmp],%[e],$0x7        \n\t"
+
+       AP"dualloop2:                                           \n\t"
+               "vldm   %[s]!,  {d0)                            \n\t"
+               "vldm   %[d],   {d4}                            \n\t"
+
+               // Subtract from 255 (ie negate) and extract alpha channel
+               "vmvn.u8        d8,     d0                      \n\t"
+               "vshr.u32       d8,     d8,$0x18                \n\t"
+
+               // Mulitply into all fields
+               "vmul.u32       d8,     d8, d16                 \n\t"
+
+               // Multiply out
+               "vmull.u8       q6,     d8, d4                  \n\t"
+
+               "vshrn.u16      d8,     q6, #8                  \n\t"
+
+               // Add to s
+               "vqadd.u8       d0,     d0,d8                   \n\t"
+
+               "vstm           %[d]!,  {d0}                    \n\t"
+               "cmp            %[tmp], %[d]                    \n\t"
+
+               "bhi            "AP"dualloop2                   \n\t"
+
+               // Single ??
+               "cmp            %[e], %[d]              \n\t"
+               "beq            "AP"done                \n\t"
+
+       AP "singleloop2:                                        \n\t"
+               "vld1.32        d0[0], [%[s]]                   \n\t"
+               "vld1.32        d4[0], [%[d]]                   \n\t"
+
+               "vmvn.u8        d8,     d0                      \n\t"
+
+               "vshr.u32       d8,     d8,$0x18                \n\t"
+
+               // Mulitply into all fields
+               "vmul.u32       d8,     d8, d16                 \n\t"
+
+               // Multiply out
+               "vmull.u8       q6,     d8, d4                  \n\t"
+
+               "vshrn.u16      d8,     q6, #8                  \n\t"
+
+               // Add to s
+               "vqadd.u8       d0,     d0,d8                   \n\t"
+
+               "vst1.32        d0[0], [%[d]]                   \n\t"
+       AP "done:\n\t"
+
+
+         : /* Out */
+         : /* In */  [s] "r" (s), [e] "r" (e), [d] "r" (d), [tmp] "r" (tmp),
+               [pl] "r" (pl)
+         : /* Clobbered */
+                "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
+      );
+#undef AP
 }
 
 #define _op_blend_pan_dp_neon NULL
index c0fc00d..8ec17cc 100644 (file)
@@ -4,58 +4,87 @@
 #ifdef BUILD_NEON
 static void
 _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
-   uint32_t *e;
-   uint32_t dalign = ((uint32_t)d) & 0xf; // get alignment
-   // handle unaligned stores - stores not aligned to 16bytes may suck
-   if (dalign > 0)
-     {
-        dalign = (16 - dalign) >> 2;
-        if (l < dalign) dalign = l;
-        l -= dalign;
-        e = d + dalign;
-        for (; d < e; d++) {
-           *d = c; // OP
-        }
-        if (l <= 0) return;
-     }
-   e = d + l;
-#ifdef NEON_INSTRINSICS_OK
-   e -= 15;
-   // expand the color in c to a 128 bit register as "cccc" i.e 4 pixels of c
-   uint32x4_t col = vdupq_n_u32(c);
-   // fill a run of 4x4 (16) pixels with the color
-   for (; d < e; d += 16) {
-      vst1q_u32(d+0, col); // OP
-      vst1q_u32(d+4, col); // OP
-      vst1q_u32(d+8, col); // OP
-      vst1q_u32(d+12, col); // OP
-   }
-   e += 15;
-#else
-   if ((e - d) >= 16)
-     {
-        e -= 31;
-        asm volatile (
-                      "vdup.32 q8, %[c]\n\t"
-                      "asmloop1:\n\t"
-//                      "pld [%[d], #128]\n\t"
-                      "cmp %[e], %[d]\n\t"
-                      "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
-                      "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
-                      "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
-                      "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
-                      "bhi asmloop1\n\t"
-                      : // output regs
-                      : [c] "r" (c), [e] "r" (e), [d] "r" (d) // input
-                      : "q8", "d16", "d17", "memory" // clobbered
-                      );
-        e += 31;
-     }
-#endif
-   // fixup any leftover pixels in the run
-   for (; d < e; d++) {
-      *d = c; // OP
-   }
+#define AP "COPY_C_DP_"
+   uint32_t *e = d + l,*tmp;
+   asm volatile (
+
+               "vdup.i32       q0,     %[c]            \n\t"
+
+               // Can we do 32 byte?
+               "andS           %[tmp], %[d], $0x1f     \n\t"
+               "beq            "AP"quadstart           \n\t"
+
+               // Can we do at least 16 byte?
+               "andS           %[tmp], %[d], $0x4      \n\t"
+               "beq            "AP"dualstart           \n\t"
+
+       // Only once
+       AP"singleloop:                                  \n\t"
+               "vst1.32        d0[0],  [%[d]]          \n\t"
+               "add            %[d], #4                \n\t"
+
+       // Up to 3 times
+       AP"dualstart:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+       AP"dualloop:                                    \n\t"
+               "vstr.32        d0, [%[d]]              \n\t"
+
+               "add            %[d], #8                \n\t"
+               "andS           %[tmp], %[d], $0x1f     \n\t"
+               "bne            "AP"dualloop            \n\t"
+
+
+       AP"quadstart:                                   \n\t"
+               "sub            %[tmp], %[e], %[d]      \n\t"
+               "cmp            %[tmp], #32             \n\t"
+               "blt            "AP"loopout             \n\t"
+
+               "vmov           q1, q0                  \n\t"
+               "sub            %[tmp],%[e],#31         \n\t"
+
+       AP "quadloop:                                   \n\t"
+               "vstm           %[d]!,  {d0,d1,d2,d3}   \n\t"
+
+               "cmp            %[tmp], %[d]            \n\t"
+                "bhi           "AP"quadloop            \n\t"
+
+
+       AP "loopout:                                    \n\t"
+               "cmp            %[d], %[e]              \n\t"
+                "beq           "AP"done                \n\t"
+               "sub            %[tmp],%[e], %[d]       \n\t"
+               "cmp            %[tmp],$0x04            \n\t"
+               "beq            "AP"singleloop2         \n\t"
+
+       AP "dualloop2:                                  \n\t"
+               "sub            %[tmp],%[e],#7          \n\t"
+       AP "dualloop2int:                               \n\t"
+               "vstr.64        d0, [%[d]]              \n\t"
+
+               "add            %[d], #8                \n\t"
+               "cmp            %[tmp], %[d]            \n\t"
+               "bhi            "AP"dualloop2int        \n\t"
+
+               // Single ??
+               "cmp            %[e], %[d]              \n\t"
+               "beq            "AP"done                \n\t"
+
+       AP "singleloop2:                                \n\t"
+               "vst1.32        d0[0], [%[d]]           \n\t"
+
+       AP "done:\n\t"
+
+               : // No output regs
+               // Input
+               : [c] "r" (c), [e] "r" (e), [d] "r" (d),[tmp] "r" (tmp)
+               // Clobbered
+               : "q0","q1","memory"
+
+
+   );
 }
 
 #define _op_copy_cn_dp_neon _op_copy_c_dp_neon