#ifdef BUILD_NEON
static void
_op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
- DATA32 *e, a = 256 - (c >> 24);
- UNROLL8_PLD_WHILE(d, l, e,
- {
- *d = c + MUL_256(a, *d);
- d++;
- });
+ DATA32 *e, *tmp = 0;
+#define AP "B_C_DP"
+ asm volatile (
+ "vdup.u32 q6, %[c] \n\t"
+ "vmov.i8 q5, #1 \n\t"
+ "vmvn.u8 q7,q6 \n\t"
+ "vshr.u32 q7, q7, $0x18 \n\t"
+ "vmul.u32 q7,q5, q7 \n\t"
+ "bic %[e], #3 \n\t"
+ "bic %[d], #3 \n\t"
+
+ AP "loopchoose: \n\t"
+ // If aligned already - straight to quads
+ "andS %[tmp], %[d],$0x1f \n\t"
+ "beq "AP"quadloops \n\t"
+
+ "andS %[tmp], %[d],$0x4 \n\t"
+ "beq "AP"dualloop \n\t"
+
+ // Only ever executes once, fall through to dual
+ AP "singleloop: \n\t"
+ // Use 'tmp' not 'd'
+ "vld1.32 d0[0], [%[d]] \n\t"
+ // Only touch d1
+ "vmull.u8 q0, d0, d14 \n\t"
+ "vshrn.u16 d0, q0, #8 \n\t"
+ "vadd.u8 d0, d12, d0 \n\t"
+ "vst1.32 d0[0], [%[d]] \n\t"
+
+ "add %[d], #4 \n\t"
+
+ // Can we go the fast path?
+ "andS %[tmp], %[d],$0x1f \n\t"
+ "beq "AP"quadloops \n\t"
+
+ AP "dualloop: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+
+ AP "dualloopint: \n\t"
+ "vldr.32 d0, [%[d]] \n\t"
+ "vmull.u8 q1, d0, d14 \n\t"
+ "vshrn.u16 d0, q1, #8 \n\t"
+ "vqadd.u8 d0, d0, d12 \n\t"
+
+ "vstm %[d]!, {d0} \n\t"
+
+ "ands %[tmp], %[d], $0x1f \n\t"
+ "bne "AP"dualloopint \n\t"
+
+ AP "quadloops: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ "sub %[tmp],%[e],#31 \n\t"
+
+ AP "quadloopint:\n\t"
+ "vldm %[d], {d0,d1,d2,d3} \n\t"
+
+ "vmull.u8 q2, d0, d14 \n\t"
+ "vmull.u8 q3, d1, d15 \n\t"
+ "vmull.u8 q4, d2, d14 \n\t"
+ "vmull.u8 q5, d3, d15 \n\t"
+
+ "vshrn.u16 d0, q2, #8 \n\t"
+ "vshrn.u16 d1, q3, #8 \n\t"
+ "vshrn.u16 d2, q4, #8 \n\t"
+ "vshrn.u16 d3, q5, #8 \n\t"
+
+ "vqadd.u8 q0, q6, q0 \n\t"
+ "vqadd.u8 q1, q6, q1 \n\t"
+
+ "vstm %[d]!, {d0,d1,d2,d3} \n\t"
+
+ "cmp %[tmp], %[d]\n\t"
+ "bhi "AP"quadloopint\n\t"
+
+ AP "loopout: \n\t"
+ "cmp %[d], %[e]\n\t"
+ "beq "AP"done\n\t"
+ "sub %[tmp],%[e], %[d] \n\t"
+ "cmp %[tmp],#8 \n\t"
+ "blt "AP"singleloop2 \n\t"
+
+ AP "dualloop2: \n\t"
+ "sub %[tmp],%[e],$0x7 \n\t"
+ AP "dualloop2int: \n\t"
+ "vldr.64 d0, [%[d]] \n\t"
+ "vmull.u8 q1, d0, d14 \n\t"
+ "vshrn.u16 d0, q1, #8 \n\t"
+ "vqadd.u8 d0, d0, d12 \n\t"
+
+ "vstr.64 d0, [%[d]] \n\t"
+
+ "add %[d], #8 \n\t"
+ "cmp %[tmp], %[d] \n\t"
+ "bhi "AP"dualloop2int \n\t"
+
+ // Single ??
+ "cmp %[e], %[d] \n\t"
+ "beq "AP"done \n\t"
+
+ AP "singleloop2: \n\t"
+ "vld1.32 d0[0], [%[d]] \n\t"
+ "vmull.u8 q1, d0, d14 \n\t"
+ "vshrn.u16 d0, q1, #8 \n\t"
+ "vqadd.u8 d0, d0, d12 \n\t"
+
+ "vst1.32 d0[0], [%[d]] \n\t"
+
+ AP "done:\n\t"
+
+ : // output regs
+ // Input
+ : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp)
+ : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","memory" // clobbered
+
+ );
+#undef AP
+
}
#define _op_blend_caa_dp_neon _op_blend_c_dp_neon
_op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
DATA32 *e;
int alpha = 256 - (c >> 24);
- UNROLL8_PLD_WHILE(d, l, e,
- {
- DATA32 a = *m;
- switch(a)
- {
- case 0:
- break;
- case 255:
- *d = c + MUL_256(alpha, *d);
- break;
- default:
- {
- DATA32 mc = MUL_SYM(a, c);
- a = 256 - (mc >> 24);
- *d = mc + MUL_256(a, *d);
- }
- break;
- }
- m++; d++;
- });
+#define AP "blend_mas_c_dp_"
+ asm volatile (
+ " vdup.i32 q15, %[c] \n\t"
+ " vmov.i8 q14, #1 \n\t"
+
+ // If aligned already - straight to quads
+ " andS %[tmp], %[d],$0xf \n\t"
+ " beq "AP"quadloops \n\t"
+
+ " andS %[tmp], %[d],$0x4 \n\t"
+ " beq "AP"dualloop \n\t"
+
+ AP"singleloop: \n\t"
+ " vld1.8 d0[0], [%[m]]! \n\t"
+ " vld1.32 d4[0], [%[d]] \n\t"
+ " vdup.u8 d0, d0[0] \n\t"
+ " vmull.u8 q4, d0, d30 \n\t"
+ " vshrn.u16 d12, q4, #8 \n\t"
+ " vmvn.u16 d14, d12 \n\t"
+ " vshr.u32 d16, d14, #24 \n\t"
+ " vmul.u32 d16, d16, d28 \n\t"
+ " vmull.u8 q7, d16, d4 \n\t"
+ " vshrn.u16 d0, q7, #8 \n\t"
+ " vqadd.u8 d0, d0, d12 \n\t"
+ " vst1.32 d0[0], [%[d]]! \n\t"
+
+ // Can we go the fast path?
+ " andS %[tmp], %[d],$0xf \n\t"
+ " beq "AP"quadloops \n\t"
+
+ AP"dualloop: \n\t"
+ " sub %[tmp], %[e], %[d] \n\t"
+ " cmp %[tmp], #16 \n\t"
+ " blt "AP"loopout \n\t"
+
+ " vld1.16 d0[0], [%[m]]! \n\t"
+ " vldm %[d], {d4} \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmul.u32 q0, q14 \n\t"
+ " vmull.u8 q4, d0, d30 \n\t"
+ " vshrn.u16 d12, q4, #8 \n\t"
+ " vmvn.u16 d14, d12 \n\t"
+ " vshr.u32 d16, d14, #24 \n\t"
+ " vmul.u32 d16, d16, d28 \n\t"
+ " vmull.u8 q7, d16, d4 \n\t"
+ " vshrn.u16 d0, q7, #8 \n\t"
+ " vqadd.u8 q0, q0, q6 \n\t"
+ " vstm %[d]!, {d0} \n\t"
+
+ AP"quadloops: \n\t"
+ " sub %[tmp], %[e], %[d] \n\t"
+ " cmp %[tmp], #16 \n\t"
+ " blt "AP"loopout \n\t"
+ " sub %[tmp], %[e], #15 \n\t"
+
+ " sub %[d], #16 \n\t"
+ AP"fastloop:"
+ " add %[d], #16 \n\t"
+ " cmp %[tmp], %[d] \n\t"
+ " ble "AP"loopout \n\t"
+ AP"quadloopint: \n\t"
+// " vld1.32 d0[0], [%[m]]! \n\t"
+ " ldr.32 %[x], [%[m]] \n\t"
+ " add %[m], #4 \n\t"
+ " cmp %[x], #0 \n\t"
+ " beq "AP"fastloop \n\t"
+ " vmov.32 d0[0], %[x] \n\t"
+ " vldm %[d], {d4,d5} \n\t"
+
+ // Expand M: Fixme: Can we do this quicker?
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmul.u32 q0, q14 \n\t"
+
+ // Multiply a * c
+ " vmull.u8 q4, d0, d30 \n\t"
+ " vmull.u8 q5, d1, d31 \n\t"
+
+ // Shorten
+ " vshrn.u16 d12, q4, #8 \n\t"
+ " vshrn.u16 d13, q5, #8 \n\t"
+
+ // extract negated alpha
+ " vmvn.u16 q7, q6 \n\t"
+ " vshr.u32 q8, q7, #24 \n\t"
+ " vmul.u32 q8, q8, q14 \n\t"
+
+ // Multiply
+ " vmull.u8 q7, d16, d4 \n\t"
+ " vmull.u8 q8, d17, d5 \n\t"
+
+ " vshrn.u16 d0, q7, #8 \n\t"
+ " vshrn.u16 d1, q8, #8 \n\t"
+
+ // Add
+ " vqadd.u8 q0, q0, q6 \n\t"
+
+ " vstm %[d]!, {d0,d1} \n\t"
+
+ " cmp %[tmp], %[d] \n\t"
+ " bhi "AP"quadloopint \n\t"
+
+ AP"loopout: \n\t"
+ " cmp %[d], %[e] \n\t"
+ " beq "AP"done \n\t"
+ " sub %[tmp],%[e], %[d] \n\t"
+ " cmp %[tmp],#4 \n\t"
+ " beq "AP"singleout \n\t"
+
+ AP "dualloop2: \n\t"
+ "sub %[tmp],%[e],$0x7 \n\t"
+ " vld1.16 d0[0], [%[m]]! \n\t"
+ " vldm %[d], {d4} \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmul.u32 q0, q14 \n\t"
+ " vmull.u8 q4, d0, d30 \n\t"
+ " vshrn.u16 d12, q4, #8 \n\t"
+ " vmvn.u16 d14, d12 \n\t"
+ " vshr.u32 d16, d14, #24 \n\t"
+ " vmul.u32 d16, d16, d28 \n\t"
+ " vmull.u8 q7, d16, d4 \n\t"
+ " vshrn.u16 d0, q7, #8 \n\t"
+ " vqadd.u8 q0, q0, q6 \n\t"
+ " vstm %[d]!, {d0} \n\t"
+
+ " cmp %[e], %[d] \n\t"
+ " beq "AP"done \n\t"
+
+ AP"singleout: \n\t"
+ " vld1.8 d0[0], [%[m]]! \n\t"
+ " vld1.32 d4[0], [%[d]] \n\t"
+ " vdup.u8 d0, d0[0] \n\t"
+ " vmull.u8 q4, d0, d30 \n\t"
+ " vshrn.u16 d12, q4, #8 \n\t"
+ " vmvn.u16 d14, d12 \n\t"
+ " vshr.u32 d16, d14, #24 \n\t"
+ " vmul.u32 d16, d16, d28 \n\t"
+ " vmull.u8 q7, d16, d4 \n\t"
+ " vshrn.u16 d0, q7, #8 \n\t"
+ " vqadd.u8 q0, q0, q6 \n\t"
+ " vst1.32 d0[0], [%[d]]! \n\t"
+
+ AP"done: \n\t"
+
+ : // Out
+ : [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
+ [tmp] "r" (7), [m] "r" (m), [x] "r" (0)
+ : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15",
+ "memory" // clobbered
+ );
+#undef AP
}
+#endif
+#ifdef BUILD_NEON
static void
_op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
- DATA32 *e;
+ DATA32 *e,*tmp;
int alpha;
- UNROLL8_PLD_WHILE(d, l, e,
- {
- alpha = *m;
- switch(alpha)
- {
- case 0:
- break;
- case 255:
- *d = c;
- break;
- default:
- alpha++;
- *d = INTERP_256(alpha, c, *d);
- break;
- }
- m++; d++;
- });
+#define AP "_blend_mas_can_dp_neon_"
+ asm volatile (
+ "vdup.u32 q9, %[c] \n\t"
+ "vmov.i8 q15, #1 \n\t"
+ "vmov.i8 q14, #0 \n\t"
+
+ // Make C 16 bit (C in q3/q2)
+ "vmovl.u8 q3, d19 \n\t"
+ "vmovl.u8 q2, d18 \n\t"
+
+ // Which loop to start
+ " andS %[tmp], %[d],$0xf \n\t"
+ " beq "AP"quadloop \n\t"
+
+ " andS %[tmp], %[d], #4 \n\t"
+ " beq "AP"dualloop \n\t"
+
+
+ AP"singleloop: \n\t"
+ " vld1.8 d0[0], [%[m]]! \n\t"
+ " vld1.32 d8[0], [%[d]] \n\t"
+ " vdup.u8 d0, d0[0] \n\t"
+ " vshr.u8 d0, d0, #1 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q4, d8 \n\t"
+ " vsub.s16 q6, q2, q4 \n\t"
+ " vmul.s16 q6, q0 \n\t"
+ " vshr.s16 q6, #7 \n\t"
+ " vadd.s16 q6, q4 \n\t"
+ " vqmovun.s16 d2, q6 \n\t"
+ " vst1.32 d2[0], [%[d]]! \n\t"
+
+ " andS %[tmp], %[d], #15 \n\t"
+ " beq "AP"quadloop \n\t"
+
+ AP"dualloop: \n\t"
+ " vld1.16 d0[0], [%[m]]! \n\t"
+ " vldm %[d], {d8} \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmul.u32 d0, d0, d30 \n\t"
+ " vshr.u8 d0, d0, #1 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q4, d8 \n\t"
+ " vsub.s16 q6, q2, q4 \n\t"
+ " vmul.s16 q6, q0 \n\t"
+ " vshr.s16 q6, #7 \n\t"
+ " vadd.s16 q6, q4 \n\t"
+ " vqmovun.s16 d2, q6 \n\t"
+ " vstm %[d]!, {d2} \n\t"
+
+ AP"quadloop: \n\t"
+ " sub %[tmp], %[e], %[d] \n\t"
+ " cmp %[tmp], #16 \n\t"
+ " blt "AP"loopout \n\t"
+ " sub %[tmp], %[e], #15 \n\t"
+
+ " sub %[d], #16 \n\t"
+ AP"fastloop: \n\t"
+ " add %[d], #16 \n\t"
+ " cmp %[tmp], %[d] \n\t"
+ " ble "AP"loopout \n\t"
+
+ AP"quadloopint: \n\t"
+ // Load the mask: 4 bytes: It has d0/d1
+ " ldr.32 %[x], [%[m]] \n\t"
+ " add %[m], #4 \n\t"
+ " cmp %[x], #0 \n\t"
+ " beq "AP"fastloop \n\t"
+ " vmov.32 d0[0], %[x] \n\t"
+
+ // Load d into d8/d9 q4
+ " vldm %[d], {d8,d9} \n\t"
+ " cmp %[x], $0xffffffff \n\t"
+ " beq "AP"quadstore \n\t"
+
+
+ // Get the alpha channel ready (m)
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmul.u32 q0, q0,q15 \n\t"
+ // Lop a bit off to prevent overflow
+ " vshr.u8 q0, q0, #1 \n\t"
+
+ // Now make it 16 bit
+ " vmovl.u8 q1, d1 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+
+ // 16 bit 'd'
+ " vmovl.u8 q5, d9 \n\t"
+ " vmovl.u8 q4, d8 \n\t"
+
+ // Diff 'd' & 'c'
+ " vsub.s16 q7, q3, q5 \n\t"
+ " vsub.s16 q6, q2, q4 \n\t"
+
+ " vmul.s16 q7, q1 \n\t"
+ " vmul.s16 q6, q0 \n\t"
+
+ // Shift results a bit
+ " vshr.s16 q7, #7 \n\t"
+ " vshr.s16 q6, #7 \n\t"
+
+ // Add 'd'
+ " vadd.s16 q7, q5 \n\t"
+ " vadd.s16 q6, q4 \n\t"
+
+ // Make sure none are negative
+ " vqmovun.s16 d9, q7 \n\t"
+ " vqmovun.s16 d8, q6 \n\t"
+
+ " vstm %[d]!, {d8,d9} \n\t"
+
+ " cmp %[tmp], %[d] \n\t"
+ " bhi "AP"quadloopint \n\t"
+ " b "AP"loopout \n\t"
+
+ AP"quadstore: \n\t"
+ " vstm %[d]!, {d18,d19} \n\t"
+ " cmp %[tmp], %[d] \n\t"
+ " bhi "AP"quadloopint \n\t"
+
+
+ AP"loopout: \n\t"
+
+ " cmp %[e], %[d] \n\t"
+ " beq "AP"done \n\t"
+
+ " sub %[tmp],%[e], %[d] \n\t"
+ " cmp %[tmp],#8 \n\t"
+
+ " blt "AP"onebyte \n\t"
+
+ // Load the mask: 2 bytes: It has d0
+ " vld1.16 d0[0], [%[m]]! \n\t"
+
+ // Load d into d8/d9 q4
+ " vldm %[d], {d8} \n\t"
+
+ // Get the alpha channel ready (m)
+ " vmovl.u8 q0, d0 \n\t"
+ " vmovl.u8 q0, d0 \n\t"
+ " vmul.u32 d0, d0, d30 \n\t"
+ // Lop a bit off to prevent overflow
+ " vshr.u8 d0, d0, #1 \n\t"
+
+ // Now make it 16 bit
+ " vmovl.u8 q0, d0 \n\t"
+
+ // 16 bit 'd'
+ " vmovl.u8 q4, d8 \n\t"
+
+ // Diff 'd' & 'c'
+ " vsub.s16 q6, q2, q4 \n\t"
+
+ " vmul.s16 q6, q0 \n\t"
+
+ // Shift results a bit
+ " vshr.s16 q6, #7 \n\t"
+
+ // Add 'd'
+ "vadd.s16 q6, q4 \n\t"
+
+ // Make sure none are negative
+ "vqmovun.s16 d2, q6 \n\t"
+
+ "vstm %[d]!, {d2} \n\t"
+
+ "cmp %[e], %[d] \n\t"
+ "beq "AP"done \n\t"
+
+ AP"onebyte: \n\t"
+ "vld1.8 d0[0], [%[m]]! \n\t"
+ "vld1.32 d8[0], [%[d]] \n\t"
+ "vdup.u8 d0, d0[0] \n\t"
+ "vshr.u8 d0, d0, #1 \n\t"
+ "vmovl.u8 q0, d0 \n\t"
+ "vmovl.u8 q4, d8 \n\t"
+ "vsub.s16 q6, q2, q4 \n\t"
+ "vmul.s16 q6, q0 \n\t"
+ "vshr.s16 q6, #7 \n\t"
+ "vadd.s16 q6, q4 \n\t"
+ "vqmovun.s16 d2, q6 \n\t"
+ "vst1.32 d2[0], [%[d]]! \n\t"
+
+ AP"done: \n\t"
+
+ : // output regs
+ // Input
+ : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c),
+ [m] "r" (m), [tmp] "r" (7), [x] "r" (33)
+ : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q14","q15",
+ "memory" // clobbered
+
+ );
+#undef AP
}
+#endif
+#ifdef BUILD_NEON
#define _op_blend_mas_cn_dp_neon _op_blend_mas_can_dp_neon
#define _op_blend_mas_caa_dp_neon _op_blend_mas_c_dp_neon
/* blend pixel x color --> dst */
-
#ifdef BUILD_NEON
+/* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of
+ * reads, then two writes, a miss on read is 'just' two reads */
static void
_op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
- DATA32 *e;
- int alpha;
- UNROLL8_PLD_WHILE(d, l, e,
- {
- DATA32 sc = MUL4_SYM(c, *s);
- alpha = 256 - (sc >> 24);
- *d = sc + MUL_256(alpha, *d);
- d++;
- s++;
- });
+#define AP "blend_p_c_dp_"
+ asm volatile (
+ // Load 'c'
+ "vdup.u32 q7, %[c] \n\t"
+ "vmov.i8 q6, #1 \n\t"
+
+ // Choose a loop
+ "andS %[tmp], %[d], $0xf \n\t"
+ "beq "AP"quadstart \n\t"
+
+ "andS %[tmp],%[d], $0x4 \n\t"
+ "beq "AP"dualloop \n\t"
+
+ AP"singleloop:"
+ "vld1.32 d0[0], [%[s]]! \n\t"
+ "vld1.32 d2[0], [%[d]] \n\t"
+ // Mulitply s * c (= sc)
+ "vmull.u8 q4, d0,d14 \n\t"
+ // sc in d8
+ "vshrn.u16 d4, q4, #8 \n\t"
+
+ // sca in d9
+ "vmvn.u32 d6, d4 \n\t"
+ "vshr.u32 d6, d6, #24 \n\t"
+
+ "vmul.u32 d6, d12, d6 \n\t"
+
+ /* d * alpha */
+ "vmull.u8 q4, d6, d2 \n\t"
+ "vshrn.u16 d0, q4, #8 \n\t"
+
+ "vqadd.u8 d2, d0, d4 \n\t"
+
+ // Save dsc + sc
+ "vst1.32 d2[0], [%[d]]! \n\t"
+
+ // Now where?
+ // Can we go the fast path?
+ "andS %[tmp], %[d],$0xf \n\t"
+ "beq "AP"quadstart \n\t"
+
+ AP"dualloop: \n\t"
+ // Check we have enough to bother with!
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #16 \n\t"
+ "blt "AP"loopout \n\t"
+
+ // load 's' -> q0, 'd' -> q1
+ "vldm %[s]!, {d0} \n\t"
+ "vldm %[d], {d2} \n\t"
+ // Mulitply s * c (= sc)
+ "vmull.u8 q4, d0,d14 \n\t"
+ // sc in d8
+ "vshrn.u16 d4, q4, #8 \n\t"
+
+ // sca in d9
+ "vmvn.u32 d6, d4 \n\t"
+ "vshr.u32 d6, d6, #24 \n\t"
+
+ "vmul.u32 d6, d12, d6 \n\t"
+
+ /* d * alpha */
+ "vmull.u8 q4, d6, d2 \n\t"
+ "vshrn.u16 d0, q4, #8 \n\t"
+
+ "vqadd.u8 d2, d0, d4 \n\t"
+
+ // Save dsc + sc
+ "vst1.32 d2, [%[d]]! \n\t"
+
+ AP"quadstart: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #16 \n\t"
+ "blt "AP"loopout \n\t"
+
+ "sub %[tmp], %[e], #15 \n\t"
+
+ AP"quadloop:\n\t"
+ // load 's' -> q0, 'd' -> q1
+ "vldm %[s]!, {d0,d1} \n\t"
+ "vldm %[d], {d2,d3} \n\t"
+ // Mulitply s * c (= sc)
+ "vmull.u8 q4, d0,d14 \n\t"
+ "vmull.u8 q5, d1,d14 \n\t"
+
+ // Get sc & sc alpha
+ "vshrn.u16 d4, q4, #8 \n\t"
+ "vshrn.u16 d5, q5, #8 \n\t"
+ // sc is now in q2, 8bpp
+ // Shift out, then spread alpha for q2
+ "vmvn.u32 q3, q2 \n\t"
+ "vshr.u32 q3, q3, $0x18 \n\t"
+ "vmul.u32 q3, q6,q3 \n\t"
+
+ // Multiply 'd' by sc.alpha (dsca)
+ "vmull.u8 q4, d6,d2 \n\t"
+ "vmull.u8 q5, d7,d3 \n\t"
+
+ "vshrn.u16 d0, q4, #8 \n\t"
+ "vshrn.u16 d1, q5, #8 \n\t"
+
+ "vqadd.u8 q1, q0, q2 \n\t"
+
+ // Save dsc + sc
+ "vstm %[d]!, {d2,d3} \n\t"
+
+ "cmp %[tmp], %[d] \n\t"
+
+ "bhi "AP"quadloop \n\t"
+
+ /* Trailing stuff */
+ AP"loopout: \n\t"
+
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done\n\t"
+ "sub %[tmp],%[e], %[d] \n\t"
+ "cmp %[tmp],$0x04 \n\t"
+ "beq "AP"singleloop2 \n\t"
+
+ "sub %[tmp], %[e], #7 \n\t"
+ /* Dual loop */
+ AP"dualloop2: \n\t"
+ "vldm %[s]!, {d0} \n\t"
+ "vldm %[d], {d2} \n\t"
+ // Mulitply s * c (= sc)
+ "vmull.u8 q4, d0,d14 \n\t"
+ // sc in d8
+ "vshrn.u16 d4, q4, #8 \n\t"
+
+ // sca in d9
+ // XXX: I can probably squash one of these 3
+ "vmvn.u32 d6, d4 \n\t"
+ "vshr.u32 d6, d6, #24 \n\t"
+ "vmul.u32 d6, d6, d12 \n\t"
+
+ /* d * alpha */
+ "vmull.u8 q4, d6, d2 \n\t"
+ "vshrn.u16 d0, q4, #8 \n\t"
+
+ "vqadd.u8 d2, d0, d4 \n\t"
+
+ // Save dsc + sc
+ "vstm %[d]!, {d2} \n\t"
+
+ "cmp %[tmp], %[d] \n\t"
+ "bhi "AP"dualloop2 \n\t"
+
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done \n\t"
+
+ AP"singleloop2: \n\t"
+ "vld1.32 d0[0], [%[s]]! \n\t"
+ "vld1.32 d2[0], [%[d]] \n\t"
+ // Mulitply s * c (= sc)
+ "vmull.u8 q4, d0,d14 \n\t"
+ // sc in d8
+ "vshrn.u16 d4, q4, #8 \n\t"
+
+ // sca in d6
+ "vmvn.u32 d6, d4 \n\t"
+ "vshr.u32 d6, d6, #24 \n\t"
+ "vmul.u32 d6, d12,d6 \n\t"
+
+ /* d * alpha */
+ "vmull.u8 q4, d6, d2 \n\t"
+ "vshrn.u16 d0, q4, #8 \n\t"
+
+ "vqadd.u8 d2, d0, d4 \n\t"
+
+ // Save dsc + sc
+ "vst1.32 d2[0], [%[d]]! \n\t"
+
+
+ AP"done:"
+ : // No output
+ //
+ : [s] "r" (s), [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
+ [tmp] "r" (12)
+ : "q0","q1","q2","q3","q4","q5","q6","q7","memory"
+ );
+#undef AP
}
static void
static void
_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
- DATA32 *e;
- c = 1 + (c & 0xff);
- UNROLL8_PLD_WHILE(d, l, e,
- {
- *d = INTERP_256(c, *s, *d);
- d++;
- s++;
- });
+#define AP "_op_blend_pan_caa_dp_"
+ DATA32 *e = d + l, *tmp = (void*)73;
+ asm volatile (
+ /* Set up 'c' */
+ "vdup.u8 d14, %[c] \n\t"
+ "vmov.i8 d15, #1 \n\t"
+ "vaddl.u8 q15, d14, d15 \n\t"
+ "vshr.u8 q15,#1 \n\t"
+
+ // Pick a loop
+ "andS %[tmp], %[d], $0xf \n\t"
+ "beq "AP"quadstart \n\t"
+
+ "andS %[tmp], %[d], $0x4 \n\t"
+ "beq "AP"dualstart \n\t"
+
+ AP"singleloop: \n\t"
+ "vld1.32 d4[0], [%[d]] \n\t"
+ "vld1.32 d0[0], [%[s]]! \n\t"
+
+ // Long version of 'd'
+ "vmovl.u8 q8, d4 \n\t"
+
+ // Long version of 's'
+ "vmovl.u8 q6, d0 \n\t"
+
+ // d8 = s -d
+ "vsub.s16 d8, d12, d16 \n\t"
+
+ // Multiply
+ "vmul.s16 d8, d8, d30 \n\t"
+
+ // Shift down
+ "vshr.s16 d8, #7 \n\t"
+
+ // Add 'd'
+ "vqadd.s16 d8, d8, d16 \n\t"
+
+ // Shrink to save
+ "vqmovun.s16 d0, q4 \n\t"
+ "vst1.32 d0[0], [%[d]]! \n\t"
+
+ // Now where?
+ "andS %[tmp], %[d], $0xf \n\t"
+ "beq "AP"quadstart \n\t"
+
+ AP"dualstart: \n\t"
+ // Check we have enough
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #16 \n\t"
+ "blt "AP"loopout \n\t"
+
+ AP"dualloop:"
+ "vldm %[d], {d4} \n\t"
+ "vldm %[s]!, {d0} \n\t"
+
+ // Long version of d
+ "vmovl.u8 q8, d4 \n\t"
+
+ // Long version of s
+ "vmovl.u8 q6, d0 \n\t"
+
+ // q4/q5 = s-d
+ "vsub.s16 q4, q6, q8 \n\t"
+
+ // Multiply
+ "vmul.s16 q4, q4,q15 \n\t"
+
+ // Shift down
+ "vshr.s16 q4, #7 \n\t"
+
+ // Add d
+ "vqadd.s16 q4, q4, q8 \n\t"
+
+ // Shrink to save
+ "vqmovun.s16 d0, q4 \n\t"
+
+ "vstm %[d]!, {d0} \n\t"
+ AP"quadstart: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #16 \n\t"
+ "blt "AP"loopout \n\t"
+
+ "sub %[tmp], %[e], #15 \n\t"
+
+ AP"quadloop: \n\t"
+ // load 's' -> q0, 'd' -> q2
+ "vldm %[d], {d4,d5} \n\t"
+ "vldm %[s]!, {d0,d1} \n\t"
+
+ // Long version of d
+ "vmovl.u8 q8, d4 \n\t"
+ "vmovl.u8 q9, d5 \n\t"
+
+ // Long version of s
+ "vmovl.u8 q6, d0 \n\t"
+ "vmovl.u8 q7, d1 \n\t"
+
+ // q4/q5 = s-d
+ "vsub.s16 q4, q6, q8 \n\t"
+ "vsub.s16 q5, q7, q9 \n\t"
+
+ // Multiply
+ "vmul.s16 q4, q4,q15 \n\t"
+ "vmul.s16 q5, q5,q15 \n\t"
+
+ // Shift down
+ "vshr.s16 q4, #7 \n\t"
+ "vshr.s16 q5, #7 \n\t"
+
+ // Add d
+ "vqadd.s16 q4, q4, q8 \n\t"
+ "vqadd.s16 q5, q5, q9 \n\t"
+
+ // Shrink to save
+ "vqmovun.s16 d0, q4 \n\t"
+ "vqmovun.s16 d1, q5 \n\t"
+ "vstm %[d]!, {d0,d1} \n\t"
+ "cmp %[tmp], %[d] \n\t"
+
+ "bhi "AP"quadloop\n\t"
+
+
+ "b "AP"done\n\t"
+ AP"loopout: \n\t"
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done\n\t"
+ "sub %[tmp],%[e], %[d] \n\t"
+ "cmp %[tmp],$0x04 \n\t"
+ "beq "AP"singleloop2 \n\t"
+
+ AP"dualloop2: \n\t"
+ "vldm %[d], {d4} \n\t"
+ "vldm %[s]!, {d0} \n\t"
+
+ // Long version of d
+ "vmovl.u8 q8, d4 \n\t"
+
+ // Long version of s
+ "vmovl.u8 q6, d0 \n\t"
+
+ // q4/q5 = s-d
+ "vsub.s16 q4, q6, q8 \n\t"
+
+ // Multiply
+ "vmul.s16 q4, q4,q15 \n\t"
+
+ // Shift down
+ "vshr.s16 q4, #7 \n\t"
+
+ // Add d
+ "vqadd.s16 q4, q4, q8 \n\t"
+
+ // Shrink to save
+ "vqmovun.s16 d0, q4 \n\t"
+
+ "vstm %[d]!, {d0} \n\t"
+
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done \n\t"
+
+ AP"singleloop2: \n\t"
+ "vld1.32 d4[0], [%[d]] \n\t"
+ "vld1.32 d0[0], [%[s]]! \n\t"
+
+ // Long version of 'd'
+ "vmovl.u8 q8, d4 \n\t"
+
+ // Long version of 's'
+ "vmovl.u8 q6, d0 \n\t"
+
+ // d8 = s -d
+ "vsub.s16 d8, d12, d16 \n\t"
+
+ // Multiply
+ "vmul.s16 d8, d8, d30 \n\t"
+
+ // Shift down
+ "vshr.s16 d8, #7 \n\t"
+
+ // Add 'd'
+ "vqadd.s16 d8, d8, d16 \n\t"
+
+ // Shrink to save
+ "vqmovun.s16 d0, q4 \n\t"
+
+ "vst1.32 d0[0], [%[d]] \n\t"
+
+
+ AP"done: \n\t"
+
+ // No output
+ :
+ // Input
+ : [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp)
+ // Clobbered
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory"
+ );
+#undef AP
}
#define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon
#ifdef BUILD_NEON
static void
_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
- DATA32 *e = d + l;
- while (d < e) {
- l = 256 - (*s >> 24);
- *d = *s++ + MUL_256(l, *d);
- d++;
- }
+#define AP "blend_p_dp_"
+ asm volatile (
+ //** init
+ "vmov.i8 q8, $0x1 \n\t"
+
+ AP "loopchoose: \n\t"
+ // If aligned already - straight to octs
+ "andS %[tmp], %[d],$0x1f \n\t"
+ "beq "AP"octloops \n\t"
+
+ "andS %[tmp], %[d],$0xf \n\t"
+ "beq "AP"quadloops \n\t"
+
+ "andS %[tmp], %[d],$0x4 \n\t"
+ "beq "AP"dualloop \n\t"
+
+ // Only ever executes once, fall through to dual
+ AP "singleloop: \n\t"
+ "vld1.32 d0[0], [%[s]]! \n\t"
+ "vld1.32 d4[0], [%[d]] \n\t"
+
+ "vmvn.u8 d8, d0 \n\t"
+ "vshr.u32 d8, d8, #24 \n\t"
+
+ "vmul.u32 d8, d16, d8 \n\t"
+
+ "vmull.u8 q6, d4,d8 \n\t"
+ "vshrn.u16 d8, q6, #8 \n\t"
+ // Add to 's'
+ "vqadd.u8 q2, q4,q0 \n\t"
+
+ "vst1.32 d4[0], [%[d]] \n\t"
+ "add %[d], #4 \n\t"
+
+ // Can we go the fast path?
+ "andS %[tmp], %[d],$0x1f \n\t"
+ "beq "AP"octloops \n\t"
+
+ "andS %[tmp], %[d],$0x0f \n\t"
+ "beq "AP"quadloops \n\t"
+
+
+ AP "dualloop: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ AP "dualloopint: \n\t"
+ //** Dual Loop
+ "vldm %[s]!, {d0} \n\t"
+ "vldr d4, [%[d]] \n\t"
+
+ "vmvn.u8 d8, d0 \n\t"
+ "vshr.u32 d8, d8, #24 \n\t"
+
+ "vmul.u32 d8, d16, d8 \n\t"
+
+ "vmull.u8 q6, d4,d8 \n\t"
+ "vshrn.u16 d8, q6, #8 \n\t"
+ // Add to 's'
+ "vqadd.u8 d4, d8,d0 \n\t"
+ "vstr d4, [%[d]] \n\t"
+ "add %[d], #8 \n\t"
+
+ "ands %[tmp], %[d], $0x1f \n\t"
+ "beq "AP"octloops \n\t"
+
+ AP"quadloops: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ "vldm %[s]!, {d0,d1) \n\t"
+ "vldm %[d], {d4,d5} \n\t"
+
+
+ // Copy s.a into q2 (>> 24) & subtract from 255
+ "vmvn.u8 q4, q0 \n\t"
+ "vshr.u32 q4, q4,$0x18 \n\t"
+
+ // Multiply into all fields
+ "vmul.u32 q4, q8,q4 \n\t"
+
+ // a * d (clobbering 'd'/q7)
+ "vmull.u8 q6, d4,d8 \n\t"
+ "vmull.u8 q2, d5,d9 \n\t"
+
+ // Shift & narrow it
+ "vshrn.u16 d8, q6, #8 \n\t"
+ "vshrn.u16 d9, q2, #8 \n\t"
+
+ // Add to s
+ "vqadd.u8 q2, q4,q0 \n\t"
+
+ // Write it
+ "vstm %[d]!, {d4,d5} \n\t"
+
+ AP "octloops: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "ble "AP"loopout \n\t"
+
+ "sub %[tmp],%[e],#64 \n\t"
+
+
+ AP "octloopint:\n\t"
+ //** Oct loop
+ "vldm %[s]!, {d0,d1,d2,d3) \n\t"
+ "vldm %[d], {d4,d5,d6,d7} \n\t"
+
+
+ // Copy s.a into q2 (>> 24) & subtract from 255
+ "vmvn.u8 q4, q0 \n\t"
+ "vmvn.u8 q5, q1 \n\t"
+ "vshr.u32 q4, q4,$0x18 \n\t"
+ "vshr.u32 q5, q5,$0x18\n\t"
+
+ // Multiply into all fields
+ "vmul.u32 q4, q8,q4 \n\t"
+ "vmul.u32 q5, q8,q5 \n\t"
+
+
+ // a * d (clobbering 'd'/q7)
+ "vmull.u8 q6, d4,d8 \n\t"
+ "vmull.u8 q2, d5,d9 \n\t"
+ "vmull.u8 q7, d6,d10 \n\t"
+ "vmull.u8 q3, d7,d11 \n\t"
+
+ "cmp %[tmp], %[d]\n\t"
+
+ // Shift & narrow it
+ "vshrn.u16 d8, q6, #8 \n\t"
+ "vshrn.u16 d9, q2, #8 \n\t"
+ "vshrn.u16 d10, q7, #8 \n\t"
+ "vshrn.u16 d11, q3, #8 \n\t"
+
+
+ // Add to s
+ "vqadd.u8 q2, q4,q0 \n\t"
+ "vqadd.u8 q3, q5,q1 \n\t"
+
+ // Write it
+ "vstm %[d]!, {d4,d5,d6,d7} \n\t"
+
+ "bhi "AP"octloopint\n\t"
+
+ AP "loopout: \n\t"
+//"sub %[tmp], %[d], #4\n\t"
+//"vmov.i16 d0, $0xff00 \n\t"
+//"vst1.32 d0[0], [%[tmp]] \n\t"
+
+ "cmp %[d], %[e]\n\t"
+ "beq "AP"done\n\t"
+ "sub %[tmp],%[e], %[d] \n\t"
+ "cmp %[tmp],$0x04 \n\t"
+ "ble "AP"singleloop2 \n\t"
+
+ AP "dualloop2: \n\t"
+ "sub %[tmp],%[e],$0x7 \n\t"
+ AP "dualloop2int: \n\t"
+ //** Trailing double
+
+ "vldm %[s]!, {d0} \n\t"
+ "vldm %[d], {d4} \n\t"
+
+ "vmvn.u8 d8, d0 \n\t"
+ "vshr.u32 d8, d8, #24 \n\t"
+
+ "vmul.u32 d8, d16, d8 \n\t"
+
+ "vmull.u8 q6, d4,d8 \n\t"
+ "vshrn.u16 d8, q6, #8 \n\t"
+ // Add to 's'
+ "vqadd.u8 d4, d8,d0 \n\t"
+
+ "vstr.32 d4, [%[d]] \n\t"
+ "add %[d], #8 \n\t"
+
+ "cmp %[tmp], %[d] \n\t"
+ "bhi "AP"dualloop2int \n\t"
+
+ // Single ??
+ "cmp %[e], %[d] \n\t"
+ "beq "AP"done \n\t"
+
+ AP"singleloop2: \n\t"
+ "vld1.32 d0[0], [%[s]] \n\t"
+ "vld1.32 d4[0], [%[d]] \n\t"
+
+ "vmvn.u8 d8, d0 \n\t"
+ "vshr.u32 d8, d8, #24 \n\t"
+
+ "vmul.u32 d8, d8, d16 \n\t"
+
+ "vmull.u8 q6, d8,d4 \n\t"
+ "vshrn.u16 d8, q6, #8 \n\t"
+ // Add to 's'
+ "vqadd.u8 d0, d0,d8 \n\t"
+ "vst1.32 d0[0], [%[d]] \n\t"
+
+ //** Trailing single
+
+ AP"done:\n\t"
+//"sub %[tmp], %[e], #4 \n\t"
+//"vmov.i32 d0, $0xffff0000 \n\t"
+//"vst1.32 d0[0], [%[tmp]] \n\t"
+
+
+ : // output regs
+ // Input
+ : [e] "r" (d + l), [d] "r" (d), [s] "r" (s), [c] "r" (c),
+ [tmp] "r" (7)
+ : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
+ );
+#undef AP
+
}
static void
_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
-#if 0
-#ifdef NEON_INSTRINSICS_OK
-#else
- DATA32 *e = d + l;
-#if 1
- if (l >= 4)
- {
- e -= 4;
- asm volatile (
-// "vmov.i32 q3, $0xff000000\n\t"
-// "asmloop3:\n\t"
-// "vld1.32 {d0-d1}, [%[s]]!\n\t"
-// "vmov.32 q2, q0\n\t"
-// "vand.32 q2, q2, q3\n\t"
-// "vceq.i32 q2, q2, #0\n\t"
-// "beq blank\n\t"
-// "vmov.32 d3, d0\n\t"
-// "vmovl.u8 q0, d1\n\t"
-// "vmovl.u8 q1, d3\n\t"
-// "\n\t"
-// "vmovn.u16 d1, q0\n\t"
-// "vmovn.u16 d3, q1\n\t"
-// "vmov.32 d0, d3\n\t"
-// "\n\t"
-// "vst1.32 {d0-d1}, [%[d]]!\n\t"
-
-// "cmp %[e], %[d]\n\t" // if d < e ...
-// "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
-// "b done\n\t"
-
-// "blank:\n\t"
-// "add %[s], %[s], #16\n\t"
-// "add %[d], %[d], #16\n\t"
-// "cmp %[e], %[d]\n\t" // if d < e ...
-// "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
-
-// "done:\n\t"
- "asmloop3:\n\t"
- "vld4.8 {d0-d3}, [%[s]]\n\t" // d0-d3 = s
- "vld4.8 {d4-d7}, [%[d]]\n\t" // d4-d7 = d
- "vmvn.8 d31, d3\n\t" // d31 = 255 - s.a
- "vmull.u8 q4, d31, d4\n\t"
- "vmull.u8 q5, d31, d5\n\t"
- "vmull.u8 q6, d31, d6\n\t"
- "vmull.u8 q7, d31, d7\n\t"
- "vrshr.u16 q8, q4, #8\n\t"
- "vrshr.u16 q9, q5, #8\n\t"
- "vraddhn.u16 d20, q4, q8\n\t"
- "vrshr.u16 q8, q6, #8\n\t"
- "vraddhn.u16 d21, q5, q9\n\t"
- "vrshr.u16 q9, q7, #8\n\t"
- "vraddhn.u16 d22, q6, q8\n\t"
- "vraddhn.u16 d23, q7, q9\n\t"
- "vqadd.u8 d20, d0, d20\n\t"
- "vqadd.u8 d21, d1, d21\n\t"
- "vqadd.u8 d22, d2, d22\n\t"
- "vqadd.u8 d23, d3, d23\n\t"
- "vst4.8 {d20-d23}, [%[d]]!\n\t"
- "vst4.8 {d20-d23}, [%[d]]\n\t"
- "add %[s], %[s], #4\n\t" // s++
- "add %[d], %[d], #4\n\t" // d++
- "cmp %[e], %[d]\n\t" // if d < e ...
- "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
- : // output regs
- : [s] "r" (s), [e] "r" (e), [d] "r" (d) // input
- : "d0", "d1", "memory" // clobbered
- );
- e += 4;
- }
-#endif
- while (d < e)
- {
- switch (*s & 0xff000000)
- {
- case 0:
- break;
- case 0xff000000:
- *d = *s;
- break;
- default :
- l = 256 - (*s >> 24);
- *d = *s + MUL_256(l, *d);
- break;
- }
- s++; d++;
- }
-#endif
-#else
- DATA32 *e = d + l;
- while (d < e)
- {
- switch (*s & 0xff000000)
- {
- case 0:
- break;
- case 0xff000000:
- *d = *s;
- break;
- default :
- l = 256 - (*s >> 24);
- *d = *s + MUL_256(l, *d);
- break;
- }
- s++; d++;
- }
-#endif
+#define AP "blend_pas_dp_"
+ DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
+ asm volatile (
+ "vmov.i8 q8, #1 \n\t"
+ AP"loopchoose: \n\t"
+ // If aliged - go as fast we can
+ "andS %[tmp], %[d], #31 \n\t"
+ "beq "AP"quadstart \n\t"
+
+ // See if we can at least do our double loop
+ "andS %[tmp], %[d], $0x7 \n\t"
+ "beq "AP"dualstart \n\t"
+
+ // Ugly single word version
+ AP "singleloop: \n\t"
+ "vld1.32 d0[0], [%[s]]! \n\t"
+ "vld1.32 d4[0], [%[d]] \n\t"
+
+ "vmvn.u8 d8, d0 \n\t"
+
+ "vshr.u32 d8, d8,$0x18 \n\t"
+
+ // Mulitply into all fields
+ "vmul.u32 d8, d8, d16 \n\t"
+
+ // Multiply out
+ "vmull.u8 q6, d8, d4 \n\t"
+
+ "vshrn.u16 d8, q6, #8 \n\t"
+
+ // Add to s
+ "vqadd.u8 d0, d0,d8 \n\t"
+ "vst1.32 d0[0], [%[d]]! \n\t"
+
+ AP"dualstart: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ // If aligned - go as fast we can
+ "andS %[tmp], %[d], #31 \n\t"
+ "beq "AP"quadstart \n\t"
+
+
+ AP"dualloop: \n\t"
+
+ "vldm %[s]!, {d0) \n\t"
+ "vldm %[d], {d4} \n\t"
+
+ // Subtract from 255 (ie negate) and extract alpha channel
+ "vmvn.u8 d8, d0 \n\t"
+ "vshr.u32 d8, d8,$0x18 \n\t"
+
+ // Mulitply into all fields
+ "vmul.u32 d8, d8, d16 \n\t"
+
+ // Multiply out
+ "vmull.u8 q6, d8, d4 \n\t"
+
+ "vshrn.u16 d8, q6, #8 \n\t"
+
+ // Add to s
+ "vqadd.u8 d0, d0,d8 \n\t"
+ "vstm %[d]!, {d0} \n\t"
+
+ "andS %[tmp], %[d], $0x1f \n\t"
+ "bne "AP"dualloop \n\t"
+
+
+ AP"quadstart: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ "sub %[tmp], %[e], #31 \n\t"
+
+ AP"quadloop:\n\t"
+ "vldm %[s]!, {d0,d1,d2,d3) \n\t"
+ "vldm %[d], {d4,d5,d6,d7} \n\t"
+
+ // Subtract from 255 (ie negate) and extract alpha channel
+ "vmvn.u8 q4, q0 \n\t"
+ "vmvn.u8 q5, q1 \n\t"
+ "vshr.u32 q4, q4,$0x18 \n\t"
+ "vshr.u32 q5, q5,$0x18 \n\t"
+
+ // Prepare to preload
+ "add %[pl], %[s], #32\n\t"
+
+ // Mulitply into all fields
+ "vmul.u32 q4, q4, q8 \n\t"
+ "vmul.u32 q5, q5, q8 \n\t"
+ "pld [%[pl]]\n\t"
+
+ // Multiply out
+ "vmull.u8 q6, d8, d4 \n\t"
+ "vmull.u8 q7, d10, d6 \n\t"
+ "vmull.u8 q2, d9, d5 \n\t"
+ "vmull.u8 q3, d11, d7 \n\t"
+
+ "add %[pl], %[d], #32\n\t"
+
+ "vshrn.u16 d8, q6, #8 \n\t"
+ "vshrn.u16 d10, q7, #8 \n\t"
+ "vshrn.u16 d9, q2, #8 \n\t"
+ "vshrn.u16 d11, q3, #8 \n\t"
+ "pld [%[pl]]\n\t"
+
+ "cmp %[tmp], %[pl] \n\t"
+ // Add to s
+ "vqadd.u8 q0, q0,q4 \n\t"
+ "vqadd.u8 q1, q1,q5 \n\t"
+
+ "vstm %[d]!, {d0,d1,d2,d3} \n\t"
+
+ "bhi "AP"quadloop \n\t"
+
+ AP "loopout: \n\t"
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done \n\t"
+
+ "sub %[tmp],%[e], %[d] \n\t"
+ "cmp %[tmp],$0x04 \n\t"
+ "beq "AP"singleloop2 \n\t"
+
+ "sub %[tmp],%[e],$0x7 \n\t"
+
+ AP"dualloop2: \n\t"
+ "vldm %[s]!, {d0) \n\t"
+ "vldm %[d], {d4} \n\t"
+
+ // Subtract from 255 (ie negate) and extract alpha channel
+ "vmvn.u8 d8, d0 \n\t"
+ "vshr.u32 d8, d8,$0x18 \n\t"
+
+ // Mulitply into all fields
+ "vmul.u32 d8, d8, d16 \n\t"
+
+ // Multiply out
+ "vmull.u8 q6, d8, d4 \n\t"
+
+ "vshrn.u16 d8, q6, #8 \n\t"
+
+ // Add to s
+ "vqadd.u8 d0, d0,d8 \n\t"
+
+ "vstm %[d]!, {d0} \n\t"
+ "cmp %[tmp], %[d] \n\t"
+
+ "bhi "AP"dualloop2 \n\t"
+
+ // Single ??
+ "cmp %[e], %[d] \n\t"
+ "beq "AP"done \n\t"
+
+ AP "singleloop2: \n\t"
+ "vld1.32 d0[0], [%[s]] \n\t"
+ "vld1.32 d4[0], [%[d]] \n\t"
+
+ "vmvn.u8 d8, d0 \n\t"
+
+ "vshr.u32 d8, d8,$0x18 \n\t"
+
+ // Mulitply into all fields
+ "vmul.u32 d8, d8, d16 \n\t"
+
+ // Multiply out
+ "vmull.u8 q6, d8, d4 \n\t"
+
+ "vshrn.u16 d8, q6, #8 \n\t"
+
+ // Add to s
+ "vqadd.u8 d0, d0,d8 \n\t"
+
+ "vst1.32 d0[0], [%[d]] \n\t"
+ AP "done:\n\t"
+
+
+ : /* Out */
+ : /* In */ [s] "r" (s), [e] "r" (e), [d] "r" (d), [tmp] "r" (tmp),
+ [pl] "r" (pl)
+ : /* Clobbered */
+ "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
+ );
+#undef AP
}
#define _op_blend_pan_dp_neon NULL
#ifdef BUILD_NEON
static void
_op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
- uint32_t *e;
- uint32_t dalign = ((uint32_t)d) & 0xf; // get alignment
- // handle unaligned stores - stores not aligned to 16bytes may suck
- if (dalign > 0)
- {
- dalign = (16 - dalign) >> 2;
- if (l < dalign) dalign = l;
- l -= dalign;
- e = d + dalign;
- for (; d < e; d++) {
- *d = c; // OP
- }
- if (l <= 0) return;
- }
- e = d + l;
-#ifdef NEON_INSTRINSICS_OK
- e -= 15;
- // expand the color in c to a 128 bit register as "cccc" i.e 4 pixels of c
- uint32x4_t col = vdupq_n_u32(c);
- // fill a run of 4x4 (16) pixels with the color
- for (; d < e; d += 16) {
- vst1q_u32(d+0, col); // OP
- vst1q_u32(d+4, col); // OP
- vst1q_u32(d+8, col); // OP
- vst1q_u32(d+12, col); // OP
- }
- e += 15;
-#else
- if ((e - d) >= 16)
- {
- e -= 31;
- asm volatile (
- "vdup.32 q8, %[c]\n\t"
- "asmloop1:\n\t"
-// "pld [%[d], #128]\n\t"
- "cmp %[e], %[d]\n\t"
- "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
- "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
- "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
- "vst1.32 {d16-d17}, [%[d],:128]!\n\t"
- "bhi asmloop1\n\t"
- : // output regs
- : [c] "r" (c), [e] "r" (e), [d] "r" (d) // input
- : "q8", "d16", "d17", "memory" // clobbered
- );
- e += 31;
- }
-#endif
- // fixup any leftover pixels in the run
- for (; d < e; d++) {
- *d = c; // OP
- }
+#define AP "COPY_C_DP_"
+ uint32_t *e = d + l,*tmp;
+ asm volatile (
+
+ "vdup.i32 q0, %[c] \n\t"
+
+ // Can we do 32 byte?
+ "andS %[tmp], %[d], $0x1f \n\t"
+ "beq "AP"quadstart \n\t"
+
+ // Can we do at least 16 byte?
+ "andS %[tmp], %[d], $0x4 \n\t"
+ "beq "AP"dualstart \n\t"
+
+ // Only once
+ AP"singleloop: \n\t"
+ "vst1.32 d0[0], [%[d]] \n\t"
+ "add %[d], #4 \n\t"
+
+ // Up to 3 times
+ AP"dualstart: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ AP"dualloop: \n\t"
+ "vstr.32 d0, [%[d]] \n\t"
+
+ "add %[d], #8 \n\t"
+ "andS %[tmp], %[d], $0x1f \n\t"
+ "bne "AP"dualloop \n\t"
+
+
+ AP"quadstart: \n\t"
+ "sub %[tmp], %[e], %[d] \n\t"
+ "cmp %[tmp], #32 \n\t"
+ "blt "AP"loopout \n\t"
+
+ "vmov q1, q0 \n\t"
+ "sub %[tmp],%[e],#31 \n\t"
+
+ AP "quadloop: \n\t"
+ "vstm %[d]!, {d0,d1,d2,d3} \n\t"
+
+ "cmp %[tmp], %[d] \n\t"
+ "bhi "AP"quadloop \n\t"
+
+
+ AP "loopout: \n\t"
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done \n\t"
+ "sub %[tmp],%[e], %[d] \n\t"
+ "cmp %[tmp],$0x04 \n\t"
+ "beq "AP"singleloop2 \n\t"
+
+ AP "dualloop2: \n\t"
+ "sub %[tmp],%[e],#7 \n\t"
+ AP "dualloop2int: \n\t"
+ "vstr.64 d0, [%[d]] \n\t"
+
+ "add %[d], #8 \n\t"
+ "cmp %[tmp], %[d] \n\t"
+ "bhi "AP"dualloop2int \n\t"
+
+ // Single ??
+ "cmp %[e], %[d] \n\t"
+ "beq "AP"done \n\t"
+
+ AP "singleloop2: \n\t"
+ "vst1.32 d0[0], [%[d]] \n\t"
+
+ AP "done:\n\t"
+
+ : // No output regs
+ // Input
+ : [c] "r" (c), [e] "r" (e), [d] "r" (d),[tmp] "r" (tmp)
+ // Clobbered
+ : "q0","q1","memory"
+
+
+ );
}
#define _op_copy_cn_dp_neon _op_copy_c_dp_neon