asm volatile (
".fpu neon \n\t"
//** init
- "vmov.i8 q8, $0x1 \n\t"
+ "vmov.i8 q8, $0x1 \n\t"
AP "loopchoose: \n\t"
// If aligned already - straight to octs
- "andS %[tmp], %[d],$0x1f \n\t"
- "beq "AP"octloops \n\t"
+ "andS %[tmp], %[d],$0x1f \n\t"
+ "beq "AP"octloops \n\t"
- "andS %[tmp], %[d],$0xf \n\t"
- "beq "AP"quadloops \n\t"
+ "andS %[tmp], %[d],$0xf \n\t"
+ "beq "AP"quadloops \n\t"
- "andS %[tmp], %[d],$0x4 \n\t"
- "beq "AP"dualloop \n\t"
+ "andS %[tmp], %[d],$0x4 \n\t"
+ "beq "AP"dualloop \n\t"
// Only ever executes once, fall through to dual
AP "singleloop: \n\t"
"cmp %[tmp], #32 \n\t"
"ble "AP"loopout \n\t"
- "sub %[tmp],%[e],#64 \n\t"
+ "sub %[tmp],%[e],#64 \n\t"
AP "octloopint:\n\t"
"bhi "AP"octloopint\n\t"
AP "loopout: \n\t"
-//"sub %[tmp], %[d], #4\n\t"
-//"vmov.i16 d0, $0xff00 \n\t"
-//"vst1.32 d0[0], [%[tmp]] \n\t"
-
- "cmp %[d], %[e]\n\t"
- "beq "AP"done\n\t"
+ "cmp %[d], %[e] \n\t"
+ "beq "AP"done \n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"ble "AP"singleloop2 \n\t"
"add %[d], #8 \n\t"
"cmp %[tmp], %[d] \n\t"
- "bhi "AP"dualloop2int \n\t"
+ "bhi "AP"dualloop2int \n\t"
// Single ??
"cmp %[e], %[d] \n\t"
#define AP "blend_pas_dp_"
DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
asm volatile (
- ".fpu neon \n\t"
+ ".fpu neon \n\t"
"vmov.i8 q8, #1 \n\t"
AP"loopchoose: \n\t"
// If aliged - go as fast we can
- "andS %[tmp], %[d], #31 \n\t"
+ "andS %[tmp], %[d], #31 \n\t"
"beq "AP"quadstart \n\t"
// See if we can at least do our double loop
"vshr.u32 q5, q5,$0x18 \n\t"
// Prepare to preload
- "add %[pl], %[s], #32\n\t"
+ "add %[pl], %[s], #32 \n\t"
// Mulitply into all fields
"vmul.u32 q4, q4, q8 \n\t"
"vmul.u32 q5, q5, q8 \n\t"
- "pld [%[pl]]\n\t"
+ "pld [%[pl]] \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vmull.u8 q2, d9, d5 \n\t"
"vmull.u8 q3, d11, d7 \n\t"
- "add %[pl], %[d], #32\n\t"
+ "add %[pl], %[d], #32 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vqrshrn.u16 d10, q7, #8 \n\t"
"vqrshrn.u16 d9, q2, #8 \n\t"
"vqrshrn.u16 d11, q3, #8 \n\t"
- "pld [%[pl]]\n\t"
+ "pld [%[pl]] \n\t"
"cmp %[tmp], %[pl] \n\t"
// Add to s
"cmp %[tmp],$0x04 \n\t"
"beq "AP"singleloop2 \n\t"
- "sub %[tmp],%[e],$0x7 \n\t"
+ "sub %[tmp],%[e],$0x7 \n\t"
AP"dualloop2: \n\t"
- "vldm %[s]!, {d0) \n\t"
- "vldm %[d], {d4} \n\t"
+ "vldm %[s]!, {d0) \n\t"
+ "vldm %[d], {d4} \n\t"
// Subtract from 255 (ie negate) and extract alpha channel
"vmvn.u8 d8, d0 \n\t"
"bhi "AP"dualloop2 \n\t"
// Single ??
- "cmp %[e], %[d] \n\t"
- "beq "AP"done \n\t"
+ "cmp %[e], %[d] \n\t"
+ "beq "AP"done \n\t"
AP "singleloop2: \n\t"
"vld1.32 d0[0], [%[s]] \n\t"