w = width;
#ifndef USE_GCC_INLINE_ASM
- sval = vld1_u8 ((void*)src);
- dval = vld1_u8 ((void*)dst);
+ sval = vld1_u8 (((void *))src);
+ dval = vld1_u8 (((void *))dst);
keep_dst = dst;
temp = vqadd_u8 (dval, sval);
while (w)
{
- sval = vld1_u8 ((void*)src);
- dval = vld1_u8 ((void*)dst);
+ sval = vld1_u8 (((void *))src);
+ dval = vld1_u8 (((void *))dst);
- vst1_u8 ((void*)keep_dst, temp);
+ vst1_u8 (((void *))keep_dst, temp);
keep_dst = dst;
temp = vqadd_u8 (dval, sval);
w -= 8;
}
- vst1_u8 ((void*)keep_dst, temp);
+ vst1_u8 (((void *))keep_dst, temp);
#else
asm volatile (
/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
if (w & 4)
{
sval = vreinterpret_u8_u32 (
- vld1_lane_u32 ((void*)src, vreinterpret_u32_u8 (sval), 1));
+ vld1_lane_u32 (((void *))src, vreinterpret_u32_u8 (sval), 1));
dval = vreinterpret_u8_u32 (
- vld1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (dval), 1));
+ vld1_lane_u32 (((void *))dst, vreinterpret_u32_u8 (dval), 1));
dst4 = dst;
src += 4;
if (w & 2)
{
sval = vreinterpret_u8_u16 (
- vld1_lane_u16 ((void*)src, vreinterpret_u16_u8 (sval), 1));
+ vld1_lane_u16 (((void *))src, vreinterpret_u16_u8 (sval), 1));
dval = vreinterpret_u8_u16 (
- vld1_lane_u16 ((void*)dst, vreinterpret_u16_u8 (dval), 1));
+ vld1_lane_u16 (((void *))dst, vreinterpret_u16_u8 (dval), 1));
dst2 = dst;
src += 2;
vst1_lane_u8 (dst, dval, 1);
if (w & 2)
- vst1_lane_u16 ((void*)dst2, vreinterpret_u16_u8 (dval), 1);
+ vst1_lane_u16 (((void *))dst2, vreinterpret_u16_u8 (dval), 1);
if (w & 4)
- vst1_lane_u32 ((void*)dst4, vreinterpret_u32_u8 (dval), 1);
+ vst1_lane_u32 (((void *))dst4, vreinterpret_u32_u8 (dval), 1);
}
}
}
w = width;
#ifndef USE_GCC_INLINE_ASM
- sval = vld4_u8 ((void*)src);
- dval = vld4_u8 ((void*)dst);
+ sval = vld4_u8 (((void *))src);
+ dval = vld4_u8 (((void *))dst);
keep_dst = dst;
temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
while (w)
{
- sval = vld4_u8 ((void*)src);
- dval = vld4_u8 ((void*)dst);
+ sval = vld4_u8 (((void *))src);
+ dval = vld4_u8 (((void *))dst);
- vst4_u8 ((void*)keep_dst, temp);
+ vst4_u8 (((void *))keep_dst, temp);
keep_dst = dst;
temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
w -= 8;
}
- vst4_u8 ((void*)keep_dst, temp);
+ vst4_u8 (((void *))keep_dst, temp);
#else
asm volatile (
/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
uint8x8_t sval, dval;
/* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
- sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
- dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
+ sval = vreinterpret_u8_u32 (vld1_u32 (((void *))src));
+ dval = vreinterpret_u8_u32 (vld1_u32 (((void *))dst));
dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
- vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
+ vst1_u8 (((void *))dst, vqadd_u8 (sval, dval));
src += 2;
dst += 2;
uint8x8_t sval, dval;
/* single 32-bit pixel in lane 0 */
- sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src)); /* only interested in lane 0 */
- dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst)); /* only interested in lane 0 */
+ sval = vreinterpret_u8_u32 (vld1_dup_u32 (((void *))src)); /* only interested in lane 0 */
+ dval = vreinterpret_u8_u32 (vld1_dup_u32 (((void *))dst)); /* only interested in lane 0 */
dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
- vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
+ vst1_lane_u32 (((void *))dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
}
}
}
#ifndef USE_GCC_INLINE_ASM
uint8x8x4_t sval, dval, temp;
- sval = vld4_u8 ((void*)src);
- dval = vld4_u8 ((void*)dst);
+ sval = vld4_u8 (((void *))src);
+ dval = vld4_u8 (((void *))dst);
keep_dst = dst;
sval = neon8mul (sval, mask_alpha);
while (w)
{
- sval = vld4_u8 ((void*)src);
- dval = vld4_u8 ((void*)dst);
+ sval = vld4_u8 (((void *))src);
+ dval = vld4_u8 (((void *))dst);
- vst4_u8 ((void*)keep_dst, temp);
+ vst4_u8 (((void *))keep_dst, temp);
keep_dst = dst;
sval = neon8mul (sval, mask_alpha);
dst += 8;
w -= 8;
}
- vst4_u8 ((void*)keep_dst, temp);
+ vst4_u8 (((void *))keep_dst, temp);
#else
asm volatile (
/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
{
uint8x8_t sval, dval;
- sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
- dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
+ sval = vreinterpret_u8_u32 (vld1_u32 (((void *))src));
+ dval = vreinterpret_u8_u32 (vld1_u32 (((void *))dst));
/* sval * const alpha_mul */
sval = neon2mul (sval, mask_alpha);
/* dval * 255-(src alpha) */
dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
- vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
+ vst1_u8 (((void *))dst, vqadd_u8 (sval, dval));
src += 2;
dst += 2;
{
uint8x8_t sval, dval;
- sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));
- dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
+ sval = vreinterpret_u8_u32 (vld1_dup_u32 (((void *))src));
+ dval = vreinterpret_u8_u32 (vld1_dup_u32 (((void *))dst));
/* sval * const alpha_mul */
sval = neon2mul (sval, mask_alpha);
/* dval * 255-(src alpha) */
dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
- vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
+ vst1_lane_u32 (((void *))dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
}
}
}
uint16x8_t dval, temp;
uint8x8x4_t sval8temp;
- alpha = vld1_u8 ((void*)mask);
- dval = vld1q_u16 ((void*)dst);
+ alpha = vld1_u8 (((void *))mask);
+ dval = vld1q_u16 (((void *))dst);
keep_dst = dst;
- sval8temp = neon8mul (sval8,alpha);
- temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
+ sval8temp = neon8mul (sval8, alpha);
+ temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
mask += (w & 7);
dst += (w & 7);
while (w)
{
- dval = vld1q_u16 ((void*)dst);
- alpha = vld1_u8 ((void*)mask);
+ dval = vld1q_u16 (((void *))dst);
+ alpha = vld1_u8 (((void *))mask);
- vst1q_u16 ((void*)keep_dst,temp);
+ vst1q_u16 (((void *))keep_dst, temp);
keep_dst = dst;
- sval8temp = neon8mul (sval8,alpha);
- temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
+ sval8temp = neon8mul (sval8, alpha);
+ temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
mask+=8;
dst+=8;
w-=8;
}
- vst1q_u16 ((void*)keep_dst,temp);
+ vst1q_u16 (((void *))keep_dst, temp);
#else
asm volatile (
"vdup.32 d0, %[src]\n\t"
if (w&4)
{
- alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void*)mask,vreinterpret_u32_u8 (alpha),1));
- dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void*)dst,vreinterpretq_u64_u16 (dval),1));
+ alpha = vreinterpret_u8_u32 (vld1_lane_u32 (((void *))mask, vreinterpret_u32_u8 (alpha),1));
+ dval = vreinterpretq_u16_u64 (vld1q_lane_u64 (((void *))dst, vreinterpretq_u64_u16 (dval),1));
dst4=dst;
mask+=4;
dst+=4;
}
if (w&2)
{
- alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void*)mask,vreinterpret_u16_u8 (alpha),1));
- dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void*)dst,vreinterpretq_u32_u16 (dval),1));
+ alpha = vreinterpret_u8_u16 (vld1_lane_u16 (((void *))mask, vreinterpret_u16_u8 (alpha),1));
+ dval = vreinterpretq_u16_u32 (vld1q_lane_u32 (((void *))dst, vreinterpretq_u32_u16 (dval),1));
dst2=dst;
mask+=2;
dst+=2;
}
if (w&1)
{
- alpha = vld1_lane_u8 ((void*)mask,alpha,1);
- dval = vld1q_lane_u16 ((void*)dst,dval,1);
+ alpha = vld1_lane_u8 (((void *))mask, alpha,1);
+ dval = vld1q_lane_u16 (((void *))dst, dval,1);
}
- sval8temp = neon8mul (sval8,alpha);
- temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
+ sval8temp = neon8mul (sval8, alpha);
+ temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
if (w&1)
- vst1q_lane_u16 ((void*)dst,temp,1);
+ vst1q_lane_u16 (((void *))dst, temp,1);
if (w&2)
- vst1q_lane_u32 ((void*)dst2,vreinterpretq_u32_u16 (temp),1);
+ vst1q_lane_u32 (((void *))dst2, vreinterpretq_u32_u16 (temp),1);
if (w&4)
- vst1q_lane_u64 ((void*)dst4,vreinterpretq_u64_u16 (temp),1);
+ vst1q_lane_u64 (((void *))dst4, vreinterpretq_u64_u16 (temp),1);
#else
asm volatile (
"vdup.32 d0, %[src]\n\t"
uint8x8_t alpha;
uint8x8x4_t dval, temp;
- alpha = vld1_u8 ((void*)mask);
- dval = vld4_u8 ((void*)dst);
+ alpha = vld1_u8 (((void *))mask);
+ dval = vld4_u8 (((void *))dst);
keep_dst = dst;
temp = neon8mul (sval8, alpha);
while (w)
{
- alpha = vld1_u8 ((void*)mask);
- dval = vld4_u8 ((void*)dst);
+ alpha = vld1_u8 (((void *))mask);
+ dval = vld4_u8 (((void *))dst);
- vst4_u8 ((void*)keep_dst, temp);
+ vst4_u8 (((void *))keep_dst, temp);
keep_dst = dst;
temp = neon8mul (sval8, alpha);
dst += 8;
w -= 8;
}
- vst4_u8 ((void*)keep_dst, temp);
+ vst4_u8 (((void *))keep_dst, temp);
#else
asm volatile (
"vdup.32 d0, %[src]\n\t"
uint8x8_t dval, temp, res;
alpha = vtbl1_u8 (
- vreinterpret_u8_u16 (vld1_dup_u16 ((void*)mask)), mask_selector);
- dval = vld1_u8 ((void*)dst);
+ vreinterpret_u8_u16 (vld1_dup_u16 (((void *))mask)), mask_selector);
+ dval = vld1_u8 (((void *))dst);
temp = neon2mul (sval2, alpha);
res = vqadd_u8 (
temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
- vst1_u8 ((void*)dst, res);
+ vst1_u8 (((void *))dst, res);
mask += 2;
dst += 2;
{
uint8x8_t dval, temp, res;
- alpha = vtbl1_u8 (vld1_dup_u8 ((void*)mask), mask_selector);
- dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
+ alpha = vtbl1_u8 (vld1_dup_u8 (((void *))mask), mask_selector);
+ dval = vreinterpret_u8_u32 (vld1_dup_u32 (((void *))dst));
temp = neon2mul (sval2, alpha);
res = vqadd_u8 (
temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
- vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (res), 0);
+ vst1_lane_u32 (((void *))dst, vreinterpret_u32_u8 (res), 0);
}
}
}
/* preload from next scanline */
" pld [%[src], %[src_stride], LSL #1] \n"
" sub %[count], %[count], #64 \n"
- " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
- " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
+ " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
+ " vld1.16 {d20, d21, d22, d23}, [%[src]]! \n"
/* preload from next scanline */
" pld [%[src], %[src_stride], LSL #1] \n"
- " vld1.16 {d24,d25,d26,d27}, [%[src]]! \n"
- " vld1.16 {d28,d29,d30,d31}, [%[src]]! \n"
+ " vld1.16 {d24, d25, d26, d27}, [%[src]]! \n"
+ " vld1.16 {d28, d29, d30, d31}, [%[src]]! \n"
" cmp %[count], #64 \n"
- " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
- " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
- " vst1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
- " vst1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
+ " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
+ " vst1.16 {d20, d21, d22, d23}, [%[dst]]! \n"
+ " vst1.16 {d24, d25, d26, d27}, [%[dst]]! \n"
+ " vst1.16 {d28, d29, d30, d31}, [%[dst]]! \n"
" bge 0b \n"
" cmp %[count], #0 \n"
" beq 7f @ aligned fastpath \n"
" beq 2f @ skip oversized fragment \n"
/* preload from next scanline */
" pld [%[src], %[src_stride], LSL #1] \n"
- " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
- " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
- " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
- " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
+ " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
+ " vld1.16 {d20, d21, d22, d23}, [%[src]]! \n"
+ " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
+ " vst1.16 {d20, d21, d22, d23}, [%[dst]]! \n"
"2: @ two quadwords \n"
" tst %[count], #16 \n"
" beq 3f @ skip oversized fragment \n"
/* preload from next scanline */
" pld [%[src], %[src_stride], LSL #1] \n"
- " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
- " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
+ " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
"3: @ one quadword \n"
" tst %[count], #8 \n"
" beq 4f @ skip oversized fragment \n"
- " vld1.16 {d16,d17}, [%[src]]! \n"
- " vst1.16 {d16,d17}, [%[dst]]! \n"
+ " vld1.16 {d16, d17}, [%[src]]! \n"
+ " vst1.16 {d16, d17}, [%[dst]]! \n"
"4: @ one doubleword \n"
" tst %[count], #4 \n"
" beq 5f @ skip oversized fragment \n"
"0: @ start with sixteen pixels at a time \n"
" sub %[count], %[count], #16 \n"
" pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
- " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
- " vld4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
+ " vld4.8 {d0, d1, d2, d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
+ " vld4.8 {d4, d5, d6, d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
" vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
" vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
" vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
" vsri.u16 q9, q10, #5 @ insert second green after red \n"
" vsri.u16 q9, q11, #11 @ insert second blue after green \n"
" cmp %[count], #16 \n"
- " vst1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
+ " vst1.16 {d16, d17, d18, d19}, [%[dst]]! @ store 16 pixels \n"
" bge 0b \n"
"1: @ end of main loop \n"
" cmp %[count], #8 @ can we still do an 8-pixel block? \n"
" blt 2f \n"
" sub %[count], %[count], #8 \n"
" pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
- " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
+ " vld4.8 {d0, d1, d2, d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
" vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
" vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
" vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
" vsri.u16 q8, q10, #5 @ insert first green after red \n"
" vsri.u16 q8, q11, #11 @ insert first blue after green \n"
- " vst1.16 {d16,d17}, [%[dst]]! @ store 8 pixels \n"
+ " vst1.16 {d16, d17}, [%[dst]]! @ store 8 pixels \n"
"2: @ end \n"
/* Clobbered input and working registers marked as input/outputs */
/* The main block: Do 128-bit aligned writes */
"3:\n"
"subs r5, r5, #1\n"
- "vst1.64 {d0,d1}, [r4, :128]!\n"
+ "vst1.64 {d0, d1}, [r4, :128]!\n"
"bne 3b\n"
/* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
#define NEON_SCANLINE_BUFFER_PIXELS (1024)
static inline void
-neon_quadword_copy (void* dst,
- void* src,
+neon_quadword_copy ((void *) dst,
+ (void *) src,
uint32_t count, /* of quadwords */
uint32_t trailer_count /* of bytes */)
{
" blt 1f @ skip oversized fragments \n"
"0: @ start with eight quadwords at a time \n"
" sub %[count], %[count], #8 \n"
- " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
- " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
- " vld1.8 {d24,d25,d26,d27}, [%[src]]! \n"
- " vld1.8 {d28,d29,d30,d31}, [%[src]]! \n"
+ " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n"
+ " vld1.8 {d20, d21, d22, d23}, [%[src]]! \n"
+ " vld1.8 {d24, d25, d26, d27}, [%[src]]! \n"
+ " vld1.8 {d28, d29, d30, d31}, [%[src]]! \n"
" cmp %[count], #8 \n"
- " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
- " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
- " vst1.8 {d24,d25,d26,d27}, [%[dst]]! \n"
- " vst1.8 {d28,d29,d30,d31}, [%[dst]]! \n"
+ " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n"
+ " vst1.8 {d20, d21, d22, d23}, [%[dst]]! \n"
+ " vst1.8 {d24, d25, d26, d27}, [%[dst]]! \n"
+ " vst1.8 {d28, d29, d30, d31}, [%[dst]]! \n"
" bge 0b \n"
"1: @ four quadwords \n"
" tst %[count], #4 \n"
" beq 2f @ skip oversized fragment \n"
- " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
- " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
- " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
- " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
+ " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n"
+ " vld1.8 {d20, d21, d22, d23}, [%[src]]! \n"
+ " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n"
+ " vst1.8 {d20, d21, d22, d23}, [%[dst]]! \n"
"2: @ two quadwords \n"
" tst %[count], #2 \n"
" beq 3f @ skip oversized fragment \n"
- " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
- " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n"
+ " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n"
"3: @ one quadword \n"
" tst %[count], #1 \n"
" beq 4f @ skip oversized fragment \n"
- " vld1.8 {d16,d17}, [%[src]]! \n"
- " vst1.8 {d16,d17}, [%[dst]]! \n"
+ " vld1.8 {d16, d17}, [%[src]]! \n"
+ " vst1.8 {d16, d17}, [%[dst]]! \n"
"4: @ end \n"
/* Clobbered input registers marked as input/outputs */
#ifdef USE_GCC_INLINE_ASM
asm volatile (
- " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]] @ splat solid colour components \n"
+ " vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]] @ splat solid colour components \n"
"0: @ loop \n"
- " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
+ " vld1.16 {d0, d1}, [%[dest]] @ load first pixels from framebuffer \n"
" vld1.8 {d17}, [%[in_mask]] @ load alpha mask of glyph \n"
" vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
" vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
" add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait \n"
" vsri.16 q1, q2, #5 @ pack green behind red \n"
" vsri.16 q1, q3, #11 @ pack blue into pixels \n"
- " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n"
+ " vst1.16 {d2, d3}, [%[dest]] @ store composited pixels \n"
" add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
" bne 0b @ next please \n"
* (solid colour without alpha mask)
*/
asm volatile (
- " vld4.8 {d20[],d21[],d22[],d23[]}, [%[colour]] @ solid colour load/splat \n"
+ " vld4.8 {d20[], d21[], d22[], d23[]}, [%[colour]] @ solid colour load/splat \n"
" vmull.u8 q12, d23, d22 @ premultiply alpha red \n"
" vmull.u8 q13, d23, d21 @ premultiply alpha green \n"
" vmull.u8 q14, d23, d20 @ premultiply alpha blue \n"
" vmvn d18, d23 @ inverse alpha for background \n"
"0: @ loop\n"
- " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
+ " vld1.16 {d0, d1}, [%[dest]] @ load first pixels from framebuffer \n"
" vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
" vshrn.u16 d4, q0, #3 @ unpack green \n"
" vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
" subs %[count], %[count], #1 @ decrement/test loop counter \n"
" vsri.16 q0, q1, #5 @ pack green behind red \n"
" vsri.16 q0, q2, #11 @ pack blue into pixels \n"
- " vst1.16 {d0,d1}, [%[dest]] @ store composited pixels \n"
+ " vst1.16 {d0, d1}, [%[dest]] @ store composited pixels \n"
" add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
" bne 0b @ next please \n"
asm volatile (
"0: @ loop\n"
" pld [%[src], %[src_stride]] @ preload from next scanline \n"
- " vld1.16 {d0,d1}, [%[dest]] @ load pixels from framebuffer \n"
- " vld4.8 {d20,d21,d22,d23},[%[src]]! @ load source image pixels \n"
+ " vld1.16 {d0, d1}, [%[dest]] @ load pixels from framebuffer \n"
+ " vld4.8 {d20, d21, d22, d23},[%[src]]! @ load source image pixels \n"
" vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
" vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
" vshrn.u16 d4, q0, #3 @ unpack green \n"
" vmlal.u8 q3, d23, d20 @ ...blue \n"
" vsri.16 q1, q2, #5 @ pack green behind red \n"
" vsri.16 q1, q3, #11 @ pack blue into pixels \n"
- " vst1.16 {d2,d3}, [%[dest]]! @ store composited pixels \n"
+ " vst1.16 {d2, d3}, [%[dest]]! @ store composited pixels \n"
" bne 0b @ next please \n"
/* Clobbered registers marked as input/outputs */