From ece8d13bf77d050662bb9db9716576dabff37554 Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=B8ren=20Sandmann?= Date: Sun, 19 Jun 2011 20:29:08 -0400 Subject: [PATCH] ARM: Fix two bugs in neon_composite_over_n_8888_0565_ca(). The first bug is that a vmull.u8 instruction would store its result in the q1 register, clobbering the d2 register used later on. The second is that a vraddhn instruction would overwrite d25, corrupting the q12 register used later. Fixing the second bug caused a pipeline bubble where the d18 register would be unavailable for a clock cycle. This is fixed by swapping the instruction with its successor. --- pixman/pixman-arm-neon-asm.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 833f18c..7cddf7e 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1514,11 +1514,11 @@ generate_composite_function \ * output: updated src in {d0, d1, d2 } [B, G, R] * updated mask in {d24, d25, d26} [B, G, R] */ - vmull.u8 q1, d25, d9 + vmull.u8 q6, d26, d10 vqadd.u8 q8, q0, q8 vmull.u8 q0, d24, d8 vqadd.u8 d22, d2, d22 - vmull.u8 q6, d26, d10 + vmull.u8 q1, d25, d9 /* * convert the result in d16, d17, d22 to r5g6b5 and store * it into {d28, d29} @@ -1541,6 +1541,7 @@ generate_composite_function \ vrshr.u16 q11, q12, #8 vrshr.u16 q8, q9, #8 vrshr.u16 q6, q13, #8 + vraddhn.u16 d24, q12, q11 vraddhn.u16 d25, q9, q8 /* * convert 8 r5g6b5 pixel data from {d4, d5} to planar @@ -1549,11 +1550,10 @@ generate_composite_function \ */ vshrn.u16 d17, q2, #3 vshrn.u16 d18, q2, #8 - vraddhn.u16 d24, q12, q11 vraddhn.u16 d26, q13, q6 vsli.u16 q2, q2, #5 - vsri.u8 d18, d18, #5 vsri.u8 d17, d17, #6 + vsri.u8 d18, d18, #5 /* * 'combine_over_ca' replacement * -- 2.7.4