From 44174ce51d1ed5a1bf988b9dd9218d8cbd379de3 Mon Sep 17 00:00:00 2001
From: Nemanja Lukic <nemanja.lukic@rt-rk.com>
Date: Mon, 15 Apr 2013 19:32:58 +0200
Subject: [PATCH] MIPS: DSPr2: Fix for bug in in_n_8 routine.

Rounding logic was not implemented right.
Instead of using rounding version of the 8-bit shift, logical shifts were used.
Also, code used unnecessary multiplications, which could be avoided by packing
4 destination (a8) pixel into one 32bit register. There were also, unnecessary
spills on stack. Code is rewritten to address mentioned issues.

The bug was revealed by increasing number of the iterations in blitters-test.

Performance numbers on MIPS-74kc @ 1GHz:

lowlevel-blt-bench results

Referent (before):
                   in_n_8 =  L1:  21.20  L2:  22.86  M: 21.42 ( 14.21%)  HT: 15.97  VT: 15.69  R: 15.47  RT:  8.00 (  48Kops/s)
Optimized (first implementation, with bug):
                   in_n_8 =  L1:  89.38  L2:  86.07  M: 65.48 ( 43.44%)  HT: 44.64  VT: 41.50  R: 40.77  RT: 16.94 (  66Kops/s)
Optimized (with bug fix, and code revisited):
                   in_n_8 =  L1: 102.33  L2:  95.65  M: 70.54 ( 46.84%)  HT: 48.35  VT: 45.06  R: 43.20  RT: 17.60 (  66Kops/s)
---
 pixman/pixman-mips-dspr2-asm.S | 118 +++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 70 deletions(-)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index b94e66f..3a4d914 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -2974,96 +2974,74 @@ END(pixman_composite_over_reverse_n_8888_asm_mips)
 LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
 /*
  * a0 - dst  (a8)
- * a1 - src  (a8r8g8b8)
+ * a1 - src  (32bit constant)
  * a2 - w
  */
 
-    beqz              a2, 5f
+    li                t9, 0x00ff00ff
+    beqz              a2, 3f
      nop
-
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-    move              t7, a1
-    srl               t5, t7, 24
-    replv.ph          t5, t5
-    srl               t9, a2, 2   /* t1 = how many multiples of 4 src pixels */
-    beqz              t9, 2f      /* branch if less than 4 src pixels */
+    srl               t7, a2, 2   /* t7 = how many multiples of 4 dst pixels */
+    beqz              t7, 1f      /* branch if less than 4 src pixels */
      nop
 
-1:
-    addiu             t9, t9, -1
-    addiu             a2, a2, -4
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              t7, 1f
+     addiu            t7, t7, -1
     lbu               t0, 0(a0)
     lbu               t1, 1(a0)
     lbu               t2, 2(a0)
     lbu               t3, 3(a0)
 
-    muleu_s.ph.qbl    s0, t0, t5
-    muleu_s.ph.qbr    s1, t0, t5
-    muleu_s.ph.qbl    s2, t1, t5
-    muleu_s.ph.qbr    s3, t1, t5
-    muleu_s.ph.qbl    s4, t2, t5
-    muleu_s.ph.qbr    s5, t2, t5
-    muleu_s.ph.qbl    s6, t3, t5
-    muleu_s.ph.qbr    s7, t3, t5
-
-    shrl.ph           t4, s0, 8
-    shrl.ph           t6, s1, 8
-    shrl.ph           t7, s2, 8
-    shrl.ph           t8, s3, 8
-    addq.ph           t0, s0, t4
-    addq.ph           t1, s1, t6
-    addq.ph           t2, s2, t7
-    addq.ph           t3, s3, t8
-    shra_r.ph         t0, t0, 8
-    shra_r.ph         t1, t1, 8
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr.qb.ph       t0, t3, t1
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
     shra_r.ph         t2, t2, 8
     shra_r.ph         t3, t3, 8
-    shrl.ph           t4, s4, 8
-    shrl.ph           t6, s5, 8
-    shrl.ph           t7, s6, 8
-    shrl.ph           t8, s7, 8
-    addq.ph           s0, s4, t4
-    addq.ph           s1, s5, t6
-    addq.ph           s2, s6, t7
-    addq.ph           s3, s7, t8
-    shra_r.ph         t4, s0, 8
-    shra_r.ph         t6, s1, 8
-    shra_r.ph         t7, s2, 8
-    shra_r.ph         t8, s3, 8
-
-    precr.qb.ph       s0, t0, t1
-    precr.qb.ph       s1, t2, t3
-    precr.qb.ph       s2, t4, t6
-    precr.qb.ph       s3, t7, t8
+    precr.qb.ph       t2, t2, t3
 
-    sb                s0, 0(a0)
-    sb                s1, 1(a0)
-    sb                s2, 2(a0)
-    sb                s3, 3(a0)
-    bgtz              t9, 1b
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
      addiu            a0, a0, 4
-2:
-    beqz              a2, 4f
+
+1:
+    beqz              a2, 3f
      nop
-3:
-    lbu               t1, 0(a0)
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a0)
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
 
-    muleu_s.ph.qbl    t4, t1, t5
-    muleu_s.ph.qbr    t7, t1, t5
-    shrl.ph           t6, t4, 8
-    shrl.ph           t0, t7, 8
-    addq.ph           t8, t4, t6
-    addq.ph           t9, t7, t0
-    shra_r.ph         t8, t8, 8
-    shra_r.ph         t9, t9, 8
-    precr.qb.ph       t2, t8, t9
     sb                t2, 0(a0)
     addiu             a2, a2, -1
-    bnez              a2, 3b
+    bnez              a2, 2b
      addiu            a0, a0, 1
-4:
-    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-5:
+
+3:
     j                 ra
      nop
 
-- 
2.7.4