From 44174ce51d1ed5a1bf988b9dd9218d8cbd379de3 Mon Sep 17 00:00:00 2001 From: Nemanja Lukic Date: Mon, 15 Apr 2013 19:32:58 +0200 Subject: [PATCH] MIPS: DSPr2: Fix for bug in in_n_8 routine. Rounding logic was not implemented right. Instead of using rounding version of the 8-bit shift, logical shifts were used. Also, code used unnecessary multiplications, which could be avoided by packing 4 destination (a8) pixel into one 32bit register. There were also, unnecessary spills on stack. Code is rewritten to address mentioned issues. The bug was revealed by increasing number of the iterations in blitters-test. Performance numbers on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): in_n_8 = L1: 21.20 L2: 22.86 M: 21.42 ( 14.21%) HT: 15.97 VT: 15.69 R: 15.47 RT: 8.00 ( 48Kops/s) Optimized (first implementation, with bug): in_n_8 = L1: 89.38 L2: 86.07 M: 65.48 ( 43.44%) HT: 44.64 VT: 41.50 R: 40.77 RT: 16.94 ( 66Kops/s) Optimized (with bug fix, and code revisited): in_n_8 = L1: 102.33 L2: 95.65 M: 70.54 ( 46.84%) HT: 48.35 VT: 45.06 R: 43.20 RT: 17.60 ( 66Kops/s) --- pixman/pixman-mips-dspr2-asm.S | 118 +++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 70 deletions(-) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index b94e66f..3a4d914 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -2974,96 +2974,74 @@ END(pixman_composite_over_reverse_n_8888_asm_mips) LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips) /* * a0 - dst (a8) - * a1 - src (a8r8g8b8) + * a1 - src (32bit constant) * a2 - w */ - beqz a2, 5f + li t9, 0x00ff00ff + beqz a2, 3f nop - - SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 - move t7, a1 - srl t5, t7, 24 - replv.ph t5, t5 - srl t9, a2, 2 /* t1 = how many multiples of 4 src pixels */ - beqz t9, 2f /* branch if less than 4 src pixels */ + srl t7, a2, 2 /* t7 = how many multiples of 4 dst pixels */ + beqz t7, 1f /* branch if less than 4 src pixels */ nop -1: - addiu t9, t9, -1 - addiu a2, a2, -4 + srl t8, a1, 24 + replv.ph t8, t8 + +0: + beqz t7, 1f + addiu t7, t7, -1 lbu t0, 0(a0) lbu t1, 1(a0) lbu t2, 2(a0) lbu t3, 3(a0) - muleu_s.ph.qbl s0, t0, t5 - muleu_s.ph.qbr s1, t0, t5 - muleu_s.ph.qbl s2, t1, t5 - muleu_s.ph.qbr s3, t1, t5 - muleu_s.ph.qbl s4, t2, t5 - muleu_s.ph.qbr s5, t2, t5 - muleu_s.ph.qbl s6, t3, t5 - muleu_s.ph.qbr s7, t3, t5 - - shrl.ph t4, s0, 8 - shrl.ph t6, s1, 8 - shrl.ph t7, s2, 8 - shrl.ph t8, s3, 8 - addq.ph t0, s0, t4 - addq.ph t1, s1, t6 - addq.ph t2, s2, t7 - addq.ph t3, s3, t8 - shra_r.ph t0, t0, 8 - shra_r.ph t1, t1, 8 + precr_sra.ph.w t1, t0, 0 + precr_sra.ph.w t3, t2, 0 + precr.qb.ph t0, t3, t1 + + muleu_s.ph.qbl t2, t0, t8 + muleu_s.ph.qbr t3, t0, t8 + shra_r.ph t4, t2, 8 + shra_r.ph t5, t3, 8 + and t4, t4, t9 + and t5, t5, t9 + addq.ph t2, t2, t4 + addq.ph t3, t3, t5 shra_r.ph t2, t2, 8 shra_r.ph t3, t3, 8 - shrl.ph t4, s4, 8 - shrl.ph t6, s5, 8 - shrl.ph t7, s6, 8 - shrl.ph t8, s7, 8 - addq.ph s0, s4, t4 - addq.ph s1, s5, t6 - addq.ph s2, s6, t7 - addq.ph s3, s7, t8 - shra_r.ph t4, s0, 8 - shra_r.ph t6, s1, 8 - shra_r.ph t7, s2, 8 - shra_r.ph t8, s3, 8 - - precr.qb.ph s0, t0, t1 - precr.qb.ph s1, t2, t3 - precr.qb.ph s2, t4, t6 - precr.qb.ph s3, t7, t8 + precr.qb.ph t2, t2, t3 - sb s0, 0(a0) - sb s1, 1(a0) - sb s2, 2(a0) - sb s3, 3(a0) - bgtz t9, 1b + sb t2, 0(a0) + srl t2, t2, 8 + sb t2, 1(a0) + srl t2, t2, 8 + sb t2, 2(a0) + srl t2, t2, 8 + sb t2, 3(a0) + addiu a2, a2, -4 + b 0b addiu a0, a0, 4 -2: - beqz a2, 4f + +1: + beqz a2, 3f nop -3: - lbu t1, 0(a0) + srl t8, a1, 24 +2: + lbu t0, 0(a0) + + mul t2, t0, t8 + shra_r.ph t3, t2, 8 + andi t3, t3, 0x00ff + addq.ph t2, t2, t3 + shra_r.ph t2, t2, 8 - muleu_s.ph.qbl t4, t1, t5 - muleu_s.ph.qbr t7, t1, t5 - shrl.ph t6, t4, 8 - shrl.ph t0, t7, 8 - addq.ph t8, t4, t6 - addq.ph t9, t7, t0 - shra_r.ph t8, t8, 8 - shra_r.ph t9, t9, 8 - precr.qb.ph t2, t8, t9 sb t2, 0(a0) addiu a2, a2, -1 - bnez a2, 3b + bnez a2, 2b addiu a0, a0, 1 -4: - RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 -5: + +3: j ra nop -- 2.7.4