From d2ee5631ae42d031289ae80352e02bafa3f06ed4 Mon Sep 17 00:00:00 2001 From: Nemanja Lukic Date: Sun, 11 Mar 2012 18:52:25 +0100 Subject: [PATCH] MIPS: DSPr2: Added over_n_8888_8888_ca and over_n_8888_0565_ca fast paths. Performance numbers before/after on MIPS-74kc @ 1GHz Referent (before): lowlevel-blt-bench: over_n_8888_8888_ca = L1: 8.32 L2: 7.65 M: 6.38 ( 51.08%) HT: 5.78 VT: 5.74 R: 5.84 RT: 4.39 ( 37Kops/s) over_n_8888_0565_ca = L1: 7.40 L2: 6.95 M: 6.16 ( 41.06%) HT: 5.72 VT: 5.52 R: 5.63 RT: 4.28 ( 36Kops/s) cairo-perf-trace: [ # ] backend test min(s) median(s) stddev. count [ # ] image: pixman 0.25.3 [ 0] image xfce4-terminal-a1 138.223 139.070 0.33% 6/6 [ # ] image16: pixman 0.25.3 [ 0] image16 xfce4-terminal-a1 132.763 132.939 0.06% 5/6 Optimized: lowlevel-blt-bench: over_n_8888_8888_ca = L1: 19.35 L2: 23.84 M: 13.68 (109.39%) HT: 11.39 VT: 11.19 R: 11.27 RT: 6.90 ( 47Kops/s) over_n_8888_0565_ca = L1: 18.68 L2: 17.00 M: 12.56 ( 83.70%) HT: 10.72 VT: 10.45 R: 10.43 RT: 5.79 ( 43Kops/s) cairo-perf-trace: [ # ] backend test min(s) median(s) stddev. count [ # ] image: pixman 0.25.3 [ 0] image xfce4-terminal-a1 130.400 131.720 0.46% 6/6 [ # ] image16: pixman 0.25.3 [ 0] image16 xfce4-terminal-a1 125.830 126.604 0.34% 6/6 --- pixman/pixman-mips-dspr2-asm.S | 219 ++++++++++++++++++++++++++++++ pixman/pixman-mips-dspr2-asm.h | 296 +++++++++++++++++++++++++++++++++++++++++ pixman/pixman-mips-dspr2.c | 12 ++ pixman/pixman-mips-dspr2.h | 42 ++++++ 4 files changed, 569 insertions(+) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index f1087a7..6a0fc18 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -308,3 +308,222 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips) nop END(pixman_composite_src_x888_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (32bit constant) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7 + beqz a3, 4f + nop + li t6, 0xff + addiu t7, zero, -1 /* t7 = 0xffffffff */ + srl t8, a1, 24 /* t8 = srca */ + li t9, 0x00ff00ff + addiu t1, a3, -1 + beqz t1, 3f /* last pixel */ + nop + beq t8, t6, 2f /* if (srca == 0xff) */ + nop +1: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 8 + and t3, t0, t1 + move s0, t8 /* s0 = srca */ + move s1, t8 /* s1 = srca */ + move t4, a1 /* t4 = src */ + move t5, a1 /* t5 = src */ + lw t2, 0(a0) /* t2 = dst */ + beq t3, t7, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + lw t3, 4(a0) /* t0 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5 + MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, s0, s1, t9, s2, s3, s4, s5, s6, s7 +11: + not s0, s0 + not s1, s1 + MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, s0, s1, s2, s3, t9, t0, t1, s4, s5, s6, s7 + addu_s.qb t0, t4, s2 + addu_s.qb t1, t5, s3 + sw t0, 0(a0) + sw t1, 4(a0) +12: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 8 + b 3f + nop +2: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */ + addiu a2, a2, 8 + and t2, t0, t1 + move s0, a1 + beq t2, t7, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + move s1, a1 + lw t2, 0(a0) /* t2 = dst */ + lw t3, 4(a0) /* t3 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5 + not t0, t0 + not t1, t1 + MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, s0, s1, t9, s2, s3, s4, s5, s6, s7 + addu_s.qb s0, t4, s0 + addu_s.qb s1, t5, s1 +21: + sw s0, 0(a0) + sw s1, 4(a0) +22: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 2b + addiu a0, a0, 8 +3: + blez a3, 4f + nop + /* a1 = src */ + lw t1, 0(a2) /* t1 = mask */ + beqz t1, 4f + nop + move s0, t8 /* s0 = srca */ + move t2, a1 /* t2 = src */ + beq t1, t7, 31f + lw t0, 0(a0) /* t0 = dst */ + + MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6 + MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5 +31: + not s0, s0 + MIPS_UN8x4_MUL_UN8x4 t0, s0, t3, t9, t4, t5, t6, t1 + addu_s.qb t0, t2, t3 + sw t0, 0(a0) +4: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7 + j ra + nop + +END(pixman_composite_over_n_8888_8888_ca_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (32bit constant) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 + beqz a3, 4f + nop + li t5, 0xf800f800 + li t6, 0x07e007e0 + li t7, 0x001F001F + li t9, 0x00ff00ff + + srl t8, a1, 24 /* t8 = srca */ + addiu t1, a3, -1 + beqz t1, 3f /* last pixel */ + nop + li s0, 0xff /* s0 = 0xff */ + addiu s1, zero, -1 /* s1 = 0xffffffff */ + + beq t8, s0, 2f /* if (srca == 0xff) */ + nop +1: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */ + addiu a2, a2, 8 + and t3, t0, t1 + move t0, t8 + move t1, a1 + lhu t2, 0(a0) /* t2 = dst */ + beq t3, s1, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + lhu t3, 2(a0) /* t3 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8 + MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8 +11: + not t0, t0 + not t1, t1 + CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8 + MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1 + addu_s.qb s2, s2, s4 + addu_s.qb s3, s3, s5 + CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s1, s2 + sh t2, 0(a0) + sh t3, 2(a0) +12: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 4 + b 3f + nop +2: + /* a1 = src */ + lw t0, 0(a2) /* t0 = mask */ + lw t1, 4(a2) /* t1 = mask */ + or t2, t0, t1 + beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */ + addiu a2, a2, 8 + and t3, t0, t1 + move t2, a1 + beq t3, s1, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ + move t3, a1 + lhu t2, 0(a0) /* t2 = dst */ + lhu t3, 2(a0) /* t3 = dst */ + MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8 + not t0, t0 + not t1, t1 + CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8 + MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3 + addu_s.qb t2, s2, s4 + addu_s.qb t3, s3, s5 +21: + CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3 + sh t0, 0(a0) + sh t1, 2(a0) +22: + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 2b + addiu a0, a0, 4 +3: + blez a3, 4f + nop + /* a1 = src */ + lw t1, 0(a2) /* t1 = mask */ + beqz t1, 4f + nop + move s0, t8 /* s0 = srca */ + move t2, a1 /* t2 = src */ + beq t1, t7, 31f + lhu t0, 0(a0) /* t0 = dst */ + + MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6 + MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5 +31: + not s0, s0 + CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3 + MIPS_UN8x4_MUL_UN8x4 s1, s0, t3, t9, t4, t5, t6, t1 + addu_s.qb t0, t2, t3 + CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3 + sh s1, 0(a0) +4: + RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 + j ra + nop + +END(pixman_composite_over_n_8888_0565_ca_asm_mips) diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h index e07cda4..12ff42c 100644 --- a/pixman/pixman-mips-dspr2-asm.h +++ b/pixman/pixman-mips-dspr2-asm.h @@ -96,6 +96,170 @@ LEAF_MIPS32R2(symbol) \ .size function,.-function /* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + .if \stack_offset != 0 + addiu sp, sp, -\stack_offset + .endif + sw \r1, 0(sp) + .if \r2 != 0 + sw \r2, 4(sp) + .endif + .if \r3 != 0 + sw \r3, 8(sp) + .endif + .if \r4 != 0 + sw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) + .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + lw \r1, 0(sp) + .if \r2 != 0 + lw \r2, 4(sp) + .endif + .if \r3 != 0 + lw \r3, 8(sp) + .endif + .if \r4 != 0 + lw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) + .endif + .if \stack_offset != 0 + addiu sp, sp, \stack_offset + .endif +.endm + +/* * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel * returned in (out_8888) register. Requires two temporary registers * (scratch1 and scratch2). @@ -203,4 +367,136 @@ LEAF_MIPS32R2(symbol) \ srl \out2_565, \out1_565, 16 .endm +/* + * Multiply pixel (a8) with single pixel (a8r8g8b8). It requires maskLSR needed + * for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro MIPS_UN8x4_MUL_UN8 s_8888, \ + m_8, \ + d_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3 + replv.ph \m_8, \m_8 /* 0 | M | 0 | M */ + muleu_s.ph.qbl \scratch1, \s_8888, \m_8 /* A*M | R*M */ + muleu_s.ph.qbr \scratch2, \s_8888, \m_8 /* G*M | B*M */ + shra_r.ph \scratch3, \scratch1, 8 + shra_r.ph \d_8888, \scratch2, 8 + and \scratch3, \scratch3, \maskLSR /* 0 |A*M| 0 |R*M */ + and \d_8888, \d_8888, \maskLSR /* 0 |G*M| 0 |B*M */ + addq.ph \scratch1, \scratch1, \scratch3 /* A*M+A*M | R*M+R*M */ + addq.ph \scratch2, \scratch2, \d_8888 /* G*M+G*M | B*M+B*M */ + shra_r.ph \scratch1, \scratch1, 8 + shra_r.ph \scratch2, \scratch2, 8 + precr.qb.ph \d_8888, \scratch1, \scratch2 +.endm + +/* + * Multiply two pixels (a8) with two pixels (a8r8g8b8). It requires maskLSR + * needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro MIPS_2xUN8x4_MUL_2xUN8 s1_8888, \ + s2_8888, \ + m1_8, \ + m2_8, \ + d1_8888, \ + d2_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, \ + scratch4, scratch5, scratch6 + replv.ph \m1_8, \m1_8 /* 0 | M1 | 0 | M1 */ + replv.ph \m2_8, \m2_8 /* 0 | M2 | 0 | M2 */ + muleu_s.ph.qbl \scratch1, \s1_8888, \m1_8 /* A1*M1 | R1*M1 */ + muleu_s.ph.qbr \scratch2, \s1_8888, \m1_8 /* G1*M1 | B1*M1 */ + muleu_s.ph.qbl \scratch3, \s2_8888, \m2_8 /* A2*M2 | R2*M2 */ + muleu_s.ph.qbr \scratch4, \s2_8888, \m2_8 /* G2*M2 | B2*M2 */ + shra_r.ph \scratch5, \scratch1, 8 + shra_r.ph \d1_8888, \scratch2, 8 + shra_r.ph \scratch6, \scratch3, 8 + shra_r.ph \d2_8888, \scratch4, 8 + and \scratch5, \scratch5, \maskLSR /* 0 |A1*M1| 0 |R1*M1 */ + and \d1_8888, \d1_8888, \maskLSR /* 0 |G1*M1| 0 |B1*M1 */ + and \scratch6, \scratch6, \maskLSR /* 0 |A2*M2| 0 |R2*M2 */ + and \d2_8888, \d2_8888, \maskLSR /* 0 |G2*M2| 0 |B2*M2 */ + addq.ph \scratch1, \scratch1, \scratch5 + addq.ph \scratch2, \scratch2, \d1_8888 + addq.ph \scratch3, \scratch3, \scratch6 + addq.ph \scratch4, \scratch4, \d2_8888 + shra_r.ph \scratch1, \scratch1, 8 + shra_r.ph \scratch2, \scratch2, 8 + shra_r.ph \scratch3, \scratch3, 8 + shra_r.ph \scratch4, \scratch4, 8 + precr.qb.ph \d1_8888, \scratch1, \scratch2 + precr.qb.ph \d2_8888, \scratch3, \scratch4 +.endm + +/* + * Multiply pixel (a8r8g8b8) with single pixel (a8r8g8b8). It requires maskLSR + * needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ +.macro MIPS_UN8x4_MUL_UN8x4 s_8888, \ + m_8888, \ + d_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, scratch4 + preceu.ph.qbl \scratch1, \m_8888 /* 0 | A | 0 | R */ + preceu.ph.qbr \scratch2, \m_8888 /* 0 | G | 0 | B */ + muleu_s.ph.qbl \scratch3, \s_8888, \scratch1 /* A*A | R*R */ + muleu_s.ph.qbr \scratch4, \s_8888, \scratch2 /* G*G | B*B */ + shra_r.ph \scratch1, \scratch3, 8 + shra_r.ph \scratch2, \scratch4, 8 + and \scratch1, \scratch1, \maskLSR /* 0 |A*A| 0 |R*R */ + and \scratch2, \scratch2, \maskLSR /* 0 |G*G| 0 |B*B */ + addq.ph \scratch1, \scratch1, \scratch3 + addq.ph \scratch2, \scratch2, \scratch4 + shra_r.ph \scratch1, \scratch1, 8 + shra_r.ph \scratch2, \scratch2, 8 + precr.qb.ph \d_8888, \scratch1, \scratch2 +.endm + +/* + * Multiply two pixels (a8r8g8b8) with two pixels (a8r8g8b8). It requires + * maskLSR needed for rounding process. maskLSR must have following value: + * li maskLSR, 0x00ff00ff + */ + +.macro MIPS_2xUN8x4_MUL_2xUN8x4 s1_8888, \ + s2_8888, \ + m1_8888, \ + m2_8888, \ + d1_8888, \ + d2_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3, \ + scratch4, scratch5, scratch6 + preceu.ph.qbl \scratch1, \m1_8888 /* 0 | A | 0 | R */ + preceu.ph.qbr \scratch2, \m1_8888 /* 0 | G | 0 | B */ + preceu.ph.qbl \scratch3, \m2_8888 /* 0 | A | 0 | R */ + preceu.ph.qbr \scratch4, \m2_8888 /* 0 | G | 0 | B */ + muleu_s.ph.qbl \scratch5, \s1_8888, \scratch1 /* A*A | R*R */ + muleu_s.ph.qbr \scratch6, \s1_8888, \scratch2 /* G*G | B*B */ + muleu_s.ph.qbl \scratch1, \s2_8888, \scratch3 /* A*A | R*R */ + muleu_s.ph.qbr \scratch2, \s2_8888, \scratch4 /* G*G | B*B */ + shra_r.ph \scratch3, \scratch5, 8 + shra_r.ph \scratch4, \scratch6, 8 + shra_r.ph \d1_8888, \scratch1, 8 + shra_r.ph \d2_8888, \scratch2, 8 + and \scratch3, \scratch3, \maskLSR /* 0 |A*A| 0 |R*R */ + and \scratch4, \scratch4, \maskLSR /* 0 |G*G| 0 |B*B */ + and \d1_8888, \d1_8888, \maskLSR /* 0 |A*A| 0 |R*R */ + and \d2_8888, \d2_8888, \maskLSR /* 0 |G*G| 0 |B*B */ + addq.ph \scratch3, \scratch3, \scratch5 + addq.ph \scratch4, \scratch4, \scratch6 + addq.ph \d1_8888, \d1_8888, \scratch1 + addq.ph \d2_8888, \d2_8888, \scratch2 + shra_r.ph \scratch3, \scratch3, 8 + shra_r.ph \scratch4, \scratch4, 8 + shra_r.ph \scratch5, \d1_8888, 8 + shra_r.ph \scratch6, \d2_8888, 8 + precr.qb.ph \d1_8888, \scratch3, \scratch4 + precr.qb.ph \d2_8888, \scratch5, \scratch6 +.endm + #endif //PIXMAN_MIPS_DSPR2_ASM_H diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index 2beada3..018770a 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -49,6 +49,11 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888, PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, uint8_t, 3, uint8_t, 3) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca, + uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca, + uint32_t, 1, uint16_t, 1) + static pixman_bool_t pixman_fill_mips (uint32_t *bits, int stride, @@ -184,6 +189,13 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h index a40e7c8..cc35d02 100644 --- a/pixman/pixman-mips-dspr2.h +++ b/pixman/pixman-mips-dspr2.h @@ -85,4 +85,46 @@ mips_composite_##name (pixman_implementation_t *imp, \ } \ } +/*******************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name, \ + mask_type, mask_cnt, \ + dst_type, dst_cnt) \ +void \ +pixman_composite_##name##_asm_mips (dst_type *dst, \ + uint32_t src, \ + mask_type *mask, \ + int32_t w); \ + \ +static void \ +mips_composite_##name (pixman_implementation_t *imp, \ + pixman_composite_info_t *info) \ +{ \ + PIXMAN_COMPOSITE_ARGS (info); \ + dst_type *dst_line, *dst; \ + mask_type *mask_line, *mask; \ + int32_t dst_stride, mask_stride; \ + uint32_t src; \ + \ + src = _pixman_image_get_solid ( \ + imp, src_image, dest_image->bits.format); \ + \ + if ((flags & SKIP_ZERO_SRC) && src == 0) \ + return; \ + \ + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ + dst_stride, dst_line, dst_cnt); \ + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \ + mask_stride, mask_line, mask_cnt); \ + \ + while (height--) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + mask = mask_line; \ + mask_line += mask_stride; \ + pixman_composite_##name##_asm_mips (dst, src, mask, width); \ + } \ +} + #endif //PIXMAN_MIPS_DSPR2_H -- 2.7.4