From 43914d68d1c87a9da6f53e6b0a12941c97bb0e5d Mon Sep 17 00:00:00 2001 From: Nemanja Lukic Date: Wed, 27 Feb 2013 14:39:45 +0100 Subject: [PATCH] MIPS: DSPr2: Added more fast-paths for OVER operation: - over_8888_0565 - over_n_8_8 Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): over_8888_0565 = L1: 14.30 L2: 13.22 M: 10.43 ( 41.56%) HT: 12.51 VT: 12.95 R: 11.82 RT: 7.34 ( 49Kops/s) over_n_8_8 = L1: 12.77 L2: 16.93 M: 15.03 ( 29.94%) HT: 10.78 VT: 10.72 R: 10.29 RT: 4.92 ( 33Kops/s) Optimized: over_8888_0565 = L1: 26.03 L2: 22.92 M: 15.68 ( 62.43%) HT: 16.19 VT: 16.27 R: 14.93 RT: 8.60 ( 52Kops/s) over_n_8_8 = L1: 62.00 L2: 55.17 M: 40.29 ( 80.23%) HT: 26.77 VT: 25.64 R: 24.13 RT: 10.01 ( 47Kops/s) --- pixman/pixman-mips-dspr2-asm.S | 198 +++++++++++++++++++++++++++++++++++++++++ pixman/pixman-mips-dspr2.c | 7 ++ 2 files changed, 205 insertions(+) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index ddfacef..299f739 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -658,6 +658,126 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips) END(pixman_composite_over_n_8888_0565_ca_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8_asm_mips) +/* + * a0 - dst (a8) + * a1 - src (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 0, v0 + li t9, 0x00ff00ff + beqz a3, 3f + nop + srl v0, a3, 2 /* v0 = how many multiples of 4 dst pixels */ + beqz v0, 1f /* branch if less than 4 src pixels */ + nop + + srl t8, a1, 24 + replv.ph t8, t8 + +0: + beqz v0, 1f + addiu v0, v0, -1 + lbu t0, 0(a2) + lbu t1, 1(a2) + lbu t2, 2(a2) + lbu t3, 3(a2) + lbu t4, 0(a0) + lbu t5, 1(a0) + lbu t6, 2(a0) + lbu t7, 3(a0) + + addiu a2, a2, 4 + + precr_sra.ph.w t1, t0, 0 + precr_sra.ph.w t3, t2, 0 + precr_sra.ph.w t5, t4, 0 + precr_sra.ph.w t7, t6, 0 + + precr.qb.ph t0, t3, t1 + precr.qb.ph t1, t7, t5 + + muleu_s.ph.qbl t2, t0, t8 + muleu_s.ph.qbr t3, t0, t8 + shra_r.ph t4, t2, 8 + shra_r.ph t5, t3, 8 + and t4, t4, t9 + and t5, t5, t9 + addq.ph t2, t2, t4 + addq.ph t3, t3, t5 + shra_r.ph t2, t2, 8 + shra_r.ph t3, t3, 8 + precr.qb.ph t0, t2, t3 + not t6, t0 + + preceu.ph.qbl t7, t6 + preceu.ph.qbr t6, t6 + + muleu_s.ph.qbl t2, t1, t7 + muleu_s.ph.qbr t3, t1, t6 + shra_r.ph t4, t2, 8 + shra_r.ph t5, t3, 8 + and t4, t4, t9 + and t5, t5, t9 + addq.ph t2, t2, t4 + addq.ph t3, t3, t5 + shra_r.ph t2, t2, 8 + shra_r.ph t3, t3, 8 + precr.qb.ph t1, t2, t3 + + addu_s.qb t2, t0, t1 + + sb t2, 0(a0) + srl t2, t2, 8 + sb t2, 1(a0) + srl t2, t2, 8 + sb t2, 2(a0) + srl t2, t2, 8 + sb t2, 3(a0) + addiu a3, a3, -4 + b 0b + addiu a0, a0, 4 + +1: + beqz a3, 3f + nop + srl t8, a1, 24 +2: + lbu t0, 0(a2) + lbu t1, 0(a0) + addiu a2, a2, 1 + + mul t2, t0, t8 + shra_r.ph t3, t2, 8 + andi t3, t3, 0x00ff + addq.ph t2, t2, t3 + shra_r.ph t2, t2, 8 + not t3, t2 + andi t3, t3, 0x00ff + + + mul t4, t1, t3 + shra_r.ph t5, t4, 8 + andi t5, t5, 0x00ff + addq.ph t4, t4, t5 + shra_r.ph t4, t4, 8 + andi t4, t4, 0x00ff + + addu_s.qb t2, t2, t4 + sb t2, 0(a0) + addiu a3, a3, -1 + bnez a3, 2b + addiu a0, a0, 1 + +3: + RESTORE_REGS_FROM_STACK 0, v0 + j ra + nop + +END(pixman_composite_over_n_8_8_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips) /* * a0 - dst (a8r8g8b8) @@ -1342,6 +1462,84 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips) END(pixman_composite_over_8888_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_over_8888_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - w + */ + + SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5 + li t4, 0x00ff00ff + li s3, 0xf800f800 + li s4, 0x07e007e0 + li s5, 0x001F001F + beqz a2, 3f + nop + addiu t1, a2, -1 + beqz t1, 2f + nop +1: + lw t0, 0(a1) /* t0 = source (a8r8g8b8) */ + lw t1, 4(a1) /* t1 = source (a8r8g8b8) */ + lhu t2, 0(a0) /* t2 = destination (r5g6b5) */ + lhu t3, 2(a0) /* t3 = destination (r5g6b5) */ + addiu a1, a1, 8 + + not t5, t0 + srl t5, t5, 24 + not t6, t1 + srl t6, t6, 24 + + or t7, t5, t6 + beqz t7, 11f + or t8, t0, t1 + beqz t8, 12f + + CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, s4, s5, t7, t8, t9, s2 + MIPS_2xUN8x4_MUL_2xUN8 s0, s1, t5, t6, t7, t8, t4, t9, t2, t3, s2, s0, s1 + + addu_s.qb t0, t7, t0 + addu_s.qb t1, t8, t1 +11: + CONVERT_2x8888_TO_2x0565 t0, t1, t7, t8, s3, s4, s5, t2, t3 + sh t7, 0(a0) + sh t8, 2(a0) +12: + addiu a2, a2, -2 + addiu t1, a2, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a2, 3f + nop + + lw t0, 0(a1) /* t0 = source (a8r8g8b8) */ + lhu t1, 0(a0) /* t1 = destination (r5g6b5) */ + addiu a1, a1, 4 + + not t2, t0 + srl t2, t2, 24 + + beqz t2, 21f + nop + beqz t0, 3f + + CONVERT_1x0565_TO_1x8888 t1, s0, t8, t9 + MIPS_UN8x4_MUL_UN8 s0, t2, t3, t4, t5, t6, t7 + + addu_s.qb t0, t3, t0 +21: + CONVERT_1x8888_TO_1x0565 t0, s0, t8, t9 + sh s0, 0(a0) + +3: + RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5 + j ra + nop + +END(pixman_composite_over_8888_0565_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips) /* * a0 - dst (r5g6b5) diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index e14e1c4..cdc71cd 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -50,6 +50,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, uint8_t, 3, uint8_t, 3) PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565, + uint32_t, 1, uint16_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888, @@ -67,6 +69,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca, uint32_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca, uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565, @@ -290,6 +294,7 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, mips_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mips_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mips_composite_over_n_8_8888), @@ -318,6 +323,8 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mips_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mips_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mips_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mips_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mips_composite_over_8888_0565), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mips_composite_add_n_8_8), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, mips_composite_add_n_8_8888), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, mips_composite_add_n_8_8888), -- 2.7.4