From b660eb30b4e5f690d191b26a500a6ba224986b3a Mon Sep 17 00:00:00 2001 From: Nemanja Lukic Date: Fri, 14 Sep 2012 09:31:24 +0200 Subject: [PATCH] MIPS: DSPr2: Added fast-paths for OVER operation: - over_8888_n_0565 - over_8888_8_0565 Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): over_8888_n_0565 = L1: 8.95 L2: 8.33 M: 6.95 ( 27.74%) HT: 4.27 VT: 4.07 R: 4.01 RT: 1.74 ( 19Kops/s) over_8888_8_0565 = L1: 8.86 L2: 8.11 M: 6.72 ( 35.71%) HT: 5.68 VT: 5.62 R: 5.47 RT: 3.35 ( 30Kops/s) Optimized: over_8888_n_0565 = L1: 18.76 L2: 17.55 M: 13.11 ( 52.19%) HT: 11.35 VT: 11.10 R: 10.88 RT: 6.94 ( 47Kops/s) over_8888_8_0565 = L1: 18.14 L2: 16.79 M: 12.10 ( 64.25%) HT: 10.24 VT: 9.98 R: 9.63 RT: 5.89 ( 43Kops/s) --- pixman/pixman-mips-dspr2-asm.S | 115 +++++++++++++++++++++++++++++++++++++++++ pixman/pixman-mips-dspr2.c | 8 +++ 2 files changed, 123 insertions(+) diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index 165f177..dc44f9c 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -800,6 +800,63 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_8888_asm_mips) END(pixman_composite_over_8888_n_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - mask (32bit constant) + * a3 - w + */ + + SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 + li t6, 0x00ff00ff + li t7, 0xf800f800 + li t8, 0x07e007e0 + li t9, 0x001F001F + beqz a3, 3f + nop + srl a2, a2, 24 + addiu t1, a3, -1 + beqz t1, 2f + nop +1: + lw t0, 0(a1) /* t0 = source (a8r8g8b8) */ + lw t1, 4(a1) /* t1 = source (a8r8g8b8) */ + /* a2 = mask (32bit constant) */ + lhu t2, 0(a0) /* t2 = destination (r5g6b5) */ + lhu t3, 2(a0) /* t2 = destination (r5g6b5) */ + addiu a1, a1, 8 + + CONVERT_2x0565_TO_2x8888 t2, t3, t4, t5, t8, t9, s0, s1, t2, t3 + OVER_2x8888_2x8_2x8888 t0, t1, a2, a2, t4, t5, \ + t2, t3, t6, t0, t1, s0, s1, s2, s3 + CONVERT_2x8888_TO_2x0565 t2, t3, t4, t5, t7, t8, t9, s0, s1 + + sh t4, 0(a0) + sh t5, 2(a0) + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a3, 3f + nop + lw t0, 0(a1) /* t0 = source (a8r8g8b8) */ + /* a2 = mask (32bit constant) */ + lhu t1, 0(a0) /* t1 = destination (r5g6b5) */ + + CONVERT_1x0565_TO_1x8888 t1, t2, t4, t5 + OVER_8888_8_8888 t0, a2, t2, t1, t6, t3, t4, t5, t7 + CONVERT_1x8888_TO_1x0565 t1, t3, t4, t5 + + sh t3, 0(a0) +3: + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + j ra + nop + +END(pixman_composite_over_8888_n_0565_asm_mips) + LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_8888_asm_mips) /* * a0 - dst (a8r8g8b8) @@ -851,6 +908,64 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_8888_asm_mips) END(pixman_composite_over_8888_8_8888_asm_mips) +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - mask (a8) + * a3 - w + */ + + SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5 + li t6, 0x00ff00ff + li t7, 0xf800f800 + li t8, 0x07e007e0 + li t9, 0x001F001F + beqz a3, 3f + nop + addiu t1, a3, -1 + beqz t1, 2f + nop +1: + lw t0, 0(a1) /* t0 = source (a8r8g8b8) */ + lw t1, 4(a1) /* t1 = source (a8r8g8b8) */ + lbu t2, 0(a2) /* t2 = mask (a8) */ + lbu t3, 1(a2) /* t3 = mask (a8) */ + lhu t4, 0(a0) /* t4 = destination (r5g6b5) */ + lhu t5, 2(a0) /* t5 = destination (r5g6b5) */ + addiu a1, a1, 8 + addiu a2, a2, 2 + + CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5 + OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, s0, s1, \ + t4, t5, t6, s2, s3, s4, s5, t0, t1 + CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3 + + sh s0, 0(a0) + sh s1, 2(a0) + addiu a3, a3, -2 + addiu t1, a3, -1 + bgtz t1, 1b + addiu a0, a0, 4 +2: + beqz a3, 3f + nop + lw t0, 0(a1) /* t0 = source (a8r8g8b8) */ + lbu t1, 0(a2) /* t1 = mask (a8) */ + lhu t2, 0(a0) /* t2 = destination (r5g6b5) */ + + CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5 + OVER_8888_8_8888 t0, t1, t3, t2, t6, t4, t5, t7, t8 + CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5 + + sh t3, 0(a0) +3: + RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5 + j ra + nop + +END(pixman_composite_over_8888_8_0565_asm_mips) + LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips) /* * a0 - *dst diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index a0ed50c..8666496 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -60,9 +60,13 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565, PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_8888, uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565, + uint32_t, 1, uint16_t, 1) PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1, uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1, + uint8_t, 1, uint16_t, 1) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC, uint32_t, uint32_t) @@ -242,10 +246,14 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mips_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mips_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, mips_composite_over_8888_n_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, mips_composite_over_8888_n_0565), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, mips_composite_over_8888_8_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, mips_composite_over_8888_8_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, mips_composite_over_8888_8_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, mips_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, r5g6b5, mips_composite_over_8888_8_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, mips_composite_over_8888_8_0565), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888), -- 2.7.4