From: Nemanja Lukic Date: Mon, 2 Jul 2012 18:54:19 +0000 (+0200) Subject: MIPS: DSPr2: Added several bilinear fast paths with a8 mask X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=707a8be11280c4d395e662e869d4a98d75bb5571;p=platform%2Fupstream%2Fpixman.git MIPS: DSPr2: Added several bilinear fast paths with a8 mask Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench -b Referent (before): src_8888_8_8888 = L1: 6.37 L2: 6.08 M: 5.46 ( 32.57%) HT: 4.64 VT: 4.61 R: 4.52 RT: 2.85 ( 23Kops/s) src_8888_8_0565 = L1: 5.89 L2: 5.66 M: 5.11 ( 23.71%) HT: 4.36 VT: 4.34 R: 4.26 RT: 2.71 ( 22Kops/s) src_0565_8_x888 = L1: 3.32 L2: 3.27 M: 3.17 ( 14.71%) HT: 2.86 VT: 2.84 R: 2.81 RT: 2.07 ( 19Kops/s) src_0565_8_0565 = L1: 3.19 L2: 3.15 M: 3.05 ( 10.11%) HT: 2.75 VT: 2.74 R: 2.71 RT: 2.00 ( 18Kops/s) over_8888_8_8888 = L1: 4.99 L2: 4.71 M: 4.11 ( 27.22%) HT: 3.59 VT: 3.58 R: 3.50 RT: 2.36 ( 21Kops/s) add_8888_8_8888 = L1: 5.60 L2: 5.26 M: 4.52 ( 29.95%) HT: 3.92 VT: 3.89 R: 3.80 RT: 2.49 ( 21Kops/s) Optimized: src_8888_8_8888 = L1: 13.19 L2: 12.13 M: 9.75 ( 58.22%) HT: 8.60 VT: 8.44 R: 7.90 RT: 5.06 ( 33Kops/s) src_8888_8_0565 = L1: 11.64 L2: 10.81 M: 9.18 ( 42.63%) HT: 8.04 VT: 7.90 R: 7.57 RT: 5.02 ( 32Kops/s) src_0565_8_x888 = L1: 8.34 L2: 7.95 M: 7.29 ( 33.85%) HT: 6.55 VT: 6.48 R: 6.25 RT: 4.35 ( 30Kops/s) src_0565_8_0565 = L1: 7.71 L2: 7.35 M: 6.90 ( 22.90%) HT: 6.14 VT: 6.10 R: 5.94 RT: 4.07 ( 29Kops/s) over_8888_8_8888 = L1: 9.73 L2: 8.99 M: 7.15 ( 47.41%) HT: 6.40 VT: 6.30 R: 6.11 RT: 4.28 ( 30Kops/s) add_8888_8_8888 = L1: 13.01 L2: 11.72 M: 8.70 ( 57.68%) HT: 7.59 VT: 7.46 R: 7.20 RT: 4.74 ( 32Kops/s) --- diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S index 48f108e..ac56746 100644 --- a/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman-mips-dspr2-asm.S @@ -749,6 +749,278 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips) END(pixman_composite_over_n_8_0565_asm_mips) +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips) +/* + * a0 - *dst + * a1 - *mask + * a2 - *src_top + * a3 - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + + lw v1, 32(sp) + beqz v1, 1f + nop + + SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lw s0, 44(sp) /* s0 = wt */ + lw s1, 48(sp) /* s1 = wb */ + lw s2, 52(sp) /* s2 = vx */ + lw s3, 56(sp) /* s3 = unit_x */ + li v0, BILINEAR_INTERPOLATION_RANGE + li s8, 0x00ff00ff + + sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: + andi t4, s2, 0xffff /* t4 = (short)vx */ + srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ + subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */ + + mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */ + mul s5, s0, t4 /* s5 = wt*(vx>>8) */ + mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */ + mul s7, s1, t4 /* s7 = wb*(vx>>8) */ + + sra t9, s2, 16 + sll t9, t9, 2 + addiu t8, t9, 4 + lwx t0, t9(a2) /* t0 = tl */ + lwx t1, t8(a2) /* t1 = tr */ + addiu v1, v1, -1 + lwx t2, t9(a3) /* t2 = bl */ + lwx t3, t8(a3) /* t3 = br */ + + BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + lbu t1, 0(a1) /* t1 = mask */ + addiu a1, a1, 1 + MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4 + + addu s2, s2, s3 /* vx += unit_x; */ + sw t0, 0(a0) + bnez v1, 0b + addiu a0, a0, 4 + + RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: + j ra + nop + +END(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips) +/* + * a0 - *dst + * a1 - *mask + * a2 - *src_top + * a3 - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + + lw v1, 32(sp) + beqz v1, 1f + nop + + SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lw s0, 44(sp) /* s0 = wt */ + lw s1, 48(sp) /* s1 = wb */ + lw s2, 52(sp) /* s2 = vx */ + lw s3, 56(sp) /* s3 = unit_x */ + li v0, BILINEAR_INTERPOLATION_RANGE + li s8, 0x00ff00ff + + sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: + andi t4, s2, 0xffff /* t4 = (short)vx */ + srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ + subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */ + + mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */ + mul s5, s0, t4 /* s5 = wt*(vx>>8) */ + mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */ + mul s7, s1, t4 /* s7 = wb*(vx>>8) */ + + sra t9, s2, 16 + sll t9, t9, 2 + addiu t8, t9, 4 + lwx t0, t9(a2) /* t0 = tl */ + lwx t1, t8(a2) /* t1 = tr */ + addiu v1, v1, -1 + lwx t2, t9(a3) /* t2 = bl */ + lwx t3, t8(a3) /* t3 = br */ + + BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + lbu t1, 0(a1) /* t1 = mask */ + addiu a1, a1, 1 + MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4 + CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + + addu s2, s2, s3 /* vx += unit_x; */ + sh t1, 0(a0) + bnez v1, 0b + addiu a0, a0, 2 + + RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: + j ra + nop + +END(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips) +/* + * a0 - *dst + * a1 - *mask + * a2 - *src_top + * a3 - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + + lw t0, 32(sp) + beqz t0, 1f + nop + + SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra + + lw s0, 48(sp) /* s0 = wt */ + lw s1, 52(sp) /* s1 = wb */ + lw s2, 56(sp) /* s2 = vx */ + lw s3, 60(sp) /* s3 = unit_x */ + lw ra, 64(sp) /* ra = w */ + li v0, 0x00ff00ff + li v1, 0x07e007e0 + li s8, 0x001f001f + + sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: + andi t4, s2, 0xffff /* t4 = (short)vx */ + srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ + li t5, BILINEAR_INTERPOLATION_RANGE + subu t5, t5, t4 /* t5 = ( 256 - (vx>>8)) */ + + mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */ + mul s5, s0, t4 /* s5 = wt*(vx>>8) */ + mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */ + mul s7, s1, t4 /* s7 = wb*(vx>>8) */ + + sra t9, s2, 16 + sll t9, t9, 1 + addiu t8, t9, 2 + lhx t0, t9(a2) /* t0 = tl */ + lhx t1, t8(a2) /* t1 = tr */ + andi t1, t1, 0xffff + addiu ra, ra, -1 + lhx t2, t9(a3) /* t2 = bl */ + lhx t3, t8(a3) /* t3 = br */ + andi t3, t3, 0xffff + + CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7 + CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7 + BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + lbu t1, 0(a1) /* t1 = mask */ + addiu a1, a1, 1 + MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4 + + addu s2, s2, s3 /* vx += unit_x; */ + sw t0, 0(a0) + bnez ra, 0b + addiu a0, a0, 4 + + RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra +1: + j ra + nop + +END(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips) +/* + * a0 - *dst + * a1 - *mask + * a2 - *src_top + * a3 - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + + lw t0, 32(sp) + beqz t0, 1f + nop + + SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra + + lw s0, 48(sp) /* s0 = wt */ + lw s1, 52(sp) /* s1 = wb */ + lw s2, 56(sp) /* s2 = vx */ + lw s3, 60(sp) /* s3 = unit_x */ + lw ra, 64(sp) /* ra = w */ + li v0, 0x00ff00ff + li v1, 0x07e007e0 + li s8, 0x001f001f + + sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: + andi t4, s2, 0xffff /* t4 = (short)vx */ + srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ + li t5, BILINEAR_INTERPOLATION_RANGE + subu t5, t5, t4 /* t5 = ( 256 - (vx>>8)) */ + + mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */ + mul s5, s0, t4 /* s5 = wt*(vx>>8) */ + mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */ + mul s7, s1, t4 /* s7 = wb*(vx>>8) */ + + sra t9, s2, 16 + sll t9, t9, 1 + addiu t8, t9, 2 + lhx t0, t9(a2) /* t0 = tl */ + lhx t1, t8(a2) /* t1 = tr */ + andi t1, t1, 0xffff + addiu ra, ra, -1 + lhx t2, t9(a3) /* t2 = bl */ + lhx t3, t8(a3) /* t3 = br */ + andi t3, t3, 0xffff + + CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7 + CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7 + BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + lbu t1, 0(a1) /* t1 = mask */ + addiu a1, a1, 1 + MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4 + CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + + addu s2, s2, s3 /* vx += unit_x; */ + sh t1, 0(a0) + bnez ra, 0b + addiu a0, a0, 2 + + RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra +1: + j ra + nop + +END(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips) + LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips) /* * a0 - dst (a8r8g8b8) @@ -815,3 +1087,68 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips) nop END(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips) +/* + * a0 - *dst + * a1 - *mask + * a2 - *src_top + * a3 - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + + lw v1, 32(sp) + beqz v1, 1f + nop + + SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lw s0, 44(sp) /* s0 = wt */ + lw s1, 48(sp) /* s1 = wb */ + lw s2, 52(sp) /* s2 = vx */ + lw s3, 56(sp) /* s3 = unit_x */ + li v0, BILINEAR_INTERPOLATION_RANGE + li s8, 0x00ff00ff + + sll s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + sll s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: + andi t4, s2, 0xffff /* t4 = (short)vx */ + srl t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ + subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */ + + mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */ + mul s5, s0, t4 /* s5 = wt*(vx>>8) */ + mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */ + mul s7, s1, t4 /* s7 = wb*(vx>>8) */ + + sra t9, s2, 16 + sll t9, t9, 2 + addiu t8, t9, 4 + lwx t0, t9(a2) /* t0 = tl */ + lwx t1, t8(a2) /* t1 = tr */ + addiu v1, v1, -1 + lwx t2, t9(a3) /* t2 = bl */ + lwx t3, t8(a3) /* t3 = br */ + + BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + lbu t1, 0(a1) /* t1 = mask */ + lw t2, 0(a0) /* t2 = dst */ + addiu a1, a1, 1 + MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t0, s8, t3, t4, t5 + + addu s2, s2, s3 /* vx += unit_x; */ + sw t0, 0(a0) + bnez v1, 0b + addiu a0, a0, 4 + + RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: + j ra + nop + +END(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips) diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h index 7cf3281..24b049e 100644 --- a/pixman/pixman-mips-dspr2-asm.h +++ b/pixman/pixman-mips-dspr2-asm.h @@ -566,6 +566,19 @@ LEAF_MIPS32R2(symbol) \ addu_s.qb \out2_8888, \d2_8888, \scratch2 .endm +.macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888, \ + m_8, \ + d_8888, \ + out_8888, \ + maskLSR, \ + scratch1, scratch2, scratch3 + MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \ + \out_8888, \maskLSR, \ + \scratch1, \scratch2, \scratch3 + + addu_s.qb \out_8888, \out_8888, \d_8888 +.endm + .macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br, \ scratch1, scratch2, \ alpha, red, green, blue \ diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index 06d4335..66c0e5d 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -58,8 +58,18 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888, PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565, uint8_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC, + uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC, + uint32_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_x888, SRC, + uint16_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_0565, SRC, + uint16_t, uint16_t) PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, OVER, uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, ADD, + uint32_t, uint32_t) static pixman_bool_t pixman_fill_mips (uint32_t *bits, @@ -209,9 +219,21 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8_8888), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_8_0565), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_8_0565), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8_x888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_8_0565), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8_8888), SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8_8888), { PIXMAN_OP_NONE }, };