From: Nemanja Lukic Date: Wed, 22 Feb 2012 13:23:48 +0000 (+0100) Subject: MIPS: DSPr2: Added fast-paths for SRC operation. X-Git-Tag: pixman-0.25.2~9 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e7574d336b7c812a888fac22f99f1b0e9a3518b0;p=platform%2Fupstream%2Fpixman.git MIPS: DSPr2: Added fast-paths for SRC operation. Following fast-path functions are implemented (routines 4, 5 and 6 utilize same fast-memcpy routine): 1. src_x888_8888 2. src_8888_0565 3. src_0565_8888 4. src_0565_0565 5. src_8888_8888 6. src_0888_0888 Performance numbers before/after on MIPS-74kc @ 1GHz Referent (before): lowlevel-blt-bench: src_x888_8888 = L1: 199.35 L2: 96.54 M: 18.87 (100.68%) HT: 17.12 VT: 16.24 R: 15.43 RT: 9.33 ( 61Kops/s) src_8888_0565 = L1: 71.22 L2: 51.95 M: 24.19 ( 96.17%) HT: 20.71 VT: 19.92 R: 18.15 RT: 9.92 ( 63Kops/s) src_0565_8888 = L1: 38.82 L2: 36.22 M: 18.60 ( 73.95%) HT: 14.47 VT: 13.19 R: 12.97 RT: 6.61 ( 49Kops/s) src_0565_0565 = L1: 286.05 L2: 155.02 M: 37.68 (100.54%) HT: 31.08 VT: 28.07 R: 26.26 RT: 11.93 ( 68Kops/s) src_8888_8888 = L1: 454.32 L2: 139.15 M: 19.30 (102.98%) HT: 17.73 VT: 16.08 R: 16.62 RT: 10.45 ( 64Kops/s) src_0888_0888 = L1: 190.47 L2: 106.14 M: 25.26 (101.08%) HT: 21.88 VT: 20.32 R: 18.83 RT: 10.10 ( 63Kops/s) cairo-perf-trace: [ # ] backend test min(s) median(s) stddev. count [ # ] image: pixman 0.25.1 [ 0] image firefox-asteroids 421.215 421.325 0.01% 4/6 [ 1] image firefox-planet-gnome 647.708 648.486 0.13% 6/6 [ 2] image gnome-system-monitor 276.073 277.506 0.38% 6/6 [ 3] image gnome-terminal-vim 263.866 265.229 0.39% 6/6 [ 4] image poppler 123.576 124.003 0.15% 6/6 Optimized (with these optimizations): lowlevel-blt-bench: src_x888_8888 = L1: 369.50 L2: 99.37 M: 27.19 (145.07%) HT: 20.24 VT: 19.48 R: 19.00 RT: 10.22 ( 63Kops/s) src_8888_0565 = L1: 105.65 L2: 67.87 M: 25.41 (101.00%) HT: 20.78 VT: 19.84 R: 18.52 RT: 9.81 ( 63Kops/s) src_0565_8888 = L1: 77.10 L2: 63.04 M: 23.37 ( 92.90%) HT: 20.29 VT: 19.37 R: 18.14 RT: 10.02 ( 63Kops/s) src_0565_0565 = L1: 519.02 L2: 241.32 M: 62.35 (166.34%) HT: 33.74 VT: 27.63 R: 26.12 RT: 11.70 ( 67Kops/s) src_8888_8888 = L1: 390.48 L2: 113.99 M: 30.32 (161.77%) HT: 19.55 VT: 17.05 R: 17.13 RT: 10.19 ( 63Kops/s) src_0888_0888 = L1: 349.74 L2: 156.68 M: 40.68 (162.78%) HT: 25.58 VT: 20.57 R: 20.20 RT: 9.96 ( 63Kops/s) cairo-perf-trace: [ # ] backend test min(s) median(s) stddev. count [ # ] image: pixman 0.25.1 [ 0] image firefox-asteroids 400.050 400.308 0.04% 6/6 [ 1] image firefox-planet-gnome 628.978 629.364 0.07% 6/6 [ 2] image gnome-system-monitor 270.247 270.313 0.03% 6/6 [ 3] image gnome-terminal-vim 256.413 257.641 0.21% 6/6 [ 4] image poppler 119.540 120.023 0.21% 6/6 --- diff --git a/pixman/Makefile.am b/pixman/Makefile.am index a7fba33..fb7e047 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -107,7 +107,10 @@ if USE_MIPS_DSPR2 noinst_LTLIBRARIES += libpixman-mips-dspr2.la libpixman_mips_dspr2_la_SOURCES = \ pixman-mips-dspr2.c \ - pixman-mips-dspr2.h + pixman-mips-dspr2.h \ + pixman-mips-dspr2-asm.S \ + pixman-mips-dspr2-asm.h \ + pixman-mips-memcpy-asm.S libpixman_mips_dspr2_la_CFLAGS = $(DEP_CFLAGS) libpixman_mips_dspr2_la_LIBADD = $(DEP_LIBS) libpixman_1_la_LIBADD += libpixman-mips-dspr2.la diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S new file mode 100644 index 0000000..0a4c87e --- /dev/null +++ b/pixman/pixman-mips-dspr2-asm.S @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nemanja Lukic (nlukic@mips.com) + */ + +#include "pixman-mips-dspr2-asm.h" + +LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - w + */ + + beqz a2, 3f + nop + addiu t1, a2, -1 + beqz t1, 2f + nop + li t4, 0xf800f800 + li t5, 0x07e007e0 + li t6, 0x001f001f +1: + lw t0, 0(a1) + lw t1, 4(a1) + addiu a1, a1, 8 + addiu a2, a2, -2 + + CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8 + + sh t2, 0(a0) + sh t3, 2(a0) + + addiu t2, a2, -1 + bgtz t2, 1b + addiu a0, a0, 4 +2: + beqz a2, 3f + nop + lw t0, 0(a1) + + CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + + sh t1, 0(a0) +3: + j ra + nop + +END(pixman_composite_src_8888_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_0565_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (r5g6b5) + * a2 - w + */ + + beqz a2, 3f + nop + addiu t1, a2, -1 + beqz t1, 2f + nop + li t4, 0x07e007e0 + li t5, 0x001F001F +1: + lhu t0, 0(a1) + lhu t1, 2(a1) + addiu a1, a1, 4 + addiu a2, a2, -2 + + CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9 + + sw t2, 0(a0) + sw t3, 4(a0) + + addiu t2, a2, -1 + bgtz t2, 1b + addiu a0, a0, 8 +2: + beqz a2, 3f + nop + lhu t0, 0(a1) + + CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3 + + sw t1, 0(a0) +3: + j ra + nop + +END(pixman_composite_src_0565_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (x8r8g8b8) + * a2 - w + */ + + beqz a2, 4f + nop + li t9, 0xff000000 + srl t8, a2, 3 /* t1 = how many multiples of 8 src pixels */ + beqz t8, 3f /* branch if less than 8 src pixels */ + nop +1: + addiu t8, t8, -1 + beqz t8, 2f + addiu a2, a2, -8 + pref 0, 32(a1) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a1, a1, 32 + or t0, t0, t9 + or t1, t1, t9 + or t2, t2, t9 + or t3, t3, t9 + or t4, t4, t9 + or t5, t5, t9 + or t6, t6, t9 + or t7, t7, t9 + pref 30, 32(a0) + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + b 1b + addiu a0, a0, 32 +2: + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a1, a1, 32 + or t0, t0, t9 + or t1, t1, t9 + or t2, t2, t9 + or t3, t3, t9 + or t4, t4, t9 + or t5, t5, t9 + or t6, t6, t9 + or t7, t7, t9 + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + beqz a2, 4f + addiu a0, a0, 32 +3: + lw t0, 0(a1) + addiu a1, a1, 4 + addiu a2, a2, -1 + or t1, t0, t9 + sw t1, 0(a0) + bnez a2, 3b + addiu a0, a0, 4 +4: + jr ra + nop + +END(pixman_composite_src_x888_8888_asm_mips) diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h new file mode 100644 index 0000000..e07cda4 --- /dev/null +++ b/pixman/pixman-mips-dspr2-asm.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Nemanja Lukic (nlukic@mips.com) + */ + +#ifndef PIXMAN_MIPS_DSPR2_ASM_H +#define PIXMAN_MIPS_DSPR2_ASM_H + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: .frame sp, 0, ra; \ + .set push; \ + .set arch=mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_MIPS_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function,.-function + +/* + * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel + * returned in (out_8888) register. Requires two temporary registers + * (scratch1 and scratch2). + */ +.macro CONVERT_1x0565_TO_1x8888 in_565, \ + out_8888, \ + scratch1, scratch2 + lui \out_8888, 0xff00 + sll \scratch1, \in_565, 0x3 + andi \scratch2, \scratch1, 0xff + ext \scratch1, \in_565, 0x2, 0x3 + or \scratch1, \scratch2, \scratch1 + or \out_8888, \out_8888, \scratch1 + + sll \scratch1, \in_565, 0x5 + andi \scratch1, \scratch1, 0xfc00 + srl \scratch2, \in_565, 0x1 + andi \scratch2, \scratch2, 0x300 + or \scratch2, \scratch1, \scratch2 + or \out_8888, \out_8888, \scratch2 + + andi \scratch1, \in_565, 0xf800 + srl \scratch2, \scratch1, 0x5 + andi \scratch2, \scratch2, 0xff00 + or \scratch1, \scratch1, \scratch2 + sll \scratch1, \scratch1, 0x8 + or \out_8888, \out_8888, \scratch1 +.endm + +/* + * Conversion of two r5g6b5 pixels (in1_565 and in2_565) to two a8r8g8b8 pixels + * returned in (out1_8888 and out2_8888) registers. Requires four scratch + * registers (scratch1 ... scratch4). It also requires maskG and maskB for + * color component extractions. These masks must have following values: + * li maskG, 0x07e007e0 + * li maskB, 0x001F001F + */ +.macro CONVERT_2x0565_TO_2x8888 in1_565, in2_565, \ + out1_8888, out2_8888, \ + maskG, maskB, \ + scratch1, scratch2, scratch3, scratch4 + sll \scratch1, \in1_565, 16 + or \scratch1, \scratch1, \in2_565 + lui \out2_8888, 0xff00 + ori \out2_8888, \out2_8888, 0xff00 + shrl.ph \scratch2, \scratch1, 11 + and \scratch3, \scratch1, \maskG + shra.ph \scratch4, \scratch2, 2 + shll.ph \scratch2, \scratch2, 3 + shll.ph \scratch3, \scratch3, 5 + or \scratch2, \scratch2, \scratch4 + shrl.qb \scratch4, \scratch3, 6 + or \out2_8888, \out2_8888, \scratch2 + or \scratch3, \scratch3, \scratch4 + and \scratch1, \scratch1, \maskB + shll.ph \scratch2, \scratch1, 3 + shra.ph \scratch4, \scratch1, 2 + or \scratch2, \scratch2, \scratch4 + or \scratch3, \scratch2, \scratch3 + precrq.ph.w \out1_8888, \out2_8888, \scratch3 + precr_sra.ph.w \out2_8888, \scratch3, 0 +.endm + +/* + * Conversion of single a8r8g8b8 pixel (in_8888) to single r5g6b5 pixel + * returned in (out_565) register. Requires two temporary registers + * (scratch1 and scratch2). + */ +.macro CONVERT_1x8888_TO_1x0565 in_8888, \ + out_565, \ + scratch1, scratch2 + ext \out_565, \in_8888, 0x3, 0x5 + srl \scratch1, \in_8888, 0x5 + andi \scratch1, \scratch1, 0x07e0 + srl \scratch2, \in_8888, 0x8 + andi \scratch2, \scratch2, 0xf800 + or \out_565, \out_565, \scratch1 + or \out_565, \out_565, \scratch2 +.endm + +/* + * Conversion of two a8r8g8b8 pixels (in1_8888 and in2_8888) to two r5g6b5 + * pixels returned in (out1_565 and out2_565) registers. Requires two temporary + * registers (scratch1 and scratch2). It also requires maskR, maskG and maskB + * for color component extractions. These masks must have following values: + * li maskR, 0xf800f800 + * li maskG, 0x07e007e0 + * li maskB, 0x001F001F + * Value of input register in2_8888 is lost. + */ +.macro CONVERT_2x8888_TO_2x0565 in1_8888, in2_8888, \ + out1_565, out2_565, \ + maskR, maskG, maskB, \ + scratch1, scratch2 + precrq.ph.w \scratch1, \in2_8888, \in1_8888 + precr_sra.ph.w \in2_8888, \in1_8888, 0 + shll.ph \scratch1, \scratch1, 8 + srl \in2_8888, \in2_8888, 3 + and \scratch2, \in2_8888, \maskB + and \scratch1, \scratch1, \maskR + srl \in2_8888, \in2_8888, 2 + and \out2_565, \in2_8888, \maskG + or \out2_565, \out2_565, \scratch2 + or \out1_565, \out2_565, \scratch1 + srl \out2_565, \out1_565, 16 +.endm + +#endif //PIXMAN_MIPS_DSPR2_ASM_H diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c index 518dae1..e331853 100644 --- a/pixman/pixman-mips-dspr2.c +++ b/pixman/pixman-mips-dspr2.c @@ -36,8 +36,41 @@ #include "pixman-private.h" #include "pixman-mips-dspr2.h" +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_x888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_8888_0565, + uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0565_8888, + uint16_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0565_0565, + uint16_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, + uint8_t, 3, uint8_t, 3) + static const pixman_fast_path_t mips_dspr2_fast_paths[] = { + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mips_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mips_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mips_composite_src_8888_0565), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, mips_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888), + { PIXMAN_OP_NONE }, }; diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h index 4c764a8..449c42a 100644 --- a/pixman/pixman-mips-dspr2.h +++ b/pixman/pixman-mips-dspr2.h @@ -35,4 +35,50 @@ #include "pixman-private.h" #include "pixman-inlines.h" +#define SKIP_ZERO_SRC 1 +#define SKIP_ZERO_MASK 2 +#define DO_FAST_MEMCPY 3 + +void +pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes); + +/****************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST(flags, name, \ + src_type, src_cnt, \ + dst_type, dst_cnt) \ +void \ +pixman_composite_##name##_asm_mips (dst_type *dst, \ + src_type *src, \ + int32_t w); \ + \ +static void \ +mips_composite_##name (pixman_implementation_t *imp, \ + pixman_composite_info_t *info) \ +{ \ + PIXMAN_COMPOSITE_ARGS (info); \ + dst_type *dst_line, *dst; \ + src_type *src_line, *src; \ + int32_t dst_stride, src_stride; \ + int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8; \ + \ + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \ + src_stride, src_line, src_cnt); \ + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ + dst_stride, dst_line, dst_cnt); \ + \ + while (height--) \ + { \ + dst = dst_line; \ + dst_line += dst_stride; \ + src = src_line; \ + src_line += src_stride; \ + \ + if (flags == DO_FAST_MEMCPY) \ + pixman_mips_fast_memcpy (dst, src, width * bpp); \ + else \ + pixman_composite_##name##_asm_mips (dst, src, width); \ + } \ +} + #endif //PIXMAN_MIPS_DSPR2_H diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S new file mode 100644 index 0000000..9ad6da5 --- /dev/null +++ b/pixman/pixman-mips-memcpy-asm.S @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "pixman-mips-dspr2-asm.h" + +/* + * This routine could be optimized for MIPS64. The current code only + * uses MIPS32 instructions. + */ + +#ifdef EB +# define LWHI lwl /* high part is left in big-endian */ +# define SWHI swl /* high part is left in big-endian */ +# define LWLO lwr /* low part is right in big-endian */ +# define SWLO swr /* low part is right in big-endian */ +#else +# define LWHI lwr /* high part is right in little-endian */ +# define SWHI swr /* high part is right in little-endian */ +# define LWLO lwl /* low part is left in big-endian */ +# define SWLO swl /* low part is left in big-endian */ +#endif + +LEAF_MIPS32R2(pixman_mips_fast_memcpy) + + slti AT, a2, 8 + bne AT, zero, $last8 + move v0, a0 /* memcpy returns the dst pointer */ + +/* Test if the src and dst are word-aligned, or can be made word-aligned */ + xor t8, a1, a0 + andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ + + bne t8, zero, $unaligned + negu a3, a0 + + andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ + beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ + subu a2, a2, a3 /* now a2 is the remining bytes count */ + + LWHI t8, 0(a1) + addu a1, a1, a3 + SWHI t8, 0(a0) + addu a0, a0, a3 + +/* Now the dst/src are mutually word-aligned with word-aligned addresses */ +$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ + /* t8 is the byte count after 64-byte chunks */ + + beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */ + /* There will be at most 1 32-byte chunk after it */ + subu a3, a2, t8 /* subtract from a2 the reminder */ + /* Here a3 counts bytes in 16w chunks */ + addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ + + addu t0, a0, a2 /* t0 is the "past the end" address */ + +/* + * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past + * the "t0-32" address + * This means: for x=128 the last "safe" a0 address is "t0-160" + * Alternatively, for x=64 the last "safe" a0 address is "t0-96" + * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit + */ + subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ + + pref 0, 0(a1) /* bring the first line of src, addr 0 */ + pref 0, 32(a1) /* bring the second line of src, addr 32 */ + pref 0, 64(a1) /* bring the third line of src, addr 64 */ + pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ +/* In case the a0 > t9 don't use "pref 30" at all */ + sgtu v1, a0, t9 + bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ + nop +/* otherwise, start with using pref30 */ + pref 30, 64(a0) +$loop16w: + pref 0, 96(a1) + lw t0, 0(a1) + bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ + lw t1, 4(a1) + pref 30, 96(a0) /* continue setting up the dest, addr 96 */ +$skip_pref30_96: + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + pref 0, 128(a1) /* bring the next lines of src, addr 128 */ + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + + lw t0, 32(a1) + bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ + lw t1, 36(a1) + pref 30, 128(a0) /* continue setting up the dest, addr 128 */ +$skip_pref30_128: + lw t2, 40(a1) + lw t3, 44(a1) + lw t4, 48(a1) + lw t5, 52(a1) + lw t6, 56(a1) + lw t7, 60(a1) + pref 0, 160(a1) /* bring the next lines of src, addr 160 */ + + sw t0, 32(a0) + sw t1, 36(a0) + sw t2, 40(a0) + sw t3, 44(a0) + sw t4, 48(a0) + sw t5, 52(a0) + sw t6, 56(a0) + sw t7, 60(a0) + + addiu a0, a0, 64 /* adding 64 to dest */ + sgtu v1, a0, t9 + bne a0, a3, $loop16w + addiu a1, a1, 64 /* adding 64 to src */ + move a2, t8 + +/* Here we have src and dest word-aligned but less than 64-bytes to go */ + +$chk8w: + pref 0, 0x0(a1) + andi t8, a2, 0x1f /* is there a 32-byte chunk? */ + /* the t8 is the reminder count past 32-bytes */ + beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ + nop + + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a1, a1, 32 + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + addiu a0, a0, 32 + +$chk1w: + andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ + beq a2, t8, $last8 + subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ + addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ + +/* copying in words (4-byte chunks) */ +$wordCopy_loop: + lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ + addiu a1, a1, 4 + addiu a0, a0, 4 + bne a0, a3, $wordCopy_loop + sw t3, -4(a0) + +/* For the last (<8) bytes */ +$last8: + blez a2, leave + addu a3, a0, a2 /* a3 is the last dst address */ +$last8loop: + lb v1, 0(a1) + addiu a1, a1, 1 + addiu a0, a0, 1 + bne a0, a3, $last8loop + sb v1, -1(a0) + +leave: j ra + nop + +/* + * UNALIGNED case + */ + +$unaligned: + /* got here with a3="negu a0" */ + andi a3, a3, 0x3 /* test if the a0 is word aligned */ + beqz a3, $ua_chk16w + subu a2, a2, a3 /* bytes left after initial a3 bytes */ + + LWHI v1, 0(a1) + LWLO v1, 3(a1) + addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ + SWHI v1, 0(a0) + addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ + +$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ + /* t8 is the byte count after 64-byte chunks */ + beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */ + /* There will be at most 1 32-byte chunk after it */ + subu a3, a2, t8 /* subtract from a2 the reminder */ + /* Here a3 counts bytes in 16w chunks */ + addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ + + addu t0, a0, a2 /* t0 is the "past the end" address */ + + subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ + + pref 0, 0(a1) /* bring the first line of src, addr 0 */ + pref 0, 32(a1) /* bring the second line of src, addr 32 */ + pref 0, 64(a1) /* bring the third line of src, addr 64 */ + pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ +/* In case the a0 > t9 don't use "pref 30" at all */ + sgtu v1, a0, t9 + bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ + nop +/* otherwise, start with using pref30 */ + pref 30, 64(a0) +$ua_loop16w: + pref 0, 96(a1) + LWHI t0, 0(a1) + LWLO t0, 3(a1) + LWHI t1, 4(a1) + bgtz v1, $ua_skip_pref30_96 + LWLO t1, 7(a1) + pref 30, 96(a0) /* continue setting up the dest, addr 96 */ +$ua_skip_pref30_96: + LWHI t2, 8(a1) + LWLO t2, 11(a1) + LWHI t3, 12(a1) + LWLO t3, 15(a1) + LWHI t4, 16(a1) + LWLO t4, 19(a1) + LWHI t5, 20(a1) + LWLO t5, 23(a1) + LWHI t6, 24(a1) + LWLO t6, 27(a1) + LWHI t7, 28(a1) + LWLO t7, 31(a1) + pref 0, 128(a1) /* bring the next lines of src, addr 128 */ + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + + LWHI t0, 32(a1) + LWLO t0, 35(a1) + LWHI t1, 36(a1) + bgtz v1, $ua_skip_pref30_128 + LWLO t1, 39(a1) + pref 30, 128(a0) /* continue setting up the dest, addr 128 */ +$ua_skip_pref30_128: + LWHI t2, 40(a1) + LWLO t2, 43(a1) + LWHI t3, 44(a1) + LWLO t3, 47(a1) + LWHI t4, 48(a1) + LWLO t4, 51(a1) + LWHI t5, 52(a1) + LWLO t5, 55(a1) + LWHI t6, 56(a1) + LWLO t6, 59(a1) + LWHI t7, 60(a1) + LWLO t7, 63(a1) + pref 0, 160(a1) /* bring the next lines of src, addr 160 */ + + sw t0, 32(a0) + sw t1, 36(a0) + sw t2, 40(a0) + sw t3, 44(a0) + sw t4, 48(a0) + sw t5, 52(a0) + sw t6, 56(a0) + sw t7, 60(a0) + + addiu a0, a0, 64 /* adding 64 to dest */ + sgtu v1, a0, t9 + bne a0, a3, $ua_loop16w + addiu a1, a1, 64 /* adding 64 to src */ + move a2, t8 + +/* Here we have src and dest word-aligned but less than 64-bytes to go */ + +$ua_chk8w: + pref 0, 0x0(a1) + andi t8, a2, 0x1f /* is there a 32-byte chunk? */ + /* the t8 is the reminder count */ + beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ + + LWHI t0, 0(a1) + LWLO t0, 3(a1) + LWHI t1, 4(a1) + LWLO t1, 7(a1) + LWHI t2, 8(a1) + LWLO t2, 11(a1) + LWHI t3, 12(a1) + LWLO t3, 15(a1) + LWHI t4, 16(a1) + LWLO t4, 19(a1) + LWHI t5, 20(a1) + LWLO t5, 23(a1) + LWHI t6, 24(a1) + LWLO t6, 27(a1) + LWHI t7, 28(a1) + LWLO t7, 31(a1) + addiu a1, a1, 32 + + sw t0, 0(a0) + sw t1, 4(a0) + sw t2, 8(a0) + sw t3, 12(a0) + sw t4, 16(a0) + sw t5, 20(a0) + sw t6, 24(a0) + sw t7, 28(a0) + addiu a0, a0, 32 + +$ua_chk1w: + andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ + beq a2, t8, $ua_smallCopy + subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ + addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ + +/* copying in words (4-byte chunks) */ +$ua_wordCopy_loop: + LWHI v1, 0(a1) + LWLO v1, 3(a1) + addiu a1, a1, 4 + addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ + bne a0, a3, $ua_wordCopy_loop + sw v1, -4(a0) + +/* Now less than 4 bytes (value in a2) left to copy */ +$ua_smallCopy: + beqz a2, leave + addu a3, a0, a2 /* a3 is the last dst address */ +$ua_smallCopy_loop: + lb v1, 0(a1) + addiu a1, a1, 1 + addiu a0, a0, 1 + bne a0, a3, $ua_smallCopy_loop + sb v1, -1(a0) + + j ra + nop + +END(pixman_mips_fast_memcpy)