From 618a08e6aa03b38e8dc71ac610f7fdd55e8a8558 Mon Sep 17 00:00:00 2001
From: Nemanja Lukic <nemanja.lukic@rt-rk.com>
Date: Thu, 3 May 2012 00:03:42 +0200
Subject: [PATCH] MIPS: DSPr2: Added over_n_8_8888 and over_n_8_0565 fast
 paths.

Performance numbers before/after on MIPS-74kc @ 1GHz

Referent (before):

lowlevel-blt-bench:
     over_n_8_8888 =  L1:  10.40  L2:   9.79  M:  8.47 ( 33.62%)  HT:  7.64  VT:  7.59  R:  7.48  RT:  5.30 (  40Kops/s)
     over_n_8_0565 =  L1:   7.40  L2:   7.23  M:  6.78 ( 17.94%)  HT:  6.23  VT:  6.17  R:  6.14  RT:  4.62 (  37Kops/s)

Optimized:

lowlevel-blt-bench:
     over_n_8_8888 =  L1:  27.25  L2:  26.24  M: 18.15 ( 72.12%)  HT: 14.52  VT: 14.31  R: 13.83  RT:  7.57 (  48Kops/s)
     over_n_8_0565 =  L1:  18.91  L2:  17.59  M: 15.06 ( 39.90%)  HT: 12.18  VT: 11.98  R: 11.83  RT:  6.80 (  46Kops/s)
---
 pixman/pixman-mips-dspr2-asm.S | 224 +++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspr2-asm.h |  67 ++++++++++++
 pixman/pixman-mips-dspr2.c     |  10 ++
 3 files changed, 301 insertions(+)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 6a0fc18..68ad33f 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -527,3 +527,227 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
      nop
 
 END(pixman_composite_over_n_8888_0565_ca_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4
+    beqz      a3, 4f
+     nop
+    li        t4, 0x00ff00ff
+    li        t5, 0xff
+    addiu     t0, a3, -1
+    beqz      t0, 3f         /* last pixel */
+     srl      t6, a1, 24     /* t6 = srca */
+    not       s4, a1
+    beq       t5, t6, 2f     /* if (srca == 0xff) */
+     srl      s4, s4, 24
+1:
+                             /* a1 = src */
+    lbu       t0, 0(a2)      /* t0 = mask */
+    lbu       t1, 1(a2)      /* t1 = mask */
+    or        t2, t0, t1
+    beqz      t2, 111f       /* if (t0 == 0) && (t1 == 0) */
+     addiu    a2, a2, 2
+    and       t3, t0, t1
+
+    lw        t2, 0(a0)      /* t2 = dst */
+    beq       t3, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */
+     lw       t3, 4(a0)      /* t3 = dst */
+
+    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3
+    not       s2, s0
+    not       s3, s1
+    srl       s2, s2, 24
+    srl       s3, s3, 24
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9
+    addu_s.qb s2, t2, s0
+    addu_s.qb s3, t3, s1
+    sw        s2, 0(a0)
+    b         111f
+     sw       s3, 4(a0)
+11:
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9
+    addu_s.qb s2, t2, a1
+    addu_s.qb s3, t3, a1
+    sw        s2, 0(a0)
+    sw        s3, 4(a0)
+
+111:
+    addiu     a3, a3, -2
+    addiu     t0, a3, -1
+    bgtz      t0, 1b
+     addiu    a0, a0, 8
+    b         3f
+     nop
+2:
+                             /* a1 = src */
+    lbu       t0, 0(a2)      /* t0 = mask */
+    lbu       t1, 1(a2)      /* t1 = mask */
+    or        t2, t0, t1
+    beqz      t2, 222f       /* if (t0 == 0) && (t1 == 0) */
+     addiu    a2, a2, 2
+    and       t3, t0, t1
+    beq       t3, t5, 22f    /* if (t0 == 0xff) && (t1 == 0xff) */
+     nop
+    lw        t2, 0(a0)      /* t2 = dst */
+    lw        t3, 4(a0)      /* t3 = dst */
+
+    OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \
+                           t6, t7, t4, t8, t9, s0, s1, s2, s3
+    sw        t6, 0(a0)
+    b         222f
+     sw        t7, 4(a0)
+22:
+    sw        a1, 0(a0)
+    sw        a1, 4(a0)
+222:
+    addiu     a3, a3, -2
+    addiu     t0, a3, -1
+    bgtz      t0, 2b
+     addiu    a0, a0, 8
+3:
+    blez      a3, 4f
+     nop
+                             /* a1 = src */
+    lbu       t0, 0(a2)      /* t0 = mask */
+    beqz      t0, 4f         /* if (t0 == 0) */
+     addiu    a2, a2, 1
+    move      t3, a1
+    beq       t0, t5, 31f    /* if (t0 == 0xff) */
+     lw       t1, 0(a0)      /* t1 = dst */
+
+    MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8
+31:
+    not       t2, t3
+    srl       t2, t2, 24
+    MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8
+    addu_s.qb t2, t1, t3
+    sw        t2, 0(a0)
+4:
+    RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4
+    j         ra
+     nop
+
+END(pixman_composite_over_n_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    beqz     a3, 4f
+     nop
+    li       t4, 0x00ff00ff
+    li       t5, 0xff
+    li       t6, 0xf800f800
+    li       t7, 0x07e007e0
+    li       t8, 0x001F001F
+    addiu    t1, a3, -1
+    beqz     t1, 3f         /* last pixel */
+     srl     t0, a1, 24     /* t0 = srca */
+    not      v0, a1
+    beq      t0, t5, 2f     /* if (srca == 0xff) */
+     srl     v0, v0, 24
+1:
+                            /* a1 = src */
+    lbu      t0, 0(a2)      /* t0 = mask */
+    lbu      t1, 1(a2)      /* t1 = mask */
+    or       t2, t0, t1
+    beqz     t2, 111f       /* if (t0 == 0) && (t1 == 0) */
+     addiu   a2, a2, 2
+    lhu      t2, 0(a0)      /* t2 = dst */
+    lhu      t3, 2(a0)      /* t3 = dst */
+    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4
+    and      t9, t0, t1
+    beq      t9, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */
+     nop
+
+    MIPS_2xUN8x4_MUL_2xUN8   a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8
+    not      s4, s2
+    not      s5, s3
+    srl      s4, s4, 24
+    srl      s5, s5, 24
+    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8
+    addu_s.qb                s4, s2, s0
+    addu_s.qb                s5, s3, s1
+    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+    sh       t2, 0(a0)
+    b        111f
+     sh      t3, 2(a0)
+11:
+    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8
+    addu_s.qb                s4, a1, s0
+    addu_s.qb                s5, a1, s1
+    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+    sh       t2, 0(a0)
+    sh       t3, 2(a0)
+111:
+    addiu    a3, a3, -2
+    addiu    t0, a3, -1
+    bgtz     t0, 1b
+     addiu   a0, a0, 4
+    b        3f
+     nop
+2:
+    CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2
+21:
+                            /* a1 = src */
+    lbu      t0, 0(a2)      /* t0 = mask */
+    lbu      t1, 1(a2)      /* t1 = mask */
+    or       t2, t0, t1
+    beqz     t2, 222f       /* if (t0 == 0) && (t1 == 0) */
+     addiu   a2, a2, 2
+    and      t9, t0, t1
+    move     s2, s0
+    beq      t9, t5, 22f    /* if (t0 == 0xff) && (t2 == 0xff) */
+     move    s3, s0
+    lhu      t2, 0(a0)      /* t2 = dst */
+    lhu      t3, 2(a0)      /* t3 = dst */
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7
+    OVER_2x8888_2x8_2x8888   a1, a1, t0, t1, s2, s3, \
+                             t2, t3, t4, t9, s4, s5, s6, s7, s8
+    CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5
+22:
+    sh       s2, 0(a0)
+    sh       s3, 2(a0)
+222:
+    addiu    a3, a3, -2
+    addiu    t0, a3, -1
+    bgtz     t0, 21b
+     addiu   a0, a0, 4
+3:
+    blez      a3, 4f
+     nop
+                            /* a1 = src */
+    lbu      t0, 0(a2)      /* t0 = mask */
+    beqz     t0, 4f         /* if (t0 == 0) */
+     nop
+    lhu      t1, 0(a0)      /* t1 = dst */
+    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7
+    beq      t0, t5, 31f    /* if (t0 == 0xff) */
+     move    t3, a1
+
+    MIPS_UN8x4_MUL_UN8       a1, t0, t3, t4, t7, t8, t9
+31:
+    not      t6, t3
+    srl      t6, t6, 24
+    MIPS_UN8x4_MUL_UN8       t2, t6, t2, t4, t7, t8, t9
+    addu_s.qb                t1, t2, t3
+    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7
+    sh       t2, 0(a0)
+4:
+    RESTORE_REGS_FROM_STACK  24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j        ra
+     nop
+
+END(pixman_composite_over_n_8_0565_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 12ff42c..8383060 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -499,4 +499,71 @@ LEAF_MIPS32R2(symbol)                                   \
     precr.qb.ph       \d2_8888,  \scratch5, \scratch6
 .endm
 
+/*
+ * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8
+ * destination pixel (d_8888) using a8 mask (m_8). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_8888_8_8888 s_8888,   \
+                        m_8,      \
+                        d_8888,   \
+                        out_8888, \
+                        maskLSR,  \
+                        scratch1, scratch2, scratch3, scratch4
+    MIPS_UN8x4_MUL_UN8 \s_8888,   \m_8, \
+                       \scratch1, \maskLSR, \
+                       \scratch2, \scratch3, \scratch4
+
+    not                \scratch2, \scratch1
+    srl                \scratch2, \scratch2, 24
+
+    MIPS_UN8x4_MUL_UN8 \d_8888,   \scratch2, \
+                       \d_8888,   \maskLSR,  \
+                       \scratch3, \scratch4, \out_8888
+
+    addu_s.qb          \out_8888, \d_8888,   \scratch1
+.endm
+
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888) using a8 masks (m1_8 and
+ * m2_8). It also requires maskLSR needed for rounding process. maskLSR must
+ * have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8_2x8888 s1_8888,   \
+                              s2_8888,   \
+                              m1_8,      \
+                              m2_8,      \
+                              d1_8888,   \
+                              d2_8888,   \
+                              out1_8888, \
+                              out2_8888, \
+                              maskLSR,   \
+                              scratch1, scratch2, scratch3, \
+                              scratch4, scratch5, scratch6
+    MIPS_2xUN8x4_MUL_2xUN8 \s1_8888,   \s2_8888, \
+                           \m1_8,      \m2_8, \
+                           \scratch1,  \scratch2, \
+                           \maskLSR, \
+                           \scratch3,  \scratch4, \out1_8888, \
+                           \out2_8888, \scratch5, \scratch6
+
+    not                    \scratch3,  \scratch1
+    srl                    \scratch3,  \scratch3, 24
+    not                    \scratch4,  \scratch2
+    srl                    \scratch4,  \scratch4, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \
+                           \scratch3,  \scratch4, \
+                           \d1_8888,   \d2_8888, \
+                           \maskLSR, \
+                           \scratch5,  \scratch6, \out1_8888, \
+                           \out2_8888, \scratch3, \scratch4
+
+    addu_s.qb              \out1_8888, \d1_8888,  \scratch1
+    addu_s.qb              \out2_8888, \d2_8888,  \scratch2
+.endm
+
 #endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 018770a..7081734 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -53,6 +53,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
                                        uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
                                        uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
+                                       uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
+                                       uint8_t, 1, uint16_t, 1)
 
 static pixman_bool_t
 pixman_fill_mips (uint32_t *bits,
@@ -195,6 +199,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   mips_composite_over_n_8888_0565_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   mips_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   mips_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   mips_composite_over_n_8_0565),
 
     { PIXMAN_OP_NONE },
 };
-- 
2.7.4