MIPS: DSPr2: Added fast-paths for OVER operation: - over_0565_n_0565 - over_0565_8_0565
authorNemanja Lukic <nemanja.lukic@rt-rk.com>
Fri, 14 Sep 2012 07:31:25 +0000 (09:31 +0200)
committerSøren Sandmann Pedersen <ssp@redhat.com>
Mon, 24 Sep 2012 21:12:57 +0000 (17:12 -0400)
Performance numbers before/after on MIPS-74kc @ 1GHz:

lowlevel-blt-bench results

Referent (before):
        over_0565_n_0565 =  L1:   7.56  L2:   7.24  M:  6.16 ( 16.38%)  HT:  4.01  VT:  3.84  R:  3.79  RT:  1.66 (  18Kops/s)
        over_0565_8_0565 =  L1:   7.43  L2:   7.05  M:  5.98 ( 23.85%)  HT:  5.27  VT:  5.23  R:  5.09  RT:  3.14 (  28Kops/s)

Optimized:
        over_0565_n_0565 =  L1:  15.47  L2:  14.52  M: 12.30 ( 32.65%)  HT: 10.76  VT: 10.57  R: 10.27  RT:  6.63 (  46Kops/s)
        over_0565_8_0565 =  L1:  15.47  L2:  14.61  M: 11.78 ( 46.92%)  HT: 10.00  VT:  9.84  R:  9.40  RT:  5.81 (  43Kops/s)

pixman/pixman-mips-dspr2-asm.S
pixman/pixman-mips-dspr2.c

index dc44f9c..a4426f2 100644 (file)
@@ -857,6 +857,65 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_0565_asm_mips)
 
 END(pixman_composite_over_8888_n_0565_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_over_0565_n_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (r5g6b5)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
+    li       t6, 0x00ff00ff
+    li       t7, 0xf800f800
+    li       t8, 0x07e007e0
+    li       t9, 0x001F001F
+    beqz     a3, 3f
+     nop
+    srl      a2, a2, 24
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
+                       /* a2 = mask        (32bit constant) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0) /* t3 = destination (r5g6b5) */
+    addiu    a1, a1, 4
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t4, t5, t8, t9, s0, s1, s2, s3
+    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t8, t9, s2, s3, s4, s5
+    OVER_2x8888_2x8_2x8888   t4, t5, a2, a2, s0, s1, \
+                             t0, t1, t6, s2, s3, s4, s5, t4, t5
+    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t7, t8, t9, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+                       /* a2 = mask        (32bit constant) */
+    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t2, t4, t5
+    CONVERT_1x0565_TO_1x8888 t1, t3, t4, t5
+    OVER_8888_8_8888         t2, a2, t3, t0, t6, t1, t4, t5, t7
+    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
+    j        ra
+     nop
+
+END(pixman_composite_over_0565_n_0565_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
@@ -966,6 +1025,66 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_0565_asm_mips)
 
 END(pixman_composite_over_8888_8_0565_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_over_0565_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (r5g6b5)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001F001F
+    li       t7, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */
+    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */
+    addiu    a1, a1, 4
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1
+    OVER_2x8888_2x8_2x8888   s0, s1, t2, t3, s2, s3, \
+                             t0, t1, t7, s4, s5, t8, t9, s0, s1
+    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
+    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
+    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8
+    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
+    j        ra
+     nop
+
+END(pixman_composite_over_0565_8_0565_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
 /*
  * a0     - *dst
index 8666496..55eefef 100644 (file)
@@ -62,11 +62,15 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_8888,
                                       uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565,
                                       uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565,
+                                      uint16_t, 1, uint16_t, 1)
 
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
                                          uint8_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
                                          uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
+                                         uint8_t, 1, uint16_t, 1)
 
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
@@ -248,12 +252,16 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, mips_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   mips_composite_over_8888_n_0565),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   mips_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   mips_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   mips_composite_over_0565_n_0565),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, mips_composite_over_8888_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, mips_composite_over_8888_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, mips_composite_over_8888_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, mips_composite_over_8888_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   mips_composite_over_8888_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   mips_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   mips_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   mips_composite_over_0565_8_0565),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),