ARMv6: Replacement add_8_8, over_8888_8888, over_8888_n_8888 and over_n_8_8888 routines
authorBen Avison <bavison@riscosopen.org>
Sat, 19 Jan 2013 16:16:53 +0000 (16:16 +0000)
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>
Tue, 29 Jan 2013 19:48:03 +0000 (21:48 +0200)
Improved by adding preloads, combining writes and using the SEL
instruction.

add_8_8

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  62.1   0.2      543.4  12.4    100.0%      +774.9%
L2  38.7   0.4      116.8  1.7     100.0%      +201.8%
M   40.0   0.1      110.1  0.5     100.0%      +175.3%
HT  30.9   0.2      43.4   0.5     100.0%      +40.4%
VT  30.6   0.3      39.2   0.5     100.0%      +28.0%
R   21.3   0.2      35.4   0.4     100.0%      +66.6%
RT  8.6    0.2      10.2   0.3     100.0%      +19.4%

over_8888_8888

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  32.3   0.1      38.0   0.2     100.0%      +17.7%
L2  15.9   0.4      30.6   0.5     100.0%      +92.8%
M   13.3   0.0      25.6   0.0     100.0%      +92.9%
HT  10.5   0.1      15.5   0.1     100.0%      +47.1%
VT  10.4   0.1      14.6   0.1     100.0%      +40.8%
R   10.3   0.1      15.8   0.1     100.0%      +53.3%
RT  6.0    0.1      7.6    0.1     100.0%      +25.9%

over_8888_n_8888

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  17.6   0.1      21.0   0.1     100.0%      +19.2%
L2  11.2   0.2      19.2   0.1     100.0%      +71.2%
M   10.2   0.0      19.6   0.0     100.0%      +92.6%
HT  8.4    0.0      11.9   0.1     100.0%      +41.7%
VT  8.3    0.0      11.3   0.1     100.0%      +36.4%
R   8.3    0.0      11.8   0.1     100.0%      +43.1%
RT  5.1    0.1      6.2    0.1     100.0%      +21.3%

over_n_8_8888

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  17.5   0.1      22.8   0.8     100.0%      +30.1%
L2  14.2   0.3      21.7   0.2     100.0%      +52.6%
M   12.0   0.0      22.3   0.0     100.0%      +84.8%
HT  10.5   0.1      14.1   0.1     100.0%      +34.5%
VT  10.0   0.1      13.5   0.1     100.0%      +35.3%
R   9.4    0.0      12.9   0.2     100.0%      +37.7%
RT  5.5    0.1      6.5    0.2     100.0%      +19.2%

pixman/pixman-arm-simd-asm-scaled.S
pixman/pixman-arm-simd-asm.S
pixman/pixman-arm-simd.c

index b438001d3d0c0cf219398513d627abe7c3b14a7d..71109954885567537ebb47d52e64da93e82172c2 100644 (file)
 fname:
 .endm
 
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
- */
-
-pixman_asm_function pixman_composite_add_8_8_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       mov     r10, r1
-       sub     sp, sp, #4
-       subs    r10, r10, #1
-       mov     r11, r0
-       mov     r8, r2
-       str     r3, [sp]
-       ldr     r7, [sp, #36]
-       bcc     0f
-6:     cmp     r11, #0
-       beq     1f
-       orr     r3, r8, r7
-       tst     r3, #3
-       beq     2f
-       mov     r1, r8
-       mov     r0, r7
-       mov     r12, r11
-       b       3f
-5:     tst     r3, #3
-       beq     4f
-3:     ldrb    r2, [r0], #1
-       subs    r12, r12, #1
-       ldrb    r3, [r1]
-       uqadd8  r3, r2, r3
-       strb    r3, [r1], #1
-       orr     r3, r1, r0
-       bne     5b
-1:     ldr     r3, [sp]
-       add     r8, r8, r3
-       ldr     r3, [sp, #40]
-       add     r7, r7, r3
-10:    subs    r10, r10, #1
-       bcs     6b
-0:     add     sp, sp, #4
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-2:     mov     r12, r11
-       mov     r1, r8
-       mov     r0, r7
-4:     cmp     r12, #3
-       subgt   r6, r12, #4
-       movgt   r9, r12
-       lsrgt   r5, r6, #2
-       addgt   r3, r5, #1
-       movgt   r12, #0
-       lslgt   r4, r3, #2
-       ble     7f
-8:     ldr     r3, [r0, r12]
-       ldr     r2, [r1, r12]
-       uqadd8  r3, r3, r2
-       str     r3, [r1, r12]
-       add     r12, r12, #4
-       cmp     r12, r4
-       bne     8b
-       sub     r3, r9, #4
-       bic     r3, r3, #3
-       add     r3, r3, #4
-       subs    r12, r6, r5, lsl #2
-       add     r1, r1, r3
-       add     r0, r0, r3
-       beq     1b
-7:     mov     r4, #0
-9:     ldrb    r3, [r1, r4]
-       ldrb    r2, [r0, r4]
-       uqadd8  r3, r2, r3
-       strb    r3, [r1, r4]
-       add     r4, r4, #1
-       cmp     r4, r12
-       bne     9b
-       ldr     r3, [sp]
-       add     r8, r8, r3
-       ldr     r3, [sp, #40]
-       add     r7, r7, r3
-       b       10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       sub     sp, sp, #20
-       cmp     r1, #0
-       mov     r12, r2
-       str     r1, [sp, #12]
-       str     r0, [sp, #16]
-       ldr     r2, [sp, #52]
-       beq     0f
-       lsl     r3, r3, #2
-       str     r3, [sp]
-       ldr     r3, [sp, #56]
-       mov     r10, #0
-       lsl     r3, r3, #2
-       str     r3, [sp, #8]
-       mov     r11, r3
-       b       1f
-6:     ldr     r11, [sp, #8]
-1:     ldr     r9, [sp]
-       mov     r0, r12
-       add     r12, r12, r9
-       mov     r1, r2
-       str     r12, [sp, #4]
-       add     r2, r2, r11
-       ldr     r12, [sp, #16]
-       ldr     r3, =0x00800080
-       ldr     r9, =0xff00ff00
-       mov     r11, #255
-       cmp     r12, #0
-       beq     4f
-5:     ldr     r5, [r1], #4
-       ldr     r4, [r0]
-       sub     r8, r11, r5, lsr #24
-       uxtb16  r6, r4
-       uxtb16  r7, r4, ror #8
-       mla     r6, r6, r8, r3
-       mla     r7, r7, r8, r3
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       and     r7, r7, r9
-       uxtab16 r6, r7, r6, ror #8
-       uqadd8  r5, r6, r5
-       str     r5, [r0], #4
-       subs    r12, r12, #1
-       bne     5b
-4:     ldr     r3, [sp, #12]
-       add     r10, r10, #1
-       cmp     r10, r3
-       ldr     r12, [sp, #4]
-       bne     6b
-0:     add     sp, sp, #20
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       sub     sp, sp, #28
-       cmp     r1, #0
-       str     r1, [sp, #12]
-       ldrb    r1, [sp, #71]
-       mov     r12, r2
-       str     r0, [sp, #16]
-       ldr     r2, [sp, #60]
-       str     r1, [sp, #24]
-       beq     0f
-       lsl     r3, r3, #2
-       str     r3, [sp, #20]
-       ldr     r3, [sp, #64]
-       mov     r10, #0
-       lsl     r3, r3, #2
-       str     r3, [sp, #8]
-       mov     r11, r3
-       b       1f
-5:     ldr     r11, [sp, #8]
-1:     ldr     r4, [sp, #20]
-       mov     r0, r12
-       mov     r1, r2
-       add     r12, r12, r4
-       add     r2, r2, r11
-       str     r12, [sp]
-       str     r2, [sp, #4]
-       ldr     r12, [sp, #16]
-       ldr     r2, =0x00800080
-       ldr     r3, [sp, #24]
-       mov     r11, #255
-       cmp     r12, #0
-       beq     3f
-4:     ldr     r5, [r1], #4
-       ldr     r4, [r0]
-       uxtb16  r6, r5
-       uxtb16  r7, r5, ror #8
-       mla     r6, r6, r3, r2
-       mla     r7, r7, r3, r2
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r5, r6, r7, lsl #8
-       uxtb16  r6, r4
-       uxtb16  r7, r4, ror #8
-       sub     r8, r11, r5, lsr #24
-       mla     r6, r6, r8, r2
-       mla     r7, r7, r8, r2
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r6, r6, r7, lsl #8
-       uqadd8  r5, r6, r5
-       str     r5, [r0], #4
-       subs    r12, r12, #1
-       bne     4b
-3:     ldr     r1, [sp, #12]
-       add     r10, r10, #1
-       cmp     r10, r1
-       ldr     r12, [sp]
-       ldr     r2, [sp, #4]
-       bne     5b
-0:     add     sp, sp, #28
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
-       push    {r4, r5, r6, r7, r8, r9, r10, r11}
-       sub     sp, sp, #28
-       cmp     r1, #0
-       ldr     r9, [sp, #60]
-       str     r1, [sp, #12]
-       bic     r1, r9, #-16777216
-       str     r1, [sp, #20]
-       mov     r12, r2
-       lsr     r1, r9, #8
-       ldr     r2, [sp, #20]
-       bic     r1, r1, #-16777216
-       bic     r2, r2, #65280
-       bic     r1, r1, #65280
-       str     r2, [sp, #20]
-       str     r0, [sp, #16]
-       str     r1, [sp, #4]
-       ldr     r2, [sp, #68]
-       beq     0f
-       lsl     r3, r3, #2
-       str     r3, [sp, #24]
-       mov     r0, #0
-       b       1f
-5:     ldr     r3, [sp, #24]
-1:     ldr     r4, [sp, #72]
-       mov     r10, r12
-       mov     r1, r2
-       add     r12, r12, r3
-       add     r2, r2, r4
-       str     r12, [sp, #8]
-       str     r2, [sp]
-       ldr     r12, [sp, #16]
-       ldr     r11, =0x00800080
-       ldr     r2, [sp, #4]
-       ldr     r3, [sp, #20]
-       cmp     r12, #0
-       beq     3f
-4:     ldrb    r5, [r1], #1
-       ldr     r4, [r10]
-       mla     r6, r3, r5, r11
-       mla     r7, r2, r5, r11
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r5, r6, r7, lsl #8
-       uxtb16  r6, r4
-       uxtb16  r7, r4, ror #8
-       mvn     r8, r5
-       lsr     r8, r8, #24
-       mla     r6, r6, r8, r11
-       mla     r7, r7, r8, r11
-       uxtab16 r6, r6, r6, ror #8
-       uxtab16 r7, r7, r7, ror #8
-       uxtb16  r6, r6, ror #8
-       uxtb16  r7, r7, ror #8
-       orr     r6, r6, r7, lsl #8
-       uqadd8  r5, r6, r5
-       str     r5, [r10], #4
-       subs    r12, r12, #1
-       bne     4b
-3:     ldr     r4, [sp, #12]
-       add     r0, r0, #1
-       cmp     r0, r4
-       ldr     r12, [sp, #8]
-       ldr     r2, [sp]
-       bne     5b
-0:     add     sp, sp, #28
-       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
-       bx      lr
-.endfunc
-
 /*
  * Note: This code is only using armv5te instructions (not even armv6),
  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
index a3e2d04524ae6270d604e81a9d44a0e2892715b2..c2096887907612b3ffb8b0991bb89b7965e0d693 100644 (file)
@@ -303,3 +303,311 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro add_8_8_8pixels  cond, dst1, dst2
+        uqadd8&cond  WK&dst1, WK&dst1, MASK
+        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+.endm
+
+.macro add_8_8_4pixels  cond, dst
+        uqadd8&cond  WK&dst, WK&dst, MASK
+.endm
+
+.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    MASK
+    WK5     .req    STRIDE_M
+ .if numbytes == 16
+        pixld   cond, 8, 4, SRC, unaligned_src
+        pixld   cond, 16, firstreg, DST, 0
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+        pixld   cond, 8, 4, SRC, unaligned_src
+ .else
+        pixld   cond, numbytes, 4, SRC, unaligned_src
+        pixld   cond, numbytes, firstreg, DST, 0
+ .endif
+    .unreq  WK4
+    .unreq  WK5
+.endm
+
+.macro add_8_8_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .else
+        add_8_8_4pixels cond, firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    2, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    add_8_8_process_head, \
+    add_8_8_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
+        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
+        teq     WK&reg0, #0
+ .if numbytes > 4
+        teqeq   WK&reg1, #0
+  .if numbytes > 8
+        teqeq   WK&reg2, #0
+        teqeq   WK&reg3, #0
+  .endif
+ .endif
+.endm
+
+.macro over_8888_8888_prepare  next
+        mov     WK&next, WK&next, lsr #24
+.endm
+
+.macro over_8888_8888_1pixel src, dst, offset, next
+        /* src = destination component multiplier */
+        rsb     WK&src, WK&src, #255
+        /* Split even/odd bytes of dst into SCRATCH/dst */
+        uxtb16  SCRATCH, WK&dst
+        uxtb16  WK&dst, WK&dst, ror #8
+        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
+        mla     SCRATCH, SCRATCH, WK&src, MASK
+        mla     WK&dst, WK&dst, WK&src, MASK
+        /* Where we would have had a stall between the result of the first MLA and the shifter input,
+         * reload the complete source pixel */
+        ldr     WK&src, [SRC, #offset]
+        /* Multiply by 257/256 to approximate 256/255 */
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        /* In this stall, start processing the next pixel */
+ .if offset < -4
+        mov     WK&next, WK&next, lsr #24
+ .endif
+        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        /* Recombine even/odd bytes of multiplied destination */
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     WK&dst, SCRATCH, WK&dst
+        /* Saturated add of source to multiplied destination */
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        over_8888_8888_prepare  %(4+firstreg)
+ .set PROCESS_REG, firstreg
+ .set PROCESS_OFF, -numbytes
+ .rept numbytes / 4
+        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+  .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_8888_process_head, \
+    over_8888_8888_process_tail
+
+/******************************************************************************/
+
+/* Multiply each byte of a word by a byte.
+ * Useful when there aren't any obvious ways to fill the stalls with other instructions.
+ * word  Register containing 4 bytes
+ * byte  Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp   Scratch register
+ * half  Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
+ */
+.macro mul_8888_8  word, byte, tmp, half
+        /* Split even/odd bytes of word apart */
+        uxtb16  tmp, word
+        uxtb16  word, word, ror #8
+        /* Multiply bytes together with rounding, then by 257/256 */
+        mla     tmp, tmp, byte, half
+        mla     word, word, byte, half /* 1 stall follows */
+        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
+        uxtab16 word, word, word, ror #8
+        /* Recombine bytes */
+        mov     tmp, tmp, ror #8
+        sel     word, tmp, word
+.endm
+
+/******************************************************************************/
+
+.macro over_8888_n_8888_init
+        /* Mask is constant */
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        /* Hold loop invariant in STRIDE_M */
+        ldr     STRIDE_M, =0x00800080
+        /* We only want the alpha bits of the constant mask */
+        mov     MASK, MASK, lsr #24
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, STRIDE_M, STRIDE_M
+        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_n_8888_1pixel src, dst
+        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK&src, lsr #24
+        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        mov     WK6, #255
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+  .if numbytes == 16 && PROCESS_REG == 2
+        /* We're using WK6 and WK7 as temporaries, so half way through
+         * 4 pixels, reload the second two source pixels but this time
+         * into WK4 and WK5 */
+        ldmdb   SRC, {WK4, WK5}
+  .endif
+        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_n_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_n_8888_process_head, \
+    over_8888_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8_8888_init
+        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
+        ldr     SCRATCH, =0x00800080
+        uxtb16  STRIDE_S, SRC
+        uxtb16  SRC, SRC, ror #8
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8_8888_newline
+        ldr     STRIDE_D, =0x00800080
+        b       1f
+ .ltorg
+1:
+.endm
+
+.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    STRIDE_M
+        pixld   , numbytes/4, 4, MASK, unaligned_mask
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+.endm
+
+.macro over_n_8_8888_1pixel src, dst
+        uxtb    Y, WK4, ror #src*8
+        /* Trailing part of multiplication of source */
+        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
+        mla     Y, SRC, Y, STRIDE_D
+        mov     ORIG_W, #255
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 Y, Y, Y, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sub     ORIG_W, ORIG_W, Y, lsr #24
+        sel     Y, SCRATCH, Y
+        /* Then multiply the destination */
+        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK&dst, WK&dst, Y
+.endm
+
+.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_M
+        teq     WK4, #0
+        beq     10f
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_n_8_8888_init, \
+    over_n_8_8888_newline, \
+    nop_macro, /* cleanup */ \
+    over_n_8_8888_process_head, \
+    over_n_8_8888_process_tail
+
+/******************************************************************************/
+
index 09a5036a12c3d96c8dd830c53d8fc000f884a930..af062e19dcb3711167cfefe57d9dcca447f60bc3 100644 (file)
 #include "pixman-arm-common.h"
 #include "pixman-inlines.h"
 
-#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
-
-void
-pixman_composite_add_8_8_asm_armv6 (int32_t  width,
-                                   int32_t  height,
-                                   uint8_t *dst_line,
-                                   int32_t  dst_stride,
-                                   uint8_t *src_line,
-                                   int32_t  src_stride)
-{
-    uint8_t *dst, *src;
-    int32_t w;
-    uint8_t s, d;
-
-    while (height--)
-    {
-       dst = dst_line;
-       dst_line += dst_stride;
-       src = src_line;
-       src_line += src_stride;
-       w = width;
-
-       /* ensure both src and dst are properly aligned before doing 32 bit reads
-        * we'll stay in this loop if src and dst have differing alignments
-        */
-       while (w && (((uintptr_t)dst & 3) || ((uintptr_t)src & 3)))
-       {
-           s = *src;
-           d = *dst;
-           asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-           *dst = d;
-
-           dst++;
-           src++;
-           w--;
-       }
-
-       while (w >= 4)
-       {
-           asm ("uqadd8 %0, %1, %2"
-                : "=r" (*(uint32_t*)dst)
-                : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
-           dst += 4;
-           src += 4;
-           w -= 4;
-       }
-
-       while (w)
-       {
-           s = *src;
-           d = *dst;
-           asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-           *dst = d;
-
-           dst++;
-           src++;
-           w--;
-       }
-    }
-
-}
-
-void
-pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
-                                           int32_t   height,
-                                           uint32_t *dst_line,
-                                           int32_t   dst_stride,
-                                           uint32_t *src_line,
-                                           int32_t   src_stride)
-{
-    uint32_t    *dst;
-    uint32_t    *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t upper_component_mask = 0xff00ff00;
-    uint32_t alpha_mask = 0xff;
-
-    while (height--)
-    {
-       dst = dst_line;
-       dst_line += dst_stride;
-       src = src_line;
-       src_line += src_stride;
-       w = width;
-
-/* #define inner_branch */
-       asm volatile (
-           "cmp %[w], #0\n\t"
-           "beq 2f\n\t"
-           "1:\n\t"
-           /* load src */
-           "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-           /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-            * The 0x0 case also allows us to avoid doing an unecessary data
-            * write which is more valuable so we only check for that
-            */
-           "cmp r5, #0\n\t"
-           "beq 3f\n\t"
-
-           /* = 255 - alpha */
-           "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-           "ldr r4, [%[dest]] \n\t"
-
-#else
-           "ldr r4, [%[dest]] \n\t"
-
-           /* = 255 - alpha */
-           "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-#endif
-           "uxtb16 r6, r4\n\t"
-           "uxtb16 r7, r4, ror #8\n\t"
-
-           /* multiply by 257 and divide by 65536 */
-           "mla r6, r6, r8, %[component_half]\n\t"
-           "mla r7, r7, r8, %[component_half]\n\t"
-
-           "uxtab16 r6, r6, r6, ror #8\n\t"
-           "uxtab16 r7, r7, r7, ror #8\n\t"
-
-           /* recombine the 0xff00ff00 bytes of r6 and r7 */
-           "and r7, r7, %[upper_component_mask]\n\t"
-           "uxtab16 r6, r7, r6, ror #8\n\t"
-
-           "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-           "3:\n\t"
-
-#endif
-           "str r5, [%[dest]], #4\n\t"
-           /* increment counter and jmp to top */
-           "subs       %[w], %[w], #1\n\t"
-           "bne        1b\n\t"
-           "2:\n\t"
-           : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-           : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
-             [alpha_mask] "r" (alpha_mask)
-           : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
-           );
-    }
-}
-
-void
-pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
-                                             int32_t   height,
-                                             uint32_t *dst_line,
-                                             int32_t   dst_stride,
-                                             uint32_t *src_line,
-                                             int32_t   src_stride,
-                                             uint32_t  mask)
-{
-    uint32_t *dst;
-    uint32_t *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t alpha_mask = 0xff;
-
-    mask = (mask) >> 24;
-
-    while (height--)
-    {
-       dst = dst_line;
-       dst_line += dst_stride;
-       src = src_line;
-       src_line += src_stride;
-       w = width;
-
-/* #define inner_branch */
-       asm volatile (
-           "cmp %[w], #0\n\t"
-           "beq 2f\n\t"
-           "1:\n\t"
-           /* load src */
-           "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-           /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-            * The 0x0 case also allows us to avoid doing an unecessary data
-            * write which is more valuable so we only check for that
-            */
-           "cmp r5, #0\n\t"
-           "beq 3f\n\t"
-
-#endif
-           "ldr r4, [%[dest]] \n\t"
-
-           "uxtb16 r6, r5\n\t"
-           "uxtb16 r7, r5, ror #8\n\t"
-
-           /* multiply by alpha (r8) then by 257 and divide by 65536 */
-           "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
-           "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
-
-           "uxtab16 r6, r6, r6, ror #8\n\t"
-           "uxtab16 r7, r7, r7, ror #8\n\t"
-
-           "uxtb16 r6, r6, ror #8\n\t"
-           "uxtb16 r7, r7, ror #8\n\t"
-
-           /* recombine */
-           "orr r5, r6, r7, lsl #8\n\t"
-
-           "uxtb16 r6, r4\n\t"
-           "uxtb16 r7, r4, ror #8\n\t"
-
-           /* 255 - alpha */
-           "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-           /* multiply by alpha (r8) then by 257 and divide by 65536 */
-           "mla r6, r6, r8, %[component_half]\n\t"
-           "mla r7, r7, r8, %[component_half]\n\t"
-
-           "uxtab16 r6, r6, r6, ror #8\n\t"
-           "uxtab16 r7, r7, r7, ror #8\n\t"
-
-           "uxtb16 r6, r6, ror #8\n\t"
-           "uxtb16 r7, r7, ror #8\n\t"
-
-           /* recombine */
-           "orr r6, r6, r7, lsl #8\n\t"
-
-           "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-           "3:\n\t"
-
-#endif
-           "str r5, [%[dest]], #4\n\t"
-           /* increment counter and jmp to top */
-           "subs       %[w], %[w], #1\n\t"
-           "bne        1b\n\t"
-           "2:\n\t"
-           : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-           : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
-             [alpha_mask] "r" (alpha_mask)
-           : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
-           );
-    }
-}
-
-void
-pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
-                                          int32_t   height,
-                                          uint32_t *dst_line,
-                                          int32_t   dst_stride,
-                                          uint32_t  src,
-                                          int32_t   unused,
-                                          uint8_t  *mask_line,
-                                          int32_t   mask_stride)
-{
-    uint32_t  srca;
-    uint32_t *dst;
-    uint8_t  *mask;
-    int32_t w;
-
-    srca = src >> 24;
-
-    uint32_t component_mask = 0xff00ff;
-    uint32_t component_half = 0x800080;
-
-    uint32_t src_hi = (src >> 8) & component_mask;
-    uint32_t src_lo = src & component_mask;
-
-    while (height--)
-    {
-       dst = dst_line;
-       dst_line += dst_stride;
-       mask = mask_line;
-       mask_line += mask_stride;
-       w = width;
-
-/* #define inner_branch */
-       asm volatile (
-           "cmp %[w], #0\n\t"
-           "beq 2f\n\t"
-           "1:\n\t"
-           /* load mask */
-           "ldrb r5, [%[mask]], #1\n\t"
-#ifdef inner_branch
-           /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-            * The 0x0 case also allows us to avoid doing an unecessary data
-            * write which is more valuable so we only check for that
-            */
-           "cmp r5, #0\n\t"
-           "beq 3f\n\t"
-
-#endif
-           "ldr r4, [%[dest]] \n\t"
-
-           /* multiply by alpha (r8) then by 257 and divide by 65536 */
-           "mla r6, %[src_lo], r5, %[component_half]\n\t"
-           "mla r7, %[src_hi], r5, %[component_half]\n\t"
-
-           "uxtab16 r6, r6, r6, ror #8\n\t"
-           "uxtab16 r7, r7, r7, ror #8\n\t"
-
-           "uxtb16 r6, r6, ror #8\n\t"
-           "uxtb16 r7, r7, ror #8\n\t"
-
-           /* recombine */
-           "orr r5, r6, r7, lsl #8\n\t"
-
-           "uxtb16 r6, r4\n\t"
-           "uxtb16 r7, r4, ror #8\n\t"
-
-           /* we could simplify this to use 'sub' if we were
-            * willing to give up a register for alpha_mask
-            */
-           "mvn r8, r5\n\t"
-           "mov r8, r8, lsr #24\n\t"
-
-           /* multiply by alpha (r8) then by 257 and divide by 65536 */
-           "mla r6, r6, r8, %[component_half]\n\t"
-           "mla r7, r7, r8, %[component_half]\n\t"
-
-           "uxtab16 r6, r6, r6, ror #8\n\t"
-           "uxtab16 r7, r7, r7, ror #8\n\t"
-
-           "uxtb16 r6, r6, ror #8\n\t"
-           "uxtb16 r7, r7, ror #8\n\t"
-
-           /* recombine */
-           "orr r6, r6, r7, lsl #8\n\t"
-
-           "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-           "3:\n\t"
-
-#endif
-           "str r5, [%[dest]], #4\n\t"
-           /* increment counter and jmp to top */
-           "subs       %[w], %[w], #1\n\t"
-           "bne        1b\n\t"
-           "2:\n\t"
-           : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
-           : [component_half] "r" (component_half),
-             [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
-           : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
-    }
-}
-
-#endif
-
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
                                   uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,