pixman/pixman-arm-simd.c

   1 /*
   2  * Copyright © 2008 Mozilla Corporation
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of Mozilla Corporation not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  Mozilla Corporation makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  24  *
  25  */
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include "pixman-private.h"
  31 #include "pixman-arm-common.h"
  32 #include "pixman-inlines.h"
  33
  34 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
  35
  36 void
  37 pixman_composite_add_8_8_asm_armv6 (int32_t  width,
  38                                     int32_t  height,
  39                                     uint8_t *dst_line,
  40                                     int32_t  dst_stride,
  41                                     uint8_t *src_line,
  42                                     int32_t  src_stride)
  43 {
  44     uint8_t *dst, *src;
  45     int32_t w;
  46     uint8_t s, d;
  47
  48     while (height--)
  49     {
  50         dst = dst_line;
  51         dst_line += dst_stride;
  52         src = src_line;
  53         src_line += src_stride;
  54         w = width;
  55
  56         /* ensure both src and dst are properly aligned before doing 32 bit reads
  57          * we'll stay in this loop if src and dst have differing alignments
  58          */
  59         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
  60         {
  61             s = *src;
  62             d = *dst;
  63             asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
  64             *dst = d;
  65
  66             dst++;
  67             src++;
  68             w--;
  69         }
  70
  71         while (w >= 4)
  72         {
  73             asm ("uqadd8 %0, %1, %2"
  74                  : "=r" (*(uint32_t*)dst)
  75                  : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
  76             dst += 4;
  77             src += 4;
  78             w -= 4;
  79         }
  80
  81         while (w)
  82         {
  83             s = *src;
  84             d = *dst;
  85             asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
  86             *dst = d;
  87
  88             dst++;
  89             src++;
  90             w--;
  91         }
  92     }
  93
  94 }
  95
  96 void
  97 pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
  98                                            int32_t   height,
  99                                            uint32_t *dst_line,
 100                                            int32_t   dst_stride,
 101                                            uint32_t *src_line,
 102                                            int32_t   src_stride)
 103 {
 104     uint32_t    *dst;
 105     uint32_t    *src;
 106     int32_t w;
 107     uint32_t component_half = 0x800080;
 108     uint32_t upper_component_mask = 0xff00ff00;
 109     uint32_t alpha_mask = 0xff;
 110
 111     while (height--)
 112     {
 113         dst = dst_line;
 114         dst_line += dst_stride;
 115         src = src_line;
 116         src_line += src_stride;
 117         w = width;
 118
 119 /* #define inner_branch */
 120         asm volatile (
 121             "cmp %[w], #0\n\t"
 122             "beq 2f\n\t"
 123             "1:\n\t"
 124             /* load src */
 125             "ldr r5, [%[src]], #4\n\t"
 126 #ifdef inner_branch
 127             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 128              * The 0x0 case also allows us to avoid doing an unecessary data
 129              * write which is more valuable so we only check for that
 130              */
 131             "cmp r5, #0\n\t"
 132             "beq 3f\n\t"
 133
 134             /* = 255 - alpha */
 135             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 136
 137             "ldr r4, [%[dest]] \n\t"
 138
 139 #else
 140             "ldr r4, [%[dest]] \n\t"
 141
 142             /* = 255 - alpha */
 143             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 144 #endif
 145             "uxtb16 r6, r4\n\t"
 146             "uxtb16 r7, r4, ror #8\n\t"
 147
 148             /* multiply by 257 and divide by 65536 */
 149             "mla r6, r6, r8, %[component_half]\n\t"
 150             "mla r7, r7, r8, %[component_half]\n\t"
 151
 152             "uxtab16 r6, r6, r6, ror #8\n\t"
 153             "uxtab16 r7, r7, r7, ror #8\n\t"
 154
 155             /* recombine the 0xff00ff00 bytes of r6 and r7 */
 156             "and r7, r7, %[upper_component_mask]\n\t"
 157             "uxtab16 r6, r7, r6, ror #8\n\t"
 158
 159             "uqadd8 r5, r6, r5\n\t"
 160
 161 #ifdef inner_branch
 162             "3:\n\t"
 163
 164 #endif
 165             "str r5, [%[dest]], #4\n\t"
 166             /* increment counter and jmp to top */
 167             "subs       %[w], %[w], #1\n\t"
 168             "bne        1b\n\t"
 169             "2:\n\t"
 170             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 171             : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
 172               [alpha_mask] "r" (alpha_mask)
 173             : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 174             );
 175     }
 176 }
 177
 178 void
 179 pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
 180                                              int32_t   height,
 181                                              uint32_t *dst_line,
 182                                              int32_t   dst_stride,
 183                                              uint32_t *src_line,
 184                                              int32_t   src_stride,
 185                                              uint32_t  mask)
 186 {
 187     uint32_t *dst;
 188     uint32_t *src;
 189     int32_t w;
 190     uint32_t component_half = 0x800080;
 191     uint32_t alpha_mask = 0xff;
 192
 193     mask = (mask) >> 24;
 194
 195     while (height--)
 196     {
 197         dst = dst_line;
 198         dst_line += dst_stride;
 199         src = src_line;
 200         src_line += src_stride;
 201         w = width;
 202
 203 /* #define inner_branch */
 204         asm volatile (
 205             "cmp %[w], #0\n\t"
 206             "beq 2f\n\t"
 207             "1:\n\t"
 208             /* load src */
 209             "ldr r5, [%[src]], #4\n\t"
 210 #ifdef inner_branch
 211             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 212              * The 0x0 case also allows us to avoid doing an unecessary data
 213              * write which is more valuable so we only check for that
 214              */
 215             "cmp r5, #0\n\t"
 216             "beq 3f\n\t"
 217
 218 #endif
 219             "ldr r4, [%[dest]] \n\t"
 220
 221             "uxtb16 r6, r5\n\t"
 222             "uxtb16 r7, r5, ror #8\n\t"
 223
 224             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 225             "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
 226             "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
 227
 228             "uxtab16 r6, r6, r6, ror #8\n\t"
 229             "uxtab16 r7, r7, r7, ror #8\n\t"
 230
 231             "uxtb16 r6, r6, ror #8\n\t"
 232             "uxtb16 r7, r7, ror #8\n\t"
 233
 234             /* recombine */
 235             "orr r5, r6, r7, lsl #8\n\t"
 236
 237             "uxtb16 r6, r4\n\t"
 238             "uxtb16 r7, r4, ror #8\n\t"
 239
 240             /* 255 - alpha */
 241             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 242
 243             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 244             "mla r6, r6, r8, %[component_half]\n\t"
 245             "mla r7, r7, r8, %[component_half]\n\t"
 246
 247             "uxtab16 r6, r6, r6, ror #8\n\t"
 248             "uxtab16 r7, r7, r7, ror #8\n\t"
 249
 250             "uxtb16 r6, r6, ror #8\n\t"
 251             "uxtb16 r7, r7, ror #8\n\t"
 252
 253             /* recombine */
 254             "orr r6, r6, r7, lsl #8\n\t"
 255
 256             "uqadd8 r5, r6, r5\n\t"
 257
 258 #ifdef inner_branch
 259             "3:\n\t"
 260
 261 #endif
 262             "str r5, [%[dest]], #4\n\t"
 263             /* increment counter and jmp to top */
 264             "subs       %[w], %[w], #1\n\t"
 265             "bne        1b\n\t"
 266             "2:\n\t"
 267             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 268             : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
 269               [alpha_mask] "r" (alpha_mask)
 270             : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 271             );
 272     }
 273 }
 274
 275 void
 276 pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
 277                                           int32_t   height,
 278                                           uint32_t *dst_line,
 279                                           int32_t   dst_stride,
 280                                           uint32_t  src,
 281                                           int32_t   unused,
 282                                           uint8_t  *mask_line,
 283                                           int32_t   mask_stride)
 284 {
 285     uint32_t  srca;
 286     uint32_t *dst;
 287     uint8_t  *mask;
 288     int32_t w;
 289
 290     srca = src >> 24;
 291
 292     uint32_t component_mask = 0xff00ff;
 293     uint32_t component_half = 0x800080;
 294
 295     uint32_t src_hi = (src >> 8) & component_mask;
 296     uint32_t src_lo = src & component_mask;
 297
 298     while (height--)
 299     {
 300         dst = dst_line;
 301         dst_line += dst_stride;
 302         mask = mask_line;
 303         mask_line += mask_stride;
 304         w = width;
 305
 306 /* #define inner_branch */
 307         asm volatile (
 308             "cmp %[w], #0\n\t"
 309             "beq 2f\n\t"
 310             "1:\n\t"
 311             /* load mask */
 312             "ldrb r5, [%[mask]], #1\n\t"
 313 #ifdef inner_branch
 314             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 315              * The 0x0 case also allows us to avoid doing an unecessary data
 316              * write which is more valuable so we only check for that
 317              */
 318             "cmp r5, #0\n\t"
 319             "beq 3f\n\t"
 320
 321 #endif
 322             "ldr r4, [%[dest]] \n\t"
 323
 324             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 325             "mla r6, %[src_lo], r5, %[component_half]\n\t"
 326             "mla r7, %[src_hi], r5, %[component_half]\n\t"
 327
 328             "uxtab16 r6, r6, r6, ror #8\n\t"
 329             "uxtab16 r7, r7, r7, ror #8\n\t"
 330
 331             "uxtb16 r6, r6, ror #8\n\t"
 332             "uxtb16 r7, r7, ror #8\n\t"
 333
 334             /* recombine */
 335             "orr r5, r6, r7, lsl #8\n\t"
 336
 337             "uxtb16 r6, r4\n\t"
 338             "uxtb16 r7, r4, ror #8\n\t"
 339
 340             /* we could simplify this to use 'sub' if we were
 341              * willing to give up a register for alpha_mask
 342              */
 343             "mvn r8, r5\n\t"
 344             "mov r8, r8, lsr #24\n\t"
 345
 346             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 347             "mla r6, r6, r8, %[component_half]\n\t"
 348             "mla r7, r7, r8, %[component_half]\n\t"
 349
 350             "uxtab16 r6, r6, r6, ror #8\n\t"
 351             "uxtab16 r7, r7, r7, ror #8\n\t"
 352
 353             "uxtb16 r6, r6, ror #8\n\t"
 354             "uxtb16 r7, r7, ror #8\n\t"
 355
 356             /* recombine */
 357             "orr r6, r6, r7, lsl #8\n\t"
 358
 359             "uqadd8 r5, r6, r5\n\t"
 360
 361 #ifdef inner_branch
 362             "3:\n\t"
 363
 364 #endif
 365             "str r5, [%[dest]], #4\n\t"
 366             /* increment counter and jmp to top */
 367             "subs       %[w], %[w], #1\n\t"
 368             "bne        1b\n\t"
 369             "2:\n\t"
 370             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
 371             : [component_half] "r" (component_half),
 372               [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
 373             : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
 374     }
 375 }
 376
 377 #endif
 378
 379 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
 380                                    uint8_t, 1, uint8_t, 1)
 381 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
 382                                    uint32_t, 1, uint32_t, 1)
 383
 384 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
 385                                      uint32_t, 1, uint32_t, 1)
 386
 387 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
 388                                       uint8_t, 1, uint32_t, 1)
 389
 390 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
 391                                         uint16_t, uint16_t)
 392 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
 393                                         uint32_t, uint32_t)
 394
 395 static const pixman_fast_path_t arm_simd_fast_paths[] =
 396 {
 397     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
 398     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
 399     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
 400     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
 401     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
 402     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
 403     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
 404     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
 405
 406     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
 407
 408     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
 409     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
 410     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
 411     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 412
 413     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
 414     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
 415
 416     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
 417     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
 418     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
 419     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
 420     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
 421     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
 422
 423     { PIXMAN_OP_NONE },
 424 };
 425
 426 pixman_implementation_t *
 427 _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 428 {
 429     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
 430
 431     return imp;
 432 }