pixman/pixman-arm-simd.c

   1 /*
   2  * Copyright © 2008 Mozilla Corporation
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of Mozilla Corporation not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  Mozilla Corporation makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  24  *
  25  */
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include "pixman-private.h"
  31
  32 static void
  33 arm_composite_add_8000_8000 (pixman_implementation_t * impl,
  34                              pixman_op_t               op,
  35                              pixman_image_t *          src_image,
  36                              pixman_image_t *          mask_image,
  37                              pixman_image_t *          dst_image,
  38                              int32_t                   src_x,
  39                              int32_t                   src_y,
  40                              int32_t                   mask_x,
  41                              int32_t                   mask_y,
  42                              int32_t                   dest_x,
  43                              int32_t                   dest_y,
  44                              int32_t                   width,
  45                              int32_t                   height)
  46 {
  47     uint8_t     *dst_line, *dst;
  48     uint8_t     *src_line, *src;
  49     int dst_stride, src_stride;
  50     int32_t w;
  51     uint8_t s, d;
  52
  53     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  54     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  55
  56     while (height--)
  57     {
  58         dst = dst_line;
  59         dst_line += dst_stride;
  60         src = src_line;
  61         src_line += src_stride;
  62         w = width;
  63
  64         /* ensure both src and dst are properly aligned before doing 32 bit reads
  65          * we'll stay in this loop if src and dst have differing alignments
  66          */
  67         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
  68         {
  69             s = *src;
  70             d = *dst;
  71             asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
  72             *dst = d;
  73
  74             dst++;
  75             src++;
  76             w--;
  77         }
  78
  79         while (w >= 4)
  80         {
  81             asm ("uqadd8 %0, %1, %2"
  82                  : "=r" (*(uint32_t*)dst)
  83                  : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
  84             dst += 4;
  85             src += 4;
  86             w -= 4;
  87         }
  88
  89         while (w)
  90         {
  91             s = *src;
  92             d = *dst;
  93             asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
  94             *dst = d;
  95
  96             dst++;
  97             src++;
  98             w--;
  99         }
 100     }
 101
 102 }
 103
 104 static void
 105 arm_composite_over_8888_8888 (pixman_implementation_t * impl,
 106                               pixman_op_t               op,
 107                               pixman_image_t *          src_image,
 108                               pixman_image_t *          mask_image,
 109                               pixman_image_t *          dst_image,
 110                               int32_t                   src_x,
 111                               int32_t                   src_y,
 112                               int32_t                   mask_x,
 113                               int32_t                   mask_y,
 114                               int32_t                   dest_x,
 115                               int32_t                   dest_y,
 116                               int32_t                   width,
 117                               int32_t                   height)
 118 {
 119     uint32_t    *dst_line, *dst;
 120     uint32_t    *src_line, *src;
 121     int dst_stride, src_stride;
 122     int32_t w;
 123     uint32_t component_half = 0x800080;
 124     uint32_t upper_component_mask = 0xff00ff00;
 125     uint32_t alpha_mask = 0xff;
 126
 127     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 128     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 129
 130     while (height--)
 131     {
 132         dst = dst_line;
 133         dst_line += dst_stride;
 134         src = src_line;
 135         src_line += src_stride;
 136         w = width;
 137
 138 /* #define inner_branch */
 139         asm volatile (
 140             "cmp %[w], #0\n\t"
 141             "beq 2f\n\t"
 142             "1:\n\t"
 143             /* load src */
 144             "ldr r5, [%[src]], #4\n\t"
 145 #ifdef inner_branch
 146             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 147              * The 0x0 case also allows us to avoid doing an unecessary data
 148              * write which is more valuable so we only check for that
 149              */
 150             "cmp r5, #0\n\t"
 151             "beq 3f\n\t"
 152
 153             /* = 255 - alpha */
 154             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 155
 156             "ldr r4, [%[dest]] \n\t"
 157
 158 #else
 159             "ldr r4, [%[dest]] \n\t"
 160
 161             /* = 255 - alpha */
 162             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 163 #endif
 164             "uxtb16 r6, r4\n\t"
 165             "uxtb16 r7, r4, ror #8\n\t"
 166
 167             /* multiply by 257 and divide by 65536 */
 168             "mla r6, r6, r8, %[component_half]\n\t"
 169             "mla r7, r7, r8, %[component_half]\n\t"
 170
 171             "uxtab16 r6, r6, r6, ror #8\n\t"
 172             "uxtab16 r7, r7, r7, ror #8\n\t"
 173
 174             /* recombine the 0xff00ff00 bytes of r6 and r7 */
 175             "and r7, r7, %[upper_component_mask]\n\t"
 176             "uxtab16 r6, r7, r6, ror #8\n\t"
 177
 178             "uqadd8 r5, r6, r5\n\t"
 179
 180 #ifdef inner_branch
 181             "3:\n\t"
 182
 183 #endif
 184             "str r5, [%[dest]], #4\n\t"
 185             /* increment counter and jmp to top */
 186             "subs       %[w], %[w], #1\n\t"
 187             "bne        1b\n\t"
 188             "2:\n\t"
 189             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 190             : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
 191               [alpha_mask] "r" (alpha_mask)
 192             : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 193             );
 194     }
 195 }
 196
 197 static void
 198 arm_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 199                                 pixman_op_t               op,
 200                                 pixman_image_t *          src_image,
 201                                 pixman_image_t *          mask_image,
 202                                 pixman_image_t *          dst_image,
 203                                 int32_t                   src_x,
 204                                 int32_t                   src_y,
 205                                 int32_t                   mask_x,
 206                                 int32_t                   mask_y,
 207                                 int32_t                   dest_x,
 208                                 int32_t                   dest_y,
 209                                 int32_t                   width,
 210                                 int32_t                   height)
 211 {
 212     uint32_t *dst_line, *dst;
 213     uint32_t *src_line, *src;
 214     uint32_t mask;
 215     int dst_stride, src_stride;
 216     int32_t w;
 217     uint32_t component_half = 0x800080;
 218     uint32_t alpha_mask = 0xff;
 219
 220     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 221     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 222
 223     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
 224     mask = (mask) >> 24;
 225
 226     while (height--)
 227     {
 228         dst = dst_line;
 229         dst_line += dst_stride;
 230         src = src_line;
 231         src_line += src_stride;
 232         w = width;
 233
 234 /* #define inner_branch */
 235         asm volatile (
 236             "cmp %[w], #0\n\t"
 237             "beq 2f\n\t"
 238             "1:\n\t"
 239             /* load src */
 240             "ldr r5, [%[src]], #4\n\t"
 241 #ifdef inner_branch
 242             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 243              * The 0x0 case also allows us to avoid doing an unecessary data
 244              * write which is more valuable so we only check for that
 245              */
 246             "cmp r5, #0\n\t"
 247             "beq 3f\n\t"
 248
 249 #endif
 250             "ldr r4, [%[dest]] \n\t"
 251
 252             "uxtb16 r6, r5\n\t"
 253             "uxtb16 r7, r5, ror #8\n\t"
 254
 255             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 256             "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
 257             "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
 258
 259             "uxtab16 r6, r6, r6, ror #8\n\t"
 260             "uxtab16 r7, r7, r7, ror #8\n\t"
 261
 262             "uxtb16 r6, r6, ror #8\n\t"
 263             "uxtb16 r7, r7, ror #8\n\t"
 264
 265             /* recombine */
 266             "orr r5, r6, r7, lsl #8\n\t"
 267
 268             "uxtb16 r6, r4\n\t"
 269             "uxtb16 r7, r4, ror #8\n\t"
 270
 271             /* 255 - alpha */
 272             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 273
 274             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 275             "mla r6, r6, r8, %[component_half]\n\t"
 276             "mla r7, r7, r8, %[component_half]\n\t"
 277
 278             "uxtab16 r6, r6, r6, ror #8\n\t"
 279             "uxtab16 r7, r7, r7, ror #8\n\t"
 280
 281             "uxtb16 r6, r6, ror #8\n\t"
 282             "uxtb16 r7, r7, ror #8\n\t"
 283
 284             /* recombine */
 285             "orr r6, r6, r7, lsl #8\n\t"
 286
 287             "uqadd8 r5, r6, r5\n\t"
 288
 289 #ifdef inner_branch
 290             "3:\n\t"
 291
 292 #endif
 293             "str r5, [%[dest]], #4\n\t"
 294             /* increment counter and jmp to top */
 295             "subs       %[w], %[w], #1\n\t"
 296             "bne        1b\n\t"
 297             "2:\n\t"
 298             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 299             : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
 300               [alpha_mask] "r" (alpha_mask)
 301             : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 302             );
 303     }
 304 }
 305
 306 static void
 307 arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
 308                              pixman_op_t               op,
 309                              pixman_image_t *          src_image,
 310                              pixman_image_t *          mask_image,
 311                              pixman_image_t *          dst_image,
 312                              int32_t                   src_x,
 313                              int32_t                   src_y,
 314                              int32_t                   mask_x,
 315                              int32_t                   mask_y,
 316                              int32_t                   dest_x,
 317                              int32_t                   dest_y,
 318                              int32_t                   width,
 319                              int32_t                   height)
 320 {
 321     uint32_t src, srca;
 322     uint32_t *dst_line, *dst;
 323     uint8_t  *mask_line, *mask;
 324     int dst_stride, mask_stride;
 325     int32_t w;
 326
 327     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 328
 329     /* bail out if fully transparent */
 330     srca = src >> 24;
 331     if (src == 0)
 332         return;
 333
 334     uint32_t component_mask = 0xff00ff;
 335     uint32_t component_half = 0x800080;
 336
 337     uint32_t src_hi = (src >> 8) & component_mask;
 338     uint32_t src_lo = src & component_mask;
 339
 340     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 341     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 342
 343     while (height--)
 344     {
 345         dst = dst_line;
 346         dst_line += dst_stride;
 347         mask = mask_line;
 348         mask_line += mask_stride;
 349         w = width;
 350
 351 /* #define inner_branch */
 352         asm volatile (
 353             "cmp %[w], #0\n\t"
 354             "beq 2f\n\t"
 355             "1:\n\t"
 356             /* load mask */
 357             "ldrb r5, [%[mask]], #1\n\t"
 358 #ifdef inner_branch
 359             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 360              * The 0x0 case also allows us to avoid doing an unecessary data
 361              * write which is more valuable so we only check for that
 362              */
 363             "cmp r5, #0\n\t"
 364             "beq 3f\n\t"
 365
 366 #endif
 367             "ldr r4, [%[dest]] \n\t"
 368
 369             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 370             "mla r6, %[src_lo], r5, %[component_half]\n\t"
 371             "mla r7, %[src_hi], r5, %[component_half]\n\t"
 372
 373             "uxtab16 r6, r6, r6, ror #8\n\t"
 374             "uxtab16 r7, r7, r7, ror #8\n\t"
 375
 376             "uxtb16 r6, r6, ror #8\n\t"
 377             "uxtb16 r7, r7, ror #8\n\t"
 378
 379             /* recombine */
 380             "orr r5, r6, r7, lsl #8\n\t"
 381
 382             "uxtb16 r6, r4\n\t"
 383             "uxtb16 r7, r4, ror #8\n\t"
 384
 385             /* we could simplify this to use 'sub' if we were
 386              * willing to give up a register for alpha_mask
 387              */
 388             "mvn r8, r5\n\t"
 389             "mov r8, r8, lsr #24\n\t"
 390
 391             /* multiply by alpha (r8) then by 257 and divide by 65536 */
 392             "mla r6, r6, r8, %[component_half]\n\t"
 393             "mla r7, r7, r8, %[component_half]\n\t"
 394
 395             "uxtab16 r6, r6, r6, ror #8\n\t"
 396             "uxtab16 r7, r7, r7, ror #8\n\t"
 397
 398             "uxtb16 r6, r6, ror #8\n\t"
 399             "uxtb16 r7, r7, ror #8\n\t"
 400
 401             /* recombine */
 402             "orr r6, r6, r7, lsl #8\n\t"
 403
 404             "uqadd8 r5, r6, r5\n\t"
 405
 406 #ifdef inner_branch
 407             "3:\n\t"
 408
 409 #endif
 410             "str r5, [%[dest]], #4\n\t"
 411             /* increment counter and jmp to top */
 412             "subs       %[w], %[w], #1\n\t"
 413             "bne        1b\n\t"
 414             "2:\n\t"
 415             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
 416             : [component_half] "r" (component_half),
 417               [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
 418             : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
 419     }
 420 }
 421
 422 static const pixman_fast_path_t arm_simd_fast_paths[] =
 423 {
 424     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, arm_composite_over_8888_8888),
 425     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, arm_composite_over_8888_8888),
 426     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, arm_composite_over_8888_8888),
 427     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, arm_composite_over_8888_8888),
 428     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, arm_composite_over_8888_n_8888),
 429     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, arm_composite_over_8888_n_8888),
 430
 431     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, arm_composite_add_8000_8000),
 432
 433     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, arm_composite_over_n_8_8888),
 434     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, arm_composite_over_n_8_8888),
 435     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, arm_composite_over_n_8_8888),
 436     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, arm_composite_over_n_8_8888),
 437
 438     { PIXMAN_OP_NONE },
 439 };
 440
 441 pixman_implementation_t *
 442 _pixman_implementation_create_arm_simd (void)
 443 {
 444     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
 445     pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
 446
 447     return imp;
 448 }