pixman/pixman-arm-simd.c

   1 /*
   2  * Copyright © 2008 Mozilla Corporation
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of Mozilla Corporation not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  Mozilla Corporation makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  24  *
  25  */
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include "pixman-arm-simd.h"
  31
  32 void
  33 fbCompositeSrcAdd_8000x8000arm (
  34                             pixman_implementation_t * impl,
  35                             pixman_op_t op,
  36                                 pixman_image_t * pSrc,
  37                                 pixman_image_t * pMask,
  38                                 pixman_image_t * pDst,
  39                                 int32_t      xSrc,
  40                                 int32_t      ySrc,
  41                                 int32_t      xMask,
  42                                 int32_t      yMask,
  43                                 int32_t      xDst,
  44                                 int32_t      yDst,
  45                                 int32_t      width,
  46                                 int32_t      height)
  47 {
  48     uint8_t     *dstLine, *dst;
  49     uint8_t     *srcLine, *src;
  50     int dstStride, srcStride;
  51     uint16_t    w;
  52     uint8_t     s, d;
  53
  54     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
  55     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
  56
  57     while (height--)
  58     {
  59         dst = dstLine;
  60         dstLine += dstStride;
  61         src = srcLine;
  62         srcLine += srcStride;
  63         w = width;
  64
  65         /* ensure both src and dst are properly aligned before doing 32 bit reads
  66          * we'll stay in this loop if src and dst have differing alignments */
  67         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
  68         {
  69             s = *src;
  70             d = *dst;
  71             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
  72             *dst = d;
  73
  74             dst++;
  75             src++;
  76             w--;
  77         }
  78
  79         while (w >= 4)
  80         {
  81             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
  82             dst += 4;
  83             src += 4;
  84             w -= 4;
  85         }
  86
  87         while (w)
  88         {
  89             s = *src;
  90             d = *dst;
  91             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
  92             *dst = d;
  93
  94             dst++;
  95             src++;
  96             w--;
  97         }
  98     }
  99
 100 }
 101
 102 void
 103 fbCompositeSrc_8888x8888arm (
 104                             pixman_implementation_t * impl,
 105                             pixman_op_t op,
 106                          pixman_image_t * pSrc,
 107                          pixman_image_t * pMask,
 108                          pixman_image_t * pDst,
 109                          int32_t      xSrc,
 110                          int32_t      ySrc,
 111                          int32_t      xMask,
 112                          int32_t      yMask,
 113                          int32_t      xDst,
 114                          int32_t      yDst,
 115                          int32_t      width,
 116                          int32_t      height)
 117 {
 118     uint32_t    *dstLine, *dst;
 119     uint32_t    *srcLine, *src;
 120     int dstStride, srcStride;
 121     uint16_t    w;
 122     uint32_t component_half = 0x800080;
 123     uint32_t upper_component_mask = 0xff00ff00;
 124     uint32_t alpha_mask = 0xff;
 125
 126     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 127     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 128
 129     while (height--)
 130     {
 131         dst = dstLine;
 132         dstLine += dstStride;
 133         src = srcLine;
 134         srcLine += srcStride;
 135         w = width;
 136
 137 //#define inner_branch
 138         asm volatile (
 139                         "cmp %[w], #0\n\t"
 140                         "beq 2f\n\t"
 141                         "1:\n\t"
 142                         /* load src */
 143                         "ldr r5, [%[src]], #4\n\t"
 144 #ifdef inner_branch
 145                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 146                          * The 0x0 case also allows us to avoid doing an unecessary data
 147                          * write which is more valuable so we only check for that */
 148                         "cmp r5, #0\n\t"
 149                         "beq 3f\n\t"
 150
 151                         /* = 255 - alpha */
 152                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 153
 154                         "ldr r4, [%[dest]] \n\t"
 155
 156 #else
 157                         "ldr r4, [%[dest]] \n\t"
 158
 159                         /* = 255 - alpha */
 160                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 161 #endif
 162                         "uxtb16 r6, r4\n\t"
 163                         "uxtb16 r7, r4, ror #8\n\t"
 164
 165                         /* multiply by 257 and divide by 65536 */
 166                         "mla r6, r6, r8, %[component_half]\n\t"
 167                         "mla r7, r7, r8, %[component_half]\n\t"
 168
 169                         "uxtab16 r6, r6, r6, ror #8\n\t"
 170                         "uxtab16 r7, r7, r7, ror #8\n\t"
 171
 172                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
 173                         "and r7, r7, %[upper_component_mask]\n\t"
 174                         "uxtab16 r6, r7, r6, ror #8\n\t"
 175
 176                         "uqadd8 r5, r6, r5\n\t"
 177
 178 #ifdef inner_branch
 179                         "3:\n\t"
 180
 181 #endif
 182                         "str r5, [%[dest]], #4\n\t"
 183                         /* increment counter and jmp to top */
 184                         "subs   %[w], %[w], #1\n\t"
 185                         "bne    1b\n\t"
 186                         "2:\n\t"
 187                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 188                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
 189                           [alpha_mask] "r" (alpha_mask)
 190                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 191                         );
 192     }
 193 }
 194
 195 void
 196 fbCompositeSrc_8888x8x8888arm (
 197                             pixman_implementation_t * impl,
 198                             pixman_op_t op,
 199                                pixman_image_t * pSrc,
 200                                pixman_image_t * pMask,
 201                                pixman_image_t * pDst,
 202                                int32_t  xSrc,
 203                                int32_t  ySrc,
 204                                int32_t      xMask,
 205                                int32_t      yMask,
 206                                int32_t      xDst,
 207                                int32_t      yDst,
 208                                int32_t      width,
 209                                int32_t      height)
 210 {
 211     uint32_t    *dstLine, *dst;
 212     uint32_t    *srcLine, *src;
 213     uint32_t    mask;
 214     int dstStride, srcStride;
 215     uint16_t    w;
 216     uint32_t component_half = 0x800080;
 217     uint32_t alpha_mask = 0xff;
 218
 219     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 220     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 221
 222     fbComposeGetSolid (pMask, mask, pDst->bits.format);
 223     mask = (mask) >> 24;
 224
 225     while (height--)
 226     {
 227         dst = dstLine;
 228         dstLine += dstStride;
 229         src = srcLine;
 230         srcLine += srcStride;
 231         w = width;
 232
 233 //#define inner_branch
 234         asm volatile (
 235                         "cmp %[w], #0\n\t"
 236                         "beq 2f\n\t"
 237                         "1:\n\t"
 238                         /* load src */
 239                         "ldr r5, [%[src]], #4\n\t"
 240 #ifdef inner_branch
 241                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 242                          * The 0x0 case also allows us to avoid doing an unecessary data
 243                          * write which is more valuable so we only check for that */
 244                         "cmp r5, #0\n\t"
 245                         "beq 3f\n\t"
 246
 247 #endif
 248                         "ldr r4, [%[dest]] \n\t"
 249
 250                         "uxtb16 r6, r5\n\t"
 251                         "uxtb16 r7, r5, ror #8\n\t"
 252
 253                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 254                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
 255                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
 256
 257                         "uxtab16 r6, r6, r6, ror #8\n\t"
 258                         "uxtab16 r7, r7, r7, ror #8\n\t"
 259
 260                         "uxtb16 r6, r6, ror #8\n\t"
 261                         "uxtb16 r7, r7, ror #8\n\t"
 262
 263                         /* recombine */
 264                         "orr r5, r6, r7, lsl #8\n\t"
 265
 266                         "uxtb16 r6, r4\n\t"
 267                         "uxtb16 r7, r4, ror #8\n\t"
 268
 269                         /* 255 - alpha */
 270                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 271
 272                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 273                         "mla r6, r6, r8, %[component_half]\n\t"
 274                         "mla r7, r7, r8, %[component_half]\n\t"
 275
 276                         "uxtab16 r6, r6, r6, ror #8\n\t"
 277                         "uxtab16 r7, r7, r7, ror #8\n\t"
 278
 279                         "uxtb16 r6, r6, ror #8\n\t"
 280                         "uxtb16 r7, r7, ror #8\n\t"
 281
 282                         /* recombine */
 283                         "orr r6, r6, r7, lsl #8\n\t"
 284
 285                         "uqadd8 r5, r6, r5\n\t"
 286
 287 #ifdef inner_branch
 288                         "3:\n\t"
 289
 290 #endif
 291                         "str r5, [%[dest]], #4\n\t"
 292                         /* increment counter and jmp to top */
 293                         "subs   %[w], %[w], #1\n\t"
 294                         "bne    1b\n\t"
 295                         "2:\n\t"
 296                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 297                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
 298                           [alpha_mask] "r" (alpha_mask)
 299                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 300                         );
 301     }
 302 }
 303
 304 void
 305 fbCompositeSolidMask_nx8x8888arm (
 306                             pixman_implementation_t * impl,
 307                             pixman_op_t      op,
 308                                pixman_image_t * pSrc,
 309                                pixman_image_t * pMask,
 310                                pixman_image_t * pDst,
 311                                int32_t      xSrc,
 312                                int32_t      ySrc,
 313                                int32_t      xMask,
 314                                int32_t      yMask,
 315                                int32_t      xDst,
 316                                int32_t      yDst,
 317                                int32_t      width,
 318                                int32_t      height)
 319 {
 320     uint32_t     src, srca;
 321     uint32_t    *dstLine, *dst;
 322     uint8_t     *maskLine, *mask;
 323     int          dstStride, maskStride;
 324     uint16_t     w;
 325
 326     fbComposeGetSolid(pSrc, src, pDst->bits.format);
 327
 328     srca = src >> 24;
 329     if (src == 0)
 330         return;
 331
 332     uint32_t component_mask = 0xff00ff;
 333     uint32_t component_half = 0x800080;
 334
 335     uint32_t src_hi = (src >> 8) & component_mask;
 336     uint32_t src_lo = src & component_mask;
 337
 338     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 339     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
 340
 341     while (height--)
 342     {
 343         dst = dstLine;
 344         dstLine += dstStride;
 345         mask = maskLine;
 346         maskLine += maskStride;
 347         w = width;
 348
 349 //#define inner_branch
 350         asm volatile (
 351                         "cmp %[w], #0\n\t"
 352                         "beq 2f\n\t"
 353                         "1:\n\t"
 354                         /* load mask */
 355                         "ldrb r5, [%[mask]], #1\n\t"
 356 #ifdef inner_branch
 357                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 358                          * The 0x0 case also allows us to avoid doing an unecessary data
 359                          * write which is more valuable so we only check for that */
 360                         "cmp r5, #0\n\t"
 361                         "beq 3f\n\t"
 362
 363 #endif
 364                         "ldr r4, [%[dest]] \n\t"
 365
 366                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 367                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
 368                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
 369
 370                         "uxtab16 r6, r6, r6, ror #8\n\t"
 371                         "uxtab16 r7, r7, r7, ror #8\n\t"
 372
 373                         "uxtb16 r6, r6, ror #8\n\t"
 374                         "uxtb16 r7, r7, ror #8\n\t"
 375
 376                         /* recombine */
 377                         "orr r5, r6, r7, lsl #8\n\t"
 378
 379                         "uxtb16 r6, r4\n\t"
 380                         "uxtb16 r7, r4, ror #8\n\t"
 381
 382                         /* we could simplify this to use 'sub' if we were
 383                          * willing to give up a register for alpha_mask */
 384                         "mvn r8, r5\n\t"
 385                         "mov r8, r8, lsr #24\n\t"
 386
 387                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 388                         "mla r6, r6, r8, %[component_half]\n\t"
 389                         "mla r7, r7, r8, %[component_half]\n\t"
 390
 391                         "uxtab16 r6, r6, r6, ror #8\n\t"
 392                         "uxtab16 r7, r7, r7, ror #8\n\t"
 393
 394                         "uxtb16 r6, r6, ror #8\n\t"
 395                         "uxtb16 r7, r7, ror #8\n\t"
 396
 397                         /* recombine */
 398                         "orr r6, r6, r7, lsl #8\n\t"
 399
 400                         "uqadd8 r5, r6, r5\n\t"
 401
 402 #ifdef inner_branch
 403                         "3:\n\t"
 404
 405 #endif
 406                         "str r5, [%[dest]], #4\n\t"
 407                         /* increment counter and jmp to top */
 408                         "subs   %[w], %[w], #1\n\t"
 409                         "bne    1b\n\t"
 410                         "2:\n\t"
 411                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
 412                         : [component_half] "r" (component_half),
 413                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
 414                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 415                         );
 416     }
 417 }
 418
 419 static const pixman_fast_path_t arm_simd_fast_path_array[] =
 420 {
 421     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
 422     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
 423     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
 424     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
 425     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
 426     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
 427
 428     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000arm,   0 },
 429
 430     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
 431     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
 432     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
 433     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
 434
 435     { PIXMAN_OP_NONE },
 436 };
 437
 438 const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
 439
 440 static void
 441 arm_simd_composite (pixman_implementation_t *imp,
 442                 pixman_op_t     op,
 443                 pixman_image_t *src,
 444                 pixman_image_t *mask,
 445                 pixman_image_t *dest,
 446                 int32_t         src_x,
 447                 int32_t         src_y,
 448                 int32_t         mask_x,
 449                 int32_t         mask_y,
 450                 int32_t         dest_x,
 451                 int32_t         dest_y,
 452                 int32_t        width,
 453                 int32_t        height)
 454 {
 455     if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
 456                                op, src, mask, dest,
 457                                src_x, src_y,
 458                                mask_x, mask_y,
 459                                dest_x, dest_y,
 460                                width, height))
 461     {
 462         return;
 463     }
 464
 465     _pixman_implementation_composite (imp->delegate, op,
 466                                       src, mask, dest,
 467                                       src_x, src_y,
 468                                       mask_x, mask_y,
 469                                       dest_x, dest_y,
 470                                       width, height);
 471 }
 472
 473 pixman_implementation_t *
 474 _pixman_implementation_create_arm_simd (void)
 475 {
 476     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
 477     pixman_implementation_t *imp = _pixman_implementation_create (general);
 478
 479     imp->composite = arm_simd_composite;
 480
 481     return imp;
 482 }