pixman/pixman-arm-simd.c

   1 /*
   2  * Copyright © 2008 Mozilla Corporation
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of Mozilla Corporation not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  Mozilla Corporation makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  24  *
  25  */
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include "pixman-arm-simd.h"
  31
  32 void
  33 fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
  34                                 pixman_image_t * pSrc,
  35                                 pixman_image_t * pMask,
  36                                 pixman_image_t * pDst,
  37                                 int16_t      xSrc,
  38                                 int16_t      ySrc,
  39                                 int16_t      xMask,
  40                                 int16_t      yMask,
  41                                 int16_t      xDst,
  42                                 int16_t      yDst,
  43                                 uint16_t     width,
  44                                 uint16_t     height)
  45 {
  46     uint8_t     *dstLine, *dst;
  47     uint8_t     *srcLine, *src;
  48     int dstStride, srcStride;
  49     uint16_t    w;
  50     uint8_t     s, d;
  51
  52     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
  53     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
  54
  55     while (height--)
  56     {
  57         dst = dstLine;
  58         dstLine += dstStride;
  59         src = srcLine;
  60         srcLine += srcStride;
  61         w = width;
  62
  63         /* ensure both src and dst are properly aligned before doing 32 bit reads
  64          * we'll stay in this loop if src and dst have differing alignments */
  65         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
  66         {
  67             s = *src;
  68             d = *dst;
  69             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
  70             *dst = d;
  71
  72             dst++;
  73             src++;
  74             w--;
  75         }
  76
  77         while (w >= 4)
  78         {
  79             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
  80             dst += 4;
  81             src += 4;
  82             w -= 4;
  83         }
  84
  85         while (w)
  86         {
  87             s = *src;
  88             d = *dst;
  89             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
  90             *dst = d;
  91
  92             dst++;
  93             src++;
  94             w--;
  95         }
  96     }
  97
  98 }
  99
 100 void
 101 fbCompositeSrc_8888x8888arm (pixman_op_t op,
 102                          pixman_image_t * pSrc,
 103                          pixman_image_t * pMask,
 104                          pixman_image_t * pDst,
 105                          int16_t      xSrc,
 106                          int16_t      ySrc,
 107                          int16_t      xMask,
 108                          int16_t      yMask,
 109                          int16_t      xDst,
 110                          int16_t      yDst,
 111                          uint16_t     width,
 112                          uint16_t     height)
 113 {
 114     uint32_t    *dstLine, *dst;
 115     uint32_t    *srcLine, *src;
 116     int dstStride, srcStride;
 117     uint16_t    w;
 118     uint32_t component_half = 0x800080;
 119     uint32_t upper_component_mask = 0xff00ff00;
 120     uint32_t alpha_mask = 0xff;
 121
 122     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 123     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 124
 125     while (height--)
 126     {
 127         dst = dstLine;
 128         dstLine += dstStride;
 129         src = srcLine;
 130         srcLine += srcStride;
 131         w = width;
 132
 133 //#define inner_branch
 134         asm volatile (
 135                         "cmp %[w], #0\n\t"
 136                         "beq 2f\n\t"
 137                         "1:\n\t"
 138                         /* load src */
 139                         "ldr r5, [%[src]], #4\n\t"
 140 #ifdef inner_branch
 141                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 142                          * The 0x0 case also allows us to avoid doing an unecessary data
 143                          * write which is more valuable so we only check for that */
 144                         "cmp r5, #0\n\t"
 145                         "beq 3f\n\t"
 146
 147                         /* = 255 - alpha */
 148                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 149
 150                         "ldr r4, [%[dest]] \n\t"
 151
 152 #else
 153                         "ldr r4, [%[dest]] \n\t"
 154
 155                         /* = 255 - alpha */
 156                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 157 #endif
 158                         "uxtb16 r6, r4\n\t"
 159                         "uxtb16 r7, r4, ror #8\n\t"
 160
 161                         /* multiply by 257 and divide by 65536 */
 162                         "mla r6, r6, r8, %[component_half]\n\t"
 163                         "mla r7, r7, r8, %[component_half]\n\t"
 164
 165                         "uxtab16 r6, r6, r6, ror #8\n\t"
 166                         "uxtab16 r7, r7, r7, ror #8\n\t"
 167
 168                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
 169                         "and r7, r7, %[upper_component_mask]\n\t"
 170                         "uxtab16 r6, r7, r6, ror #8\n\t"
 171
 172                         "uqadd8 r5, r6, r5\n\t"
 173
 174 #ifdef inner_branch
 175                         "3:\n\t"
 176
 177 #endif
 178                         "str r5, [%[dest]], #4\n\t"
 179                         /* increment counter and jmp to top */
 180                         "subs   %[w], %[w], #1\n\t"
 181                         "bne    1b\n\t"
 182                         "2:\n\t"
 183                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 184                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
 185                           [alpha_mask] "r" (alpha_mask)
 186                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 187                         );
 188     }
 189 }
 190
 191 void
 192 fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
 193                                pixman_image_t * pSrc,
 194                                pixman_image_t * pMask,
 195                                pixman_image_t * pDst,
 196                                int16_t  xSrc,
 197                                int16_t  ySrc,
 198                                int16_t      xMask,
 199                                int16_t      yMask,
 200                                int16_t      xDst,
 201                                int16_t      yDst,
 202                                uint16_t     width,
 203                                uint16_t     height)
 204 {
 205     uint32_t    *dstLine, *dst;
 206     uint32_t    *srcLine, *src;
 207     uint32_t    mask;
 208     int dstStride, srcStride;
 209     uint16_t    w;
 210     uint32_t component_half = 0x800080;
 211     uint32_t alpha_mask = 0xff;
 212
 213     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 214     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 215
 216     fbComposeGetSolid (pMask, mask, pDst->bits.format);
 217     mask = (mask) >> 24;
 218
 219     while (height--)
 220     {
 221         dst = dstLine;
 222         dstLine += dstStride;
 223         src = srcLine;
 224         srcLine += srcStride;
 225         w = width;
 226
 227 //#define inner_branch
 228         asm volatile (
 229                         "cmp %[w], #0\n\t"
 230                         "beq 2f\n\t"
 231                         "1:\n\t"
 232                         /* load src */
 233                         "ldr r5, [%[src]], #4\n\t"
 234 #ifdef inner_branch
 235                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 236                          * The 0x0 case also allows us to avoid doing an unecessary data
 237                          * write which is more valuable so we only check for that */
 238                         "cmp r5, #0\n\t"
 239                         "beq 3f\n\t"
 240
 241 #endif
 242                         "ldr r4, [%[dest]] \n\t"
 243
 244                         "uxtb16 r6, r5\n\t"
 245                         "uxtb16 r7, r5, ror #8\n\t"
 246
 247                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 248                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
 249                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
 250
 251                         "uxtab16 r6, r6, r6, ror #8\n\t"
 252                         "uxtab16 r7, r7, r7, ror #8\n\t"
 253
 254                         "uxtb16 r6, r6, ror #8\n\t"
 255                         "uxtb16 r7, r7, ror #8\n\t"
 256
 257                         /* recombine */
 258                         "orr r5, r6, r7, lsl #8\n\t"
 259
 260                         "uxtb16 r6, r4\n\t"
 261                         "uxtb16 r7, r4, ror #8\n\t"
 262
 263                         /* 255 - alpha */
 264                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 265
 266                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 267                         "mla r6, r6, r8, %[component_half]\n\t"
 268                         "mla r7, r7, r8, %[component_half]\n\t"
 269
 270                         "uxtab16 r6, r6, r6, ror #8\n\t"
 271                         "uxtab16 r7, r7, r7, ror #8\n\t"
 272
 273                         "uxtb16 r6, r6, ror #8\n\t"
 274                         "uxtb16 r7, r7, ror #8\n\t"
 275
 276                         /* recombine */
 277                         "orr r6, r6, r7, lsl #8\n\t"
 278
 279                         "uqadd8 r5, r6, r5\n\t"
 280
 281 #ifdef inner_branch
 282                         "3:\n\t"
 283
 284 #endif
 285                         "str r5, [%[dest]], #4\n\t"
 286                         /* increment counter and jmp to top */
 287                         "subs   %[w], %[w], #1\n\t"
 288                         "bne    1b\n\t"
 289                         "2:\n\t"
 290                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 291                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
 292                           [alpha_mask] "r" (alpha_mask)
 293                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 294                         );
 295     }
 296 }
 297
 298 void
 299 fbCompositeSolidMask_nx8x8888arm (pixman_op_t      op,
 300                                pixman_image_t * pSrc,
 301                                pixman_image_t * pMask,
 302                                pixman_image_t * pDst,
 303                                int16_t      xSrc,
 304                                int16_t      ySrc,
 305                                int16_t      xMask,
 306                                int16_t      yMask,
 307                                int16_t      xDst,
 308                                int16_t      yDst,
 309                                uint16_t     width,
 310                                uint16_t     height)
 311 {
 312     uint32_t     src, srca;
 313     uint32_t    *dstLine, *dst;
 314     uint8_t     *maskLine, *mask;
 315     int          dstStride, maskStride;
 316     uint16_t     w;
 317
 318     fbComposeGetSolid(pSrc, src, pDst->bits.format);
 319
 320     srca = src >> 24;
 321     if (src == 0)
 322         return;
 323
 324     uint32_t component_mask = 0xff00ff;
 325     uint32_t component_half = 0x800080;
 326
 327     uint32_t src_hi = (src >> 8) & component_mask;
 328     uint32_t src_lo = src & component_mask;
 329
 330     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 331     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
 332
 333     while (height--)
 334     {
 335         dst = dstLine;
 336         dstLine += dstStride;
 337         mask = maskLine;
 338         maskLine += maskStride;
 339         w = width;
 340
 341 //#define inner_branch
 342         asm volatile (
 343                         "cmp %[w], #0\n\t"
 344                         "beq 2f\n\t"
 345                         "1:\n\t"
 346                         /* load mask */
 347                         "ldrb r5, [%[mask]], #1\n\t"
 348 #ifdef inner_branch
 349                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 350                          * The 0x0 case also allows us to avoid doing an unecessary data
 351                          * write which is more valuable so we only check for that */
 352                         "cmp r5, #0\n\t"
 353                         "beq 3f\n\t"
 354
 355 #endif
 356                         "ldr r4, [%[dest]] \n\t"
 357
 358                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 359                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
 360                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
 361
 362                         "uxtab16 r6, r6, r6, ror #8\n\t"
 363                         "uxtab16 r7, r7, r7, ror #8\n\t"
 364
 365                         "uxtb16 r6, r6, ror #8\n\t"
 366                         "uxtb16 r7, r7, ror #8\n\t"
 367
 368                         /* recombine */
 369                         "orr r5, r6, r7, lsl #8\n\t"
 370
 371                         "uxtb16 r6, r4\n\t"
 372                         "uxtb16 r7, r4, ror #8\n\t"
 373
 374                         /* we could simplify this to use 'sub' if we were
 375                          * willing to give up a register for alpha_mask */
 376                         "mvn r8, r5\n\t"
 377                         "mov r8, r8, lsr #24\n\t"
 378
 379                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 380                         "mla r6, r6, r8, %[component_half]\n\t"
 381                         "mla r7, r7, r8, %[component_half]\n\t"
 382
 383                         "uxtab16 r6, r6, r6, ror #8\n\t"
 384                         "uxtab16 r7, r7, r7, ror #8\n\t"
 385
 386                         "uxtb16 r6, r6, ror #8\n\t"
 387                         "uxtb16 r7, r7, ror #8\n\t"
 388
 389                         /* recombine */
 390                         "orr r6, r6, r7, lsl #8\n\t"
 391
 392                         "uqadd8 r5, r6, r5\n\t"
 393
 394 #ifdef inner_branch
 395                         "3:\n\t"
 396
 397 #endif
 398                         "str r5, [%[dest]], #4\n\t"
 399                         /* increment counter and jmp to top */
 400                         "subs   %[w], %[w], #1\n\t"
 401                         "bne    1b\n\t"
 402                         "2:\n\t"
 403                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
 404                         : [component_half] "r" (component_half),
 405                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
 406                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 407                         );
 408     }
 409 }
 410
 411 static const FastPathInfo arm_simd_fast_path_array[] =
 412 {
 413     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
 414     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
 415     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
 416     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
 417     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
 418     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
 419
 420     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000arm,   0 },
 421
 422     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
 423     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
 424     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
 425     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
 426
 427     { PIXMAN_OP_NONE },
 428 };
 429
 430 const FastPathInfo *const arm_simd_fast_paths = arm_simd_fast_path_array;