pixman/pixman-arm-simd.c

   1 /*
   2  * Copyright © 2008 Mozilla Corporation
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of Mozilla Corporation not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  Mozilla Corporation makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  24  *
  25  */
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include "pixman-arm-simd.h"
  31
  32 void
  33 fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
  34                                 pixman_image_t * pSrc,
  35                                 pixman_image_t * pMask,
  36                                 pixman_image_t * pDst,
  37                                 int16_t      xSrc,
  38                                 int16_t      ySrc,
  39                                 int16_t      xMask,
  40                                 int16_t      yMask,
  41                                 int16_t      xDst,
  42                                 int16_t      yDst,
  43                                 uint16_t     width,
  44                                 uint16_t     height)
  45 {
  46     uint8_t     *dstLine, *dst;
  47     uint8_t     *srcLine, *src;
  48     int dstStride, srcStride;
  49     uint16_t    w;
  50     uint8_t     s, d;
  51
  52     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
  53     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
  54
  55     while (height--)
  56     {
  57         dst = dstLine;
  58         dstLine += dstStride;
  59         src = srcLine;
  60         srcLine += srcStride;
  61         w = width;
  62
  63         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
  64         {
  65             s = *src;
  66             d = *dst;
  67             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
  68             *dst = d;
  69
  70             dst++;
  71             src++;
  72             w--;
  73         }
  74
  75         while (w >= 4)
  76         {
  77             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
  78             dst += 4;
  79             src += 4;
  80             w -= 4;
  81         }
  82
  83         while (w)
  84         {
  85             s = *src;
  86             d = *dst;
  87             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
  88             *dst = d;
  89
  90             dst++;
  91             src++;
  92             w--;
  93         }
  94     }
  95
  96 }
  97
  98 void
  99 fbCompositeSrc_8888x8888arm (pixman_op_t op,
 100                          pixman_image_t * pSrc,
 101                          pixman_image_t * pMask,
 102                          pixman_image_t * pDst,
 103                          int16_t      xSrc,
 104                          int16_t      ySrc,
 105                          int16_t      xMask,
 106                          int16_t      yMask,
 107                          int16_t      xDst,
 108                          int16_t      yDst,
 109                          uint16_t     width,
 110                          uint16_t     height)
 111 {
 112     uint32_t    *dstLine, *dst;
 113     uint32_t    *srcLine, *src;
 114     int dstStride, srcStride;
 115     uint16_t    w;
 116     uint32_t component_half = 0x800080;
 117     uint32_t upper_component_mask = 0xff00ff00;
 118     uint32_t alpha_mask = 0xff;
 119
 120     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 121     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 122
 123     while (height--)
 124     {
 125         dst = dstLine;
 126         dstLine += dstStride;
 127         src = srcLine;
 128         srcLine += srcStride;
 129         w = width;
 130
 131 //#define inner_branch
 132         asm volatile (
 133                         "cmp %[w], #0\n\t"
 134                         "beq 2f\n\t"
 135                         "1:\n\t"
 136                         /* load src */
 137                         "ldr r5, [%[src]], #4\n\t"
 138 #ifdef inner_branch
 139                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 140                          * The 0x0 case also allows us to avoid doing an unecessary data
 141                          * write which is more valuable so we only check for that */
 142                         "cmp r5, #0\n\t"
 143                         "beq 3f\n\t"
 144
 145                         /* = 255 - alpha */
 146                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 147
 148                         "ldr r4, [%[dest]] \n\t"
 149
 150 #else
 151                         "ldr r4, [%[dest]] \n\t"
 152
 153                         /* = 255 - alpha */
 154                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 155 #endif
 156                         "uxtb16 r6, r4\n\t"
 157                         "uxtb16 r7, r4, ror #8\n\t"
 158
 159                         /* multiply by 257 and divide by 65536 */
 160                         "mla r6, r6, r8, %[component_half]\n\t"
 161                         "mla r7, r7, r8, %[component_half]\n\t"
 162
 163                         "uxtab16 r6, r6, r6, ror #8\n\t"
 164                         "uxtab16 r7, r7, r7, ror #8\n\t"
 165
 166                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
 167                         "and r7, r7, %[upper_component_mask]\n\t"
 168                         "uxtab16 r6, r7, r6, ror #8\n\t"
 169
 170                         "uqadd8 r5, r6, r5\n\t"
 171
 172 #ifdef inner_branch
 173                         "3:\n\t"
 174
 175 #endif
 176                         "str r5, [%[dest]], #4\n\t"
 177                         /* increment counter and jmp to top */
 178                         "subs   %[w], %[w], #1\n\t"
 179                         "bne    1b\n\t"
 180                         "2:\n\t"
 181                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 182                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
 183                           [alpha_mask] "r" (alpha_mask)
 184                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 185                         );
 186     }
 187 }
 188
 189 void
 190 fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
 191                                pixman_image_t * pSrc,
 192                                pixman_image_t * pMask,
 193                                pixman_image_t * pDst,
 194                                int16_t  xSrc,
 195                                int16_t  ySrc,
 196                                int16_t      xMask,
 197                                int16_t      yMask,
 198                                int16_t      xDst,
 199                                int16_t      yDst,
 200                                uint16_t     width,
 201                                uint16_t     height)
 202 {
 203     uint32_t    *dstLine, *dst;
 204     uint32_t    *srcLine, *src;
 205     uint32_t    mask;
 206     int dstStride, srcStride;
 207     uint16_t    w;
 208     uint32_t component_half = 0x800080;
 209     uint32_t alpha_mask = 0xff;
 210
 211     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 212     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 213
 214     fbComposeGetSolid (pMask, mask, pDst->bits.format);
 215     mask = (mask) >> 24;
 216
 217     while (height--)
 218     {
 219         dst = dstLine;
 220         dstLine += dstStride;
 221         src = srcLine;
 222         srcLine += srcStride;
 223         w = width;
 224
 225 //#define inner_branch
 226         asm volatile (
 227                         "cmp %[w], #0\n\t"
 228                         "beq 2f\n\t"
 229                         "1:\n\t"
 230                         /* load src */
 231                         "ldr r5, [%[src]], #4\n\t"
 232 #ifdef inner_branch
 233                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 234                          * The 0x0 case also allows us to avoid doing an unecessary data
 235                          * write which is more valuable so we only check for that */
 236                         "cmp r5, #0\n\t"
 237                         "beq 3f\n\t"
 238
 239 #endif
 240                         "ldr r4, [%[dest]] \n\t"
 241
 242                         "uxtb16 r6, r5\n\t"
 243                         "uxtb16 r7, r5, ror #8\n\t"
 244
 245                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 246                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
 247                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
 248
 249                         "uxtab16 r6, r6, r6, ror #8\n\t"
 250                         "uxtab16 r7, r7, r7, ror #8\n\t"
 251
 252                         "uxtb16 r6, r6, ror #8\n\t"
 253                         "uxtb16 r7, r7, ror #8\n\t"
 254
 255                         /* recombine */
 256                         "orr r5, r6, r7, lsl #8\n\t"
 257
 258                         "uxtb16 r6, r4\n\t"
 259                         "uxtb16 r7, r4, ror #8\n\t"
 260
 261                         /* 255 - alpha */
 262                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 263
 264                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 265                         "mla r6, r6, r8, %[component_half]\n\t"
 266                         "mla r7, r7, r8, %[component_half]\n\t"
 267
 268                         "uxtab16 r6, r6, r6, ror #8\n\t"
 269                         "uxtab16 r7, r7, r7, ror #8\n\t"
 270
 271                         "uxtb16 r6, r6, ror #8\n\t"
 272                         "uxtb16 r7, r7, ror #8\n\t"
 273
 274                         /* recombine */
 275                         "orr r6, r6, r7, lsl #8\n\t"
 276
 277                         "uqadd8 r5, r6, r5\n\t"
 278
 279 #ifdef inner_branch
 280                         "3:\n\t"
 281
 282 #endif
 283                         "str r5, [%[dest]], #4\n\t"
 284                         /* increment counter and jmp to top */
 285                         "subs   %[w], %[w], #1\n\t"
 286                         "bne    1b\n\t"
 287                         "2:\n\t"
 288                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 289                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
 290                           [alpha_mask] "r" (alpha_mask)
 291                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 292                         );
 293     }
 294 }
 295
 296 void
 297 fbCompositeSolidMask_nx8x8888arm (pixman_op_t      op,
 298                                pixman_image_t * pSrc,
 299                                pixman_image_t * pMask,
 300                                pixman_image_t * pDst,
 301                                int16_t      xSrc,
 302                                int16_t      ySrc,
 303                                int16_t      xMask,
 304                                int16_t      yMask,
 305                                int16_t      xDst,
 306                                int16_t      yDst,
 307                                uint16_t     width,
 308                                uint16_t     height)
 309 {
 310     uint32_t     src, srca;
 311     uint32_t    *dstLine, *dst;
 312     uint8_t     *maskLine, *mask;
 313     int          dstStride, maskStride;
 314     uint16_t     w;
 315
 316     fbComposeGetSolid(pSrc, src, pDst->bits.format);
 317
 318     srca = src >> 24;
 319     if (src == 0)
 320         return;
 321
 322     uint32_t component_mask = 0xff00ff;
 323     uint32_t component_half = 0x800080;
 324
 325     uint32_t src_hi = (src >> 8) & component_mask;
 326     uint32_t src_lo = src & component_mask;
 327
 328     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 329     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
 330
 331     while (height--)
 332     {
 333         dst = dstLine;
 334         dstLine += dstStride;
 335         mask = maskLine;
 336         maskLine += maskStride;
 337         w = width;
 338
 339 //#define inner_branch
 340         asm volatile (
 341                         "cmp %[w], #0\n\t"
 342                         "beq 2f\n\t"
 343                         "1:\n\t"
 344                         /* load mask */
 345                         "ldrb r5, [%[mask]], #1\n\t"
 346 #ifdef inner_branch
 347                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 348                          * The 0x0 case also allows us to avoid doing an unecessary data
 349                          * write which is more valuable so we only check for that */
 350                         "cmp r5, #0\n\t"
 351                         "beq 3f\n\t"
 352
 353 #endif
 354                         "ldr r4, [%[dest]] \n\t"
 355
 356                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 357                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
 358                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
 359
 360                         "uxtab16 r6, r6, r6, ror #8\n\t"
 361                         "uxtab16 r7, r7, r7, ror #8\n\t"
 362
 363                         "uxtb16 r6, r6, ror #8\n\t"
 364                         "uxtb16 r7, r7, ror #8\n\t"
 365
 366                         /* recombine */
 367                         "orr r5, r6, r7, lsl #8\n\t"
 368
 369                         "uxtb16 r6, r4\n\t"
 370                         "uxtb16 r7, r4, ror #8\n\t"
 371
 372                         /* we could simplify this to use 'sub' if we were
 373                          * willing to give up a register for alpha_mask */
 374                         "mvn r8, r5\n\t"
 375                         "mov r8, r8, lsr #24\n\t"
 376
 377                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
 378                         "mla r6, r6, r8, %[component_half]\n\t"
 379                         "mla r7, r7, r8, %[component_half]\n\t"
 380
 381                         "uxtab16 r6, r6, r6, ror #8\n\t"
 382                         "uxtab16 r7, r7, r7, ror #8\n\t"
 383
 384                         "uxtb16 r6, r6, ror #8\n\t"
 385                         "uxtb16 r7, r7, ror #8\n\t"
 386
 387                         /* recombine */
 388                         "orr r6, r6, r7, lsl #8\n\t"
 389
 390                         "uqadd8 r5, r6, r5\n\t"
 391
 392 #ifdef inner_branch
 393                         "3:\n\t"
 394
 395 #endif
 396                         "str r5, [%[dest]], #4\n\t"
 397                         /* increment counter and jmp to top */
 398                         "subs   %[w], %[w], #1\n\t"
 399                         "bne    1b\n\t"
 400                         "2:\n\t"
 401                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
 402                         : [component_half] "r" (component_half),
 403                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
 404                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 405                         );
 406     }
 407 }