pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 /* Deal with an intrinsic that is defined differently in GCC */
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch (_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t
  43 unpack0565 (uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8 (0);
  49     gb = vshrq_n_u16 (rgb, 5);
  50     b = vshrq_n_u16 (rgb, 5 + 6);
  51
  52     res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
  53     res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
  54     res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
  55
  56     res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
  57     res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
  58     res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
  59
  60     res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
  61     res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
  62     res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
  63
  64     return res;
  65 }
  66
  67 #ifdef USE_GCC_INLINE_ASM
  68 /* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */
  69 #define vshll_n_u8(a, n) ({ uint16x8_t r; \
  70     asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; })
  71 #endif
  72
  73 static force_inline uint16x8_t
  74 pack0565 (uint8x8x4_t s)
  75 {
  76     uint16x8_t rgb, val_g, val_r;
  77
  78     rgb = vshll_n_u8 (s.val[2], 8);
  79     val_g = vshll_n_u8 (s.val[1], 8);
  80     val_r = vshll_n_u8 (s.val[0], 8);
  81     rgb = vsriq_n_u16 (rgb, val_g, 5);
  82     rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
  83
  84     return rgb;
  85 }
  86
  87 static force_inline uint8x8_t
  88 neon2mul (uint8x8_t x,
  89           uint8x8_t alpha)
  90 {
  91     uint16x8_t tmp, tmp2;
  92     uint8x8_t res;
  93
  94     tmp = vmull_u8 (x, alpha);
  95     tmp2 = vrshrq_n_u16 (tmp, 8);
  96     res = vraddhn_u16 (tmp, tmp2);
  97
  98     return res;
  99 }
 100
 101 static force_inline uint8x8x4_t
 102 neon8mul (uint8x8x4_t x,
 103           uint8x8_t   alpha)
 104 {
 105     uint16x8x4_t tmp;
 106     uint8x8x4_t res;
 107     uint16x8_t qtmp1, qtmp2;
 108
 109     tmp.val[0] = vmull_u8 (x.val[0], alpha);
 110     tmp.val[1] = vmull_u8 (x.val[1], alpha);
 111     tmp.val[2] = vmull_u8 (x.val[2], alpha);
 112     tmp.val[3] = vmull_u8 (x.val[3], alpha);
 113
 114     qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
 115     qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
 116     res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
 117     qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
 118     res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
 119     qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
 120     res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
 121     res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 122
 123     return res;
 124 }
 125
 126 static force_inline uint8x8x4_t
 127 neon8qadd (uint8x8x4_t x,
 128            uint8x8x4_t y)
 129 {
 130     uint8x8x4_t res;
 131
 132     res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
 133     res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
 134     res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
 135     res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 136
 137     return res;
 138 }
 139
 140 static void
 141 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
 142                               pixman_op_t               op,
 143                               pixman_image_t *          src_image,
 144                               pixman_image_t *          mask_image,
 145                               pixman_image_t *          dst_image,
 146                               int32_t                   src_x,
 147                               int32_t                   src_y,
 148                               int32_t                   mask_x,
 149                               int32_t                   mask_y,
 150                               int32_t                   dest_x,
 151                               int32_t                   dest_y,
 152                               int32_t                   width,
 153                               int32_t                   height)
 154 {
 155     uint8_t     *dst_line, *dst;
 156     uint8_t     *src_line, *src;
 157     int dst_stride, src_stride;
 158     uint16_t w;
 159
 160     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 161     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 162
 163     if (width >= 8)
 164     {
 165         /* Use overlapping 8-pixel method */
 166         while (height--)
 167         {
 168             uint8_t *keep_dst = 0;
 169             uint8x8_t sval, dval, temp;
 170
 171             dst = dst_line;
 172             dst_line += dst_stride;
 173             src = src_line;
 174             src_line += src_stride;
 175             w = width;
 176
 177 #ifndef USE_GCC_INLINE_ASM
 178             sval = vld1_u8 ((void *)src);
 179             dval = vld1_u8 ((void *)dst);
 180             keep_dst = dst;
 181
 182             temp = vqadd_u8 (dval, sval);
 183
 184             src += (w & 7);
 185             dst += (w & 7);
 186             w -= (w & 7);
 187
 188             while (w)
 189             {
 190                 sval = vld1_u8 ((void *)src);
 191                 dval = vld1_u8 ((void *)dst);
 192
 193                 vst1_u8 ((void *)keep_dst, temp);
 194                 keep_dst = dst;
 195
 196                 temp = vqadd_u8 (dval, sval);
 197
 198                 src += 8;
 199                 dst += 8;
 200                 w -= 8;
 201             }
 202
 203             vst1_u8 ((void *)keep_dst, temp);
 204 #else
 205             asm volatile (
 206 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 207                 "vld1.8  {d0}, [%[src]]\n\t"
 208                 "vld1.8  {d4}, [%[dst]]\n\t"
 209                 "mov     %[keep_dst], %[dst]\n\t"
 210
 211                 "and ip, %[w], #7\n\t"
 212                 "add %[src], %[src], ip\n\t"
 213                 "add %[dst], %[dst], ip\n\t"
 214                 "subs %[w], %[w], ip\n\t"
 215                 "b 9f\n\t"
 216 /* LOOP */
 217                 "2:\n\t"
 218                 "vld1.8  {d0}, [%[src]]!\n\t"
 219                 "vld1.8  {d4}, [%[dst]]!\n\t"
 220                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 221                 "sub     %[keep_dst], %[dst], #8\n\t"
 222                 "subs %[w], %[w], #8\n\t"
 223                 "9:\n\t"
 224                 "vqadd.u8 d20, d0, d4\n\t"
 225
 226                 "bne 2b\n\t"
 227
 228                 "1:\n\t"
 229                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 230
 231                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 232                 :
 233                 : "ip", "cc", "memory", "d0", "d4",
 234                 "d20"
 235                 );
 236 #endif
 237         }
 238     }
 239     else
 240     {
 241         const uint8_t nil = 0;
 242         const uint8x8_t vnil = vld1_dup_u8 (&nil);
 243
 244         while (height--)
 245         {
 246             uint8x8_t sval = vnil, dval = vnil;
 247             uint8_t *dst4 = 0, *dst2 = 0;
 248
 249             dst = dst_line;
 250             dst_line += dst_stride;
 251             src = src_line;
 252             src_line += src_stride;
 253             w = width;
 254
 255             if (w & 4)
 256             {
 257                 sval = vreinterpret_u8_u32 (
 258                     vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
 259                 dval = vreinterpret_u8_u32 (
 260                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
 261
 262                 dst4 = dst;
 263                 src += 4;
 264                 dst += 4;
 265             }
 266
 267             if (w & 2)
 268             {
 269                 sval = vreinterpret_u8_u16 (
 270                     vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
 271                 dval = vreinterpret_u8_u16 (
 272                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
 273
 274                 dst2 = dst;
 275                 src += 2;
 276                 dst += 2;
 277             }
 278
 279             if (w & 1)
 280             {
 281                 sval = vld1_lane_u8 (src, sval, 1);
 282                 dval = vld1_lane_u8 (dst, dval, 1);
 283             }
 284
 285             dval = vqadd_u8 (dval, sval);
 286
 287             if (w & 1)
 288                 vst1_lane_u8 (dst, dval, 1);
 289
 290             if (w & 2)
 291                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
 292
 293             if (w & 4)
 294                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
 295         }
 296     }
 297 }
 298
 299 static void
 300 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
 301                                pixman_op_t               op,
 302                                pixman_image_t *          src_image,
 303                                pixman_image_t *          mask_image,
 304                                pixman_image_t *          dst_image,
 305                                int32_t                   src_x,
 306                                int32_t                   src_y,
 307                                int32_t                   mask_x,
 308                                int32_t                   mask_y,
 309                                int32_t                   dest_x,
 310                                int32_t                   dest_y,
 311                                int32_t                   width,
 312                                int32_t                   height)
 313 {
 314     uint32_t    *dst_line, *dst;
 315     uint32_t    *src_line, *src;
 316     int dst_stride, src_stride;
 317     uint32_t w;
 318
 319     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 320     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 321
 322     if (width >= 8)
 323     {
 324         /* Use overlapping 8-pixel method */
 325         while (height--)
 326         {
 327             uint32_t *keep_dst = 0;
 328             uint8x8x4_t sval, dval, temp;
 329
 330             dst = dst_line;
 331             dst_line += dst_stride;
 332             src = src_line;
 333             src_line += src_stride;
 334             w = width;
 335
 336 #ifndef USE_GCC_INLINE_ASM
 337             sval = vld4_u8 ((void *)src);
 338             dval = vld4_u8 ((void *)dst);
 339             keep_dst = dst;
 340
 341             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 342             temp = neon8qadd (sval, temp);
 343
 344             src += (w & 7);
 345             dst += (w & 7);
 346             w -= (w & 7);
 347
 348             while (w)
 349             {
 350                 sval = vld4_u8 ((void *)src);
 351                 dval = vld4_u8 ((void *)dst);
 352
 353                 vst4_u8 ((void *)keep_dst, temp);
 354                 keep_dst = dst;
 355
 356                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 357                 temp = neon8qadd (sval, temp);
 358
 359                 src += 8;
 360                 dst += 8;
 361                 w -= 8;
 362             }
 363
 364             vst4_u8 ((void *)keep_dst, temp);
 365 #else
 366             asm volatile (
 367 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 368                 "vld4.8  {d0-d3}, [%[src]]\n\t"
 369                 "vld4.8  {d4-d7}, [%[dst]]\n\t"
 370                 "mov     %[keep_dst], %[dst]\n\t"
 371
 372                 "and ip, %[w], #7\n\t"
 373                 "add %[src], %[src], ip, LSL#2\n\t"
 374                 "add %[dst], %[dst], ip, LSL#2\n\t"
 375                 "subs %[w], %[w], ip\n\t"
 376                 "b 9f\n\t"
 377 /* LOOP */
 378                 "2:\n\t"
 379                 "vld4.8  {d0-d3}, [%[src]]!\n\t"
 380                 "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 381                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 382                 "sub     %[keep_dst], %[dst], #8*4\n\t"
 383                 "subs %[w], %[w], #8\n\t"
 384                 "9:\n\t"
 385                 "vmvn.8  d31, d3\n\t"
 386                 "vmull.u8 q10, d31, d4\n\t"
 387                 "vmull.u8 q11, d31, d5\n\t"
 388                 "vmull.u8 q12, d31, d6\n\t"
 389                 "vmull.u8 q13, d31, d7\n\t"
 390                 "vrshr.u16 q8, q10, #8\n\t"
 391                 "vrshr.u16 q9, q11, #8\n\t"
 392                 "vraddhn.u16 d20, q10, q8\n\t"
 393                 "vraddhn.u16 d21, q11, q9\n\t"
 394                 "vrshr.u16 q8, q12, #8\n\t"
 395                 "vrshr.u16 q9, q13, #8\n\t"
 396                 "vraddhn.u16 d22, q12, q8\n\t"
 397                 "vraddhn.u16 d23, q13, q9\n\t"
 398 /* result in d20-d23 */
 399                 "vqadd.u8 d20, d0, d20\n\t"
 400                 "vqadd.u8 d21, d1, d21\n\t"
 401                 "vqadd.u8 d22, d2, d22\n\t"
 402                 "vqadd.u8 d23, d3, d23\n\t"
 403
 404                 "bne 2b\n\t"
 405
 406                 "1:\n\t"
 407                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 408
 409                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 410                 :
 411                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 412                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
 413                 );
 414 #endif
 415         }
 416     }
 417     else
 418     {
 419         uint8x8_t alpha_selector = vreinterpret_u8_u64 (
 420             vcreate_u64 (0x0707070703030303ULL));
 421
 422         /* Handle width < 8 */
 423         while (height--)
 424         {
 425             dst = dst_line;
 426             dst_line += dst_stride;
 427             src = src_line;
 428             src_line += src_stride;
 429             w = width;
 430
 431             while (w >= 2)
 432             {
 433                 uint8x8_t sval, dval;
 434
 435                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 436                 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
 437                 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 438                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 439                 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 440
 441                 src += 2;
 442                 dst += 2;
 443                 w -= 2;
 444             }
 445
 446             if (w)
 447             {
 448                 uint8x8_t sval, dval;
 449
 450                 /* single 32-bit pixel in lane 0 */
 451                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));  /* only interested in lane 0 */
 452                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));  /* only interested in lane 0 */
 453                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 454                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 455             }
 456         }
 457     }
 458 }
 459
 460 static void
 461 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 462                                  pixman_op_t               op,
 463                                  pixman_image_t *          src_image,
 464                                  pixman_image_t *          mask_image,
 465                                  pixman_image_t *          dst_image,
 466                                  int32_t                   src_x,
 467                                  int32_t                   src_y,
 468                                  int32_t                   mask_x,
 469                                  int32_t                   mask_y,
 470                                  int32_t                   dest_x,
 471                                  int32_t                   dest_y,
 472                                  int32_t                   width,
 473                                  int32_t                   height)
 474 {
 475     uint32_t    *dst_line, *dst;
 476     uint32_t    *src_line, *src;
 477     uint32_t mask;
 478     int dst_stride, src_stride;
 479     uint32_t w;
 480     uint8x8_t mask_alpha;
 481
 482     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 483     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 484
 485     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 486     mask_alpha = vdup_n_u8 ((mask) >> 24);
 487
 488     if (width >= 8)
 489     {
 490         /* Use overlapping 8-pixel method */
 491         while (height--)
 492         {
 493             dst = dst_line;
 494             dst_line += dst_stride;
 495             src = src_line;
 496             src_line += src_stride;
 497             w = width;
 498
 499             uint32_t *keep_dst = 0;
 500
 501 #ifndef USE_GCC_INLINE_ASM
 502             uint8x8x4_t sval, dval, temp;
 503
 504             sval = vld4_u8 ((void *)src);
 505             dval = vld4_u8 ((void *)dst);
 506             keep_dst = dst;
 507
 508             sval = neon8mul (sval, mask_alpha);
 509             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 510             temp = neon8qadd (sval, temp);
 511
 512             src += (w & 7);
 513             dst += (w & 7);
 514             w -= (w & 7);
 515
 516             while (w)
 517             {
 518                 sval = vld4_u8 ((void *)src);
 519                 dval = vld4_u8 ((void *)dst);
 520
 521                 vst4_u8 ((void *)keep_dst, temp);
 522                 keep_dst = dst;
 523
 524                 sval = neon8mul (sval, mask_alpha);
 525                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 526                 temp = neon8qadd (sval, temp);
 527
 528                 src += 8;
 529                 dst += 8;
 530                 w -= 8;
 531             }
 532             vst4_u8 ((void *)keep_dst, temp);
 533 #else
 534             asm volatile (
 535 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 536                 "vdup.32      d30, %[mask]\n\t"
 537                 "vdup.8       d30, d30[3]\n\t"
 538
 539                 "vld4.8       {d0-d3}, [%[src]]\n\t"
 540                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 541                 "mov  %[keep_dst], %[dst]\n\t"
 542
 543                 "and  ip, %[w], #7\n\t"
 544                 "add  %[src], %[src], ip, LSL#2\n\t"
 545                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 546                 "subs  %[w], %[w], ip\n\t"
 547                 "b 9f\n\t"
 548 /* LOOP */
 549                 "2:\n\t"
 550                 "vld4.8       {d0-d3}, [%[src]]!\n\t"
 551                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 552                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 553                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 554                 "subs  %[w], %[w], #8\n\t"
 555
 556                 "9:\n\t"
 557                 "vmull.u8     q10, d30, d0\n\t"
 558                 "vmull.u8     q11, d30, d1\n\t"
 559                 "vmull.u8     q12, d30, d2\n\t"
 560                 "vmull.u8     q13, d30, d3\n\t"
 561                 "vrshr.u16    q8, q10, #8\n\t"
 562                 "vrshr.u16    q9, q11, #8\n\t"
 563                 "vraddhn.u16  d0, q10, q8\n\t"
 564                 "vraddhn.u16  d1, q11, q9\n\t"
 565                 "vrshr.u16    q9, q13, #8\n\t"
 566                 "vrshr.u16    q8, q12, #8\n\t"
 567                 "vraddhn.u16  d3, q13, q9\n\t"
 568                 "vraddhn.u16  d2, q12, q8\n\t"
 569
 570                 "vmvn.8       d31, d3\n\t"
 571                 "vmull.u8     q10, d31, d4\n\t"
 572                 "vmull.u8     q11, d31, d5\n\t"
 573                 "vmull.u8     q12, d31, d6\n\t"
 574                 "vmull.u8     q13, d31, d7\n\t"
 575                 "vrshr.u16    q8, q10, #8\n\t"
 576                 "vrshr.u16    q9, q11, #8\n\t"
 577                 "vraddhn.u16  d20, q10, q8\n\t"
 578                 "vrshr.u16    q8, q12, #8\n\t"
 579                 "vraddhn.u16  d21, q11, q9\n\t"
 580                 "vrshr.u16    q9, q13, #8\n\t"
 581                 "vraddhn.u16  d22, q12, q8\n\t"
 582                 "vraddhn.u16  d23, q13, q9\n\t"
 583
 584 /* result in d20-d23 */
 585                 "vqadd.u8     d20, d0, d20\n\t"
 586                 "vqadd.u8     d21, d1, d21\n\t"
 587                 "vqadd.u8     d22, d2, d22\n\t"
 588                 "vqadd.u8     d23, d3, d23\n\t"
 589
 590                 "bne  2b\n\t"
 591
 592                 "1:\n\t"
 593                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 594
 595                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 596                 : [mask] "r" (mask)
 597                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 598                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
 599                 "d30", "d31"
 600                 );
 601 #endif
 602         }
 603     }
 604     else
 605     {
 606         uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 607
 608         /* Handle width < 8 */
 609         while (height--)
 610         {
 611             dst = dst_line;
 612             dst_line += dst_stride;
 613             src = src_line;
 614             src_line += src_stride;
 615             w = width;
 616
 617             while (w >= 2)
 618             {
 619                 uint8x8_t sval, dval;
 620
 621                 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
 622                 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 623
 624                 /* sval * const alpha_mul */
 625                 sval = neon2mul (sval, mask_alpha);
 626
 627                 /* dval * 255-(src alpha) */
 628                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 629
 630                 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 631
 632                 src += 2;
 633                 dst += 2;
 634                 w -= 2;
 635             }
 636
 637             if (w)
 638             {
 639                 uint8x8_t sval, dval;
 640
 641                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
 642                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
 643
 644                 /* sval * const alpha_mul */
 645                 sval = neon2mul (sval, mask_alpha);
 646
 647                 /* dval * 255-(src alpha) */
 648                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 649
 650                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 651             }
 652         }
 653     }
 654 }
 655
 656 static void
 657 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
 658                               pixman_op_t               op,
 659                               pixman_image_t *          src_image,
 660                               pixman_image_t *          mask_image,
 661                               pixman_image_t *          dst_image,
 662                               int32_t                   src_x,
 663                               int32_t                   src_y,
 664                               int32_t                   mask_x,
 665                               int32_t                   mask_y,
 666                               int32_t                   dest_x,
 667                               int32_t                   dest_y,
 668                               int32_t                   width,
 669                               int32_t                   height)
 670 {
 671     uint32_t     src, srca;
 672     uint16_t    *dst_line, *dst;
 673     uint8_t     *mask_line, *mask;
 674     int          dst_stride, mask_stride;
 675     uint32_t     w;
 676     uint8x8_t    sval2;
 677     uint8x8x4_t  sval8;
 678
 679     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 680
 681     srca = src >> 24;
 682     if (src == 0)
 683         return;
 684
 685     sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
 686     sval8.val[0]=vdup_lane_u8 (sval2,0);
 687     sval8.val[1]=vdup_lane_u8 (sval2,1);
 688     sval8.val[2]=vdup_lane_u8 (sval2,2);
 689     sval8.val[3]=vdup_lane_u8 (sval2,3);
 690
 691     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 692     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 693
 694     if (width>=8)
 695     {
 696         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
 697         while (height--)
 698         {
 699             uint16_t *keep_dst=0;
 700
 701             dst = dst_line;
 702             dst_line += dst_stride;
 703             mask = mask_line;
 704             mask_line += mask_stride;
 705             w = width;
 706
 707 #ifndef USE_GCC_INLINE_ASM
 708             uint8x8_t alpha;
 709             uint16x8_t dval, temp;
 710             uint8x8x4_t sval8temp;
 711
 712             alpha = vld1_u8 ((void *)mask);
 713             dval = vld1q_u16 ((void *)dst);
 714             keep_dst = dst;
 715
 716             sval8temp = neon8mul (sval8, alpha);
 717             temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 718
 719             mask += (w & 7);
 720             dst += (w & 7);
 721             w -= (w & 7);
 722
 723             while (w)
 724             {
 725                 dval = vld1q_u16 ((void *)dst);
 726                 alpha = vld1_u8 ((void *)mask);
 727
 728                 vst1q_u16 ((void *)keep_dst, temp);
 729                 keep_dst = dst;
 730
 731                 sval8temp = neon8mul (sval8, alpha);
 732                 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 733
 734                 mask+=8;
 735                 dst+=8;
 736                 w-=8;
 737             }
 738             vst1q_u16 ((void *)keep_dst, temp);
 739 #else
 740             asm volatile (
 741                 "vdup.32      d0, %[src]\n\t"
 742                 "vdup.8       d1, d0[1]\n\t"
 743                 "vdup.8       d2, d0[2]\n\t"
 744                 "vdup.8       d3, d0[3]\n\t"
 745                 "vdup.8       d0, d0[0]\n\t"
 746
 747                 "vld1.8       {q12}, [%[dst]]\n\t"
 748                 "vld1.8       {d31}, [%[mask]]\n\t"
 749                 "mov  %[keep_dst], %[dst]\n\t"
 750
 751                 "and  ip, %[w], #7\n\t"
 752                 "add  %[mask], %[mask], ip\n\t"
 753                 "add  %[dst], %[dst], ip, LSL#1\n\t"
 754                 "subs  %[w], %[w], ip\n\t"
 755                 "b  9f\n\t"
 756 /* LOOP */
 757                 "2:\n\t"
 758
 759                 "vld1.16      {q12}, [%[dst]]!\n\t"
 760                 "vld1.8       {d31}, [%[mask]]!\n\t"
 761                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 762                 "sub  %[keep_dst], %[dst], #8*2\n\t"
 763                 "subs  %[w], %[w], #8\n\t"
 764                 "9:\n\t"
 765 /* expand 0565 q12 to 8888 {d4-d7} */
 766                 "vmovn.u16    d4, q12\t\n"
 767                 "vshr.u16     q11, q12, #5\t\n"
 768                 "vshr.u16     q10, q12, #6+5\t\n"
 769                 "vmovn.u16    d5, q11\t\n"
 770                 "vmovn.u16    d6, q10\t\n"
 771                 "vshl.u8      d4, d4, #3\t\n"
 772                 "vshl.u8      d5, d5, #2\t\n"
 773                 "vshl.u8      d6, d6, #3\t\n"
 774                 "vsri.u8      d4, d4, #5\t\n"
 775                 "vsri.u8      d5, d5, #6\t\n"
 776                 "vsri.u8      d6, d6, #5\t\n"
 777
 778                 "vmull.u8     q10, d31, d0\n\t"
 779                 "vmull.u8     q11, d31, d1\n\t"
 780                 "vmull.u8     q12, d31, d2\n\t"
 781                 "vmull.u8     q13, d31, d3\n\t"
 782                 "vrshr.u16    q8, q10, #8\n\t"
 783                 "vrshr.u16    q9, q11, #8\n\t"
 784                 "vraddhn.u16  d20, q10, q8\n\t"
 785                 "vraddhn.u16  d21, q11, q9\n\t"
 786                 "vrshr.u16    q9, q13, #8\n\t"
 787                 "vrshr.u16    q8, q12, #8\n\t"
 788                 "vraddhn.u16  d23, q13, q9\n\t"
 789                 "vraddhn.u16  d22, q12, q8\n\t"
 790
 791 /* duplicate in 4/2/1 & 8pix vsns */
 792                 "vmvn.8       d30, d23\n\t"
 793                 "vmull.u8     q14, d30, d6\n\t"
 794                 "vmull.u8     q13, d30, d5\n\t"
 795                 "vmull.u8     q12, d30, d4\n\t"
 796                 "vrshr.u16    q8, q14, #8\n\t"
 797                 "vrshr.u16    q9, q13, #8\n\t"
 798                 "vraddhn.u16  d6, q14, q8\n\t"
 799                 "vrshr.u16    q8, q12, #8\n\t"
 800                 "vraddhn.u16  d5, q13, q9\n\t"
 801                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 802                 "vraddhn.u16  d4, q12, q8\n\t"
 803 /* intentionally don't calculate alpha */
 804 /* result in d4-d6 */
 805
 806 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 807                 "vqadd.u8     d5, d5, d21\n\t"
 808                 "vqadd.u8     d4, d4, d20\n\t"
 809
 810 /* pack 8888 {d20-d23} to 0565 q10 */
 811                 "vshll.u8     q10, d6, #8\n\t"
 812                 "vshll.u8     q3, d5, #8\n\t"
 813                 "vshll.u8     q2, d4, #8\n\t"
 814                 "vsri.u16     q10, q3, #5\t\n"
 815                 "vsri.u16     q10, q2, #11\t\n"
 816
 817                 "bne 2b\n\t"
 818
 819                 "1:\n\t"
 820                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 821
 822                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 823                 : [src] "r" (src)
 824                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 825                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 826                   "d30","d31"
 827                 );
 828 #endif
 829         }
 830     }
 831     else
 832     {
 833         while (height--)
 834         {
 835             void *dst4=0, *dst2=0;
 836
 837             dst = dst_line;
 838             dst_line += dst_stride;
 839             mask = mask_line;
 840             mask_line += mask_stride;
 841             w = width;
 842
 843
 844 #if 1 /* #ifndef USE_GCC_INLINE_ASM */
 845             uint8x8_t alpha;
 846             uint16x8_t dval, temp;
 847             uint8x8x4_t sval8temp;
 848
 849             if (w&4)
 850             {
 851                 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
 852                 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
 853                 dst4=dst;
 854                 mask+=4;
 855                 dst+=4;
 856             }
 857             if (w&2)
 858             {
 859                 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
 860                 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
 861                 dst2=dst;
 862                 mask+=2;
 863                 dst+=2;
 864             }
 865             if (w&1)
 866             {
 867                 alpha = vld1_lane_u8 ((void *)mask, alpha,1);
 868                 dval = vld1q_lane_u16 ((void *)dst, dval,1);
 869             }
 870
 871             sval8temp = neon8mul (sval8, alpha);
 872             temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 873
 874             if (w&1)
 875                 vst1q_lane_u16 ((void *)dst, temp,1);
 876             if (w&2)
 877                 vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
 878             if (w&4)
 879                 vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
 880 #else
 881             /* this code has some bug (does not pass blitters-test) */
 882             asm volatile (
 883                 "vdup.32      d0, %[src]\n\t"
 884                 "vdup.8       d1, d0[1]\n\t"
 885                 "vdup.8       d2, d0[2]\n\t"
 886                 "vdup.8       d3, d0[3]\n\t"
 887                 "vdup.8       d0, d0[0]\n\t"
 888
 889                 "tst  %[w], #4\t\n"
 890                 "beq  skip_load4\t\n"
 891
 892                 "vld1.64      {d25}, [%[dst]]\n\t"
 893                 "vld1.32      {d31[1]}, [%[mask]]\n\t"
 894                 "mov  %[dst4], %[dst]\t\n"
 895                 "add  %[mask], %[mask], #4\t\n"
 896                 "add  %[dst], %[dst], #4*2\t\n"
 897
 898                 "skip_load4:\t\n"
 899                 "tst  %[w], #2\t\n"
 900                 "beq  skip_load2\t\n"
 901                 "vld1.32      {d24[1]}, [%[dst]]\n\t"
 902                 "vld1.16      {d31[1]}, [%[mask]]\n\t"
 903                 "mov  %[dst2], %[dst]\t\n"
 904                 "add  %[mask], %[mask], #2\t\n"
 905                 "add  %[dst], %[dst], #2*2\t\n"
 906
 907                 "skip_load2:\t\n"
 908                 "tst  %[w], #1\t\n"
 909                 "beq  skip_load1\t\n"
 910                 "vld1.16      {d24[1]}, [%[dst]]\n\t"
 911                 "vld1.8       {d31[1]}, [%[mask]]\n\t"
 912
 913                 "skip_load1:\t\n"
 914 /* expand 0565 q12 to 8888 {d4-d7} */
 915                 "vmovn.u16    d4, q12\t\n"
 916                 "vshr.u16     q11, q12, #5\t\n"
 917                 "vshr.u16     q10, q12, #6+5\t\n"
 918                 "vmovn.u16    d5, q11\t\n"
 919                 "vmovn.u16    d6, q10\t\n"
 920                 "vshl.u8      d4, d4, #3\t\n"
 921                 "vshl.u8      d5, d5, #2\t\n"
 922                 "vshl.u8      d6, d6, #3\t\n"
 923                 "vsri.u8      d4, d4, #5\t\n"
 924                 "vsri.u8      d5, d5, #6\t\n"
 925                 "vsri.u8      d6, d6, #5\t\n"
 926
 927                 "vmull.u8     q10, d31, d0\n\t"
 928                 "vmull.u8     q11, d31, d1\n\t"
 929                 "vmull.u8     q12, d31, d2\n\t"
 930                 "vmull.u8     q13, d31, d3\n\t"
 931                 "vrshr.u16    q8, q10, #8\n\t"
 932                 "vrshr.u16    q9, q11, #8\n\t"
 933                 "vraddhn.u16  d20, q10, q8\n\t"
 934                 "vraddhn.u16  d21, q11, q9\n\t"
 935                 "vrshr.u16    q9, q13, #8\n\t"
 936                 "vrshr.u16    q8, q12, #8\n\t"
 937                 "vraddhn.u16  d23, q13, q9\n\t"
 938                 "vraddhn.u16  d22, q12, q8\n\t"
 939
 940 /* duplicate in 4/2/1 & 8pix vsns */
 941                 "vmvn.8       d30, d23\n\t"
 942                 "vmull.u8     q14, d30, d6\n\t"
 943                 "vmull.u8     q13, d30, d5\n\t"
 944                 "vmull.u8     q12, d30, d4\n\t"
 945                 "vrshr.u16    q8, q14, #8\n\t"
 946                 "vrshr.u16    q9, q13, #8\n\t"
 947                 "vraddhn.u16  d6, q14, q8\n\t"
 948                 "vrshr.u16    q8, q12, #8\n\t"
 949                 "vraddhn.u16  d5, q13, q9\n\t"
 950                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 951                 "vraddhn.u16  d4, q12, q8\n\t"
 952 /* intentionally don't calculate alpha */
 953 /* result in d4-d6 */
 954
 955 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 956                 "vqadd.u8     d5, d5, d21\n\t"
 957                 "vqadd.u8     d4, d4, d20\n\t"
 958
 959 /* pack 8888 {d20-d23} to 0565 q10 */
 960                 "vshll.u8     q10, d6, #8\n\t"
 961                 "vshll.u8     q3, d5, #8\n\t"
 962                 "vshll.u8     q2, d4, #8\n\t"
 963                 "vsri.u16     q10, q3, #5\t\n"
 964                 "vsri.u16     q10, q2, #11\t\n"
 965
 966                 "tst  %[w], #1\n\t"
 967                 "beq skip_store1\t\n"
 968                 "vst1.16      {d20[1]}, [%[dst]]\t\n"
 969                 "skip_store1:\t\n"
 970                 "tst  %[w], #2\n\t"
 971                 "beq  skip_store2\t\n"
 972                 "vst1.32      {d20[1]}, [%[dst2]]\t\n"
 973                 "skip_store2:\t\n"
 974                 "tst  %[w], #4\n\t"
 975                 "beq skip_store4\t\n"
 976                 "vst1.16      {d21}, [%[dst4]]\t\n"
 977                 "skip_store4:\t\n"
 978
 979                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
 980                 : [src] "r" (src)
 981                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 982                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 983                   "d30","d31"
 984                 );
 985 #endif
 986         }
 987     }
 988 }
 989
 990 static void
 991 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
 992                               pixman_op_t               op,
 993                               pixman_image_t *          src_image,
 994                               pixman_image_t *          mask_image,
 995                               pixman_image_t *          dst_image,
 996                               int32_t                   src_x,
 997                               int32_t                   src_y,
 998                               int32_t                   mask_x,
 999                               int32_t                   mask_y,
1000                               int32_t                   dest_x,
1001                               int32_t                   dest_y,
1002                               int32_t                   width,
1003                               int32_t                   height)
1004 {
1005     uint32_t src, srca;
1006     uint32_t    *dst_line, *dst;
1007     uint8_t     *mask_line, *mask;
1008     int dst_stride, mask_stride;
1009     uint32_t w;
1010     uint8x8_t sval2;
1011     uint8x8x4_t sval8;
1012     uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1013     uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1014
1015     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1016
1017     /* bail out if fully transparent */
1018     srca = src >> 24;
1019     if (src == 0)
1020         return;
1021
1022     sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1023     sval8.val[0] = vdup_lane_u8 (sval2, 0);
1024     sval8.val[1] = vdup_lane_u8 (sval2, 1);
1025     sval8.val[2] = vdup_lane_u8 (sval2, 2);
1026     sval8.val[3] = vdup_lane_u8 (sval2, 3);
1027
1028     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1029     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1030
1031     if (width >= 8)
1032     {
1033         /* Use overlapping 8-pixel method, modified to avoid
1034          * rewritten dest being reused
1035          */
1036         while (height--)
1037         {
1038             uint32_t *keep_dst = 0;
1039
1040             dst = dst_line;
1041             dst_line += dst_stride;
1042             mask = mask_line;
1043             mask_line += mask_stride;
1044             w = width;
1045
1046 #ifndef USE_GCC_INLINE_ASM
1047             uint8x8_t alpha;
1048             uint8x8x4_t dval, temp;
1049
1050             alpha = vld1_u8 ((void *)mask);
1051             dval = vld4_u8 ((void *)dst);
1052             keep_dst = dst;
1053
1054             temp = neon8mul (sval8, alpha);
1055             dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1056             temp = neon8qadd (temp, dval);
1057
1058             mask += (w & 7);
1059             dst += (w & 7);
1060             w -= (w & 7);
1061
1062             while (w)
1063             {
1064                 alpha = vld1_u8 ((void *)mask);
1065                 dval = vld4_u8 ((void *)dst);
1066
1067                 vst4_u8 ((void *)keep_dst, temp);
1068                 keep_dst = dst;
1069
1070                 temp = neon8mul (sval8, alpha);
1071                 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1072                 temp = neon8qadd (temp, dval);
1073
1074                 mask += 8;
1075                 dst += 8;
1076                 w -= 8;
1077             }
1078             vst4_u8 ((void *)keep_dst, temp);
1079 #else
1080             asm volatile (
1081                 "vdup.32      d0, %[src]\n\t"
1082                 "vdup.8       d1, d0[1]\n\t"
1083                 "vdup.8       d2, d0[2]\n\t"
1084                 "vdup.8       d3, d0[3]\n\t"
1085                 "vdup.8       d0, d0[0]\n\t"
1086
1087                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
1088                 "vld1.8       {d31}, [%[mask]]\n\t"
1089                 "mov  %[keep_dst], %[dst]\n\t"
1090
1091                 "and  ip, %[w], #7\n\t"
1092                 "add  %[mask], %[mask], ip\n\t"
1093                 "add  %[dst], %[dst], ip, LSL#2\n\t"
1094                 "subs  %[w], %[w], ip\n\t"
1095                 "b 9f\n\t"
1096 /* LOOP */
1097                 "2:\n\t"
1098                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
1099                 "vld1.8       {d31}, [%[mask]]!\n\t"
1100                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1101                 "sub  %[keep_dst], %[dst], #8*4\n\t"
1102                 "subs  %[w], %[w], #8\n\t"
1103                 "9:\n\t"
1104
1105                 "vmull.u8     q10, d31, d0\n\t"
1106                 "vmull.u8     q11, d31, d1\n\t"
1107                 "vmull.u8     q12, d31, d2\n\t"
1108                 "vmull.u8     q13, d31, d3\n\t"
1109                 "vrshr.u16    q8, q10, #8\n\t"
1110                 "vrshr.u16    q9, q11, #8\n\t"
1111                 "vraddhn.u16  d20, q10, q8\n\t"
1112                 "vraddhn.u16  d21, q11, q9\n\t"
1113                 "vrshr.u16    q9, q13, #8\n\t"
1114                 "vrshr.u16    q8, q12, #8\n\t"
1115                 "vraddhn.u16  d23, q13, q9\n\t"
1116                 "vraddhn.u16  d22, q12, q8\n\t"
1117
1118                 "vmvn.8       d30, d23\n\t"
1119                 "vmull.u8     q12, d30, d4\n\t"
1120                 "vmull.u8     q13, d30, d5\n\t"
1121                 "vmull.u8     q14, d30, d6\n\t"
1122                 "vmull.u8     q15, d30, d7\n\t"
1123
1124                 "vrshr.u16    q8, q12, #8\n\t"
1125                 "vrshr.u16    q9, q13, #8\n\t"
1126                 "vraddhn.u16  d4, q12, q8\n\t"
1127                 "vrshr.u16    q8, q14, #8\n\t"
1128                 "vraddhn.u16  d5, q13, q9\n\t"
1129                 "vrshr.u16    q9, q15, #8\n\t"
1130                 "vraddhn.u16  d6, q14, q8\n\t"
1131                 "vraddhn.u16  d7, q15, q9\n\t"
1132 /* result in d4-d7 */
1133
1134                 "vqadd.u8     d20, d4, d20\n\t"
1135                 "vqadd.u8     d21, d5, d21\n\t"
1136                 "vqadd.u8     d22, d6, d22\n\t"
1137                 "vqadd.u8     d23, d7, d23\n\t"
1138
1139                 "bne 2b\n\t"
1140
1141                 "1:\n\t"
1142                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1143
1144                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1145                 : [src] "r" (src)
1146                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1147                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1148                 "d30", "d31"
1149                 );
1150 #endif
1151         }
1152     }
1153     else
1154     {
1155         while (height--)
1156         {
1157             uint8x8_t alpha;
1158
1159             dst = dst_line;
1160             dst_line += dst_stride;
1161             mask = mask_line;
1162             mask_line += mask_stride;
1163             w = width;
1164
1165             while (w >= 2)
1166             {
1167                 uint8x8_t dval, temp, res;
1168
1169                 alpha = vtbl1_u8 (
1170                     vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
1171                 dval = vld1_u8 ((void *)dst);
1172
1173                 temp = neon2mul (sval2, alpha);
1174                 res = vqadd_u8 (
1175                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1176
1177                 vst1_u8 ((void *)dst, res);
1178
1179                 mask += 2;
1180                 dst += 2;
1181                 w -= 2;
1182             }
1183
1184             if (w)
1185             {
1186                 uint8x8_t dval, temp, res;
1187
1188                 alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
1189                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
1190
1191                 temp = neon2mul (sval2, alpha);
1192                 res = vqadd_u8 (
1193                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1194
1195                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
1196             }
1197         }
1198     }
1199 }
1200
1201 static void
1202 neon_composite_add_n_8_8 (pixman_implementation_t * impl,
1203                           pixman_op_t               op,
1204                           pixman_image_t *          src_image,
1205                           pixman_image_t *          mask_image,
1206                           pixman_image_t *          dst_image,
1207                           int32_t                   src_x,
1208                           int32_t                   src_y,
1209                           int32_t                   mask_x,
1210                           int32_t                   mask_y,
1211                           int32_t                   dest_x,
1212                           int32_t                   dest_y,
1213                           int32_t                   width,
1214                           int32_t                   height)
1215 {
1216     uint8_t     *dst_line, *dst;
1217     uint8_t     *mask_line, *mask;
1218     int dst_stride, mask_stride;
1219     uint32_t w;
1220     uint32_t src;
1221     uint8x8_t sa;
1222
1223     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1224     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1225     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1226     sa = vdup_n_u8 ((src) >> 24);
1227
1228     if (width >= 8)
1229     {
1230         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1231         while (height--)
1232         {
1233             dst = dst_line;
1234             dst_line += dst_stride;
1235             mask = mask_line;
1236             mask_line += mask_stride;
1237             w = width;
1238
1239             uint8x8_t mval, dval, res;
1240             uint8_t     *keep_dst;
1241
1242             mval = vld1_u8 ((void *)mask);
1243             dval = vld1_u8 ((void *)dst);
1244             keep_dst = dst;
1245
1246             res = vqadd_u8 (neon2mul (mval, sa), dval);
1247
1248             mask += (w & 7);
1249             dst += (w & 7);
1250             w -= w & 7;
1251
1252             while (w)
1253             {
1254                 mval = vld1_u8 ((void *)mask);
1255                 dval = vld1_u8 ((void *)dst);
1256                 vst1_u8 ((void *)keep_dst, res);
1257                 keep_dst = dst;
1258
1259                 res = vqadd_u8 (neon2mul (mval, sa), dval);
1260
1261                 mask += 8;
1262                 dst += 8;
1263                 w -= 8;
1264             }
1265             vst1_u8 ((void *)keep_dst, res);
1266         }
1267     }
1268     else
1269     {
1270         /* Use 4/2/1 load/store method to handle 1-7 pixels */
1271         while (height--)
1272         {
1273             dst = dst_line;
1274             dst_line += dst_stride;
1275             mask = mask_line;
1276             mask_line += mask_stride;
1277             w = width;
1278
1279             uint8x8_t mval = sa, dval = sa, res;
1280             uint8_t *dst4 = 0, *dst2 = 0;
1281
1282             if (w & 4)
1283             {
1284                 mval = vreinterpret_u8_u32 (
1285                     vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1286                 dval = vreinterpret_u8_u32 (
1287                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1288
1289                 dst4 = dst;
1290                 mask += 4;
1291                 dst += 4;
1292             }
1293
1294             if (w & 2)
1295             {
1296                 mval = vreinterpret_u8_u16 (
1297                     vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1298                 dval = vreinterpret_u8_u16 (
1299                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1300                 dst2 = dst;
1301                 mask += 2;
1302                 dst += 2;
1303             }
1304
1305             if (w & 1)
1306             {
1307                 mval = vld1_lane_u8 (mask, mval, 1);
1308                 dval = vld1_lane_u8 (dst, dval, 1);
1309             }
1310
1311             res = vqadd_u8 (neon2mul (mval, sa), dval);
1312
1313             if (w & 1)
1314                 vst1_lane_u8 (dst, res, 1);
1315             if (w & 2)
1316                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1317             if (w & 4)
1318                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1319         }
1320     }
1321 }
1322
1323 #ifdef USE_GCC_INLINE_ASM
1324
1325 static void
1326 neon_composite_src_16_16 (pixman_implementation_t * impl,
1327                           pixman_op_t               op,
1328                           pixman_image_t *          src_image,
1329                           pixman_image_t *          mask_image,
1330                           pixman_image_t *          dst_image,
1331                           int32_t                   src_x,
1332                           int32_t                   src_y,
1333                           int32_t                   mask_x,
1334                           int32_t                   mask_y,
1335                           int32_t                   dest_x,
1336                           int32_t                   dest_y,
1337                           int32_t                   width,
1338                           int32_t                   height)
1339 {
1340     uint16_t    *dst_line, *src_line;
1341     uint32_t dst_stride, src_stride;
1342
1343     if (!height || !width)
1344         return;
1345
1346     /* We simply copy 16-bit-aligned pixels from one place to another. */
1347     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1348     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1349
1350     /* Preload the first input scanline */
1351     {
1352         uint16_t *src_ptr = src_line;
1353         uint32_t count = width;
1354
1355         asm volatile (
1356             "0: @ loop                                                  \n"
1357             "   subs    %[count], %[count], #32                         \n"
1358             "   pld     [%[src]]                                        \n"
1359             "   add     %[src], %[src], #64                             \n"
1360             "   bgt 0b                                                  \n"
1361
1362             /* Clobbered input registers marked as input/outputs */
1363             : [src] "+r" (src_ptr), [count] "+r" (count)
1364             :     /* no unclobbered inputs */
1365             : "cc"
1366             );
1367     }
1368
1369     while (height--)
1370     {
1371         uint16_t *dst_ptr = dst_line;
1372         uint16_t *src_ptr = src_line;
1373         uint32_t count = width;
1374         uint32_t tmp = 0;
1375
1376         /* Uses multi-register access and preloading to maximise bandwidth.
1377          * Each pixel is one halfword, so a quadword contains 8px.
1378          * Preload frequency assumed a 64-byte cacheline.
1379          */
1380         asm volatile (
1381             "   cmp       %[count], #64                         \n"
1382             "   blt 1f    @ skip oversized fragments            \n"
1383             "0: @ start with eight quadwords at a time          \n"
1384             /* preload from next scanline */
1385             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1386             "   sub       %[count], %[count], #64               \n"
1387             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1388             "   vld1.16   {d20, d21, d22, d23}, [%[src]]!               \n"
1389             /* preload from next scanline */
1390             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1391             "   vld1.16   {d24, d25, d26, d27}, [%[src]]!               \n"
1392             "   vld1.16   {d28, d29, d30, d31}, [%[src]]!               \n"
1393             "   cmp       %[count], #64                         \n"
1394             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1395             "   vst1.16   {d20, d21, d22, d23}, [%[dst]]!               \n"
1396             "   vst1.16   {d24, d25, d26, d27}, [%[dst]]!               \n"
1397             "   vst1.16   {d28, d29, d30, d31}, [%[dst]]!               \n"
1398             "   bge 0b                                          \n"
1399             "   cmp       %[count], #0                          \n"
1400             "   beq 7f    @ aligned fastpath                    \n"
1401             "1: @ four quadwords                                \n"
1402             "   tst       %[count], #32                         \n"
1403             "   beq 2f    @ skip oversized fragment             \n"
1404             /* preload from next scanline */
1405             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1406             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1407             "   vld1.16   {d20, d21, d22, d23}, [%[src]]!               \n"
1408             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1409             "   vst1.16   {d20, d21, d22, d23}, [%[dst]]!               \n"
1410             "2: @ two quadwords                                 \n"
1411             "   tst       %[count], #16                         \n"
1412             "   beq 3f    @ skip oversized fragment             \n"
1413             /* preload from next scanline */
1414             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1415             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1416             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1417             "3: @ one quadword                                  \n"
1418             "   tst       %[count], #8                          \n"
1419             "   beq 4f    @ skip oversized fragment             \n"
1420             "   vld1.16   {d16, d17}, [%[src]]!                 \n"
1421             "   vst1.16   {d16, d17}, [%[dst]]!                 \n"
1422             "4: @ one doubleword                                \n"
1423             "   tst       %[count], #4                          \n"
1424             "   beq 5f    @ skip oversized fragment             \n"
1425             "   vld1.16   {d16}, [%[src]]!                      \n"
1426             "   vst1.16   {d16}, [%[dst]]!                      \n"
1427             "5: @ one word                                      \n"
1428             "   tst       %[count], #2                          \n"
1429             "   beq 6f    @ skip oversized fragment             \n"
1430             "   ldr       %[tmp], [%[src]], #4                  \n"
1431             "   str       %[tmp], [%[dst]], #4                  \n"
1432             "6: @ one halfword                                  \n"
1433             "   tst       %[count], #1                          \n"
1434             "   beq 7f    @ skip oversized fragment             \n"
1435             "   ldrh      %[tmp], [%[src]]                      \n"
1436             "   strh      %[tmp], [%[dst]]                      \n"
1437             "7: @ end                                           \n"
1438
1439             /* Clobbered input registers marked as input/outputs */
1440             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1441               [count] "+r" (count), [tmp] "+r" (tmp)
1442
1443               /* Unclobbered input */
1444             : [src_stride] "r" (src_stride)
1445
1446               /* Clobbered vector registers */
1447             : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1448               "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1449             );
1450
1451         src_line += src_stride;
1452         dst_line += dst_stride;
1453     }
1454 }
1455
1456 #endif /* USE_GCC_INLINE_ASM */
1457
1458 static void
1459 neon_composite_src_24_16 (pixman_implementation_t * impl,
1460                           pixman_op_t               op,
1461                           pixman_image_t *          src_image,
1462                           pixman_image_t *          mask_image,
1463                           pixman_image_t *          dst_image,
1464                           int32_t                   src_x,
1465                           int32_t                   src_y,
1466                           int32_t                   mask_x,
1467                           int32_t                   mask_y,
1468                           int32_t                   dest_x,
1469                           int32_t                   dest_y,
1470                           int32_t                   width,
1471                           int32_t                   height)
1472 {
1473     uint16_t    *dst_line;
1474     uint32_t    *src_line;
1475     uint32_t dst_stride, src_stride;
1476
1477     if (!width || !height)
1478         return;
1479
1480     /* We simply copy pixels from one place to another,
1481      * assuming that the source's alpha is opaque.
1482      */
1483     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1484     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1485
1486     /* Preload the first input scanline */
1487     {
1488         uint8_t *src_ptr = (uint8_t*) src_line;
1489         uint32_t count = (width + 15) / 16;
1490
1491 #ifdef USE_GCC_INLINE_ASM
1492         asm volatile (
1493             "0: @ loop                                          \n"
1494             "   subs    %[count], %[count], #1                  \n"
1495             "   pld     [%[src]]                                \n"
1496             "   add     %[src], %[src], #64                     \n"
1497             "   bgt 0b                                          \n"
1498
1499             /* Clobbered input registers marked as input/outputs */
1500             : [src] "+r" (src_ptr), [count] "+r" (count)
1501             :     /* no unclobbered inputs */
1502             : "cc"
1503             );
1504 #else
1505         do
1506         {
1507             __pld (src_ptr);
1508             src_ptr += 64;
1509         }
1510         while (--count);
1511 #endif
1512     }
1513
1514     while (height--)
1515     {
1516         uint16_t *dst_ptr = dst_line;
1517         uint32_t *src_ptr = src_line;
1518         uint32_t count = width;
1519         const uint32_t rb_mask = 0x1F;
1520         const uint32_t g_mask = 0x3F;
1521
1522         /* If you're going to complain about a goto, take a long hard look
1523          * at the massive blocks of assembler this skips over.  ;-)
1524          */
1525         if (count < 8)
1526             goto small_stuff;
1527
1528 #ifdef USE_GCC_INLINE_ASM
1529
1530         /* This is not as aggressive as the RGB565-source case.
1531          * Generally the source is in cached RAM when the formats are
1532          * different, so we use preload.
1533          *
1534          * We don't need to blend, so we are not reading from the
1535          * uncached framebuffer.
1536          */
1537         asm volatile (
1538             "   cmp       %[count], #16                         \n"
1539             "   blt 1f    @ skip oversized fragments            \n"
1540             "0: @ start with sixteen pixels at a time           \n"
1541             "   sub       %[count], %[count], #16               \n"
1542             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1543             "   vld4.8    {d0, d1, d2, d3}, [%[src]]!           @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1544             "   vld4.8    {d4, d5, d6, d7}, [%[src]]!           @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1545             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1546             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1547             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1548             "   vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1549             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1550             "   vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1551             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1552             "   vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1553             "   vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1554             "   vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1555             "   cmp       %[count], #16                         \n"
1556             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!          @ store 16 pixels                            \n"
1557             "   bge 0b                                          \n"
1558             "1: @ end of main loop                              \n"
1559             "   cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1560             "   blt 2f                                          \n"
1561             "   sub       %[count], %[count], #8                \n"
1562             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1563             "   vld4.8    {d0, d1, d2, d3}, [%[src]]!           @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1564             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1565             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1566             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1567             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1568             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1569             "   vst1.16   {d16, d17}, [%[dst]]!          @ store 8 pixels                               \n"
1570             "2: @ end                                           \n"
1571
1572             /* Clobbered input and working registers marked as input/outputs */
1573             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1574
1575               /* Unclobbered input */
1576             : [src_stride] "r" (src_stride)
1577
1578               /* Clobbered vector registers */
1579
1580               /* NB: these are the quad aliases of the
1581                * double registers used in the asm
1582                */
1583             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1584               "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1585             );
1586 #else
1587         /* A copy of the above code, in intrinsics-form. */
1588         while (count >= 16)
1589         {
1590             uint8x8x4_t pixel_set_a, pixel_set_b;
1591             uint16x8_t red_a, green_a, blue_a;
1592             uint16x8_t red_b, green_b, blue_b;
1593             uint16x8_t dest_pixels_a, dest_pixels_b;
1594
1595             count -= 16;
1596             __pld (src_ptr + src_stride);
1597             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1598             pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1599             src_ptr += 16;
1600
1601             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1602             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1603             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1604
1605             red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
1606             green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1607             blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
1608
1609             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1610             dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1611
1612             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1613             dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1614
1615             /* There doesn't seem to be an intrinsic for the
1616              * double-quadword variant
1617              */
1618             vst1q_u16 (dst_ptr, dest_pixels_a);
1619             vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1620             dst_ptr += 16;
1621         }
1622
1623         /* 8-pixel loop */
1624         if (count >= 8)
1625         {
1626             uint8x8x4_t pixel_set_a;
1627             uint16x8_t red_a, green_a, blue_a;
1628             uint16x8_t dest_pixels_a;
1629
1630             __pld (src_ptr + src_stride);
1631             count -= 8;
1632             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1633             src_ptr += 8;
1634
1635             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1636             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1637             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1638
1639             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1640             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1641
1642             vst1q_u16 (dst_ptr, dest_pixels_a);
1643             dst_ptr += 8;
1644         }
1645
1646 #endif  /* USE_GCC_INLINE_ASM */
1647
1648     small_stuff:
1649         if (count)
1650             __pld (src_ptr + src_stride);
1651
1652         while (count >= 2)
1653         {
1654             uint32_t src_pixel_a = *src_ptr++;
1655             uint32_t src_pixel_b = *src_ptr++;
1656
1657             /* ARM is really good at shift-then-ALU ops. */
1658             /* This should be a total of six shift-ANDs and five shift-ORs. */
1659             uint32_t dst_pixels_a;
1660             uint32_t dst_pixels_b;
1661
1662             dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1663             dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1664             dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1665
1666             dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1667             dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1668             dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1669
1670             /* little-endian mode only */
1671             *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1672             dst_ptr += 2;
1673             count -= 2;
1674         }
1675
1676         if (count)
1677         {
1678             uint32_t src_pixel = *src_ptr++;
1679
1680             /* ARM is really good at shift-then-ALU ops.
1681              * This block should end up as three shift-ANDs
1682              * and two shift-ORs.
1683              */
1684             uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1685             uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1686             uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1687             uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1688
1689             *dst_ptr++ = dst_pixel;
1690             count--;
1691         }
1692
1693         src_line += src_stride;
1694         dst_line += dst_stride;
1695     }
1696 }
1697
1698 static pixman_bool_t
1699 pixman_fill_neon (uint32_t *bits,
1700                   int       stride,
1701                   int       bpp,
1702                   int       x,
1703                   int       y,
1704                   int       width,
1705                   int       height,
1706                   uint32_t  _xor)
1707 {
1708     uint32_t byte_stride, color;
1709     char *dst;
1710
1711     /* stride is always multiple of 32bit units in pixman */
1712     byte_stride = stride * sizeof(uint32_t);
1713
1714     switch (bpp)
1715     {
1716     case 8:
1717         dst = ((char *) bits) + y * byte_stride + x;
1718         _xor &= 0xff;
1719         color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1720         break;
1721
1722     case 16:
1723         dst = ((char *) bits) + y * byte_stride + x * 2;
1724         _xor &= 0xffff;
1725         color = _xor << 16 | _xor;
1726         width *= 2;         /* width to bytes */
1727         break;
1728
1729     case 32:
1730         dst = ((char *) bits) + y * byte_stride + x * 4;
1731         color = _xor;
1732         width *= 4;         /* width to bytes */
1733         break;
1734
1735     default:
1736         return FALSE;
1737     }
1738
1739 #ifdef USE_GCC_INLINE_ASM
1740     if (width < 16)
1741     {
1742         /* We have a special case for such small widths that don't allow
1743          * us to use wide 128-bit stores anyway. We don't waste time
1744          * trying to align writes, since there are only very few of them anyway
1745          */
1746         asm volatile (
1747             "cmp                %[height], #0\n"/* Check if empty fill */
1748             "beq                3f\n"
1749             "vdup.32    d0, %[color]\n"/* Fill the color to neon req */
1750
1751             /* Check if we have a such width that can easily be handled by single
1752              * operation for each scanline. This significantly reduces the number
1753              * of test/branch instructions for each scanline
1754              */
1755             "cmp                %[width], #8\n"
1756             "beq                4f\n"
1757             "cmp                %[width], #4\n"
1758             "beq                5f\n"
1759             "cmp                %[width], #2\n"
1760             "beq                6f\n"
1761
1762             /* Loop starts here for each scanline */
1763             "1:\n"
1764             "mov                r4, %[dst]\n" /* Starting address of the current line */
1765             "tst                %[width], #8\n"
1766             "beq                2f\n"
1767             "vst1.8             {d0}, [r4]!\n"
1768             "2:\n"
1769             "tst                %[width], #4\n"
1770             "beq                2f\n"
1771             "str                %[color], [r4], #4\n"
1772             "2:\n"
1773             "tst                %[width], #2\n"
1774             "beq                2f\n"
1775             "strh               %[color], [r4], #2\n"
1776             "2:\n"
1777             "tst                %[width], #1\n"
1778             "beq                2f\n"
1779             "strb               %[color], [r4], #1\n"
1780             "2:\n"
1781
1782             "subs               %[height], %[height], #1\n"
1783             "add                %[dst], %[dst], %[byte_stride]\n"
1784             "bne                1b\n"
1785             "b          3f\n"
1786
1787             /* Special fillers for those widths that we can do with single operation */
1788             "4:\n"
1789             "subs               %[height], %[height], #1\n"
1790             "vst1.8             {d0}, [%[dst]]\n"
1791             "add                %[dst], %[dst], %[byte_stride]\n"
1792             "bne                4b\n"
1793             "b          3f\n"
1794
1795             "5:\n"
1796             "subs               %[height], %[height], #1\n"
1797             "str                %[color], [%[dst]]\n"
1798             "add                %[dst], %[dst], %[byte_stride]\n"
1799             "bne                5b\n"
1800             "b          3f\n"
1801
1802             "6:\n"
1803             "subs               %[height], %[height], #1\n"
1804             "strh               %[color], [%[dst]]\n"
1805             "add                %[dst], %[dst], %[byte_stride]\n"
1806             "bne                6b\n"
1807
1808             "3:\n"
1809             : [height] "+r" (height), [dst] "+r" (dst)
1810             : [color] "r" (color), [width] "r" (width),
1811               [byte_stride] "r" (byte_stride)
1812             : "memory", "cc", "d0", "r4");
1813     }
1814     else
1815     {
1816         asm volatile (
1817             "cmp                %[height], #0\n"/* Check if empty fill */
1818             "beq                5f\n"
1819             "vdup.32    q0, %[color]\n"/* Fill the color to neon req */
1820
1821             /* Loop starts here for each scanline */
1822             "1:\n"
1823             "mov                r4, %[dst]\n"/* Starting address of the current line */
1824             "mov                r5, %[width]\n"/* We're going to write this many bytes */
1825             "ands               r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1826             "beq                2f\n"/* Jump to the best case */
1827
1828             /* We're not 128-bit aligned: However, we know that we can get to the
1829                next aligned location, since the fill is at least 16 bytes wide */
1830             "rsb                r6, r6, #16\n" /* We would need to go forward this much */
1831             "sub                r5, r5, r6\n"/* Update bytes left */
1832             "tst                r6, #1\n"
1833             "beq                6f\n"
1834             "vst1.8             {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1835             "6:\n"
1836             "tst                r6, #2\n"
1837             "beq                6f\n"
1838             "vst1.16    {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1839             "6:\n"
1840             "tst                r6, #4\n"
1841             "beq                6f\n"
1842             "vst1.32    {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1843             "6:\n"
1844             "tst                r6, #8\n"
1845             "beq                2f\n"
1846             "vst1.64    {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1847
1848             /* The good case: We're 128-bit aligned for this scanline */
1849             "2:\n"
1850             "and                r6, r5, #15\n"/* Number of tailing bytes */
1851             "cmp                r5, r6\n"/* Do we have at least one qword to write? */
1852             "beq                6f\n"/* No, we just write the tail */
1853             "lsr                r5, r5, #4\n"/* This many full qwords to write */
1854
1855             /* The main block: Do 128-bit aligned writes */
1856             "3:\n"
1857             "subs               r5, r5, #1\n"
1858             "vst1.64    {d0, d1}, [r4, :128]!\n"
1859             "bne                3b\n"
1860
1861             /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1862                We know that we're currently at 128-bit aligned address, so we can just
1863                pick the biggest operations that the remaining write width allows */
1864             "6:\n"
1865             "cmp                r6, #0\n"
1866             "beq                4f\n"
1867             "tst                r6, #8\n"
1868             "beq                6f\n"
1869             "vst1.64    {d0}, [r4, :64]!\n"
1870             "6:\n"
1871             "tst                r6, #4\n"
1872             "beq                6f\n"
1873             "vst1.32    {d0[0]}, [r4, :32]!\n"
1874             "6:\n"
1875             "tst                r6, #2\n"
1876             "beq                6f\n"
1877             "vst1.16    {d0[0]}, [r4, :16]!\n"
1878             "6:\n"
1879             "tst                r6, #1\n"
1880             "beq                4f\n"
1881             "vst1.8             {d0[0]}, [r4]!\n"
1882             "4:\n"
1883
1884             /* Handle the next scanline */
1885             "subs               %[height], %[height], #1\n"
1886             "add                %[dst], %[dst], %[byte_stride]\n"
1887             "bne                1b\n"
1888             "5:\n"
1889             : [height] "+r" (height), [dst] "+r" (dst)
1890             : [color] "r" (color), [width] "r" (width),
1891               [byte_stride] "r" (byte_stride)
1892             : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
1893     }
1894     return TRUE;
1895
1896 #else
1897
1898     /* TODO: intrinsic version for armcc */
1899     return FALSE;
1900
1901 #endif
1902 }
1903
1904 static const pixman_fast_path_t arm_neon_fast_path_array[] =
1905 {
1906     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_n_8_8,        0 },
1907     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
1908     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
1909     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
1910     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
1911     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
1912     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
1913     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
1914 #ifdef USE_GCC_INLINE_ASM
1915     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
1916     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
1917 #endif
1918     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
1919     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
1920     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
1921     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
1922     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
1923     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
1924     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
1925     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
1926     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
1927     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
1928     { PIXMAN_OP_NONE },
1929 };
1930
1931 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
1932
1933 static void
1934 arm_neon_composite (pixman_implementation_t *imp,
1935                     pixman_op_t              op,
1936                     pixman_image_t *         src,
1937                     pixman_image_t *         mask,
1938                     pixman_image_t *         dest,
1939                     int32_t                  src_x,
1940                     int32_t                  src_y,
1941                     int32_t                  mask_x,
1942                     int32_t                  mask_y,
1943                     int32_t                  dest_x,
1944                     int32_t                  dest_y,
1945                     int32_t                  width,
1946                     int32_t                  height)
1947 {
1948     if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
1949                                op, src, mask, dest,
1950                                src_x, src_y,
1951                                mask_x, mask_y,
1952                                dest_x, dest_y,
1953                                width, height))
1954     {
1955         return;
1956     }
1957
1958     _pixman_implementation_composite (imp->delegate, op,
1959                                       src, mask, dest,
1960                                       src_x, src_y,
1961                                       mask_x, mask_y,
1962                                       dest_x, dest_y,
1963                                       width, height);
1964 }
1965
1966 static pixman_bool_t
1967 arm_neon_fill (pixman_implementation_t *imp,
1968                uint32_t *               bits,
1969                int                      stride,
1970                int                      bpp,
1971                int                      x,
1972                int                      y,
1973                int                      width,
1974                int                      height,
1975                uint32_t xor)
1976 {
1977     if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
1978         return TRUE;
1979
1980     return _pixman_implementation_fill (
1981         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
1982 }
1983
1984 pixman_implementation_t *
1985 _pixman_implementation_create_arm_neon (void)
1986 {
1987     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
1988     pixman_implementation_t *imp = _pixman_implementation_create (general);
1989
1990     imp->composite = arm_neon_composite;
1991     imp->fill = arm_neon_fill;
1992
1993     return imp;
1994 }
1995