pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 /* Deal with an intrinsic that is defined differently in GCC */
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch (_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t
  43 unpack0565 (uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8 (0);
  49     gb = vshrq_n_u16 (rgb, 5);
  50     b = vshrq_n_u16 (rgb, 5 + 6);
  51
  52     res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
  53     res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
  54     res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
  55
  56     res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
  57     res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
  58     res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
  59
  60     res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
  61     res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
  62     res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
  63
  64     return res;
  65 }
  66
  67 #ifdef USE_GCC_INLINE_ASM
  68 /* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */
  69 #define vshll_n_u8(a, n) ({ uint16x8_t r; \
  70     asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; })
  71 #endif
  72
  73 static force_inline uint16x8_t
  74 pack0565 (uint8x8x4_t s)
  75 {
  76     uint16x8_t rgb, val_g, val_r;
  77
  78     rgb = vshll_n_u8 (s.val[2], 8);
  79     val_g = vshll_n_u8 (s.val[1], 8);
  80     val_r = vshll_n_u8 (s.val[0], 8);
  81     rgb = vsriq_n_u16 (rgb, val_g, 5);
  82     rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
  83
  84     return rgb;
  85 }
  86
  87 static force_inline uint8x8_t
  88 neon2mul (uint8x8_t x,
  89           uint8x8_t alpha)
  90 {
  91     uint16x8_t tmp, tmp2;
  92     uint8x8_t res;
  93
  94     tmp = vmull_u8 (x, alpha);
  95     tmp2 = vrshrq_n_u16 (tmp, 8);
  96     res = vraddhn_u16 (tmp, tmp2);
  97
  98     return res;
  99 }
 100
 101 static force_inline uint8x8x4_t
 102 neon8mul (uint8x8x4_t x,
 103           uint8x8_t   alpha)
 104 {
 105     uint16x8x4_t tmp;
 106     uint8x8x4_t res;
 107     uint16x8_t qtmp1, qtmp2;
 108
 109     tmp.val[0] = vmull_u8 (x.val[0], alpha);
 110     tmp.val[1] = vmull_u8 (x.val[1], alpha);
 111     tmp.val[2] = vmull_u8 (x.val[2], alpha);
 112     tmp.val[3] = vmull_u8 (x.val[3], alpha);
 113
 114     qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
 115     qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
 116     res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
 117     qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
 118     res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
 119     qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
 120     res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
 121     res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 122
 123     return res;
 124 }
 125
 126 static force_inline uint8x8x4_t
 127 neon8qadd (uint8x8x4_t x,
 128            uint8x8x4_t y)
 129 {
 130     uint8x8x4_t res;
 131
 132     res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
 133     res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
 134     res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
 135     res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 136
 137     return res;
 138 }
 139
 140 static void
 141 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
 142                               pixman_op_t               op,
 143                               pixman_image_t *          src_image,
 144                               pixman_image_t *          mask_image,
 145                               pixman_image_t *          dst_image,
 146                               int32_t                   src_x,
 147                               int32_t                   src_y,
 148                               int32_t                   mask_x,
 149                               int32_t                   mask_y,
 150                               int32_t                   dest_x,
 151                               int32_t                   dest_y,
 152                               int32_t                   width,
 153                               int32_t                   height)
 154 {
 155     uint8_t     *dst_line, *dst;
 156     uint8_t     *src_line, *src;
 157     int dst_stride, src_stride;
 158     uint16_t w;
 159
 160     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 161     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 162
 163     if (width >= 8)
 164     {
 165         /* Use overlapping 8-pixel method */
 166         while (height--)
 167         {
 168             uint8_t *keep_dst = 0;
 169             uint8x8_t sval, dval, temp;
 170
 171             dst = dst_line;
 172             dst_line += dst_stride;
 173             src = src_line;
 174             src_line += src_stride;
 175             w = width;
 176
 177 #ifndef USE_GCC_INLINE_ASM
 178             sval = vld1_u8 ((void *)src);
 179             dval = vld1_u8 ((void *)dst);
 180             keep_dst = dst;
 181
 182             temp = vqadd_u8 (dval, sval);
 183
 184             src += (w & 7);
 185             dst += (w & 7);
 186             w -= (w & 7);
 187
 188             while (w)
 189             {
 190                 sval = vld1_u8 ((void *)src);
 191                 dval = vld1_u8 ((void *)dst);
 192
 193                 vst1_u8 ((void *)keep_dst, temp);
 194                 keep_dst = dst;
 195
 196                 temp = vqadd_u8 (dval, sval);
 197
 198                 src += 8;
 199                 dst += 8;
 200                 w -= 8;
 201             }
 202
 203             vst1_u8 ((void *)keep_dst, temp);
 204 #else
 205             asm volatile (
 206 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 207                 "vld1.8  {d0}, [%[src]]\n\t"
 208                 "vld1.8  {d4}, [%[dst]]\n\t"
 209                 "mov     %[keep_dst], %[dst]\n\t"
 210
 211                 "and ip, %[w], #7\n\t"
 212                 "add %[src], %[src], ip\n\t"
 213                 "add %[dst], %[dst], ip\n\t"
 214                 "subs %[w], %[w], ip\n\t"
 215                 "b 9f\n\t"
 216 /* LOOP */
 217                 "2:\n\t"
 218                 "vld1.8  {d0}, [%[src]]!\n\t"
 219                 "vld1.8  {d4}, [%[dst]]!\n\t"
 220                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 221                 "sub     %[keep_dst], %[dst], #8\n\t"
 222                 "subs %[w], %[w], #8\n\t"
 223                 "9:\n\t"
 224                 "vqadd.u8 d20, d0, d4\n\t"
 225
 226                 "bne 2b\n\t"
 227
 228                 "1:\n\t"
 229                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 230
 231                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 232                 :
 233                 : "ip", "cc", "memory", "d0", "d4",
 234                 "d20"
 235                 );
 236 #endif
 237         }
 238     }
 239     else
 240     {
 241         const uint8_t nil = 0;
 242         const uint8x8_t vnil = vld1_dup_u8 (&nil);
 243
 244         while (height--)
 245         {
 246             uint8x8_t sval = vnil, dval = vnil;
 247             uint8_t *dst4 = 0, *dst2 = 0;
 248
 249             dst = dst_line;
 250             dst_line += dst_stride;
 251             src = src_line;
 252             src_line += src_stride;
 253             w = width;
 254
 255             if (w & 4)
 256             {
 257                 sval = vreinterpret_u8_u32 (
 258                     vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
 259                 dval = vreinterpret_u8_u32 (
 260                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
 261
 262                 dst4 = dst;
 263                 src += 4;
 264                 dst += 4;
 265             }
 266
 267             if (w & 2)
 268             {
 269                 sval = vreinterpret_u8_u16 (
 270                     vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
 271                 dval = vreinterpret_u8_u16 (
 272                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
 273
 274                 dst2 = dst;
 275                 src += 2;
 276                 dst += 2;
 277             }
 278
 279             if (w & 1)
 280             {
 281                 sval = vld1_lane_u8 (src, sval, 1);
 282                 dval = vld1_lane_u8 (dst, dval, 1);
 283             }
 284
 285             dval = vqadd_u8 (dval, sval);
 286
 287             if (w & 1)
 288                 vst1_lane_u8 (dst, dval, 1);
 289
 290             if (w & 2)
 291                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
 292
 293             if (w & 4)
 294                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
 295         }
 296     }
 297 }
 298
 299 static void
 300 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
 301                                pixman_op_t               op,
 302                                pixman_image_t *          src_image,
 303                                pixman_image_t *          mask_image,
 304                                pixman_image_t *          dst_image,
 305                                int32_t                   src_x,
 306                                int32_t                   src_y,
 307                                int32_t                   mask_x,
 308                                int32_t                   mask_y,
 309                                int32_t                   dest_x,
 310                                int32_t                   dest_y,
 311                                int32_t                   width,
 312                                int32_t                   height)
 313 {
 314     uint32_t    *dst_line, *dst;
 315     uint32_t    *src_line, *src;
 316     int dst_stride, src_stride;
 317     uint32_t w;
 318
 319     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 320     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 321
 322     if (width >= 8)
 323     {
 324         /* Use overlapping 8-pixel method */
 325         while (height--)
 326         {
 327             uint32_t *keep_dst = 0;
 328             uint8x8x4_t sval, dval, temp;
 329
 330             dst = dst_line;
 331             dst_line += dst_stride;
 332             src = src_line;
 333             src_line += src_stride;
 334             w = width;
 335
 336 #ifndef USE_GCC_INLINE_ASM
 337             sval = vld4_u8 ((void *)src);
 338             dval = vld4_u8 ((void *)dst);
 339             keep_dst = dst;
 340
 341             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 342             temp = neon8qadd (sval, temp);
 343
 344             src += (w & 7);
 345             dst += (w & 7);
 346             w -= (w & 7);
 347
 348             while (w)
 349             {
 350                 sval = vld4_u8 ((void *)src);
 351                 dval = vld4_u8 ((void *)dst);
 352
 353                 vst4_u8 ((void *)keep_dst, temp);
 354                 keep_dst = dst;
 355
 356                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 357                 temp = neon8qadd (sval, temp);
 358
 359                 src += 8;
 360                 dst += 8;
 361                 w -= 8;
 362             }
 363
 364             vst4_u8 ((void *)keep_dst, temp);
 365 #else
 366             asm volatile (
 367 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 368                 "vld4.8  {d0-d3}, [%[src]]\n\t"
 369                 "vld4.8  {d4-d7}, [%[dst]]\n\t"
 370                 "mov     %[keep_dst], %[dst]\n\t"
 371
 372                 "and ip, %[w], #7\n\t"
 373                 "add %[src], %[src], ip, LSL#2\n\t"
 374                 "add %[dst], %[dst], ip, LSL#2\n\t"
 375                 "subs %[w], %[w], ip\n\t"
 376                 "b 9f\n\t"
 377 /* LOOP */
 378                 "2:\n\t"
 379                 "vld4.8  {d0-d3}, [%[src]]!\n\t"
 380                 "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 381                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 382                 "sub     %[keep_dst], %[dst], #8*4\n\t"
 383                 "subs %[w], %[w], #8\n\t"
 384                 "9:\n\t"
 385                 "vmvn.8  d31, d3\n\t"
 386                 "vmull.u8 q10, d31, d4\n\t"
 387                 "vmull.u8 q11, d31, d5\n\t"
 388                 "vmull.u8 q12, d31, d6\n\t"
 389                 "vmull.u8 q13, d31, d7\n\t"
 390                 "vrshr.u16 q8, q10, #8\n\t"
 391                 "vrshr.u16 q9, q11, #8\n\t"
 392                 "vraddhn.u16 d20, q10, q8\n\t"
 393                 "vraddhn.u16 d21, q11, q9\n\t"
 394                 "vrshr.u16 q8, q12, #8\n\t"
 395                 "vrshr.u16 q9, q13, #8\n\t"
 396                 "vraddhn.u16 d22, q12, q8\n\t"
 397                 "vraddhn.u16 d23, q13, q9\n\t"
 398 /* result in d20-d23 */
 399                 "vqadd.u8 d20, d0, d20\n\t"
 400                 "vqadd.u8 d21, d1, d21\n\t"
 401                 "vqadd.u8 d22, d2, d22\n\t"
 402                 "vqadd.u8 d23, d3, d23\n\t"
 403
 404                 "bne 2b\n\t"
 405
 406                 "1:\n\t"
 407                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 408
 409                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 410                 :
 411                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 412                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
 413                 );
 414 #endif
 415         }
 416     }
 417     else
 418     {
 419         uint8x8_t alpha_selector = vreinterpret_u8_u64 (
 420             vcreate_u64 (0x0707070703030303ULL));
 421
 422         /* Handle width < 8 */
 423         while (height--)
 424         {
 425             dst = dst_line;
 426             dst_line += dst_stride;
 427             src = src_line;
 428             src_line += src_stride;
 429             w = width;
 430
 431             while (w >= 2)
 432             {
 433                 uint8x8_t sval, dval;
 434
 435                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 436                 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
 437                 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 438                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 439                 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 440
 441                 src += 2;
 442                 dst += 2;
 443                 w -= 2;
 444             }
 445
 446             if (w)
 447             {
 448                 uint8x8_t sval, dval;
 449
 450                 /* single 32-bit pixel in lane 0 */
 451                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));  /* only interested in lane 0 */
 452                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));  /* only interested in lane 0 */
 453                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 454                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 455             }
 456         }
 457     }
 458 }
 459
 460 static void
 461 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 462                                  pixman_op_t               op,
 463                                  pixman_image_t *          src_image,
 464                                  pixman_image_t *          mask_image,
 465                                  pixman_image_t *          dst_image,
 466                                  int32_t                   src_x,
 467                                  int32_t                   src_y,
 468                                  int32_t                   mask_x,
 469                                  int32_t                   mask_y,
 470                                  int32_t                   dest_x,
 471                                  int32_t                   dest_y,
 472                                  int32_t                   width,
 473                                  int32_t                   height)
 474 {
 475     uint32_t    *dst_line, *dst;
 476     uint32_t    *src_line, *src;
 477     uint32_t mask;
 478     int dst_stride, src_stride;
 479     uint32_t w;
 480     uint8x8_t mask_alpha;
 481
 482     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 483     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 484
 485     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 486     mask_alpha = vdup_n_u8 ((mask) >> 24);
 487
 488     if (width >= 8)
 489     {
 490         /* Use overlapping 8-pixel method */
 491         while (height--)
 492         {
 493             dst = dst_line;
 494             dst_line += dst_stride;
 495             src = src_line;
 496             src_line += src_stride;
 497             w = width;
 498
 499             uint32_t *keep_dst = 0;
 500
 501 #ifndef USE_GCC_INLINE_ASM
 502             uint8x8x4_t sval, dval, temp;
 503
 504             sval = vld4_u8 ((void *)src);
 505             dval = vld4_u8 ((void *)dst);
 506             keep_dst = dst;
 507
 508             sval = neon8mul (sval, mask_alpha);
 509             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 510             temp = neon8qadd (sval, temp);
 511
 512             src += (w & 7);
 513             dst += (w & 7);
 514             w -= (w & 7);
 515
 516             while (w)
 517             {
 518                 sval = vld4_u8 ((void *)src);
 519                 dval = vld4_u8 ((void *)dst);
 520
 521                 vst4_u8 ((void *)keep_dst, temp);
 522                 keep_dst = dst;
 523
 524                 sval = neon8mul (sval, mask_alpha);
 525                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 526                 temp = neon8qadd (sval, temp);
 527
 528                 src += 8;
 529                 dst += 8;
 530                 w -= 8;
 531             }
 532             vst4_u8 ((void *)keep_dst, temp);
 533 #else
 534             asm volatile (
 535 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 536                 "vdup.32      d30, %[mask]\n\t"
 537                 "vdup.8       d30, d30[3]\n\t"
 538
 539                 "vld4.8       {d0-d3}, [%[src]]\n\t"
 540                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 541                 "mov  %[keep_dst], %[dst]\n\t"
 542
 543                 "and  ip, %[w], #7\n\t"
 544                 "add  %[src], %[src], ip, LSL#2\n\t"
 545                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 546                 "subs  %[w], %[w], ip\n\t"
 547                 "b 9f\n\t"
 548 /* LOOP */
 549                 "2:\n\t"
 550                 "vld4.8       {d0-d3}, [%[src]]!\n\t"
 551                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 552                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 553                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 554                 "subs  %[w], %[w], #8\n\t"
 555
 556                 "9:\n\t"
 557                 "vmull.u8     q10, d30, d0\n\t"
 558                 "vmull.u8     q11, d30, d1\n\t"
 559                 "vmull.u8     q12, d30, d2\n\t"
 560                 "vmull.u8     q13, d30, d3\n\t"
 561                 "vrshr.u16    q8, q10, #8\n\t"
 562                 "vrshr.u16    q9, q11, #8\n\t"
 563                 "vraddhn.u16  d0, q10, q8\n\t"
 564                 "vraddhn.u16  d1, q11, q9\n\t"
 565                 "vrshr.u16    q9, q13, #8\n\t"
 566                 "vrshr.u16    q8, q12, #8\n\t"
 567                 "vraddhn.u16  d3, q13, q9\n\t"
 568                 "vraddhn.u16  d2, q12, q8\n\t"
 569
 570                 "vmvn.8       d31, d3\n\t"
 571                 "vmull.u8     q10, d31, d4\n\t"
 572                 "vmull.u8     q11, d31, d5\n\t"
 573                 "vmull.u8     q12, d31, d6\n\t"
 574                 "vmull.u8     q13, d31, d7\n\t"
 575                 "vrshr.u16    q8, q10, #8\n\t"
 576                 "vrshr.u16    q9, q11, #8\n\t"
 577                 "vraddhn.u16  d20, q10, q8\n\t"
 578                 "vrshr.u16    q8, q12, #8\n\t"
 579                 "vraddhn.u16  d21, q11, q9\n\t"
 580                 "vrshr.u16    q9, q13, #8\n\t"
 581                 "vraddhn.u16  d22, q12, q8\n\t"
 582                 "vraddhn.u16  d23, q13, q9\n\t"
 583
 584 /* result in d20-d23 */
 585                 "vqadd.u8     d20, d0, d20\n\t"
 586                 "vqadd.u8     d21, d1, d21\n\t"
 587                 "vqadd.u8     d22, d2, d22\n\t"
 588                 "vqadd.u8     d23, d3, d23\n\t"
 589
 590                 "bne  2b\n\t"
 591
 592                 "1:\n\t"
 593                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 594
 595                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 596                 : [mask] "r" (mask)
 597                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 598                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
 599                 "d30", "d31"
 600                 );
 601 #endif
 602         }
 603     }
 604     else
 605     {
 606         uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 607
 608         /* Handle width < 8 */
 609         while (height--)
 610         {
 611             dst = dst_line;
 612             dst_line += dst_stride;
 613             src = src_line;
 614             src_line += src_stride;
 615             w = width;
 616
 617             while (w >= 2)
 618             {
 619                 uint8x8_t sval, dval;
 620
 621                 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
 622                 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 623
 624                 /* sval * const alpha_mul */
 625                 sval = neon2mul (sval, mask_alpha);
 626
 627                 /* dval * 255-(src alpha) */
 628                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 629
 630                 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 631
 632                 src += 2;
 633                 dst += 2;
 634                 w -= 2;
 635             }
 636
 637             if (w)
 638             {
 639                 uint8x8_t sval, dval;
 640
 641                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
 642                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
 643
 644                 /* sval * const alpha_mul */
 645                 sval = neon2mul (sval, mask_alpha);
 646
 647                 /* dval * 255-(src alpha) */
 648                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 649
 650                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 651             }
 652         }
 653     }
 654 }
 655
 656 static void
 657 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
 658                               pixman_op_t               op,
 659                               pixman_image_t *          src_image,
 660                               pixman_image_t *          mask_image,
 661                               pixman_image_t *          dst_image,
 662                               int32_t                   src_x,
 663                               int32_t                   src_y,
 664                               int32_t                   mask_x,
 665                               int32_t                   mask_y,
 666                               int32_t                   dest_x,
 667                               int32_t                   dest_y,
 668                               int32_t                   width,
 669                               int32_t                   height)
 670 {
 671     uint32_t     src, srca;
 672     uint16_t    *dst_line, *dst;
 673     uint8_t     *mask_line, *mask;
 674     int          dst_stride, mask_stride;
 675     uint32_t     w;
 676     uint8x8_t    sval2;
 677     uint8x8x4_t  sval8;
 678
 679     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 680
 681     srca = src >> 24;
 682     if (src == 0)
 683         return;
 684
 685     sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
 686     sval8.val[0]=vdup_lane_u8 (sval2,0);
 687     sval8.val[1]=vdup_lane_u8 (sval2,1);
 688     sval8.val[2]=vdup_lane_u8 (sval2,2);
 689     sval8.val[3]=vdup_lane_u8 (sval2,3);
 690
 691     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 692     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 693
 694     if (width>=8)
 695     {
 696         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
 697         while (height--)
 698         {
 699             uint16_t *keep_dst=0;
 700
 701             dst = dst_line;
 702             dst_line += dst_stride;
 703             mask = mask_line;
 704             mask_line += mask_stride;
 705             w = width;
 706
 707 #ifndef USE_GCC_INLINE_ASM
 708             uint8x8_t alpha;
 709             uint16x8_t dval, temp;
 710             uint8x8x4_t sval8temp;
 711
 712             alpha = vld1_u8 ((void *)mask);
 713             dval = vld1q_u16 ((void *)dst);
 714             keep_dst = dst;
 715
 716             sval8temp = neon8mul (sval8, alpha);
 717             temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 718
 719             mask += (w & 7);
 720             dst += (w & 7);
 721             w -= (w & 7);
 722
 723             while (w)
 724             {
 725                 dval = vld1q_u16 ((void *)dst);
 726                 alpha = vld1_u8 ((void *)mask);
 727
 728                 vst1q_u16 ((void *)keep_dst, temp);
 729                 keep_dst = dst;
 730
 731                 sval8temp = neon8mul (sval8, alpha);
 732                 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 733
 734                 mask+=8;
 735                 dst+=8;
 736                 w-=8;
 737             }
 738             vst1q_u16 ((void *)keep_dst, temp);
 739 #else
 740             asm volatile (
 741                 "vdup.32      d0, %[src]\n\t"
 742                 "vdup.8       d1, d0[1]\n\t"
 743                 "vdup.8       d2, d0[2]\n\t"
 744                 "vdup.8       d3, d0[3]\n\t"
 745                 "vdup.8       d0, d0[0]\n\t"
 746
 747                 "vld1.8       {q12}, [%[dst]]\n\t"
 748                 "vld1.8       {d31}, [%[mask]]\n\t"
 749                 "mov  %[keep_dst], %[dst]\n\t"
 750
 751                 "and  ip, %[w], #7\n\t"
 752                 "add  %[mask], %[mask], ip\n\t"
 753                 "add  %[dst], %[dst], ip, LSL#1\n\t"
 754                 "subs  %[w], %[w], ip\n\t"
 755                 "b  9f\n\t"
 756 /* LOOP */
 757                 "2:\n\t"
 758
 759                 "vld1.16      {q12}, [%[dst]]!\n\t"
 760                 "vld1.8       {d31}, [%[mask]]!\n\t"
 761                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 762                 "sub  %[keep_dst], %[dst], #8*2\n\t"
 763                 "subs  %[w], %[w], #8\n\t"
 764                 "9:\n\t"
 765 /* expand 0565 q12 to 8888 {d4-d7} */
 766                 "vmovn.u16    d4, q12\t\n"
 767                 "vshr.u16     q11, q12, #5\t\n"
 768                 "vshr.u16     q10, q12, #6+5\t\n"
 769                 "vmovn.u16    d5, q11\t\n"
 770                 "vmovn.u16    d6, q10\t\n"
 771                 "vshl.u8      d4, d4, #3\t\n"
 772                 "vshl.u8      d5, d5, #2\t\n"
 773                 "vshl.u8      d6, d6, #3\t\n"
 774                 "vsri.u8      d4, d4, #5\t\n"
 775                 "vsri.u8      d5, d5, #6\t\n"
 776                 "vsri.u8      d6, d6, #5\t\n"
 777
 778                 "vmull.u8     q10, d31, d0\n\t"
 779                 "vmull.u8     q11, d31, d1\n\t"
 780                 "vmull.u8     q12, d31, d2\n\t"
 781                 "vmull.u8     q13, d31, d3\n\t"
 782                 "vrshr.u16    q8, q10, #8\n\t"
 783                 "vrshr.u16    q9, q11, #8\n\t"
 784                 "vraddhn.u16  d20, q10, q8\n\t"
 785                 "vraddhn.u16  d21, q11, q9\n\t"
 786                 "vrshr.u16    q9, q13, #8\n\t"
 787                 "vrshr.u16    q8, q12, #8\n\t"
 788                 "vraddhn.u16  d23, q13, q9\n\t"
 789                 "vraddhn.u16  d22, q12, q8\n\t"
 790
 791 /* duplicate in 4/2/1 & 8pix vsns */
 792                 "vmvn.8       d30, d23\n\t"
 793                 "vmull.u8     q14, d30, d6\n\t"
 794                 "vmull.u8     q13, d30, d5\n\t"
 795                 "vmull.u8     q12, d30, d4\n\t"
 796                 "vrshr.u16    q8, q14, #8\n\t"
 797                 "vrshr.u16    q9, q13, #8\n\t"
 798                 "vraddhn.u16  d6, q14, q8\n\t"
 799                 "vrshr.u16    q8, q12, #8\n\t"
 800                 "vraddhn.u16  d5, q13, q9\n\t"
 801                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 802                 "vraddhn.u16  d4, q12, q8\n\t"
 803 /* intentionally don't calculate alpha */
 804 /* result in d4-d6 */
 805
 806 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 807                 "vqadd.u8     d5, d5, d21\n\t"
 808                 "vqadd.u8     d4, d4, d20\n\t"
 809
 810 /* pack 8888 {d20-d23} to 0565 q10 */
 811                 "vshll.u8     q10, d6, #8\n\t"
 812                 "vshll.u8     q3, d5, #8\n\t"
 813                 "vshll.u8     q2, d4, #8\n\t"
 814                 "vsri.u16     q10, q3, #5\t\n"
 815                 "vsri.u16     q10, q2, #11\t\n"
 816
 817                 "bne 2b\n\t"
 818
 819                 "1:\n\t"
 820                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 821
 822                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 823                 : [src] "r" (src)
 824                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 825                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 826                   "d30","d31"
 827                 );
 828 #endif
 829         }
 830     }
 831     else
 832     {
 833         while (height--)
 834         {
 835             void *dst4=0, *dst2=0;
 836
 837             dst = dst_line;
 838             dst_line += dst_stride;
 839             mask = mask_line;
 840             mask_line += mask_stride;
 841             w = width;
 842
 843
 844 #if 1 /* #ifndef USE_GCC_INLINE_ASM */
 845             uint8x8_t alpha;
 846             uint16x8_t dval, temp;
 847             uint8x8x4_t sval8temp;
 848
 849             if (w&4)
 850             {
 851                 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
 852                 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
 853                 dst4=dst;
 854                 mask+=4;
 855                 dst+=4;
 856             }
 857             if (w&2)
 858             {
 859                 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
 860                 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
 861                 dst2=dst;
 862                 mask+=2;
 863                 dst+=2;
 864             }
 865             if (w&1)
 866             {
 867                 alpha = vld1_lane_u8 ((void *)mask, alpha,1);
 868                 dval = vld1q_lane_u16 ((void *)dst, dval,1);
 869             }
 870
 871             sval8temp = neon8mul (sval8, alpha);
 872             temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 873
 874             if (w&1)
 875                 vst1q_lane_u16 ((void *)dst, temp,1);
 876             if (w&2)
 877                 vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
 878             if (w&4)
 879                 vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
 880 #else
 881             /* this code has some bug (does not pass blitters-test) */
 882             asm volatile (
 883                 "vdup.32      d0, %[src]\n\t"
 884                 "vdup.8       d1, d0[1]\n\t"
 885                 "vdup.8       d2, d0[2]\n\t"
 886                 "vdup.8       d3, d0[3]\n\t"
 887                 "vdup.8       d0, d0[0]\n\t"
 888
 889                 "tst  %[w], #4\t\n"
 890                 "beq  skip_load4\t\n"
 891
 892                 "vld1.64      {d25}, [%[dst]]\n\t"
 893                 "vld1.32      {d31[1]}, [%[mask]]\n\t"
 894                 "mov  %[dst4], %[dst]\t\n"
 895                 "add  %[mask], %[mask], #4\t\n"
 896                 "add  %[dst], %[dst], #4*2\t\n"
 897
 898                 "skip_load4:\t\n"
 899                 "tst  %[w], #2\t\n"
 900                 "beq  skip_load2\t\n"
 901                 "vld1.32      {d24[1]}, [%[dst]]\n\t"
 902                 "vld1.16      {d31[1]}, [%[mask]]\n\t"
 903                 "mov  %[dst2], %[dst]\t\n"
 904                 "add  %[mask], %[mask], #2\t\n"
 905                 "add  %[dst], %[dst], #2*2\t\n"
 906
 907                 "skip_load2:\t\n"
 908                 "tst  %[w], #1\t\n"
 909                 "beq  skip_load1\t\n"
 910                 "vld1.16      {d24[1]}, [%[dst]]\n\t"
 911                 "vld1.8       {d31[1]}, [%[mask]]\n\t"
 912
 913                 "skip_load1:\t\n"
 914 /* expand 0565 q12 to 8888 {d4-d7} */
 915                 "vmovn.u16    d4, q12\t\n"
 916                 "vshr.u16     q11, q12, #5\t\n"
 917                 "vshr.u16     q10, q12, #6+5\t\n"
 918                 "vmovn.u16    d5, q11\t\n"
 919                 "vmovn.u16    d6, q10\t\n"
 920                 "vshl.u8      d4, d4, #3\t\n"
 921                 "vshl.u8      d5, d5, #2\t\n"
 922                 "vshl.u8      d6, d6, #3\t\n"
 923                 "vsri.u8      d4, d4, #5\t\n"
 924                 "vsri.u8      d5, d5, #6\t\n"
 925                 "vsri.u8      d6, d6, #5\t\n"
 926
 927                 "vmull.u8     q10, d31, d0\n\t"
 928                 "vmull.u8     q11, d31, d1\n\t"
 929                 "vmull.u8     q12, d31, d2\n\t"
 930                 "vmull.u8     q13, d31, d3\n\t"
 931                 "vrshr.u16    q8, q10, #8\n\t"
 932                 "vrshr.u16    q9, q11, #8\n\t"
 933                 "vraddhn.u16  d20, q10, q8\n\t"
 934                 "vraddhn.u16  d21, q11, q9\n\t"
 935                 "vrshr.u16    q9, q13, #8\n\t"
 936                 "vrshr.u16    q8, q12, #8\n\t"
 937                 "vraddhn.u16  d23, q13, q9\n\t"
 938                 "vraddhn.u16  d22, q12, q8\n\t"
 939
 940 /* duplicate in 4/2/1 & 8pix vsns */
 941                 "vmvn.8       d30, d23\n\t"
 942                 "vmull.u8     q14, d30, d6\n\t"
 943                 "vmull.u8     q13, d30, d5\n\t"
 944                 "vmull.u8     q12, d30, d4\n\t"
 945                 "vrshr.u16    q8, q14, #8\n\t"
 946                 "vrshr.u16    q9, q13, #8\n\t"
 947                 "vraddhn.u16  d6, q14, q8\n\t"
 948                 "vrshr.u16    q8, q12, #8\n\t"
 949                 "vraddhn.u16  d5, q13, q9\n\t"
 950                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 951                 "vraddhn.u16  d4, q12, q8\n\t"
 952 /* intentionally don't calculate alpha */
 953 /* result in d4-d6 */
 954
 955 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 956                 "vqadd.u8     d5, d5, d21\n\t"
 957                 "vqadd.u8     d4, d4, d20\n\t"
 958
 959 /* pack 8888 {d20-d23} to 0565 q10 */
 960                 "vshll.u8     q10, d6, #8\n\t"
 961                 "vshll.u8     q3, d5, #8\n\t"
 962                 "vshll.u8     q2, d4, #8\n\t"
 963                 "vsri.u16     q10, q3, #5\t\n"
 964                 "vsri.u16     q10, q2, #11\t\n"
 965
 966                 "tst  %[w], #1\n\t"
 967                 "beq skip_store1\t\n"
 968                 "vst1.16      {d20[1]}, [%[dst]]\t\n"
 969                 "skip_store1:\t\n"
 970                 "tst  %[w], #2\n\t"
 971                 "beq  skip_store2\t\n"
 972                 "vst1.32      {d20[1]}, [%[dst2]]\t\n"
 973                 "skip_store2:\t\n"
 974                 "tst  %[w], #4\n\t"
 975                 "beq skip_store4\t\n"
 976                 "vst1.16      {d21}, [%[dst4]]\t\n"
 977                 "skip_store4:\t\n"
 978
 979                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
 980                 : [src] "r" (src)
 981                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 982                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 983                   "d30","d31"
 984                 );
 985 #endif
 986         }
 987     }
 988 }
 989
 990 static void
 991 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
 992                               pixman_op_t               op,
 993                               pixman_image_t *          src_image,
 994                               pixman_image_t *          mask_image,
 995                               pixman_image_t *          dst_image,
 996                               int32_t                   src_x,
 997                               int32_t                   src_y,
 998                               int32_t                   mask_x,
 999                               int32_t                   mask_y,
1000                               int32_t                   dest_x,
1001                               int32_t                   dest_y,
1002                               int32_t                   width,
1003                               int32_t                   height)
1004 {
1005     uint32_t src, srca;
1006     uint32_t    *dst_line, *dst;
1007     uint8_t     *mask_line, *mask;
1008     int dst_stride, mask_stride;
1009     uint32_t w;
1010     uint8x8_t sval2;
1011     uint8x8x4_t sval8;
1012     uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1013     uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1014
1015     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1016
1017     /* bail out if fully transparent */
1018     srca = src >> 24;
1019     if (src == 0)
1020         return;
1021
1022     sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1023     sval8.val[0] = vdup_lane_u8 (sval2, 0);
1024     sval8.val[1] = vdup_lane_u8 (sval2, 1);
1025     sval8.val[2] = vdup_lane_u8 (sval2, 2);
1026     sval8.val[3] = vdup_lane_u8 (sval2, 3);
1027
1028     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1029     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1030
1031     if (width >= 8)
1032     {
1033         /* Use overlapping 8-pixel method, modified to avoid
1034          * rewritten dest being reused
1035          */
1036         while (height--)
1037         {
1038             uint32_t *keep_dst = 0;
1039
1040             dst = dst_line;
1041             dst_line += dst_stride;
1042             mask = mask_line;
1043             mask_line += mask_stride;
1044             w = width;
1045
1046 #ifndef USE_GCC_INLINE_ASM
1047             uint8x8_t alpha;
1048             uint8x8x4_t dval, temp;
1049
1050             alpha = vld1_u8 ((void *)mask);
1051             dval = vld4_u8 ((void *)dst);
1052             keep_dst = dst;
1053
1054             temp = neon8mul (sval8, alpha);
1055             dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1056             temp = neon8qadd (temp, dval);
1057
1058             mask += (w & 7);
1059             dst += (w & 7);
1060             w -= (w & 7);
1061
1062             while (w)
1063             {
1064                 alpha = vld1_u8 ((void *)mask);
1065                 dval = vld4_u8 ((void *)dst);
1066
1067                 vst4_u8 ((void *)keep_dst, temp);
1068                 keep_dst = dst;
1069
1070                 temp = neon8mul (sval8, alpha);
1071                 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1072                 temp = neon8qadd (temp, dval);
1073
1074                 mask += 8;
1075                 dst += 8;
1076                 w -= 8;
1077             }
1078             vst4_u8 ((void *)keep_dst, temp);
1079 #else
1080             asm volatile (
1081                 "vdup.32      d0, %[src]\n\t"
1082                 "vdup.8       d1, d0[1]\n\t"
1083                 "vdup.8       d2, d0[2]\n\t"
1084                 "vdup.8       d3, d0[3]\n\t"
1085                 "vdup.8       d0, d0[0]\n\t"
1086
1087                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
1088                 "vld1.8       {d31}, [%[mask]]\n\t"
1089                 "mov  %[keep_dst], %[dst]\n\t"
1090
1091                 "and  ip, %[w], #7\n\t"
1092                 "add  %[mask], %[mask], ip\n\t"
1093                 "add  %[dst], %[dst], ip, LSL#2\n\t"
1094                 "subs  %[w], %[w], ip\n\t"
1095                 "b 9f\n\t"
1096 /* LOOP */
1097                 "2:\n\t"
1098                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
1099                 "vld1.8       {d31}, [%[mask]]!\n\t"
1100                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1101                 "sub  %[keep_dst], %[dst], #8*4\n\t"
1102                 "subs  %[w], %[w], #8\n\t"
1103                 "9:\n\t"
1104
1105                 "vmull.u8     q10, d31, d0\n\t"
1106                 "vmull.u8     q11, d31, d1\n\t"
1107                 "vmull.u8     q12, d31, d2\n\t"
1108                 "vmull.u8     q13, d31, d3\n\t"
1109                 "vrshr.u16    q8, q10, #8\n\t"
1110                 "vrshr.u16    q9, q11, #8\n\t"
1111                 "vraddhn.u16  d20, q10, q8\n\t"
1112                 "vraddhn.u16  d21, q11, q9\n\t"
1113                 "vrshr.u16    q9, q13, #8\n\t"
1114                 "vrshr.u16    q8, q12, #8\n\t"
1115                 "vraddhn.u16  d23, q13, q9\n\t"
1116                 "vraddhn.u16  d22, q12, q8\n\t"
1117
1118                 "vmvn.8       d30, d23\n\t"
1119                 "vmull.u8     q12, d30, d4\n\t"
1120                 "vmull.u8     q13, d30, d5\n\t"
1121                 "vmull.u8     q14, d30, d6\n\t"
1122                 "vmull.u8     q15, d30, d7\n\t"
1123
1124                 "vrshr.u16    q8, q12, #8\n\t"
1125                 "vrshr.u16    q9, q13, #8\n\t"
1126                 "vraddhn.u16  d4, q12, q8\n\t"
1127                 "vrshr.u16    q8, q14, #8\n\t"
1128                 "vraddhn.u16  d5, q13, q9\n\t"
1129                 "vrshr.u16    q9, q15, #8\n\t"
1130                 "vraddhn.u16  d6, q14, q8\n\t"
1131                 "vraddhn.u16  d7, q15, q9\n\t"
1132 /* result in d4-d7 */
1133
1134                 "vqadd.u8     d20, d4, d20\n\t"
1135                 "vqadd.u8     d21, d5, d21\n\t"
1136                 "vqadd.u8     d22, d6, d22\n\t"
1137                 "vqadd.u8     d23, d7, d23\n\t"
1138
1139                 "bne 2b\n\t"
1140
1141                 "1:\n\t"
1142                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1143
1144                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1145                 : [src] "r" (src)
1146                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1147                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1148                 "d30", "d31"
1149                 );
1150 #endif
1151         }
1152     }
1153     else
1154     {
1155         while (height--)
1156         {
1157             uint8x8_t alpha;
1158
1159             dst = dst_line;
1160             dst_line += dst_stride;
1161             mask = mask_line;
1162             mask_line += mask_stride;
1163             w = width;
1164
1165             while (w >= 2)
1166             {
1167                 uint8x8_t dval, temp, res;
1168
1169                 alpha = vtbl1_u8 (
1170                     vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
1171                 dval = vld1_u8 ((void *)dst);
1172
1173                 temp = neon2mul (sval2, alpha);
1174                 res = vqadd_u8 (
1175                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1176
1177                 vst1_u8 ((void *)dst, res);
1178
1179                 mask += 2;
1180                 dst += 2;
1181                 w -= 2;
1182             }
1183
1184             if (w)
1185             {
1186                 uint8x8_t dval, temp, res;
1187
1188                 alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
1189                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
1190
1191                 temp = neon2mul (sval2, alpha);
1192                 res = vqadd_u8 (
1193                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1194
1195                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
1196             }
1197         }
1198     }
1199 }
1200
1201 static void
1202 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
1203                              pixman_op_t               op,
1204                              pixman_image_t *          src_image,
1205                              pixman_image_t *          mask_image,
1206                              pixman_image_t *          dst_image,
1207                              int32_t                   src_x,
1208                              int32_t                   src_y,
1209                              int32_t                   mask_x,
1210                              int32_t                   mask_y,
1211                              int32_t                   dest_x,
1212                              int32_t                   dest_y,
1213                              int32_t                   width,
1214                              int32_t                   height)
1215 {
1216     uint8_t     *dst_line, *dst;
1217     uint8_t     *mask_line, *mask;
1218     int dst_stride, mask_stride;
1219     uint32_t w;
1220     uint32_t src;
1221     uint8x8_t sa;
1222
1223     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1224     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1225     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1226     sa = vdup_n_u8 ((src) >> 24);
1227
1228     if (width >= 8)
1229     {
1230         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1231         while (height--)
1232         {
1233             dst = dst_line;
1234             dst_line += dst_stride;
1235             mask = mask_line;
1236             mask_line += mask_stride;
1237             w = width;
1238
1239             uint8x8_t mval, dval, res;
1240             uint8_t     *keep_dst;
1241
1242             mval = vld1_u8 ((void *)mask);
1243             dval = vld1_u8 ((void *)dst);
1244             keep_dst = dst;
1245
1246             res = vqadd_u8 (neon2mul (mval, sa), dval);
1247
1248             mask += (w & 7);
1249             dst += (w & 7);
1250             w -= w & 7;
1251
1252             while (w)
1253             {
1254                 mval = vld1_u8 ((void *)mask);
1255                 dval = vld1_u8 ((void *)dst);
1256                 vst1_u8 ((void *)keep_dst, res);
1257                 keep_dst = dst;
1258
1259                 res = vqadd_u8 (neon2mul (mval, sa), dval);
1260
1261                 mask += 8;
1262                 dst += 8;
1263                 w -= 8;
1264             }
1265             vst1_u8 ((void *)keep_dst, res);
1266         }
1267     }
1268     else
1269     {
1270         /* Use 4/2/1 load/store method to handle 1-7 pixels */
1271         while (height--)
1272         {
1273             dst = dst_line;
1274             dst_line += dst_stride;
1275             mask = mask_line;
1276             mask_line += mask_stride;
1277             w = width;
1278
1279             uint8x8_t mval = sa, dval = sa, res;
1280             uint8_t *dst4 = 0, *dst2 = 0;
1281
1282             if (w & 4)
1283             {
1284                 mval = vreinterpret_u8_u32 (
1285                     vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1286                 dval = vreinterpret_u8_u32 (
1287                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1288
1289                 dst4 = dst;
1290                 mask += 4;
1291                 dst += 4;
1292             }
1293
1294             if (w & 2)
1295             {
1296                 mval = vreinterpret_u8_u16 (
1297                     vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1298                 dval = vreinterpret_u8_u16 (
1299                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1300                 dst2 = dst;
1301                 mask += 2;
1302                 dst += 2;
1303             }
1304
1305             if (w & 1)
1306             {
1307                 mval = vld1_lane_u8 (mask, mval, 1);
1308                 dval = vld1_lane_u8 (dst, dval, 1);
1309             }
1310
1311             res = vqadd_u8 (neon2mul (mval, sa), dval);
1312
1313             if (w & 1)
1314                 vst1_lane_u8 (dst, res, 1);
1315             if (w & 2)
1316                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1317             if (w & 4)
1318                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1319         }
1320     }
1321 }
1322
1323 #ifdef USE_GCC_INLINE_ASM
1324
1325 static void
1326 neon_composite_src_16_16 (pixman_implementation_t * impl,
1327                           pixman_op_t               op,
1328                           pixman_image_t *          src_image,
1329                           pixman_image_t *          mask_image,
1330                           pixman_image_t *          dst_image,
1331                           int32_t                   src_x,
1332                           int32_t                   src_y,
1333                           int32_t                   mask_x,
1334                           int32_t                   mask_y,
1335                           int32_t                   dest_x,
1336                           int32_t                   dest_y,
1337                           int32_t                   width,
1338                           int32_t                   height)
1339 {
1340     uint16_t    *dst_line, *src_line;
1341     uint32_t dst_stride, src_stride;
1342
1343     if (!height || !width)
1344         return;
1345
1346     /* We simply copy 16-bit-aligned pixels from one place to another. */
1347     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1348     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1349
1350     /* Preload the first input scanline */
1351     {
1352         uint16_t *src_ptr = src_line;
1353         uint32_t count = width;
1354
1355         asm volatile (
1356             "0: @ loop                                                  \n"
1357             "   subs    %[count], %[count], #32                         \n"
1358             "   pld     [%[src]]                                        \n"
1359             "   add     %[src], %[src], #64                             \n"
1360             "   bgt 0b                                                  \n"
1361
1362             /* Clobbered input registers marked as input/outputs */
1363             : [src] "+r" (src_ptr), [count] "+r" (count)
1364             :     /* no unclobbered inputs */
1365             : "cc"
1366             );
1367     }
1368
1369     while (height--)
1370     {
1371         uint16_t *dst_ptr = dst_line;
1372         uint16_t *src_ptr = src_line;
1373         uint32_t count = width;
1374         uint32_t tmp = 0;
1375
1376         /* Uses multi-register access and preloading to maximise bandwidth.
1377          * Each pixel is one halfword, so a quadword contains 8px.
1378          * Preload frequency assumed a 64-byte cacheline.
1379          */
1380         asm volatile (
1381             "   cmp       %[count], #64                         \n"
1382             "   blt 1f    @ skip oversized fragments            \n"
1383             "0: @ start with eight quadwords at a time          \n"
1384             /* preload from next scanline */
1385             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1386             "   sub       %[count], %[count], #64               \n"
1387             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1388             "   vld1.16   {d20, d21, d22, d23}, [%[src]]!               \n"
1389             /* preload from next scanline */
1390             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1391             "   vld1.16   {d24, d25, d26, d27}, [%[src]]!               \n"
1392             "   vld1.16   {d28, d29, d30, d31}, [%[src]]!               \n"
1393             "   cmp       %[count], #64                         \n"
1394             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1395             "   vst1.16   {d20, d21, d22, d23}, [%[dst]]!               \n"
1396             "   vst1.16   {d24, d25, d26, d27}, [%[dst]]!               \n"
1397             "   vst1.16   {d28, d29, d30, d31}, [%[dst]]!               \n"
1398             "   bge 0b                                          \n"
1399             "   cmp       %[count], #0                          \n"
1400             "   beq 7f    @ aligned fastpath                    \n"
1401             "1: @ four quadwords                                \n"
1402             "   tst       %[count], #32                         \n"
1403             "   beq 2f    @ skip oversized fragment             \n"
1404             /* preload from next scanline */
1405             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1406             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1407             "   vld1.16   {d20, d21, d22, d23}, [%[src]]!               \n"
1408             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1409             "   vst1.16   {d20, d21, d22, d23}, [%[dst]]!               \n"
1410             "2: @ two quadwords                                 \n"
1411             "   tst       %[count], #16                         \n"
1412             "   beq 3f    @ skip oversized fragment             \n"
1413             /* preload from next scanline */
1414             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1415             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1416             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1417             "3: @ one quadword                                  \n"
1418             "   tst       %[count], #8                          \n"
1419             "   beq 4f    @ skip oversized fragment             \n"
1420             "   vld1.16   {d16, d17}, [%[src]]!                 \n"
1421             "   vst1.16   {d16, d17}, [%[dst]]!                 \n"
1422             "4: @ one doubleword                                \n"
1423             "   tst       %[count], #4                          \n"
1424             "   beq 5f    @ skip oversized fragment             \n"
1425             "   vld1.16   {d16}, [%[src]]!                      \n"
1426             "   vst1.16   {d16}, [%[dst]]!                      \n"
1427             "5: @ one word                                      \n"
1428             "   tst       %[count], #2                          \n"
1429             "   beq 6f    @ skip oversized fragment             \n"
1430             "   ldr       %[tmp], [%[src]], #4                  \n"
1431             "   str       %[tmp], [%[dst]], #4                  \n"
1432             "6: @ one halfword                                  \n"
1433             "   tst       %[count], #1                          \n"
1434             "   beq 7f    @ skip oversized fragment             \n"
1435             "   ldrh      %[tmp], [%[src]]                      \n"
1436             "   strh      %[tmp], [%[dst]]                      \n"
1437             "7: @ end                                           \n"
1438
1439             /* Clobbered input registers marked as input/outputs */
1440             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1441               [count] "+r" (count), [tmp] "+r" (tmp)
1442
1443               /* Unclobbered input */
1444             : [src_stride] "r" (src_stride)
1445
1446               /* Clobbered vector registers */
1447             : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1448               "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1449             );
1450
1451         src_line += src_stride;
1452         dst_line += dst_stride;
1453     }
1454 }
1455
1456 #endif /* USE_GCC_INLINE_ASM */
1457
1458 static void
1459 neon_composite_src_24_16 (pixman_implementation_t * impl,
1460                           pixman_op_t               op,
1461                           pixman_image_t *          src_image,
1462                           pixman_image_t *          mask_image,
1463                           pixman_image_t *          dst_image,
1464                           int32_t                   src_x,
1465                           int32_t                   src_y,
1466                           int32_t                   mask_x,
1467                           int32_t                   mask_y,
1468                           int32_t                   dest_x,
1469                           int32_t                   dest_y,
1470                           int32_t                   width,
1471                           int32_t                   height)
1472 {
1473     uint16_t    *dst_line;
1474     uint32_t    *src_line;
1475     uint32_t dst_stride, src_stride;
1476
1477     if (!width || !height)
1478         return;
1479
1480     /* We simply copy pixels from one place to another,
1481      * assuming that the source's alpha is opaque.
1482      */
1483     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1484     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1485
1486     /* Preload the first input scanline */
1487     {
1488         uint8_t *src_ptr = (uint8_t*) src_line;
1489         uint32_t count = (width + 15) / 16;
1490
1491 #ifdef USE_GCC_INLINE_ASM
1492         asm volatile (
1493             "0: @ loop                                          \n"
1494             "   subs    %[count], %[count], #1                  \n"
1495             "   pld     [%[src]]                                \n"
1496             "   add     %[src], %[src], #64                     \n"
1497             "   bgt 0b                                          \n"
1498
1499             /* Clobbered input registers marked as input/outputs */
1500             : [src] "+r" (src_ptr), [count] "+r" (count)
1501             :     /* no unclobbered inputs */
1502             : "cc"
1503             );
1504 #else
1505         do
1506         {
1507             __pld (src_ptr);
1508             src_ptr += 64;
1509         }
1510         while (--count);
1511 #endif
1512     }
1513
1514     while (height--)
1515     {
1516         uint16_t *dst_ptr = dst_line;
1517         uint32_t *src_ptr = src_line;
1518         uint32_t count = width;
1519         const uint32_t rb_mask = 0x1F;
1520         const uint32_t g_mask = 0x3F;
1521
1522         /* If you're going to complain about a goto, take a long hard look
1523          * at the massive blocks of assembler this skips over.  ;-)
1524          */
1525         if (count < 8)
1526             goto small_stuff;
1527
1528 #ifdef USE_GCC_INLINE_ASM
1529
1530         /* This is not as aggressive as the RGB565-source case.
1531          * Generally the source is in cached RAM when the formats are
1532          * different, so we use preload.
1533          *
1534          * We don't need to blend, so we are not reading from the
1535          * uncached framebuffer.
1536          */
1537         asm volatile (
1538             "   cmp       %[count], #16                         \n"
1539             "   blt 1f    @ skip oversized fragments            \n"
1540             "0: @ start with sixteen pixels at a time           \n"
1541             "   sub       %[count], %[count], #16               \n"
1542             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1543             "   vld4.8    {d0, d1, d2, d3}, [%[src]]!           @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1544             "   vld4.8    {d4, d5, d6, d7}, [%[src]]!           @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1545             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1546             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1547             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1548             "   vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1549             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1550             "   vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1551             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1552             "   vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1553             "   vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1554             "   vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1555             "   cmp       %[count], #16                         \n"
1556             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!          @ store 16 pixels                            \n"
1557             "   bge 0b                                          \n"
1558             "1: @ end of main loop                              \n"
1559             "   cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1560             "   blt 2f                                          \n"
1561             "   sub       %[count], %[count], #8                \n"
1562             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1563             "   vld4.8    {d0, d1, d2, d3}, [%[src]]!           @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1564             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1565             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1566             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1567             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1568             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1569             "   vst1.16   {d16, d17}, [%[dst]]!          @ store 8 pixels                               \n"
1570             "2: @ end                                           \n"
1571
1572             /* Clobbered input and working registers marked as input/outputs */
1573             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1574
1575               /* Unclobbered input */
1576             : [src_stride] "r" (src_stride)
1577
1578               /* Clobbered vector registers */
1579
1580               /* NB: these are the quad aliases of the
1581                * double registers used in the asm
1582                */
1583             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1584               "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1585             );
1586 #else
1587         /* A copy of the above code, in intrinsics-form. */
1588         while (count >= 16)
1589         {
1590             uint8x8x4_t pixel_set_a, pixel_set_b;
1591             uint16x8_t red_a, green_a, blue_a;
1592             uint16x8_t red_b, green_b, blue_b;
1593             uint16x8_t dest_pixels_a, dest_pixels_b;
1594
1595             count -= 16;
1596             __pld (src_ptr + src_stride);
1597             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1598             pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1599             src_ptr += 16;
1600
1601             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1602             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1603             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1604
1605             red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
1606             green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1607             blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
1608
1609             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1610             dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1611
1612             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1613             dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1614
1615             /* There doesn't seem to be an intrinsic for the
1616              * double-quadword variant
1617              */
1618             vst1q_u16 (dst_ptr, dest_pixels_a);
1619             vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1620             dst_ptr += 16;
1621         }
1622
1623         /* 8-pixel loop */
1624         if (count >= 8)
1625         {
1626             uint8x8x4_t pixel_set_a;
1627             uint16x8_t red_a, green_a, blue_a;
1628             uint16x8_t dest_pixels_a;
1629
1630             __pld (src_ptr + src_stride);
1631             count -= 8;
1632             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1633             src_ptr += 8;
1634
1635             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1636             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1637             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1638
1639             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1640             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1641
1642             vst1q_u16 (dst_ptr, dest_pixels_a);
1643             dst_ptr += 8;
1644         }
1645
1646 #endif  /* USE_GCC_INLINE_ASM */
1647
1648     small_stuff:
1649         if (count)
1650             __pld (src_ptr + src_stride);
1651
1652         while (count >= 2)
1653         {
1654             uint32_t src_pixel_a = *src_ptr++;
1655             uint32_t src_pixel_b = *src_ptr++;
1656
1657             /* ARM is really good at shift-then-ALU ops. */
1658             /* This should be a total of six shift-ANDs and five shift-ORs. */
1659             uint32_t dst_pixels_a;
1660             uint32_t dst_pixels_b;
1661
1662             dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1663             dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1664             dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1665
1666             dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1667             dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1668             dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1669
1670             /* little-endian mode only */
1671             *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1672             dst_ptr += 2;
1673             count -= 2;
1674         }
1675
1676         if (count)
1677         {
1678             uint32_t src_pixel = *src_ptr++;
1679
1680             /* ARM is really good at shift-then-ALU ops.
1681              * This block should end up as three shift-ANDs
1682              * and two shift-ORs.
1683              */
1684             uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1685             uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1686             uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1687             uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1688
1689             *dst_ptr++ = dst_pixel;
1690             count--;
1691         }
1692
1693         src_line += src_stride;
1694         dst_line += dst_stride;
1695     }
1696 }
1697
1698 static pixman_bool_t
1699 pixman_fill_neon (uint32_t *bits,
1700                   int       stride,
1701                   int       bpp,
1702                   int       x,
1703                   int       y,
1704                   int       width,
1705                   int       height,
1706                   uint32_t  _xor)
1707 {
1708     uint32_t byte_stride, color;
1709     char *dst;
1710
1711     /* stride is always multiple of 32bit units in pixman */
1712     byte_stride = stride * sizeof(uint32_t);
1713
1714     switch (bpp)
1715     {
1716     case 8:
1717         dst = ((char *) bits) + y * byte_stride + x;
1718         _xor &= 0xff;
1719         color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1720         break;
1721
1722     case 16:
1723         dst = ((char *) bits) + y * byte_stride + x * 2;
1724         _xor &= 0xffff;
1725         color = _xor << 16 | _xor;
1726         width *= 2;         /* width to bytes */
1727         break;
1728
1729     case 32:
1730         dst = ((char *) bits) + y * byte_stride + x * 4;
1731         color = _xor;
1732         width *= 4;         /* width to bytes */
1733         break;
1734
1735     default:
1736         return FALSE;
1737     }
1738
1739 #ifdef USE_GCC_INLINE_ASM
1740     if (width < 16)
1741     {
1742         /* We have a special case for such small widths that don't allow
1743          * us to use wide 128-bit stores anyway. We don't waste time
1744          * trying to align writes, since there are only very few of them anyway
1745          */
1746         asm volatile (
1747             "cmp                %[height], #0\n"/* Check if empty fill */
1748             "beq                3f\n"
1749             "vdup.32    d0, %[color]\n"/* Fill the color to neon req */
1750
1751             /* Check if we have a such width that can easily be handled by single
1752              * operation for each scanline. This significantly reduces the number
1753              * of test/branch instructions for each scanline
1754              */
1755             "cmp                %[width], #8\n"
1756             "beq                4f\n"
1757             "cmp                %[width], #4\n"
1758             "beq                5f\n"
1759             "cmp                %[width], #2\n"
1760             "beq                6f\n"
1761
1762             /* Loop starts here for each scanline */
1763             "1:\n"
1764             "mov                r4, %[dst]\n" /* Starting address of the current line */
1765             "tst                %[width], #8\n"
1766             "beq                2f\n"
1767             "vst1.8             {d0}, [r4]!\n"
1768             "2:\n"
1769             "tst                %[width], #4\n"
1770             "beq                2f\n"
1771             "str                %[color], [r4], #4\n"
1772             "2:\n"
1773             "tst                %[width], #2\n"
1774             "beq                2f\n"
1775             "strh               %[color], [r4], #2\n"
1776             "2:\n"
1777             "tst                %[width], #1\n"
1778             "beq                2f\n"
1779             "strb               %[color], [r4], #1\n"
1780             "2:\n"
1781
1782             "subs               %[height], %[height], #1\n"
1783             "add                %[dst], %[dst], %[byte_stride]\n"
1784             "bne                1b\n"
1785             "b          3f\n"
1786
1787             /* Special fillers for those widths that we can do with single operation */
1788             "4:\n"
1789             "subs               %[height], %[height], #1\n"
1790             "vst1.8             {d0}, [%[dst]]\n"
1791             "add                %[dst], %[dst], %[byte_stride]\n"
1792             "bne                4b\n"
1793             "b          3f\n"
1794
1795             "5:\n"
1796             "subs               %[height], %[height], #1\n"
1797             "str                %[color], [%[dst]]\n"
1798             "add                %[dst], %[dst], %[byte_stride]\n"
1799             "bne                5b\n"
1800             "b          3f\n"
1801
1802             "6:\n"
1803             "subs               %[height], %[height], #1\n"
1804             "strh               %[color], [%[dst]]\n"
1805             "add                %[dst], %[dst], %[byte_stride]\n"
1806             "bne                6b\n"
1807
1808             "3:\n"
1809             : [height] "+r" (height), [dst] "+r" (dst)
1810             : [color] "r" (color), [width] "r" (width),
1811               [byte_stride] "r" (byte_stride)
1812             : "memory", "cc", "d0", "r4");
1813     }
1814     else
1815     {
1816         asm volatile (
1817             "cmp                %[height], #0\n"/* Check if empty fill */
1818             "beq                5f\n"
1819             "vdup.32    q0, %[color]\n"/* Fill the color to neon req */
1820
1821             /* Loop starts here for each scanline */
1822             "1:\n"
1823             "mov                r4, %[dst]\n"/* Starting address of the current line */
1824             "mov                r5, %[width]\n"/* We're going to write this many bytes */
1825             "ands               r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1826             "beq                2f\n"/* Jump to the best case */
1827
1828             /* We're not 128-bit aligned: However, we know that we can get to the
1829                next aligned location, since the fill is at least 16 bytes wide */
1830             "rsb                r6, r6, #16\n" /* We would need to go forward this much */
1831             "sub                r5, r5, r6\n"/* Update bytes left */
1832             "tst                r6, #1\n"
1833             "beq                6f\n"
1834             "vst1.8             {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1835             "6:\n"
1836             "tst                r6, #2\n"
1837             "beq                6f\n"
1838             "vst1.16    {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1839             "6:\n"
1840             "tst                r6, #4\n"
1841             "beq                6f\n"
1842             "vst1.32    {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1843             "6:\n"
1844             "tst                r6, #8\n"
1845             "beq                2f\n"
1846             "vst1.64    {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1847
1848             /* The good case: We're 128-bit aligned for this scanline */
1849             "2:\n"
1850             "and                r6, r5, #15\n"/* Number of tailing bytes */
1851             "cmp                r5, r6\n"/* Do we have at least one qword to write? */
1852             "beq                6f\n"/* No, we just write the tail */
1853             "lsr                r5, r5, #4\n"/* This many full qwords to write */
1854
1855             /* The main block: Do 128-bit aligned writes */
1856             "3:\n"
1857             "subs               r5, r5, #1\n"
1858             "vst1.64    {d0, d1}, [r4, :128]!\n"
1859             "bne                3b\n"
1860
1861             /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1862                We know that we're currently at 128-bit aligned address, so we can just
1863                pick the biggest operations that the remaining write width allows */
1864             "6:\n"
1865             "cmp                r6, #0\n"
1866             "beq                4f\n"
1867             "tst                r6, #8\n"
1868             "beq                6f\n"
1869             "vst1.64    {d0}, [r4, :64]!\n"
1870             "6:\n"
1871             "tst                r6, #4\n"
1872             "beq                6f\n"
1873             "vst1.32    {d0[0]}, [r4, :32]!\n"
1874             "6:\n"
1875             "tst                r6, #2\n"
1876             "beq                6f\n"
1877             "vst1.16    {d0[0]}, [r4, :16]!\n"
1878             "6:\n"
1879             "tst                r6, #1\n"
1880             "beq                4f\n"
1881             "vst1.8             {d0[0]}, [r4]!\n"
1882             "4:\n"
1883
1884             /* Handle the next scanline */
1885             "subs               %[height], %[height], #1\n"
1886             "add                %[dst], %[dst], %[byte_stride]\n"
1887             "bne                1b\n"
1888             "5:\n"
1889             : [height] "+r" (height), [dst] "+r" (dst)
1890             : [color] "r" (color), [width] "r" (width),
1891               [byte_stride] "r" (byte_stride)
1892             : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
1893     }
1894     return TRUE;
1895
1896 #else
1897
1898     /* TODO: intrinsic version for armcc */
1899     return FALSE;
1900
1901 #endif
1902 }
1903
1904 /* TODO: is there a more generic way of doing this being introduced? */
1905 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1906
1907 static inline void
1908 neon_quadword_copy (void *   dst,
1909                     void *   src,
1910                     uint32_t count,         /* of quadwords */
1911                     uint32_t trailer_count  /* of bytes */)
1912 {
1913     uint8_t *t_dst = dst, *t_src = src;
1914
1915     /* Uses aligned multi-register loads to maximise read bandwidth
1916      * on uncached memory such as framebuffers
1917      * The accesses do not have the aligned qualifiers, so that the copy
1918      * may convert between aligned-uncached and unaligned-cached memory.
1919      * It is assumed that the CPU can infer alignedness from the address.
1920      */
1921
1922 #ifdef USE_GCC_INLINE_ASM
1923
1924     asm volatile (
1925         "       cmp       %[count], #8                          \n"
1926         "       blt 1f    @ skip oversized fragments            \n"
1927         "0: @ start with eight quadwords at a time              \n"
1928         "       sub       %[count], %[count], #8                \n"
1929         "       vld1.8    {d16, d17, d18, d19}, [%[src]]!               \n"
1930         "       vld1.8    {d20, d21, d22, d23}, [%[src]]!               \n"
1931         "       vld1.8    {d24, d25, d26, d27}, [%[src]]!               \n"
1932         "       vld1.8    {d28, d29, d30, d31}, [%[src]]!               \n"
1933         "       cmp       %[count], #8                          \n"
1934         "       vst1.8    {d16, d17, d18, d19}, [%[dst]]!               \n"
1935         "       vst1.8    {d20, d21, d22, d23}, [%[dst]]!               \n"
1936         "       vst1.8    {d24, d25, d26, d27}, [%[dst]]!               \n"
1937         "       vst1.8    {d28, d29, d30, d31}, [%[dst]]!               \n"
1938         "       bge 0b                                          \n"
1939         "1: @ four quadwords                                    \n"
1940         "       tst       %[count], #4                          \n"
1941         "       beq 2f    @ skip oversized fragment             \n"
1942         "       vld1.8    {d16, d17, d18, d19}, [%[src]]!               \n"
1943         "       vld1.8    {d20, d21, d22, d23}, [%[src]]!               \n"
1944         "       vst1.8    {d16, d17, d18, d19}, [%[dst]]!               \n"
1945         "       vst1.8    {d20, d21, d22, d23}, [%[dst]]!               \n"
1946         "2: @ two quadwords                                     \n"
1947         "       tst       %[count], #2                          \n"
1948         "       beq 3f    @ skip oversized fragment             \n"
1949         "       vld1.8    {d16, d17, d18, d19}, [%[src]]!               \n"
1950         "       vst1.8    {d16, d17, d18, d19}, [%[dst]]!               \n"
1951         "3: @ one quadword                                      \n"
1952         "       tst       %[count], #1                          \n"
1953         "       beq 4f    @ skip oversized fragment             \n"
1954         "       vld1.8    {d16, d17}, [%[src]]!                 \n"
1955         "       vst1.8    {d16, d17}, [%[dst]]!                 \n"
1956         "4: @ end                                               \n"
1957
1958         /* Clobbered input registers marked as input/outputs */
1959         : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1960
1961           /* No unclobbered inputs */
1962         :
1963
1964         /* Clobbered vector registers */
1965         : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
1966           "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
1967
1968 #else
1969
1970     while (count >= 8)
1971     {
1972         uint8x16x4_t t1 = vld4q_u8 (t_src);
1973         uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1974
1975         t_src += sizeof(uint8x16x4_t) * 2;
1976         vst4q_u8 (t_dst, t1);
1977         vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1978         t_dst += sizeof(uint8x16x4_t) * 2;
1979         count -= 8;
1980     }
1981
1982     if (count & 4)
1983     {
1984         uint8x16x4_t t1 = vld4q_u8 (t_src);
1985
1986         t_src += sizeof(uint8x16x4_t);
1987         vst4q_u8 (t_dst, t1);
1988         t_dst += sizeof(uint8x16x4_t);
1989     }
1990
1991     if (count & 2)
1992     {
1993         uint8x8x4_t t1 = vld4_u8 (t_src);
1994
1995         t_src += sizeof(uint8x8x4_t);
1996         vst4_u8 (t_dst, t1);
1997         t_dst += sizeof(uint8x8x4_t);
1998     }
1999
2000     if (count & 1)
2001     {
2002         uint8x16_t t1 = vld1q_u8 (t_src);
2003
2004         t_src += sizeof(uint8x16_t);
2005         vst1q_u8 (t_dst, t1);
2006         t_dst += sizeof(uint8x16_t);
2007     }
2008
2009 #endif  /* !USE_GCC_INLINE_ASM */
2010
2011     if (trailer_count)
2012     {
2013         if (trailer_count & 8)
2014         {
2015             uint8x8_t t1 = vld1_u8 (t_src);
2016
2017             t_src += sizeof(uint8x8_t);
2018             vst1_u8 (t_dst, t1);
2019             t_dst += sizeof(uint8x8_t);
2020         }
2021
2022         if (trailer_count & 4)
2023         {
2024             *((uint32_t*) t_dst) = *((uint32_t*) t_src);
2025
2026             t_dst += 4;
2027             t_src += 4;
2028         }
2029
2030         if (trailer_count & 2)
2031         {
2032             *((uint16_t*) t_dst) = *((uint16_t*) t_src);
2033
2034             t_dst += 2;
2035             t_src += 2;
2036         }
2037
2038         if (trailer_count & 1)
2039         {
2040             *t_dst++ = *t_src++;
2041         }
2042     }
2043 }
2044
2045 static inline void
2046 solid_over_565_8_pix_neon (uint32_t  glyph_colour,
2047                            uint16_t *dest,
2048                            uint8_t * in_mask,
2049                            uint32_t  dest_stride,    /* bytes, not elements */
2050                            uint32_t  mask_stride,
2051                            uint32_t  count           /* 8-pixel groups */)
2052 {
2053     /* Inner loop of glyph blitter (solid colour, alpha mask) */
2054
2055 #ifdef USE_GCC_INLINE_ASM
2056
2057     asm volatile (
2058         "       vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]]  @ splat solid colour components \n"
2059         "0:     @ loop                                                                                                                                                          \n"
2060         "       vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer                      \n"
2061         "       vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph                                                \n"
2062         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
2063         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
2064         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
2065         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
2066         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
2067         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
2068         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
2069         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
2070         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
2071         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
2072         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
2073         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
2074         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
2075         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
2076         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
2077         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
2078         "       add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait                \n"
2079         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
2080         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
2081         "       vst1.16   {d2, d3}, [%[dest]]         @ store composited pixels                                         \n"
2082         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer                                     \n"
2083         "       bne 0b                               @ next please                                                                      \n"
2084
2085         /* Clobbered registers marked as input/outputs */
2086         : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
2087
2088           /* Inputs */
2089         : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
2090
2091           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2092         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
2093           "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
2094         );
2095
2096 #else
2097
2098     uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
2099
2100     while (count--)
2101     {
2102         uint16x8_t pixels = vld1q_u16 (dest);
2103         uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
2104         uint8x8_t mask_image = vmvn_u8 (mask);
2105
2106         uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
2107         uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
2108         uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
2109
2110         uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
2111         uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
2112         uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
2113
2114         s_red   = vmlal (s_red, mask, solid_colour.val[2]);
2115         s_green = vmlal (s_green, mask, solid_colour.val[1]);
2116         s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
2117
2118         pixels = vsri_n_u16 (s_red, s_green, 5);
2119         pixels = vsri_n_u16 (pixels, s_blue, 11);
2120         vst1q_u16 (dest, pixels);
2121
2122         dest += dest_stride;
2123         mask += mask_stride;
2124     }
2125
2126 #endif
2127 }
2128
2129 #if 0 /* this is broken currently */
2130 static void
2131 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
2132                               pixman_op_t               op,
2133                               pixman_image_t *          src_image,
2134                               pixman_image_t *          mask_image,
2135                               pixman_image_t *          dst_image,
2136                               int32_t                   src_x,
2137                               int32_t                   src_y,
2138                               int32_t                   mask_x,
2139                               int32_t                   mask_y,
2140                               int32_t                   dest_x,
2141                               int32_t                   dest_y,
2142                               int32_t                   width,
2143                               int32_t                   height)
2144 {
2145     uint32_t  src, srca;
2146     uint16_t *dst_line, *aligned_line;
2147     uint8_t  *mask_line;
2148     uint32_t  dst_stride, mask_stride;
2149     uint32_t  kernel_count, copy_count, copy_tail;
2150     uint8_t   kernel_offset, copy_offset;
2151
2152     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2153
2154     /* bail out if fully transparent or degenerate */
2155     srca = src >> 24;
2156     if (src == 0)
2157         return;
2158
2159     if (width == 0 || height == 0)
2160         return;
2161
2162     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2163     {
2164         /* split the blit, so we can use a fixed-size scanline buffer
2165          * TODO: there must be a more elegant way of doing this.
2166          */
2167         int x;
2168         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2169         {
2170             neon_composite_over_n_8_0565 (
2171                 impl, op,
2172                 src_image, mask_image, dst_image,
2173                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2174                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2175         }
2176
2177         return;
2178     }
2179
2180     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2181     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2182
2183     /* keep within minimum number of aligned quadwords on width
2184      * while also keeping the minimum number of columns to process
2185      */
2186     {
2187         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2188         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2189         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2190
2191         /* the fast copy should be quadword aligned */
2192         copy_offset = dst_line - ((uint16_t*) aligned_left);
2193         aligned_line = dst_line - copy_offset;
2194         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2195         copy_tail = 0;
2196
2197         if (aligned_right - aligned_left > ceiling_length)
2198         {
2199             /* unaligned routine is tightest */
2200             kernel_count = (uint32_t) (ceiling_length >> 4);
2201             kernel_offset = copy_offset;
2202         }
2203         else
2204         {
2205             /* aligned routine is equally tight, so it is safer to align */
2206             kernel_count = copy_count;
2207             kernel_offset = 0;
2208         }
2209
2210         /* We should avoid reading beyond scanline ends for safety */
2211         if (aligned_line < (dst_line - dest_x) ||
2212             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2213         {
2214             /* switch to precise read */
2215             copy_offset = kernel_offset = 0;
2216             aligned_line = dst_line;
2217             kernel_count = (uint32_t) (ceiling_length >> 4);
2218             copy_count = (width * sizeof(*dst_line)) >> 4;
2219             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2220         }
2221     }
2222
2223     {
2224         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
2225         uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
2226         int y = height;
2227
2228         /* row-major order */
2229         /* left edge, middle block, right edge */
2230         for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
2231         {
2232             /* We don't want to overrun the edges of the glyph,
2233              * so realign the edge data into known buffers
2234              */
2235             neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
2236
2237             /* Uncached framebuffer access is really, really slow
2238              * if we do it piecemeal. It should be much faster if we
2239              * grab it all at once. One scanline should easily fit in
2240              * L1 cache, so this should not waste RAM bandwidth.
2241              */
2242             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2243
2244             /* Apply the actual filter */
2245             solid_over_565_8_pix_neon (
2246                 src, scan_line + kernel_offset,
2247                 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
2248                 8, kernel_count);
2249
2250             /* Copy the modified scanline back */
2251             neon_quadword_copy (dst_line, scan_line + copy_offset,
2252                                 width >> 3, (width & 7) * 2);
2253         }
2254     }
2255 }
2256 #endif
2257
2258 #ifdef USE_GCC_INLINE_ASM
2259
2260 static inline void
2261 plain_over_565_8_pix_neon (uint32_t  colour,
2262                            uint16_t *dest,
2263                            uint32_t  dest_stride,     /* bytes, not elements */
2264                            uint32_t  count            /* 8-pixel groups */)
2265 {
2266     /* Inner loop for plain translucent rects
2267      * (solid colour without alpha mask)
2268      */
2269     asm volatile (
2270         "       vld4.8   {d20[], d21[], d22[], d23[]}, [%[colour]]  @ solid colour load/splat \n"
2271         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
2272         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
2273         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
2274         "       vmvn      d18, d23                   @ inverse alpha for background \n"
2275         "0:     @ loop\n"
2276         "       vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer      \n"
2277         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2278         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2279         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2280         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2281         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2282         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2283         "       vmov      q0, q12                    @ retrieve foreground red   \n"
2284         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
2285         "       vmov      q1, q13                    @ retrieve foreground green \n"
2286         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
2287         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
2288         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
2289         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2290         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
2291         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
2292         "       vst1.16   {d0, d1}, [%[dest]]         @ store composited pixels                 \n"
2293         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer             \n"
2294         "       bne 0b                               @ next please                              \n"
2295
2296         /* Clobbered registers marked as input/outputs */
2297         : [dest] "+r" (dest), [count] "+r" (count)
2298
2299           /* Inputs */
2300         : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
2301
2302           /* Clobbers, including the inputs we modify, and
2303            * potentially lots of memory
2304            */
2305         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
2306           "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2307           "cc", "memory"
2308         );
2309 }
2310
2311 static void
2312 neon_composite_over_n_0565 (pixman_implementation_t * impl,
2313                             pixman_op_t               op,
2314                             pixman_image_t *          src_image,
2315                             pixman_image_t *          mask_image,
2316                             pixman_image_t *          dst_image,
2317                             int32_t                   src_x,
2318                             int32_t                   src_y,
2319                             int32_t                   mask_x,
2320                             int32_t                   mask_y,
2321                             int32_t                   dest_x,
2322                             int32_t                   dest_y,
2323                             int32_t                   width,
2324                             int32_t                   height)
2325 {
2326     uint32_t src, srca;
2327     uint16_t    *dst_line, *aligned_line;
2328     uint32_t dst_stride;
2329     uint32_t kernel_count, copy_count, copy_tail;
2330     uint8_t kernel_offset, copy_offset;
2331
2332     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2333
2334     /* bail out if fully transparent */
2335     srca = src >> 24;
2336     if (src == 0)
2337         return;
2338
2339     if (width == 0 || height == 0)
2340         return;
2341
2342     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2343     {
2344         /* split the blit, so we can use a fixed-size scanline buffer *
2345          * TODO: there must be a more elegant way of doing this.
2346          */
2347         int x;
2348
2349         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2350         {
2351             neon_composite_over_n_0565 (
2352                 impl, op,
2353                 src_image, mask_image, dst_image,
2354                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2355                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2356         }
2357         return;
2358     }
2359
2360     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2361
2362     /* keep within minimum number of aligned quadwords on width
2363      * while also keeping the minimum number of columns to process
2364      */
2365     {
2366         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2367         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2368         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2369
2370         /* the fast copy should be quadword aligned */
2371         copy_offset = dst_line - ((uint16_t*) aligned_left);
2372         aligned_line = dst_line - copy_offset;
2373         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2374         copy_tail = 0;
2375
2376         if (aligned_right - aligned_left > ceiling_length)
2377         {
2378             /* unaligned routine is tightest */
2379             kernel_count = (uint32_t) (ceiling_length >> 4);
2380             kernel_offset = copy_offset;
2381         }
2382         else
2383         {
2384             /* aligned routine is equally tight, so it is safer to align */
2385             kernel_count = copy_count;
2386             kernel_offset = 0;
2387         }
2388
2389         /* We should avoid reading beyond scanline ends for safety */
2390         if (aligned_line < (dst_line - dest_x) ||
2391             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2392         {
2393             /* switch to precise read */
2394             copy_offset = kernel_offset = 0;
2395             aligned_line = dst_line;
2396             kernel_count = (uint32_t) (ceiling_length >> 4);
2397             copy_count = (width * sizeof(*dst_line)) >> 4;
2398             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2399         }
2400     }
2401
2402     {
2403         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
2404
2405         /* row-major order */
2406         /* left edge, middle block, right edge */
2407         for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2408         {
2409             /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2410              * It should be much faster if we grab it all at once.
2411              * One scanline should easily fit in L1 cache, so this should
2412              * not waste RAM bandwidth.
2413              */
2414             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2415
2416             /* Apply the actual filter */
2417             plain_over_565_8_pix_neon (
2418                 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2419
2420             /* Copy the modified scanline back */
2421             neon_quadword_copy (
2422                 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2423         }
2424     }
2425 }
2426
2427 static inline void
2428 ARGB8_over_565_8_pix_neon (uint32_t *src,
2429                            uint16_t *dest,
2430                            uint32_t  src_stride,     /* bytes, not elements */
2431                            uint32_t  count           /* 8-pixel groups */)
2432 {
2433     asm volatile (
2434         "0:     @ loop\n"
2435         "       pld   [%[src], %[src_stride]]         @ preload from next scanline      \n"
2436         "       vld1.16   {d0, d1}, [%[dest]]         @ load pixels from framebuffer    \n"
2437         "       vld4.8   {d20, d21, d22, d23},[%[src]]! @ load source image pixels              \n"
2438         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2439         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2440         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2441         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
2442         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2443         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2444         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2445         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
2446         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
2447         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
2448         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2449         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
2450         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
2451         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
2452         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
2453         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
2454         "       vst1.16   {d2, d3}, [%[dest]]!        @ store composited pixels                 \n"
2455         "       bne 0b                               @ next please                              \n"
2456
2457         /* Clobbered registers marked as input/outputs */
2458         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2459
2460           /* Inputs */
2461         : [src_stride] "r" (src_stride)
2462
2463           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2464         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
2465           "d21", "d22", "d23", "cc", "memory"
2466         );
2467 }
2468
2469 static void
2470 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2471                                pixman_op_t               op,
2472                                pixman_image_t *          src_image,
2473                                pixman_image_t *          mask_image,
2474                                pixman_image_t *          dst_image,
2475                                int32_t                   src_x,
2476                                int32_t                   src_y,
2477                                int32_t                   mask_x,
2478                                int32_t                   mask_y,
2479                                int32_t                   dest_x,
2480                                int32_t                   dest_y,
2481                                int32_t                   width,
2482                                int32_t                   height)
2483 {
2484     uint32_t    *src_line;
2485     uint16_t    *dst_line, *aligned_line;
2486     uint32_t dst_stride, src_stride;
2487     uint32_t kernel_count, copy_count, copy_tail;
2488     uint8_t kernel_offset, copy_offset;
2489
2490     /* we assume mask is opaque
2491      * so the only alpha to deal with is embedded in src
2492      */
2493     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2494     {
2495         /* split the blit, so we can use a fixed-size scanline buffer */
2496         int x;
2497         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2498         {
2499             neon_composite_over_8888_0565 (
2500                 impl, op,
2501                 src_image, mask_image, dst_image,
2502                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2503                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2504         }
2505         return;
2506     }
2507
2508     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2509     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2510
2511     /* keep within minimum number of aligned quadwords on width
2512      * while also keeping the minimum number of columns to process
2513      */
2514     {
2515         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2516         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2517         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2518
2519         /* the fast copy should be quadword aligned */
2520         copy_offset = dst_line - ((uint16_t*) aligned_left);
2521         aligned_line = dst_line - copy_offset;
2522         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2523         copy_tail = 0;
2524
2525         if (aligned_right - aligned_left > ceiling_length)
2526         {
2527             /* unaligned routine is tightest */
2528             kernel_count = (uint32_t) (ceiling_length >> 4);
2529             kernel_offset = copy_offset;
2530         }
2531         else
2532         {
2533             /* aligned routine is equally tight, so it is safer to align */
2534             kernel_count = copy_count;
2535             kernel_offset = 0;
2536         }
2537
2538         /* We should avoid reading beyond scanline ends for safety */
2539         if (aligned_line < (dst_line - dest_x) ||
2540             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2541         {
2542             /* switch to precise read */
2543             copy_offset = kernel_offset = 0;
2544             aligned_line = dst_line;
2545             kernel_count = (uint32_t) (ceiling_length >> 4);
2546             copy_count = (width * sizeof(*dst_line)) >> 4;
2547             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2548         }
2549     }
2550
2551     /* Preload the first input scanline */
2552     {
2553         uint8_t *src_ptr = (uint8_t*) src_line;
2554         uint32_t count = (width + 15) / 16;
2555
2556 #ifdef USE_GCC_INLINE_ASM
2557         asm volatile (
2558             "0: @ loop                                          \n"
2559             "   subs    %[count], %[count], #1                  \n"
2560             "   pld     [%[src]]                                \n"
2561             "   add     %[src], %[src], #64                     \n"
2562             "   bgt 0b                                          \n"
2563
2564             /* Clobbered input registers marked as input/outputs */
2565             : [src] "+r" (src_ptr), [count] "+r" (count)
2566             :     /* no unclobbered inputs */
2567             : "cc"
2568             );
2569 #else
2570         do
2571         {
2572             __pld (src_ptr);
2573             src_ptr += 64;
2574         }
2575         while (--count);
2576 #endif
2577     }
2578
2579     {
2580         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2581
2582         /* row-major order */
2583         /* left edge, middle block, right edge */
2584         for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2585         {
2586             /* Uncached framebuffer access is really, really slow if we do
2587              * it piecemeal. It should be much faster if we grab it all at
2588              * once. One scanline should easily fit in L1 cache, so this
2589              * should not waste RAM bandwidth.
2590              */
2591             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2592
2593             /* Apply the actual filter */
2594             ARGB8_over_565_8_pix_neon (
2595                 src_line, scan_line + kernel_offset,
2596                 src_stride * sizeof(*src_line), kernel_count);
2597
2598             /* Copy the modified scanline back */
2599             neon_quadword_copy (dst_line,
2600                                 scan_line + copy_offset,
2601                                 width >> 3, (width & 7) * 2);
2602         }
2603     }
2604 }
2605
2606 #endif  /* USE_GCC_INLINE_ASM */
2607
2608 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2609 {
2610     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,     0 },
2611     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
2612     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
2613     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
2614     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2615     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2616     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2617     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2618 #ifdef USE_GCC_INLINE_ASM
2619     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
2620     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
2621 #if 0 /* this code has some bugs */
2622     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
2623     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
2624     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
2625     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
2626 #endif
2627 #endif
2628     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
2629     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
2630     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
2631     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
2632     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2633     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2634     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
2635     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
2636     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
2637     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
2638     { PIXMAN_OP_NONE },
2639 };
2640
2641 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2642
2643 static void
2644 arm_neon_composite (pixman_implementation_t *imp,
2645                     pixman_op_t              op,
2646                     pixman_image_t *         src,
2647                     pixman_image_t *         mask,
2648                     pixman_image_t *         dest,
2649                     int32_t                  src_x,
2650                     int32_t                  src_y,
2651                     int32_t                  mask_x,
2652                     int32_t                  mask_y,
2653                     int32_t                  dest_x,
2654                     int32_t                  dest_y,
2655                     int32_t                  width,
2656                     int32_t                  height)
2657 {
2658     if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2659                                op, src, mask, dest,
2660                                src_x, src_y,
2661                                mask_x, mask_y,
2662                                dest_x, dest_y,
2663                                width, height))
2664     {
2665         return;
2666     }
2667
2668     _pixman_implementation_composite (imp->delegate, op,
2669                                       src, mask, dest,
2670                                       src_x, src_y,
2671                                       mask_x, mask_y,
2672                                       dest_x, dest_y,
2673                                       width, height);
2674 }
2675
2676 static pixman_bool_t
2677 pixman_blt_neon (void *src_bits,
2678                  void *dst_bits,
2679                  int   src_stride,
2680                  int   dst_stride,
2681                  int   src_bpp,
2682                  int   dst_bpp,
2683                  int   src_x,
2684                  int   src_y,
2685                  int   dst_x,
2686                  int   dst_y,
2687                  int   width,
2688                  int   height)
2689 {
2690     if (!width || !height)
2691         return TRUE;
2692
2693     /* accelerate only straight copies involving complete bytes */
2694     if (src_bpp != dst_bpp || (src_bpp & 7))
2695         return FALSE;
2696
2697     {
2698         uint32_t bytes_per_pixel = src_bpp >> 3;
2699         uint32_t byte_width = width * bytes_per_pixel;
2700         /* parameter is in words for some reason */
2701         int32_t src_stride_bytes = src_stride * 4;
2702         int32_t dst_stride_bytes = dst_stride * 4;
2703         uint8_t *src_bytes = ((uint8_t*) src_bits) +
2704             src_y * src_stride_bytes + src_x * bytes_per_pixel;
2705         uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2706             dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2707         uint32_t quadword_count = byte_width / 16;
2708         uint32_t offset         = byte_width % 16;
2709
2710         while (height--)
2711         {
2712             neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2713             src_bytes += src_stride_bytes;
2714             dst_bytes += dst_stride_bytes;
2715         }
2716     }
2717
2718     return TRUE;
2719 }
2720
2721 static pixman_bool_t
2722 arm_neon_blt (pixman_implementation_t *imp,
2723               uint32_t *               src_bits,
2724               uint32_t *               dst_bits,
2725               int                      src_stride,
2726               int                      dst_stride,
2727               int                      src_bpp,
2728               int                      dst_bpp,
2729               int                      src_x,
2730               int                      src_y,
2731               int                      dst_x,
2732               int                      dst_y,
2733               int                      width,
2734               int                      height)
2735 {
2736     if (pixman_blt_neon (
2737             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2738             src_x, src_y, dst_x, dst_y, width, height))
2739     {
2740         return TRUE;
2741     }
2742
2743     return _pixman_implementation_blt (
2744                imp->delegate,
2745                src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2746                src_x, src_y, dst_x, dst_y, width, height);
2747 }
2748
2749 static pixman_bool_t
2750 arm_neon_fill (pixman_implementation_t *imp,
2751                uint32_t *               bits,
2752                int                      stride,
2753                int                      bpp,
2754                int                      x,
2755                int                      y,
2756                int                      width,
2757                int                      height,
2758                uint32_t xor)
2759 {
2760     if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2761         return TRUE;
2762
2763     return _pixman_implementation_fill (
2764         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2765 }
2766
2767 pixman_implementation_t *
2768 _pixman_implementation_create_arm_neon (void)
2769 {
2770     pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2771     pixman_implementation_t *imp = _pixman_implementation_create (simd);
2772
2773     imp->composite = arm_neon_composite;
2774 #if 0 /* this code has some bugs */
2775     imp->blt = arm_neon_blt;
2776 #endif
2777     imp->fill = arm_neon_fill;
2778
2779     return imp;
2780 }
2781