pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 /* Deal with an intrinsic that is defined differently in GCC */
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch (_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t
  43 unpack0565 (uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8 (0);
  49     gb = vshrq_n_u16 (rgb, 5);
  50     b = vshrq_n_u16 (rgb, 5 + 6);
  51
  52     res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
  53     res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
  54     res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
  55
  56     res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
  57     res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
  58     res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
  59
  60     res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
  61     res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
  62     res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
  63
  64     return res;
  65 }
  66
  67 static force_inline uint16x8_t
  68 pack0565 (uint8x8x4_t s)
  69 {
  70     uint16x8_t rgb, val_g, val_r;
  71
  72     rgb = vshll_n_u8 (s.val[2], 8);
  73     val_g = vshll_n_u8 (s.val[1], 8);
  74     val_r = vshll_n_u8 (s.val[0], 8);
  75     rgb = vsriq_n_u16 (rgb, val_g, 5);
  76     rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
  77
  78     return rgb;
  79 }
  80
  81 static force_inline uint8x8_t
  82 neon2mul (uint8x8_t x,
  83           uint8x8_t alpha)
  84 {
  85     uint16x8_t tmp, tmp2;
  86     uint8x8_t res;
  87
  88     tmp = vmull_u8 (x, alpha);
  89     tmp2 = vrshrq_n_u16 (tmp, 8);
  90     res = vraddhn_u16 (tmp, tmp2);
  91
  92     return res;
  93 }
  94
  95 static force_inline uint8x8x4_t
  96 neon8mul (uint8x8x4_t x,
  97           uint8x8_t   alpha)
  98 {
  99     uint16x8x4_t tmp;
 100     uint8x8x4_t res;
 101     uint16x8_t qtmp1, qtmp2;
 102
 103     tmp.val[0] = vmull_u8 (x.val[0], alpha);
 104     tmp.val[1] = vmull_u8 (x.val[1], alpha);
 105     tmp.val[2] = vmull_u8 (x.val[2], alpha);
 106     tmp.val[3] = vmull_u8 (x.val[3], alpha);
 107
 108     qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
 109     qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
 110     res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
 111     qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
 112     res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
 113     qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
 114     res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
 115     res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 116
 117     return res;
 118 }
 119
 120 static force_inline uint8x8x4_t
 121 neon8qadd (uint8x8x4_t x,
 122            uint8x8x4_t y)
 123 {
 124     uint8x8x4_t res;
 125
 126     res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
 127     res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
 128     res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
 129     res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 130
 131     return res;
 132 }
 133
 134 static void
 135 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
 136                               pixman_op_t               op,
 137                               pixman_image_t *          src_image,
 138                               pixman_image_t *          mask_image,
 139                               pixman_image_t *          dst_image,
 140                               int32_t                   src_x,
 141                               int32_t                   src_y,
 142                               int32_t                   mask_x,
 143                               int32_t                   mask_y,
 144                               int32_t                   dest_x,
 145                               int32_t                   dest_y,
 146                               int32_t                   width,
 147                               int32_t                   height)
 148 {
 149     uint8_t     *dst_line, *dst;
 150     uint8_t     *src_line, *src;
 151     int dst_stride, src_stride;
 152     uint16_t w;
 153
 154     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 155     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 156
 157     if (width >= 8)
 158     {
 159         /* Use overlapping 8-pixel method */
 160         while (height--)
 161         {
 162             uint8_t *keep_dst = 0;
 163             uint8x8_t sval, dval, temp;
 164
 165             dst = dst_line;
 166             dst_line += dst_stride;
 167             src = src_line;
 168             src_line += src_stride;
 169             w = width;
 170
 171 #ifndef USE_GCC_INLINE_ASM
 172             sval = vld1_u8 ((void *)src);
 173             dval = vld1_u8 ((void *)dst);
 174             keep_dst = dst;
 175
 176             temp = vqadd_u8 (dval, sval);
 177
 178             src += (w & 7);
 179             dst += (w & 7);
 180             w -= (w & 7);
 181
 182             while (w)
 183             {
 184                 sval = vld1_u8 ((void *)src);
 185                 dval = vld1_u8 ((void *)dst);
 186
 187                 vst1_u8 ((void *)keep_dst, temp);
 188                 keep_dst = dst;
 189
 190                 temp = vqadd_u8 (dval, sval);
 191
 192                 src += 8;
 193                 dst += 8;
 194                 w -= 8;
 195             }
 196
 197             vst1_u8 ((void *)keep_dst, temp);
 198 #else
 199             asm volatile (
 200 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 201                 "vld1.8  {d0}, [%[src]]\n\t"
 202                 "vld1.8  {d4}, [%[dst]]\n\t"
 203                 "mov     %[keep_dst], %[dst]\n\t"
 204
 205                 "and ip, %[w], #7\n\t"
 206                 "add %[src], %[src], ip\n\t"
 207                 "add %[dst], %[dst], ip\n\t"
 208                 "subs %[w], %[w], ip\n\t"
 209                 "b 9f\n\t"
 210 /* LOOP */
 211                 "2:\n\t"
 212                 "vld1.8  {d0}, [%[src]]!\n\t"
 213                 "vld1.8  {d4}, [%[dst]]!\n\t"
 214                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 215                 "sub     %[keep_dst], %[dst], #8\n\t"
 216                 "subs %[w], %[w], #8\n\t"
 217                 "9:\n\t"
 218                 "vqadd.u8 d20, d0, d4\n\t"
 219
 220                 "bne 2b\n\t"
 221
 222                 "1:\n\t"
 223                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 224
 225                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 226                 :
 227                 : "ip", "cc", "memory", "d0", "d4",
 228                 "d20"
 229                 );
 230 #endif
 231         }
 232     }
 233     else
 234     {
 235         const uint8_t nil = 0;
 236         const uint8x8_t vnil = vld1_dup_u8 (&nil);
 237
 238         while (height--)
 239         {
 240             uint8x8_t sval = vnil, dval = vnil;
 241             uint8_t *dst4 = 0, *dst2 = 0;
 242
 243             dst = dst_line;
 244             dst_line += dst_stride;
 245             src = src_line;
 246             src_line += src_stride;
 247             w = width;
 248
 249             if (w & 4)
 250             {
 251                 sval = vreinterpret_u8_u32 (
 252                     vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
 253                 dval = vreinterpret_u8_u32 (
 254                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
 255
 256                 dst4 = dst;
 257                 src += 4;
 258                 dst += 4;
 259             }
 260
 261             if (w & 2)
 262             {
 263                 sval = vreinterpret_u8_u16 (
 264                     vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
 265                 dval = vreinterpret_u8_u16 (
 266                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
 267
 268                 dst2 = dst;
 269                 src += 2;
 270                 dst += 2;
 271             }
 272
 273             if (w & 1)
 274             {
 275                 sval = vld1_lane_u8 (src, sval, 1);
 276                 dval = vld1_lane_u8 (dst, dval, 1);
 277             }
 278
 279             dval = vqadd_u8 (dval, sval);
 280
 281             if (w & 1)
 282                 vst1_lane_u8 (dst, dval, 1);
 283
 284             if (w & 2)
 285                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
 286
 287             if (w & 4)
 288                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
 289         }
 290     }
 291 }
 292
 293 static void
 294 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
 295                                pixman_op_t               op,
 296                                pixman_image_t *          src_image,
 297                                pixman_image_t *          mask_image,
 298                                pixman_image_t *          dst_image,
 299                                int32_t                   src_x,
 300                                int32_t                   src_y,
 301                                int32_t                   mask_x,
 302                                int32_t                   mask_y,
 303                                int32_t                   dest_x,
 304                                int32_t                   dest_y,
 305                                int32_t                   width,
 306                                int32_t                   height)
 307 {
 308     uint32_t    *dst_line, *dst;
 309     uint32_t    *src_line, *src;
 310     int dst_stride, src_stride;
 311     uint32_t w;
 312
 313     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 314     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 315
 316     if (width >= 8)
 317     {
 318         /* Use overlapping 8-pixel method */
 319         while (height--)
 320         {
 321             uint32_t *keep_dst = 0;
 322             uint8x8x4_t sval, dval, temp;
 323
 324             dst = dst_line;
 325             dst_line += dst_stride;
 326             src = src_line;
 327             src_line += src_stride;
 328             w = width;
 329
 330 #ifndef USE_GCC_INLINE_ASM
 331             sval = vld4_u8 ((void *)src);
 332             dval = vld4_u8 ((void *)dst);
 333             keep_dst = dst;
 334
 335             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 336             temp = neon8qadd (sval, temp);
 337
 338             src += (w & 7);
 339             dst += (w & 7);
 340             w -= (w & 7);
 341
 342             while (w)
 343             {
 344                 sval = vld4_u8 ((void *)src);
 345                 dval = vld4_u8 ((void *)dst);
 346
 347                 vst4_u8 ((void *)keep_dst, temp);
 348                 keep_dst = dst;
 349
 350                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 351                 temp = neon8qadd (sval, temp);
 352
 353                 src += 8;
 354                 dst += 8;
 355                 w -= 8;
 356             }
 357
 358             vst4_u8 ((void *)keep_dst, temp);
 359 #else
 360             asm volatile (
 361 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 362                 "vld4.8  {d0-d3}, [%[src]]\n\t"
 363                 "vld4.8  {d4-d7}, [%[dst]]\n\t"
 364                 "mov     %[keep_dst], %[dst]\n\t"
 365
 366                 "and ip, %[w], #7\n\t"
 367                 "add %[src], %[src], ip, LSL#2\n\t"
 368                 "add %[dst], %[dst], ip, LSL#2\n\t"
 369                 "subs %[w], %[w], ip\n\t"
 370                 "b 9f\n\t"
 371 /* LOOP */
 372                 "2:\n\t"
 373                 "vld4.8  {d0-d3}, [%[src]]!\n\t"
 374                 "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 375                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 376                 "sub     %[keep_dst], %[dst], #8*4\n\t"
 377                 "subs %[w], %[w], #8\n\t"
 378                 "9:\n\t"
 379                 "vmvn.8  d31, d3\n\t"
 380                 "vmull.u8 q10, d31, d4\n\t"
 381                 "vmull.u8 q11, d31, d5\n\t"
 382                 "vmull.u8 q12, d31, d6\n\t"
 383                 "vmull.u8 q13, d31, d7\n\t"
 384                 "vrshr.u16 q8, q10, #8\n\t"
 385                 "vrshr.u16 q9, q11, #8\n\t"
 386                 "vraddhn.u16 d20, q10, q8\n\t"
 387                 "vraddhn.u16 d21, q11, q9\n\t"
 388                 "vrshr.u16 q8, q12, #8\n\t"
 389                 "vrshr.u16 q9, q13, #8\n\t"
 390                 "vraddhn.u16 d22, q12, q8\n\t"
 391                 "vraddhn.u16 d23, q13, q9\n\t"
 392 /* result in d20-d23 */
 393                 "vqadd.u8 d20, d0, d20\n\t"
 394                 "vqadd.u8 d21, d1, d21\n\t"
 395                 "vqadd.u8 d22, d2, d22\n\t"
 396                 "vqadd.u8 d23, d3, d23\n\t"
 397
 398                 "bne 2b\n\t"
 399
 400                 "1:\n\t"
 401                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 402
 403                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 404                 :
 405                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 406                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
 407                 );
 408 #endif
 409         }
 410     }
 411     else
 412     {
 413         uint8x8_t alpha_selector = vreinterpret_u8_u64 (
 414             vcreate_u64 (0x0707070703030303ULL));
 415
 416         /* Handle width < 8 */
 417         while (height--)
 418         {
 419             dst = dst_line;
 420             dst_line += dst_stride;
 421             src = src_line;
 422             src_line += src_stride;
 423             w = width;
 424
 425             while (w >= 2)
 426             {
 427                 uint8x8_t sval, dval;
 428
 429                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 430                 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
 431                 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 432                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 433                 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 434
 435                 src += 2;
 436                 dst += 2;
 437                 w -= 2;
 438             }
 439
 440             if (w)
 441             {
 442                 uint8x8_t sval, dval;
 443
 444                 /* single 32-bit pixel in lane 0 */
 445                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));  /* only interested in lane 0 */
 446                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));  /* only interested in lane 0 */
 447                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 448                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 449             }
 450         }
 451     }
 452 }
 453
 454 static void
 455 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 456                                  pixman_op_t               op,
 457                                  pixman_image_t *          src_image,
 458                                  pixman_image_t *          mask_image,
 459                                  pixman_image_t *          dst_image,
 460                                  int32_t                   src_x,
 461                                  int32_t                   src_y,
 462                                  int32_t                   mask_x,
 463                                  int32_t                   mask_y,
 464                                  int32_t                   dest_x,
 465                                  int32_t                   dest_y,
 466                                  int32_t                   width,
 467                                  int32_t                   height)
 468 {
 469     uint32_t    *dst_line, *dst;
 470     uint32_t    *src_line, *src;
 471     uint32_t mask;
 472     int dst_stride, src_stride;
 473     uint32_t w;
 474     uint8x8_t mask_alpha;
 475
 476     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 477     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 478
 479     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 480     mask_alpha = vdup_n_u8 ((mask) >> 24);
 481
 482     if (width >= 8)
 483     {
 484         /* Use overlapping 8-pixel method */
 485         while (height--)
 486         {
 487             dst = dst_line;
 488             dst_line += dst_stride;
 489             src = src_line;
 490             src_line += src_stride;
 491             w = width;
 492
 493             uint32_t *keep_dst = 0;
 494
 495 #ifndef USE_GCC_INLINE_ASM
 496             uint8x8x4_t sval, dval, temp;
 497
 498             sval = vld4_u8 ((void *)src);
 499             dval = vld4_u8 ((void *)dst);
 500             keep_dst = dst;
 501
 502             sval = neon8mul (sval, mask_alpha);
 503             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 504             temp = neon8qadd (sval, temp);
 505
 506             src += (w & 7);
 507             dst += (w & 7);
 508             w -= (w & 7);
 509
 510             while (w)
 511             {
 512                 sval = vld4_u8 ((void *)src);
 513                 dval = vld4_u8 ((void *)dst);
 514
 515                 vst4_u8 ((void *)keep_dst, temp);
 516                 keep_dst = dst;
 517
 518                 sval = neon8mul (sval, mask_alpha);
 519                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 520                 temp = neon8qadd (sval, temp);
 521
 522                 src += 8;
 523                 dst += 8;
 524                 w -= 8;
 525             }
 526             vst4_u8 ((void *)keep_dst, temp);
 527 #else
 528             asm volatile (
 529 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 530                 "vdup.32      d30, %[mask]\n\t"
 531                 "vdup.8       d30, d30[3]\n\t"
 532
 533                 "vld4.8       {d0-d3}, [%[src]]\n\t"
 534                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 535                 "mov  %[keep_dst], %[dst]\n\t"
 536
 537                 "and  ip, %[w], #7\n\t"
 538                 "add  %[src], %[src], ip, LSL#2\n\t"
 539                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 540                 "subs  %[w], %[w], ip\n\t"
 541                 "b 9f\n\t"
 542 /* LOOP */
 543                 "2:\n\t"
 544                 "vld4.8       {d0-d3}, [%[src]]!\n\t"
 545                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 546                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 547                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 548                 "subs  %[w], %[w], #8\n\t"
 549
 550                 "9:\n\t"
 551                 "vmull.u8     q10, d30, d0\n\t"
 552                 "vmull.u8     q11, d30, d1\n\t"
 553                 "vmull.u8     q12, d30, d2\n\t"
 554                 "vmull.u8     q13, d30, d3\n\t"
 555                 "vrshr.u16    q8, q10, #8\n\t"
 556                 "vrshr.u16    q9, q11, #8\n\t"
 557                 "vraddhn.u16  d0, q10, q8\n\t"
 558                 "vraddhn.u16  d1, q11, q9\n\t"
 559                 "vrshr.u16    q9, q13, #8\n\t"
 560                 "vrshr.u16    q8, q12, #8\n\t"
 561                 "vraddhn.u16  d3, q13, q9\n\t"
 562                 "vraddhn.u16  d2, q12, q8\n\t"
 563
 564                 "vmvn.8       d31, d3\n\t"
 565                 "vmull.u8     q10, d31, d4\n\t"
 566                 "vmull.u8     q11, d31, d5\n\t"
 567                 "vmull.u8     q12, d31, d6\n\t"
 568                 "vmull.u8     q13, d31, d7\n\t"
 569                 "vrshr.u16    q8, q10, #8\n\t"
 570                 "vrshr.u16    q9, q11, #8\n\t"
 571                 "vraddhn.u16  d20, q10, q8\n\t"
 572                 "vrshr.u16    q8, q12, #8\n\t"
 573                 "vraddhn.u16  d21, q11, q9\n\t"
 574                 "vrshr.u16    q9, q13, #8\n\t"
 575                 "vraddhn.u16  d22, q12, q8\n\t"
 576                 "vraddhn.u16  d23, q13, q9\n\t"
 577
 578 /* result in d20-d23 */
 579                 "vqadd.u8     d20, d0, d20\n\t"
 580                 "vqadd.u8     d21, d1, d21\n\t"
 581                 "vqadd.u8     d22, d2, d22\n\t"
 582                 "vqadd.u8     d23, d3, d23\n\t"
 583
 584                 "bne  2b\n\t"
 585
 586                 "1:\n\t"
 587                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 588
 589                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 590                 : [mask] "r" (mask)
 591                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 592                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
 593                 "d30", "d31"
 594                 );
 595 #endif
 596         }
 597     }
 598     else
 599     {
 600         uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 601
 602         /* Handle width < 8 */
 603         while (height--)
 604         {
 605             dst = dst_line;
 606             dst_line += dst_stride;
 607             src = src_line;
 608             src_line += src_stride;
 609             w = width;
 610
 611             while (w >= 2)
 612             {
 613                 uint8x8_t sval, dval;
 614
 615                 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
 616                 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 617
 618                 /* sval * const alpha_mul */
 619                 sval = neon2mul (sval, mask_alpha);
 620
 621                 /* dval * 255-(src alpha) */
 622                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 623
 624                 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 625
 626                 src += 2;
 627                 dst += 2;
 628                 w -= 2;
 629             }
 630
 631             if (w)
 632             {
 633                 uint8x8_t sval, dval;
 634
 635                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
 636                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
 637
 638                 /* sval * const alpha_mul */
 639                 sval = neon2mul (sval, mask_alpha);
 640
 641                 /* dval * 255-(src alpha) */
 642                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 643
 644                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 645             }
 646         }
 647     }
 648 }
 649
 650 static void
 651 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
 652                               pixman_op_t               op,
 653                               pixman_image_t *          src_image,
 654                               pixman_image_t *          mask_image,
 655                               pixman_image_t *          dst_image,
 656                               int32_t                   src_x,
 657                               int32_t                   src_y,
 658                               int32_t                   mask_x,
 659                               int32_t                   mask_y,
 660                               int32_t                   dest_x,
 661                               int32_t                   dest_y,
 662                               int32_t                   width,
 663                               int32_t                   height)
 664 {
 665     uint32_t     src, srca;
 666     uint16_t    *dst_line, *dst;
 667     uint8_t     *mask_line, *mask;
 668     int          dst_stride, mask_stride;
 669     uint32_t     w;
 670     uint8x8_t    sval2;
 671     uint8x8x4_t  sval8;
 672
 673     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 674
 675     srca = src >> 24;
 676     if (src == 0)
 677         return;
 678
 679     sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
 680     sval8.val[0]=vdup_lane_u8 (sval2,0);
 681     sval8.val[1]=vdup_lane_u8 (sval2,1);
 682     sval8.val[2]=vdup_lane_u8 (sval2,2);
 683     sval8.val[3]=vdup_lane_u8 (sval2,3);
 684
 685     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 686     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 687
 688     if (width>=8)
 689     {
 690         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
 691         while (height--)
 692         {
 693             uint16_t *keep_dst=0;
 694
 695             dst = dst_line;
 696             dst_line += dst_stride;
 697             mask = mask_line;
 698             mask_line += mask_stride;
 699             w = width;
 700
 701 #ifndef USE_GCC_INLINE_ASM
 702             uint8x8_t alpha;
 703             uint16x8_t dval, temp;
 704             uint8x8x4_t sval8temp;
 705
 706             alpha = vld1_u8 ((void *)mask);
 707             dval = vld1q_u16 ((void *)dst);
 708             keep_dst = dst;
 709
 710             sval8temp = neon8mul (sval8, alpha);
 711             temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 712
 713             mask += (w & 7);
 714             dst += (w & 7);
 715             w -= (w & 7);
 716
 717             while (w)
 718             {
 719                 dval = vld1q_u16 ((void *)dst);
 720                 alpha = vld1_u8 ((void *)mask);
 721
 722                 vst1q_u16 ((void *)keep_dst, temp);
 723                 keep_dst = dst;
 724
 725                 sval8temp = neon8mul (sval8, alpha);
 726                 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 727
 728                 mask+=8;
 729                 dst+=8;
 730                 w-=8;
 731             }
 732             vst1q_u16 ((void *)keep_dst, temp);
 733 #else
 734             asm volatile (
 735                 "vdup.32      d0, %[src]\n\t"
 736                 "vdup.8       d1, d0[1]\n\t"
 737                 "vdup.8       d2, d0[2]\n\t"
 738                 "vdup.8       d3, d0[3]\n\t"
 739                 "vdup.8       d0, d0[0]\n\t"
 740
 741                 "vld1.8       {q12}, [%[dst]]\n\t"
 742                 "vld1.8       {d31}, [%[mask]]\n\t"
 743                 "mov  %[keep_dst], %[dst]\n\t"
 744
 745                 "and  ip, %[w], #7\n\t"
 746                 "add  %[mask], %[mask], ip\n\t"
 747                 "add  %[dst], %[dst], ip, LSL#1\n\t"
 748                 "subs  %[w], %[w], ip\n\t"
 749                 "b  9f\n\t"
 750 /* LOOP */
 751                 "2:\n\t"
 752
 753                 "vld1.16      {q12}, [%[dst]]!\n\t"
 754                 "vld1.8       {d31}, [%[mask]]!\n\t"
 755                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 756                 "sub  %[keep_dst], %[dst], #8*2\n\t"
 757                 "subs  %[w], %[w], #8\n\t"
 758                 "9:\n\t"
 759 /* expand 0565 q12 to 8888 {d4-d7} */
 760                 "vmovn.u16    d4, q12\t\n"
 761                 "vshr.u16     q11, q12, #5\t\n"
 762                 "vshr.u16     q10, q12, #6+5\t\n"
 763                 "vmovn.u16    d5, q11\t\n"
 764                 "vmovn.u16    d6, q10\t\n"
 765                 "vshl.u8      d4, d4, #3\t\n"
 766                 "vshl.u8      d5, d5, #2\t\n"
 767                 "vshl.u8      d6, d6, #3\t\n"
 768                 "vsri.u8      d4, d4, #5\t\n"
 769                 "vsri.u8      d5, d5, #6\t\n"
 770                 "vsri.u8      d6, d6, #5\t\n"
 771
 772                 "vmull.u8     q10, d31, d0\n\t"
 773                 "vmull.u8     q11, d31, d1\n\t"
 774                 "vmull.u8     q12, d31, d2\n\t"
 775                 "vmull.u8     q13, d31, d3\n\t"
 776                 "vrshr.u16    q8, q10, #8\n\t"
 777                 "vrshr.u16    q9, q11, #8\n\t"
 778                 "vraddhn.u16  d20, q10, q8\n\t"
 779                 "vraddhn.u16  d21, q11, q9\n\t"
 780                 "vrshr.u16    q9, q13, #8\n\t"
 781                 "vrshr.u16    q8, q12, #8\n\t"
 782                 "vraddhn.u16  d23, q13, q9\n\t"
 783                 "vraddhn.u16  d22, q12, q8\n\t"
 784
 785 /* duplicate in 4/2/1 & 8pix vsns */
 786                 "vmvn.8       d30, d23\n\t"
 787                 "vmull.u8     q14, d30, d6\n\t"
 788                 "vmull.u8     q13, d30, d5\n\t"
 789                 "vmull.u8     q12, d30, d4\n\t"
 790                 "vrshr.u16    q8, q14, #8\n\t"
 791                 "vrshr.u16    q9, q13, #8\n\t"
 792                 "vraddhn.u16  d6, q14, q8\n\t"
 793                 "vrshr.u16    q8, q12, #8\n\t"
 794                 "vraddhn.u16  d5, q13, q9\n\t"
 795                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 796                 "vraddhn.u16  d4, q12, q8\n\t"
 797 /* intentionally don't calculate alpha */
 798 /* result in d4-d6 */
 799
 800 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 801                 "vqadd.u8     d5, d5, d21\n\t"
 802                 "vqadd.u8     d4, d4, d20\n\t"
 803
 804 /* pack 8888 {d20-d23} to 0565 q10 */
 805                 "vshll.u8     q10, d6, #8\n\t"
 806                 "vshll.u8     q3, d5, #8\n\t"
 807                 "vshll.u8     q2, d4, #8\n\t"
 808                 "vsri.u16     q10, q3, #5\t\n"
 809                 "vsri.u16     q10, q2, #11\t\n"
 810
 811                 "bne 2b\n\t"
 812
 813                 "1:\n\t"
 814                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 815
 816                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 817                 : [src] "r" (src)
 818                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 819                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 820                   "d30","d31"
 821                 );
 822 #endif
 823         }
 824     }
 825     else
 826     {
 827         while (height--)
 828         {
 829             void *dst4=0, *dst2=0;
 830
 831             dst = dst_line;
 832             dst_line += dst_stride;
 833             mask = mask_line;
 834             mask_line += mask_stride;
 835             w = width;
 836
 837
 838 #if 1 /* #ifndef USE_GCC_INLINE_ASM */
 839             uint8x8_t alpha;
 840             uint16x8_t dval, temp;
 841             uint8x8x4_t sval8temp;
 842
 843             if (w&4)
 844             {
 845                 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
 846                 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
 847                 dst4=dst;
 848                 mask+=4;
 849                 dst+=4;
 850             }
 851             if (w&2)
 852             {
 853                 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
 854                 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
 855                 dst2=dst;
 856                 mask+=2;
 857                 dst+=2;
 858             }
 859             if (w&1)
 860             {
 861                 alpha = vld1_lane_u8 ((void *)mask, alpha,1);
 862                 dval = vld1q_lane_u16 ((void *)dst, dval,1);
 863             }
 864
 865             sval8temp = neon8mul (sval8, alpha);
 866             temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 867
 868             if (w&1)
 869                 vst1q_lane_u16 ((void *)dst, temp,1);
 870             if (w&2)
 871                 vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
 872             if (w&4)
 873                 vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
 874 #else
 875             /* this code has some bug (does not pass blitters-test) */
 876             asm volatile (
 877                 "vdup.32      d0, %[src]\n\t"
 878                 "vdup.8       d1, d0[1]\n\t"
 879                 "vdup.8       d2, d0[2]\n\t"
 880                 "vdup.8       d3, d0[3]\n\t"
 881                 "vdup.8       d0, d0[0]\n\t"
 882
 883                 "tst  %[w], #4\t\n"
 884                 "beq  skip_load4\t\n"
 885
 886                 "vld1.64      {d25}, [%[dst]]\n\t"
 887                 "vld1.32      {d31[1]}, [%[mask]]\n\t"
 888                 "mov  %[dst4], %[dst]\t\n"
 889                 "add  %[mask], %[mask], #4\t\n"
 890                 "add  %[dst], %[dst], #4*2\t\n"
 891
 892                 "skip_load4:\t\n"
 893                 "tst  %[w], #2\t\n"
 894                 "beq  skip_load2\t\n"
 895                 "vld1.32      {d24[1]}, [%[dst]]\n\t"
 896                 "vld1.16      {d31[1]}, [%[mask]]\n\t"
 897                 "mov  %[dst2], %[dst]\t\n"
 898                 "add  %[mask], %[mask], #2\t\n"
 899                 "add  %[dst], %[dst], #2*2\t\n"
 900
 901                 "skip_load2:\t\n"
 902                 "tst  %[w], #1\t\n"
 903                 "beq  skip_load1\t\n"
 904                 "vld1.16      {d24[1]}, [%[dst]]\n\t"
 905                 "vld1.8       {d31[1]}, [%[mask]]\n\t"
 906
 907                 "skip_load1:\t\n"
 908 /* expand 0565 q12 to 8888 {d4-d7} */
 909                 "vmovn.u16    d4, q12\t\n"
 910                 "vshr.u16     q11, q12, #5\t\n"
 911                 "vshr.u16     q10, q12, #6+5\t\n"
 912                 "vmovn.u16    d5, q11\t\n"
 913                 "vmovn.u16    d6, q10\t\n"
 914                 "vshl.u8      d4, d4, #3\t\n"
 915                 "vshl.u8      d5, d5, #2\t\n"
 916                 "vshl.u8      d6, d6, #3\t\n"
 917                 "vsri.u8      d4, d4, #5\t\n"
 918                 "vsri.u8      d5, d5, #6\t\n"
 919                 "vsri.u8      d6, d6, #5\t\n"
 920
 921                 "vmull.u8     q10, d31, d0\n\t"
 922                 "vmull.u8     q11, d31, d1\n\t"
 923                 "vmull.u8     q12, d31, d2\n\t"
 924                 "vmull.u8     q13, d31, d3\n\t"
 925                 "vrshr.u16    q8, q10, #8\n\t"
 926                 "vrshr.u16    q9, q11, #8\n\t"
 927                 "vraddhn.u16  d20, q10, q8\n\t"
 928                 "vraddhn.u16  d21, q11, q9\n\t"
 929                 "vrshr.u16    q9, q13, #8\n\t"
 930                 "vrshr.u16    q8, q12, #8\n\t"
 931                 "vraddhn.u16  d23, q13, q9\n\t"
 932                 "vraddhn.u16  d22, q12, q8\n\t"
 933
 934 /* duplicate in 4/2/1 & 8pix vsns */
 935                 "vmvn.8       d30, d23\n\t"
 936                 "vmull.u8     q14, d30, d6\n\t"
 937                 "vmull.u8     q13, d30, d5\n\t"
 938                 "vmull.u8     q12, d30, d4\n\t"
 939                 "vrshr.u16    q8, q14, #8\n\t"
 940                 "vrshr.u16    q9, q13, #8\n\t"
 941                 "vraddhn.u16  d6, q14, q8\n\t"
 942                 "vrshr.u16    q8, q12, #8\n\t"
 943                 "vraddhn.u16  d5, q13, q9\n\t"
 944                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 945                 "vraddhn.u16  d4, q12, q8\n\t"
 946 /* intentionally don't calculate alpha */
 947 /* result in d4-d6 */
 948
 949 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 950                 "vqadd.u8     d5, d5, d21\n\t"
 951                 "vqadd.u8     d4, d4, d20\n\t"
 952
 953 /* pack 8888 {d20-d23} to 0565 q10 */
 954                 "vshll.u8     q10, d6, #8\n\t"
 955                 "vshll.u8     q3, d5, #8\n\t"
 956                 "vshll.u8     q2, d4, #8\n\t"
 957                 "vsri.u16     q10, q3, #5\t\n"
 958                 "vsri.u16     q10, q2, #11\t\n"
 959
 960                 "tst  %[w], #1\n\t"
 961                 "beq skip_store1\t\n"
 962                 "vst1.16      {d20[1]}, [%[dst]]\t\n"
 963                 "skip_store1:\t\n"
 964                 "tst  %[w], #2\n\t"
 965                 "beq  skip_store2\t\n"
 966                 "vst1.32      {d20[1]}, [%[dst2]]\t\n"
 967                 "skip_store2:\t\n"
 968                 "tst  %[w], #4\n\t"
 969                 "beq skip_store4\t\n"
 970                 "vst1.16      {d21}, [%[dst4]]\t\n"
 971                 "skip_store4:\t\n"
 972
 973                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
 974                 : [src] "r" (src)
 975                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 976                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 977                   "d30","d31"
 978                 );
 979 #endif
 980         }
 981     }
 982 }
 983
 984 static void
 985 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
 986                               pixman_op_t               op,
 987                               pixman_image_t *          src_image,
 988                               pixman_image_t *          mask_image,
 989                               pixman_image_t *          dst_image,
 990                               int32_t                   src_x,
 991                               int32_t                   src_y,
 992                               int32_t                   mask_x,
 993                               int32_t                   mask_y,
 994                               int32_t                   dest_x,
 995                               int32_t                   dest_y,
 996                               int32_t                   width,
 997                               int32_t                   height)
 998 {
 999     uint32_t src, srca;
1000     uint32_t    *dst_line, *dst;
1001     uint8_t     *mask_line, *mask;
1002     int dst_stride, mask_stride;
1003     uint32_t w;
1004     uint8x8_t sval2;
1005     uint8x8x4_t sval8;
1006     uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1007     uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1008
1009     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1010
1011     /* bail out if fully transparent */
1012     srca = src >> 24;
1013     if (src == 0)
1014         return;
1015
1016     sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1017     sval8.val[0] = vdup_lane_u8 (sval2, 0);
1018     sval8.val[1] = vdup_lane_u8 (sval2, 1);
1019     sval8.val[2] = vdup_lane_u8 (sval2, 2);
1020     sval8.val[3] = vdup_lane_u8 (sval2, 3);
1021
1022     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1023     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1024
1025     if (width >= 8)
1026     {
1027         /* Use overlapping 8-pixel method, modified to avoid
1028          * rewritten dest being reused
1029          */
1030         while (height--)
1031         {
1032             uint32_t *keep_dst = 0;
1033
1034             dst = dst_line;
1035             dst_line += dst_stride;
1036             mask = mask_line;
1037             mask_line += mask_stride;
1038             w = width;
1039
1040 #ifndef USE_GCC_INLINE_ASM
1041             uint8x8_t alpha;
1042             uint8x8x4_t dval, temp;
1043
1044             alpha = vld1_u8 ((void *)mask);
1045             dval = vld4_u8 ((void *)dst);
1046             keep_dst = dst;
1047
1048             temp = neon8mul (sval8, alpha);
1049             dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1050             temp = neon8qadd (temp, dval);
1051
1052             mask += (w & 7);
1053             dst += (w & 7);
1054             w -= (w & 7);
1055
1056             while (w)
1057             {
1058                 alpha = vld1_u8 ((void *)mask);
1059                 dval = vld4_u8 ((void *)dst);
1060
1061                 vst4_u8 ((void *)keep_dst, temp);
1062                 keep_dst = dst;
1063
1064                 temp = neon8mul (sval8, alpha);
1065                 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1066                 temp = neon8qadd (temp, dval);
1067
1068                 mask += 8;
1069                 dst += 8;
1070                 w -= 8;
1071             }
1072             vst4_u8 ((void *)keep_dst, temp);
1073 #else
1074             asm volatile (
1075                 "vdup.32      d0, %[src]\n\t"
1076                 "vdup.8       d1, d0[1]\n\t"
1077                 "vdup.8       d2, d0[2]\n\t"
1078                 "vdup.8       d3, d0[3]\n\t"
1079                 "vdup.8       d0, d0[0]\n\t"
1080
1081                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
1082                 "vld1.8       {d31}, [%[mask]]\n\t"
1083                 "mov  %[keep_dst], %[dst]\n\t"
1084
1085                 "and  ip, %[w], #7\n\t"
1086                 "add  %[mask], %[mask], ip\n\t"
1087                 "add  %[dst], %[dst], ip, LSL#2\n\t"
1088                 "subs  %[w], %[w], ip\n\t"
1089                 "b 9f\n\t"
1090 /* LOOP */
1091                 "2:\n\t"
1092                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
1093                 "vld1.8       {d31}, [%[mask]]!\n\t"
1094                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1095                 "sub  %[keep_dst], %[dst], #8*4\n\t"
1096                 "subs  %[w], %[w], #8\n\t"
1097                 "9:\n\t"
1098
1099                 "vmull.u8     q10, d31, d0\n\t"
1100                 "vmull.u8     q11, d31, d1\n\t"
1101                 "vmull.u8     q12, d31, d2\n\t"
1102                 "vmull.u8     q13, d31, d3\n\t"
1103                 "vrshr.u16    q8, q10, #8\n\t"
1104                 "vrshr.u16    q9, q11, #8\n\t"
1105                 "vraddhn.u16  d20, q10, q8\n\t"
1106                 "vraddhn.u16  d21, q11, q9\n\t"
1107                 "vrshr.u16    q9, q13, #8\n\t"
1108                 "vrshr.u16    q8, q12, #8\n\t"
1109                 "vraddhn.u16  d23, q13, q9\n\t"
1110                 "vraddhn.u16  d22, q12, q8\n\t"
1111
1112                 "vmvn.8       d30, d23\n\t"
1113                 "vmull.u8     q12, d30, d4\n\t"
1114                 "vmull.u8     q13, d30, d5\n\t"
1115                 "vmull.u8     q14, d30, d6\n\t"
1116                 "vmull.u8     q15, d30, d7\n\t"
1117
1118                 "vrshr.u16    q8, q12, #8\n\t"
1119                 "vrshr.u16    q9, q13, #8\n\t"
1120                 "vraddhn.u16  d4, q12, q8\n\t"
1121                 "vrshr.u16    q8, q14, #8\n\t"
1122                 "vraddhn.u16  d5, q13, q9\n\t"
1123                 "vrshr.u16    q9, q15, #8\n\t"
1124                 "vraddhn.u16  d6, q14, q8\n\t"
1125                 "vraddhn.u16  d7, q15, q9\n\t"
1126 /* result in d4-d7 */
1127
1128                 "vqadd.u8     d20, d4, d20\n\t"
1129                 "vqadd.u8     d21, d5, d21\n\t"
1130                 "vqadd.u8     d22, d6, d22\n\t"
1131                 "vqadd.u8     d23, d7, d23\n\t"
1132
1133                 "bne 2b\n\t"
1134
1135                 "1:\n\t"
1136                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1137
1138                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1139                 : [src] "r" (src)
1140                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1141                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1142                 "d30", "d31"
1143                 );
1144 #endif
1145         }
1146     }
1147     else
1148     {
1149         while (height--)
1150         {
1151             uint8x8_t alpha;
1152
1153             dst = dst_line;
1154             dst_line += dst_stride;
1155             mask = mask_line;
1156             mask_line += mask_stride;
1157             w = width;
1158
1159             while (w >= 2)
1160             {
1161                 uint8x8_t dval, temp, res;
1162
1163                 alpha = vtbl1_u8 (
1164                     vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
1165                 dval = vld1_u8 ((void *)dst);
1166
1167                 temp = neon2mul (sval2, alpha);
1168                 res = vqadd_u8 (
1169                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1170
1171                 vst1_u8 ((void *)dst, res);
1172
1173                 mask += 2;
1174                 dst += 2;
1175                 w -= 2;
1176             }
1177
1178             if (w)
1179             {
1180                 uint8x8_t dval, temp, res;
1181
1182                 alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
1183                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
1184
1185                 temp = neon2mul (sval2, alpha);
1186                 res = vqadd_u8 (
1187                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1188
1189                 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
1190             }
1191         }
1192     }
1193 }
1194
1195 static void
1196 neon_composite_add_n_8_8 (pixman_implementation_t * impl,
1197                           pixman_op_t               op,
1198                           pixman_image_t *          src_image,
1199                           pixman_image_t *          mask_image,
1200                           pixman_image_t *          dst_image,
1201                           int32_t                   src_x,
1202                           int32_t                   src_y,
1203                           int32_t                   mask_x,
1204                           int32_t                   mask_y,
1205                           int32_t                   dest_x,
1206                           int32_t                   dest_y,
1207                           int32_t                   width,
1208                           int32_t                   height)
1209 {
1210     uint8_t     *dst_line, *dst;
1211     uint8_t     *mask_line, *mask;
1212     int dst_stride, mask_stride;
1213     uint32_t w;
1214     uint32_t src;
1215     uint8x8_t sa;
1216
1217     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1218     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1219     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1220     sa = vdup_n_u8 ((src) >> 24);
1221
1222     if (width >= 8)
1223     {
1224         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1225         while (height--)
1226         {
1227             dst = dst_line;
1228             dst_line += dst_stride;
1229             mask = mask_line;
1230             mask_line += mask_stride;
1231             w = width;
1232
1233             uint8x8_t mval, dval, res;
1234             uint8_t     *keep_dst;
1235
1236             mval = vld1_u8 ((void *)mask);
1237             dval = vld1_u8 ((void *)dst);
1238             keep_dst = dst;
1239
1240             res = vqadd_u8 (neon2mul (mval, sa), dval);
1241
1242             mask += (w & 7);
1243             dst += (w & 7);
1244             w -= w & 7;
1245
1246             while (w)
1247             {
1248                 mval = vld1_u8 ((void *)mask);
1249                 dval = vld1_u8 ((void *)dst);
1250                 vst1_u8 ((void *)keep_dst, res);
1251                 keep_dst = dst;
1252
1253                 res = vqadd_u8 (neon2mul (mval, sa), dval);
1254
1255                 mask += 8;
1256                 dst += 8;
1257                 w -= 8;
1258             }
1259             vst1_u8 ((void *)keep_dst, res);
1260         }
1261     }
1262     else
1263     {
1264         /* Use 4/2/1 load/store method to handle 1-7 pixels */
1265         while (height--)
1266         {
1267             dst = dst_line;
1268             dst_line += dst_stride;
1269             mask = mask_line;
1270             mask_line += mask_stride;
1271             w = width;
1272
1273             uint8x8_t mval = sa, dval = sa, res;
1274             uint8_t *dst4 = 0, *dst2 = 0;
1275
1276             if (w & 4)
1277             {
1278                 mval = vreinterpret_u8_u32 (
1279                     vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1280                 dval = vreinterpret_u8_u32 (
1281                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1282
1283                 dst4 = dst;
1284                 mask += 4;
1285                 dst += 4;
1286             }
1287
1288             if (w & 2)
1289             {
1290                 mval = vreinterpret_u8_u16 (
1291                     vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1292                 dval = vreinterpret_u8_u16 (
1293                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1294                 dst2 = dst;
1295                 mask += 2;
1296                 dst += 2;
1297             }
1298
1299             if (w & 1)
1300             {
1301                 mval = vld1_lane_u8 (mask, mval, 1);
1302                 dval = vld1_lane_u8 (dst, dval, 1);
1303             }
1304
1305             res = vqadd_u8 (neon2mul (mval, sa), dval);
1306
1307             if (w & 1)
1308                 vst1_lane_u8 (dst, res, 1);
1309             if (w & 2)
1310                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1311             if (w & 4)
1312                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1313         }
1314     }
1315 }
1316
1317 #ifdef USE_GCC_INLINE_ASM
1318
1319 static void
1320 neon_composite_src_16_16 (pixman_implementation_t * impl,
1321                           pixman_op_t               op,
1322                           pixman_image_t *          src_image,
1323                           pixman_image_t *          mask_image,
1324                           pixman_image_t *          dst_image,
1325                           int32_t                   src_x,
1326                           int32_t                   src_y,
1327                           int32_t                   mask_x,
1328                           int32_t                   mask_y,
1329                           int32_t                   dest_x,
1330                           int32_t                   dest_y,
1331                           int32_t                   width,
1332                           int32_t                   height)
1333 {
1334     uint16_t    *dst_line, *src_line;
1335     uint32_t dst_stride, src_stride;
1336
1337     if (!height || !width)
1338         return;
1339
1340     /* We simply copy 16-bit-aligned pixels from one place to another. */
1341     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1342     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1343
1344     /* Preload the first input scanline */
1345     {
1346         uint16_t *src_ptr = src_line;
1347         uint32_t count = width;
1348
1349         asm volatile (
1350             "0: @ loop                                                  \n"
1351             "   subs    %[count], %[count], #32                         \n"
1352             "   pld     [%[src]]                                        \n"
1353             "   add     %[src], %[src], #64                             \n"
1354             "   bgt 0b                                                  \n"
1355
1356             /* Clobbered input registers marked as input/outputs */
1357             : [src] "+r" (src_ptr), [count] "+r" (count)
1358             :     /* no unclobbered inputs */
1359             : "cc"
1360             );
1361     }
1362
1363     while (height--)
1364     {
1365         uint16_t *dst_ptr = dst_line;
1366         uint16_t *src_ptr = src_line;
1367         uint32_t count = width;
1368         uint32_t tmp = 0;
1369
1370         /* Uses multi-register access and preloading to maximise bandwidth.
1371          * Each pixel is one halfword, so a quadword contains 8px.
1372          * Preload frequency assumed a 64-byte cacheline.
1373          */
1374         asm volatile (
1375             "   cmp       %[count], #64                         \n"
1376             "   blt 1f    @ skip oversized fragments            \n"
1377             "0: @ start with eight quadwords at a time          \n"
1378             /* preload from next scanline */
1379             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1380             "   sub       %[count], %[count], #64               \n"
1381             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1382             "   vld1.16   {d20, d21, d22, d23}, [%[src]]!               \n"
1383             /* preload from next scanline */
1384             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1385             "   vld1.16   {d24, d25, d26, d27}, [%[src]]!               \n"
1386             "   vld1.16   {d28, d29, d30, d31}, [%[src]]!               \n"
1387             "   cmp       %[count], #64                         \n"
1388             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1389             "   vst1.16   {d20, d21, d22, d23}, [%[dst]]!               \n"
1390             "   vst1.16   {d24, d25, d26, d27}, [%[dst]]!               \n"
1391             "   vst1.16   {d28, d29, d30, d31}, [%[dst]]!               \n"
1392             "   bge 0b                                          \n"
1393             "   cmp       %[count], #0                          \n"
1394             "   beq 7f    @ aligned fastpath                    \n"
1395             "1: @ four quadwords                                \n"
1396             "   tst       %[count], #32                         \n"
1397             "   beq 2f    @ skip oversized fragment             \n"
1398             /* preload from next scanline */
1399             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1400             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1401             "   vld1.16   {d20, d21, d22, d23}, [%[src]]!               \n"
1402             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1403             "   vst1.16   {d20, d21, d22, d23}, [%[dst]]!               \n"
1404             "2: @ two quadwords                                 \n"
1405             "   tst       %[count], #16                         \n"
1406             "   beq 3f    @ skip oversized fragment             \n"
1407             /* preload from next scanline */
1408             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1409             "   vld1.16   {d16, d17, d18, d19}, [%[src]]!               \n"
1410             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!               \n"
1411             "3: @ one quadword                                  \n"
1412             "   tst       %[count], #8                          \n"
1413             "   beq 4f    @ skip oversized fragment             \n"
1414             "   vld1.16   {d16, d17}, [%[src]]!                 \n"
1415             "   vst1.16   {d16, d17}, [%[dst]]!                 \n"
1416             "4: @ one doubleword                                \n"
1417             "   tst       %[count], #4                          \n"
1418             "   beq 5f    @ skip oversized fragment             \n"
1419             "   vld1.16   {d16}, [%[src]]!                      \n"
1420             "   vst1.16   {d16}, [%[dst]]!                      \n"
1421             "5: @ one word                                      \n"
1422             "   tst       %[count], #2                          \n"
1423             "   beq 6f    @ skip oversized fragment             \n"
1424             "   ldr       %[tmp], [%[src]], #4                  \n"
1425             "   str       %[tmp], [%[dst]], #4                  \n"
1426             "6: @ one halfword                                  \n"
1427             "   tst       %[count], #1                          \n"
1428             "   beq 7f    @ skip oversized fragment             \n"
1429             "   ldrh      %[tmp], [%[src]]                      \n"
1430             "   strh      %[tmp], [%[dst]]                      \n"
1431             "7: @ end                                           \n"
1432
1433             /* Clobbered input registers marked as input/outputs */
1434             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1435               [count] "+r" (count), [tmp] "+r" (tmp)
1436
1437               /* Unclobbered input */
1438             : [src_stride] "r" (src_stride)
1439
1440               /* Clobbered vector registers */
1441             : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1442               "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1443             );
1444
1445         src_line += src_stride;
1446         dst_line += dst_stride;
1447     }
1448 }
1449
1450 #endif /* USE_GCC_INLINE_ASM */
1451
1452 static void
1453 neon_composite_src_24_16 (pixman_implementation_t * impl,
1454                           pixman_op_t               op,
1455                           pixman_image_t *          src_image,
1456                           pixman_image_t *          mask_image,
1457                           pixman_image_t *          dst_image,
1458                           int32_t                   src_x,
1459                           int32_t                   src_y,
1460                           int32_t                   mask_x,
1461                           int32_t                   mask_y,
1462                           int32_t                   dest_x,
1463                           int32_t                   dest_y,
1464                           int32_t                   width,
1465                           int32_t                   height)
1466 {
1467     uint16_t    *dst_line;
1468     uint32_t    *src_line;
1469     uint32_t dst_stride, src_stride;
1470
1471     if (!width || !height)
1472         return;
1473
1474     /* We simply copy pixels from one place to another,
1475      * assuming that the source's alpha is opaque.
1476      */
1477     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1478     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1479
1480     /* Preload the first input scanline */
1481     {
1482         uint8_t *src_ptr = (uint8_t*) src_line;
1483         uint32_t count = (width + 15) / 16;
1484
1485 #ifdef USE_GCC_INLINE_ASM
1486         asm volatile (
1487             "0: @ loop                                          \n"
1488             "   subs    %[count], %[count], #1                  \n"
1489             "   pld     [%[src]]                                \n"
1490             "   add     %[src], %[src], #64                     \n"
1491             "   bgt 0b                                          \n"
1492
1493             /* Clobbered input registers marked as input/outputs */
1494             : [src] "+r" (src_ptr), [count] "+r" (count)
1495             :     /* no unclobbered inputs */
1496             : "cc"
1497             );
1498 #else
1499         do
1500         {
1501             __pld (src_ptr);
1502             src_ptr += 64;
1503         }
1504         while (--count);
1505 #endif
1506     }
1507
1508     while (height--)
1509     {
1510         uint16_t *dst_ptr = dst_line;
1511         uint32_t *src_ptr = src_line;
1512         uint32_t count = width;
1513         const uint32_t rb_mask = 0x1F;
1514         const uint32_t g_mask = 0x3F;
1515
1516         /* If you're going to complain about a goto, take a long hard look
1517          * at the massive blocks of assembler this skips over.  ;-)
1518          */
1519         if (count < 8)
1520             goto small_stuff;
1521
1522 #ifdef USE_GCC_INLINE_ASM
1523
1524         /* This is not as aggressive as the RGB565-source case.
1525          * Generally the source is in cached RAM when the formats are
1526          * different, so we use preload.
1527          *
1528          * We don't need to blend, so we are not reading from the
1529          * uncached framebuffer.
1530          */
1531         asm volatile (
1532             "   cmp       %[count], #16                         \n"
1533             "   blt 1f    @ skip oversized fragments            \n"
1534             "0: @ start with sixteen pixels at a time           \n"
1535             "   sub       %[count], %[count], #16               \n"
1536             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1537             "   vld4.8    {d0, d1, d2, d3}, [%[src]]!           @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1538             "   vld4.8    {d4, d5, d6, d7}, [%[src]]!           @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1539             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1540             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1541             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1542             "   vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1543             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1544             "   vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1545             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1546             "   vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1547             "   vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1548             "   vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1549             "   cmp       %[count], #16                         \n"
1550             "   vst1.16   {d16, d17, d18, d19}, [%[dst]]!          @ store 16 pixels                            \n"
1551             "   bge 0b                                          \n"
1552             "1: @ end of main loop                              \n"
1553             "   cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1554             "   blt 2f                                          \n"
1555             "   sub       %[count], %[count], #8                \n"
1556             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1557             "   vld4.8    {d0, d1, d2, d3}, [%[src]]!           @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1558             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1559             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1560             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1561             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1562             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1563             "   vst1.16   {d16, d17}, [%[dst]]!          @ store 8 pixels                               \n"
1564             "2: @ end                                           \n"
1565
1566             /* Clobbered input and working registers marked as input/outputs */
1567             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1568
1569               /* Unclobbered input */
1570             : [src_stride] "r" (src_stride)
1571
1572               /* Clobbered vector registers */
1573
1574               /* NB: these are the quad aliases of the
1575                * double registers used in the asm
1576                */
1577             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1578               "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1579             );
1580 #else
1581         /* A copy of the above code, in intrinsics-form. */
1582         while (count >= 16)
1583         {
1584             uint8x8x4_t pixel_set_a, pixel_set_b;
1585             uint16x8_t red_a, green_a, blue_a;
1586             uint16x8_t red_b, green_b, blue_b;
1587             uint16x8_t dest_pixels_a, dest_pixels_b;
1588
1589             count -= 16;
1590             __pld (src_ptr + src_stride);
1591             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1592             pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1593             src_ptr += 16;
1594
1595             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1596             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1597             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1598
1599             red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
1600             green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1601             blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
1602
1603             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1604             dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1605
1606             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1607             dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1608
1609             /* There doesn't seem to be an intrinsic for the
1610              * double-quadword variant
1611              */
1612             vst1q_u16 (dst_ptr, dest_pixels_a);
1613             vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1614             dst_ptr += 16;
1615         }
1616
1617         /* 8-pixel loop */
1618         if (count >= 8)
1619         {
1620             uint8x8x4_t pixel_set_a;
1621             uint16x8_t red_a, green_a, blue_a;
1622             uint16x8_t dest_pixels_a;
1623
1624             __pld (src_ptr + src_stride);
1625             count -= 8;
1626             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1627             src_ptr += 8;
1628
1629             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1630             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1631             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1632
1633             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1634             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1635
1636             vst1q_u16 (dst_ptr, dest_pixels_a);
1637             dst_ptr += 8;
1638         }
1639
1640 #endif  /* USE_GCC_INLINE_ASM */
1641
1642     small_stuff:
1643         if (count)
1644             __pld (src_ptr + src_stride);
1645
1646         while (count >= 2)
1647         {
1648             uint32_t src_pixel_a = *src_ptr++;
1649             uint32_t src_pixel_b = *src_ptr++;
1650
1651             /* ARM is really good at shift-then-ALU ops. */
1652             /* This should be a total of six shift-ANDs and five shift-ORs. */
1653             uint32_t dst_pixels_a;
1654             uint32_t dst_pixels_b;
1655
1656             dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1657             dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1658             dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1659
1660             dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1661             dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1662             dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1663
1664             /* little-endian mode only */
1665             *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1666             dst_ptr += 2;
1667             count -= 2;
1668         }
1669
1670         if (count)
1671         {
1672             uint32_t src_pixel = *src_ptr++;
1673
1674             /* ARM is really good at shift-then-ALU ops.
1675              * This block should end up as three shift-ANDs
1676              * and two shift-ORs.
1677              */
1678             uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1679             uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1680             uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1681             uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1682
1683             *dst_ptr++ = dst_pixel;
1684             count--;
1685         }
1686
1687         src_line += src_stride;
1688         dst_line += dst_stride;
1689     }
1690 }
1691
1692 static pixman_bool_t
1693 pixman_fill_neon (uint32_t *bits,
1694                   int       stride,
1695                   int       bpp,
1696                   int       x,
1697                   int       y,
1698                   int       width,
1699                   int       height,
1700                   uint32_t  _xor)
1701 {
1702     uint32_t byte_stride, color;
1703     char *dst;
1704
1705     /* stride is always multiple of 32bit units in pixman */
1706     byte_stride = stride * sizeof(uint32_t);
1707
1708     switch (bpp)
1709     {
1710     case 8:
1711         dst = ((char *) bits) + y * byte_stride + x;
1712         _xor &= 0xff;
1713         color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1714         break;
1715
1716     case 16:
1717         dst = ((char *) bits) + y * byte_stride + x * 2;
1718         _xor &= 0xffff;
1719         color = _xor << 16 | _xor;
1720         width *= 2;         /* width to bytes */
1721         break;
1722
1723     case 32:
1724         dst = ((char *) bits) + y * byte_stride + x * 4;
1725         color = _xor;
1726         width *= 4;         /* width to bytes */
1727         break;
1728
1729     default:
1730         return FALSE;
1731     }
1732
1733 #ifdef USE_GCC_INLINE_ASM
1734     if (width < 16)
1735     {
1736         /* We have a special case for such small widths that don't allow
1737          * us to use wide 128-bit stores anyway. We don't waste time
1738          * trying to align writes, since there are only very few of them anyway
1739          */
1740         asm volatile (
1741             "cmp                %[height], #0\n"/* Check if empty fill */
1742             "beq                3f\n"
1743             "vdup.32    d0, %[color]\n"/* Fill the color to neon req */
1744
1745             /* Check if we have a such width that can easily be handled by single
1746              * operation for each scanline. This significantly reduces the number
1747              * of test/branch instructions for each scanline
1748              */
1749             "cmp                %[width], #8\n"
1750             "beq                4f\n"
1751             "cmp                %[width], #4\n"
1752             "beq                5f\n"
1753             "cmp                %[width], #2\n"
1754             "beq                6f\n"
1755
1756             /* Loop starts here for each scanline */
1757             "1:\n"
1758             "mov                r4, %[dst]\n" /* Starting address of the current line */
1759             "tst                %[width], #8\n"
1760             "beq                2f\n"
1761             "vst1.8             {d0}, [r4]!\n"
1762             "2:\n"
1763             "tst                %[width], #4\n"
1764             "beq                2f\n"
1765             "str                %[color], [r4], #4\n"
1766             "2:\n"
1767             "tst                %[width], #2\n"
1768             "beq                2f\n"
1769             "strh               %[color], [r4], #2\n"
1770             "2:\n"
1771             "tst                %[width], #1\n"
1772             "beq                2f\n"
1773             "strb               %[color], [r4], #1\n"
1774             "2:\n"
1775
1776             "subs               %[height], %[height], #1\n"
1777             "add                %[dst], %[dst], %[byte_stride]\n"
1778             "bne                1b\n"
1779             "b          3f\n"
1780
1781             /* Special fillers for those widths that we can do with single operation */
1782             "4:\n"
1783             "subs               %[height], %[height], #1\n"
1784             "vst1.8             {d0}, [%[dst]]\n"
1785             "add                %[dst], %[dst], %[byte_stride]\n"
1786             "bne                4b\n"
1787             "b          3f\n"
1788
1789             "5:\n"
1790             "subs               %[height], %[height], #1\n"
1791             "str                %[color], [%[dst]]\n"
1792             "add                %[dst], %[dst], %[byte_stride]\n"
1793             "bne                5b\n"
1794             "b          3f\n"
1795
1796             "6:\n"
1797             "subs               %[height], %[height], #1\n"
1798             "strh               %[color], [%[dst]]\n"
1799             "add                %[dst], %[dst], %[byte_stride]\n"
1800             "bne                6b\n"
1801
1802             "3:\n"
1803             : [height] "+r" (height), [dst] "+r" (dst)
1804             : [color] "r" (color), [width] "r" (width),
1805               [byte_stride] "r" (byte_stride)
1806             : "memory", "cc", "d0", "r4");
1807     }
1808     else
1809     {
1810         asm volatile (
1811             "cmp                %[height], #0\n"/* Check if empty fill */
1812             "beq                5f\n"
1813             "vdup.32    q0, %[color]\n"/* Fill the color to neon req */
1814
1815             /* Loop starts here for each scanline */
1816             "1:\n"
1817             "mov                r4, %[dst]\n"/* Starting address of the current line */
1818             "mov                r5, %[width]\n"/* We're going to write this many bytes */
1819             "ands               r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1820             "beq                2f\n"/* Jump to the best case */
1821
1822             /* We're not 128-bit aligned: However, we know that we can get to the
1823                next aligned location, since the fill is at least 16 bytes wide */
1824             "rsb                r6, r6, #16\n" /* We would need to go forward this much */
1825             "sub                r5, r5, r6\n"/* Update bytes left */
1826             "tst                r6, #1\n"
1827             "beq                6f\n"
1828             "vst1.8             {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1829             "6:\n"
1830             "tst                r6, #2\n"
1831             "beq                6f\n"
1832             "vst1.16    {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1833             "6:\n"
1834             "tst                r6, #4\n"
1835             "beq                6f\n"
1836             "vst1.32    {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1837             "6:\n"
1838             "tst                r6, #8\n"
1839             "beq                2f\n"
1840             "vst1.64    {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1841
1842             /* The good case: We're 128-bit aligned for this scanline */
1843             "2:\n"
1844             "and                r6, r5, #15\n"/* Number of tailing bytes */
1845             "cmp                r5, r6\n"/* Do we have at least one qword to write? */
1846             "beq                6f\n"/* No, we just write the tail */
1847             "lsr                r5, r5, #4\n"/* This many full qwords to write */
1848
1849             /* The main block: Do 128-bit aligned writes */
1850             "3:\n"
1851             "subs               r5, r5, #1\n"
1852             "vst1.64    {d0, d1}, [r4, :128]!\n"
1853             "bne                3b\n"
1854
1855             /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1856                We know that we're currently at 128-bit aligned address, so we can just
1857                pick the biggest operations that the remaining write width allows */
1858             "6:\n"
1859             "cmp                r6, #0\n"
1860             "beq                4f\n"
1861             "tst                r6, #8\n"
1862             "beq                6f\n"
1863             "vst1.64    {d0}, [r4, :64]!\n"
1864             "6:\n"
1865             "tst                r6, #4\n"
1866             "beq                6f\n"
1867             "vst1.32    {d0[0]}, [r4, :32]!\n"
1868             "6:\n"
1869             "tst                r6, #2\n"
1870             "beq                6f\n"
1871             "vst1.16    {d0[0]}, [r4, :16]!\n"
1872             "6:\n"
1873             "tst                r6, #1\n"
1874             "beq                4f\n"
1875             "vst1.8             {d0[0]}, [r4]!\n"
1876             "4:\n"
1877
1878             /* Handle the next scanline */
1879             "subs               %[height], %[height], #1\n"
1880             "add                %[dst], %[dst], %[byte_stride]\n"
1881             "bne                1b\n"
1882             "5:\n"
1883             : [height] "+r" (height), [dst] "+r" (dst)
1884             : [color] "r" (color), [width] "r" (width),
1885               [byte_stride] "r" (byte_stride)
1886             : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
1887     }
1888     return TRUE;
1889
1890 #else
1891
1892     /* TODO: intrinsic version for armcc */
1893     return FALSE;
1894
1895 #endif
1896 }
1897
1898 /* TODO: is there a more generic way of doing this being introduced? */
1899 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1900
1901 static inline void
1902 neon_quadword_copy (void *   dst,
1903                     void *   src,
1904                     uint32_t count,         /* of quadwords */
1905                     uint32_t trailer_count  /* of bytes */)
1906 {
1907     uint8_t *t_dst = dst, *t_src = src;
1908
1909     /* Uses aligned multi-register loads to maximise read bandwidth
1910      * on uncached memory such as framebuffers
1911      * The accesses do not have the aligned qualifiers, so that the copy
1912      * may convert between aligned-uncached and unaligned-cached memory.
1913      * It is assumed that the CPU can infer alignedness from the address.
1914      */
1915
1916 #ifdef USE_GCC_INLINE_ASM
1917
1918     asm volatile (
1919         "       cmp       %[count], #8                          \n"
1920         "       blt 1f    @ skip oversized fragments            \n"
1921         "0: @ start with eight quadwords at a time              \n"
1922         "       sub       %[count], %[count], #8                \n"
1923         "       vld1.8    {d16, d17, d18, d19}, [%[src]]!               \n"
1924         "       vld1.8    {d20, d21, d22, d23}, [%[src]]!               \n"
1925         "       vld1.8    {d24, d25, d26, d27}, [%[src]]!               \n"
1926         "       vld1.8    {d28, d29, d30, d31}, [%[src]]!               \n"
1927         "       cmp       %[count], #8                          \n"
1928         "       vst1.8    {d16, d17, d18, d19}, [%[dst]]!               \n"
1929         "       vst1.8    {d20, d21, d22, d23}, [%[dst]]!               \n"
1930         "       vst1.8    {d24, d25, d26, d27}, [%[dst]]!               \n"
1931         "       vst1.8    {d28, d29, d30, d31}, [%[dst]]!               \n"
1932         "       bge 0b                                          \n"
1933         "1: @ four quadwords                                    \n"
1934         "       tst       %[count], #4                          \n"
1935         "       beq 2f    @ skip oversized fragment             \n"
1936         "       vld1.8    {d16, d17, d18, d19}, [%[src]]!               \n"
1937         "       vld1.8    {d20, d21, d22, d23}, [%[src]]!               \n"
1938         "       vst1.8    {d16, d17, d18, d19}, [%[dst]]!               \n"
1939         "       vst1.8    {d20, d21, d22, d23}, [%[dst]]!               \n"
1940         "2: @ two quadwords                                     \n"
1941         "       tst       %[count], #2                          \n"
1942         "       beq 3f    @ skip oversized fragment             \n"
1943         "       vld1.8    {d16, d17, d18, d19}, [%[src]]!               \n"
1944         "       vst1.8    {d16, d17, d18, d19}, [%[dst]]!               \n"
1945         "3: @ one quadword                                      \n"
1946         "       tst       %[count], #1                          \n"
1947         "       beq 4f    @ skip oversized fragment             \n"
1948         "       vld1.8    {d16, d17}, [%[src]]!                 \n"
1949         "       vst1.8    {d16, d17}, [%[dst]]!                 \n"
1950         "4: @ end                                               \n"
1951
1952         /* Clobbered input registers marked as input/outputs */
1953         : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1954
1955           /* No unclobbered inputs */
1956         :
1957
1958         /* Clobbered vector registers */
1959         : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
1960           "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
1961
1962 #else
1963
1964     while (count >= 8)
1965     {
1966         uint8x16x4_t t1 = vld4q_u8 (t_src);
1967         uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1968
1969         t_src += sizeof(uint8x16x4_t) * 2;
1970         vst4q_u8 (t_dst, t1);
1971         vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1972         t_dst += sizeof(uint8x16x4_t) * 2;
1973         count -= 8;
1974     }
1975
1976     if (count & 4)
1977     {
1978         uint8x16x4_t t1 = vld4q_u8 (t_src);
1979
1980         t_src += sizeof(uint8x16x4_t);
1981         vst4q_u8 (t_dst, t1);
1982         t_dst += sizeof(uint8x16x4_t);
1983     }
1984
1985     if (count & 2)
1986     {
1987         uint8x8x4_t t1 = vld4_u8 (t_src);
1988
1989         t_src += sizeof(uint8x8x4_t);
1990         vst4_u8 (t_dst, t1);
1991         t_dst += sizeof(uint8x8x4_t);
1992     }
1993
1994     if (count & 1)
1995     {
1996         uint8x16_t t1 = vld1q_u8 (t_src);
1997
1998         t_src += sizeof(uint8x16_t);
1999         vst1q_u8 (t_dst, t1);
2000         t_dst += sizeof(uint8x16_t);
2001     }
2002
2003 #endif  /* !USE_GCC_INLINE_ASM */
2004
2005     if (trailer_count)
2006     {
2007         if (trailer_count & 8)
2008         {
2009             uint8x8_t t1 = vld1_u8 (t_src);
2010
2011             t_src += sizeof(uint8x8_t);
2012             vst1_u8 (t_dst, t1);
2013             t_dst += sizeof(uint8x8_t);
2014         }
2015
2016         if (trailer_count & 4)
2017         {
2018             *((uint32_t*) t_dst) = *((uint32_t*) t_src);
2019
2020             t_dst += 4;
2021             t_src += 4;
2022         }
2023
2024         if (trailer_count & 2)
2025         {
2026             *((uint16_t*) t_dst) = *((uint16_t*) t_src);
2027
2028             t_dst += 2;
2029             t_src += 2;
2030         }
2031
2032         if (trailer_count & 1)
2033         {
2034             *t_dst++ = *t_src++;
2035         }
2036     }
2037 }
2038
2039 static inline void
2040 solid_over_565_8_pix_neon (uint32_t  glyph_colour,
2041                            uint16_t *dest,
2042                            uint8_t * in_mask,
2043                            uint32_t  dest_stride,    /* bytes, not elements */
2044                            uint32_t  mask_stride,
2045                            uint32_t  count           /* 8-pixel groups */)
2046 {
2047     /* Inner loop of glyph blitter (solid colour, alpha mask) */
2048
2049 #ifdef USE_GCC_INLINE_ASM
2050
2051     asm volatile (
2052         "       vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]]  @ splat solid colour components \n"
2053         "0:     @ loop                                                                                                                                                          \n"
2054         "       vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer                      \n"
2055         "       vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph                                                \n"
2056         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
2057         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
2058         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
2059         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
2060         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
2061         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
2062         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
2063         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
2064         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
2065         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
2066         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
2067         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
2068         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
2069         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
2070         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
2071         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
2072         "       add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait                \n"
2073         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
2074         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
2075         "       vst1.16   {d2, d3}, [%[dest]]         @ store composited pixels                                         \n"
2076         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer                                     \n"
2077         "       bne 0b                               @ next please                                                                      \n"
2078
2079         /* Clobbered registers marked as input/outputs */
2080         : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
2081
2082           /* Inputs */
2083         : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
2084
2085           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2086         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
2087           "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
2088         );
2089
2090 #else
2091
2092     uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
2093
2094     while (count--)
2095     {
2096         uint16x8_t pixels = vld1q_u16 (dest);
2097         uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
2098         uint8x8_t mask_image = vmvn_u8 (mask);
2099
2100         uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
2101         uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
2102         uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
2103
2104         uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
2105         uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
2106         uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
2107
2108         s_red   = vmlal (s_red, mask, solid_colour.val[2]);
2109         s_green = vmlal (s_green, mask, solid_colour.val[1]);
2110         s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
2111
2112         pixels = vsri_n_u16 (s_red, s_green, 5);
2113         pixels = vsri_n_u16 (pixels, s_blue, 11);
2114         vst1q_u16 (dest, pixels);
2115
2116         dest += dest_stride;
2117         mask += mask_stride;
2118     }
2119
2120 #endif
2121 }
2122
2123 #if 0 /* this is broken currently */
2124 static void
2125 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
2126                               pixman_op_t               op,
2127                               pixman_image_t *          src_image,
2128                               pixman_image_t *          mask_image,
2129                               pixman_image_t *          dst_image,
2130                               int32_t                   src_x,
2131                               int32_t                   src_y,
2132                               int32_t                   mask_x,
2133                               int32_t                   mask_y,
2134                               int32_t                   dest_x,
2135                               int32_t                   dest_y,
2136                               int32_t                   width,
2137                               int32_t                   height)
2138 {
2139     uint32_t  src, srca;
2140     uint16_t *dst_line, *aligned_line;
2141     uint8_t  *mask_line;
2142     uint32_t  dst_stride, mask_stride;
2143     uint32_t  kernel_count, copy_count, copy_tail;
2144     uint8_t   kernel_offset, copy_offset;
2145
2146     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2147
2148     /* bail out if fully transparent or degenerate */
2149     srca = src >> 24;
2150     if (src == 0)
2151         return;
2152
2153     if (width == 0 || height == 0)
2154         return;
2155
2156     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2157     {
2158         /* split the blit, so we can use a fixed-size scanline buffer
2159          * TODO: there must be a more elegant way of doing this.
2160          */
2161         int x;
2162         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2163         {
2164             neon_composite_over_n_8_0565 (
2165                 impl, op,
2166                 src_image, mask_image, dst_image,
2167                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2168                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2169         }
2170
2171         return;
2172     }
2173
2174     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2175     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2176
2177     /* keep within minimum number of aligned quadwords on width
2178      * while also keeping the minimum number of columns to process
2179      */
2180     {
2181         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2182         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2183         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2184
2185         /* the fast copy should be quadword aligned */
2186         copy_offset = dst_line - ((uint16_t*) aligned_left);
2187         aligned_line = dst_line - copy_offset;
2188         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2189         copy_tail = 0;
2190
2191         if (aligned_right - aligned_left > ceiling_length)
2192         {
2193             /* unaligned routine is tightest */
2194             kernel_count = (uint32_t) (ceiling_length >> 4);
2195             kernel_offset = copy_offset;
2196         }
2197         else
2198         {
2199             /* aligned routine is equally tight, so it is safer to align */
2200             kernel_count = copy_count;
2201             kernel_offset = 0;
2202         }
2203
2204         /* We should avoid reading beyond scanline ends for safety */
2205         if (aligned_line < (dst_line - dest_x) ||
2206             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2207         {
2208             /* switch to precise read */
2209             copy_offset = kernel_offset = 0;
2210             aligned_line = dst_line;
2211             kernel_count = (uint32_t) (ceiling_length >> 4);
2212             copy_count = (width * sizeof(*dst_line)) >> 4;
2213             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2214         }
2215     }
2216
2217     {
2218         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
2219         uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
2220         int y = height;
2221
2222         /* row-major order */
2223         /* left edge, middle block, right edge */
2224         for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
2225         {
2226             /* We don't want to overrun the edges of the glyph,
2227              * so realign the edge data into known buffers
2228              */
2229             neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
2230
2231             /* Uncached framebuffer access is really, really slow
2232              * if we do it piecemeal. It should be much faster if we
2233              * grab it all at once. One scanline should easily fit in
2234              * L1 cache, so this should not waste RAM bandwidth.
2235              */
2236             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2237
2238             /* Apply the actual filter */
2239             solid_over_565_8_pix_neon (
2240                 src, scan_line + kernel_offset,
2241                 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
2242                 8, kernel_count);
2243
2244             /* Copy the modified scanline back */
2245             neon_quadword_copy (dst_line, scan_line + copy_offset,
2246                                 width >> 3, (width & 7) * 2);
2247         }
2248     }
2249 }
2250 #endif
2251
2252 #ifdef USE_GCC_INLINE_ASM
2253
2254 static inline void
2255 plain_over_565_8_pix_neon (uint32_t  colour,
2256                            uint16_t *dest,
2257                            uint32_t  dest_stride,     /* bytes, not elements */
2258                            uint32_t  count            /* 8-pixel groups */)
2259 {
2260     /* Inner loop for plain translucent rects
2261      * (solid colour without alpha mask)
2262      */
2263     asm volatile (
2264         "       vld4.8   {d20[], d21[], d22[], d23[]}, [%[colour]]  @ solid colour load/splat \n"
2265         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
2266         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
2267         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
2268         "       vmvn      d18, d23                   @ inverse alpha for background \n"
2269         "0:     @ loop\n"
2270         "       vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer      \n"
2271         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2272         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2273         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2274         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2275         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2276         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2277         "       vmov      q0, q12                    @ retrieve foreground red   \n"
2278         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
2279         "       vmov      q1, q13                    @ retrieve foreground green \n"
2280         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
2281         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
2282         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
2283         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2284         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
2285         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
2286         "       vst1.16   {d0, d1}, [%[dest]]         @ store composited pixels                 \n"
2287         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer             \n"
2288         "       bne 0b                               @ next please                              \n"
2289
2290         /* Clobbered registers marked as input/outputs */
2291         : [dest] "+r" (dest), [count] "+r" (count)
2292
2293           /* Inputs */
2294         : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
2295
2296           /* Clobbers, including the inputs we modify, and
2297            * potentially lots of memory
2298            */
2299         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
2300           "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2301           "cc", "memory"
2302         );
2303 }
2304
2305 static void
2306 neon_composite_over_n_0565 (pixman_implementation_t * impl,
2307                             pixman_op_t               op,
2308                             pixman_image_t *          src_image,
2309                             pixman_image_t *          mask_image,
2310                             pixman_image_t *          dst_image,
2311                             int32_t                   src_x,
2312                             int32_t                   src_y,
2313                             int32_t                   mask_x,
2314                             int32_t                   mask_y,
2315                             int32_t                   dest_x,
2316                             int32_t                   dest_y,
2317                             int32_t                   width,
2318                             int32_t                   height)
2319 {
2320     uint32_t src, srca;
2321     uint16_t    *dst_line, *aligned_line;
2322     uint32_t dst_stride;
2323     uint32_t kernel_count, copy_count, copy_tail;
2324     uint8_t kernel_offset, copy_offset;
2325
2326     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2327
2328     /* bail out if fully transparent */
2329     srca = src >> 24;
2330     if (src == 0)
2331         return;
2332
2333     if (width == 0 || height == 0)
2334         return;
2335
2336     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2337     {
2338         /* split the blit, so we can use a fixed-size scanline buffer *
2339          * TODO: there must be a more elegant way of doing this.
2340          */
2341         int x;
2342
2343         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2344         {
2345             neon_composite_over_n_0565 (
2346                 impl, op,
2347                 src_image, mask_image, dst_image,
2348                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2349                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2350         }
2351         return;
2352     }
2353
2354     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2355
2356     /* keep within minimum number of aligned quadwords on width
2357      * while also keeping the minimum number of columns to process
2358      */
2359     {
2360         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2361         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2362         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2363
2364         /* the fast copy should be quadword aligned */
2365         copy_offset = dst_line - ((uint16_t*) aligned_left);
2366         aligned_line = dst_line - copy_offset;
2367         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2368         copy_tail = 0;
2369
2370         if (aligned_right - aligned_left > ceiling_length)
2371         {
2372             /* unaligned routine is tightest */
2373             kernel_count = (uint32_t) (ceiling_length >> 4);
2374             kernel_offset = copy_offset;
2375         }
2376         else
2377         {
2378             /* aligned routine is equally tight, so it is safer to align */
2379             kernel_count = copy_count;
2380             kernel_offset = 0;
2381         }
2382
2383         /* We should avoid reading beyond scanline ends for safety */
2384         if (aligned_line < (dst_line - dest_x) ||
2385             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2386         {
2387             /* switch to precise read */
2388             copy_offset = kernel_offset = 0;
2389             aligned_line = dst_line;
2390             kernel_count = (uint32_t) (ceiling_length >> 4);
2391             copy_count = (width * sizeof(*dst_line)) >> 4;
2392             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2393         }
2394     }
2395
2396     {
2397         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
2398
2399         /* row-major order */
2400         /* left edge, middle block, right edge */
2401         for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2402         {
2403             /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2404              * It should be much faster if we grab it all at once.
2405              * One scanline should easily fit in L1 cache, so this should
2406              * not waste RAM bandwidth.
2407              */
2408             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2409
2410             /* Apply the actual filter */
2411             plain_over_565_8_pix_neon (
2412                 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2413
2414             /* Copy the modified scanline back */
2415             neon_quadword_copy (
2416                 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2417         }
2418     }
2419 }
2420
2421 static inline void
2422 ARGB8_over_565_8_pix_neon (uint32_t *src,
2423                            uint16_t *dest,
2424                            uint32_t  src_stride,     /* bytes, not elements */
2425                            uint32_t  count           /* 8-pixel groups */)
2426 {
2427     asm volatile (
2428         "0:     @ loop\n"
2429         "       pld   [%[src], %[src_stride]]         @ preload from next scanline      \n"
2430         "       vld1.16   {d0, d1}, [%[dest]]         @ load pixels from framebuffer    \n"
2431         "       vld4.8   {d20, d21, d22, d23},[%[src]]! @ load source image pixels              \n"
2432         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2433         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2434         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2435         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
2436         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2437         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2438         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2439         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
2440         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
2441         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
2442         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2443         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
2444         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
2445         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
2446         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
2447         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
2448         "       vst1.16   {d2, d3}, [%[dest]]!        @ store composited pixels                 \n"
2449         "       bne 0b                               @ next please                              \n"
2450
2451         /* Clobbered registers marked as input/outputs */
2452         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2453
2454           /* Inputs */
2455         : [src_stride] "r" (src_stride)
2456
2457           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2458         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
2459           "d21", "d22", "d23", "cc", "memory"
2460         );
2461 }
2462
2463 static void
2464 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2465                                pixman_op_t               op,
2466                                pixman_image_t *          src_image,
2467                                pixman_image_t *          mask_image,
2468                                pixman_image_t *          dst_image,
2469                                int32_t                   src_x,
2470                                int32_t                   src_y,
2471                                int32_t                   mask_x,
2472                                int32_t                   mask_y,
2473                                int32_t                   dest_x,
2474                                int32_t                   dest_y,
2475                                int32_t                   width,
2476                                int32_t                   height)
2477 {
2478     uint32_t    *src_line;
2479     uint16_t    *dst_line, *aligned_line;
2480     uint32_t dst_stride, src_stride;
2481     uint32_t kernel_count, copy_count, copy_tail;
2482     uint8_t kernel_offset, copy_offset;
2483
2484     /* we assume mask is opaque
2485      * so the only alpha to deal with is embedded in src
2486      */
2487     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2488     {
2489         /* split the blit, so we can use a fixed-size scanline buffer */
2490         int x;
2491         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2492         {
2493             neon_composite_over_8888_0565 (
2494                 impl, op,
2495                 src_image, mask_image, dst_image,
2496                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2497                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2498         }
2499         return;
2500     }
2501
2502     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2503     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2504
2505     /* keep within minimum number of aligned quadwords on width
2506      * while also keeping the minimum number of columns to process
2507      */
2508     {
2509         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2510         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2511         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2512
2513         /* the fast copy should be quadword aligned */
2514         copy_offset = dst_line - ((uint16_t*) aligned_left);
2515         aligned_line = dst_line - copy_offset;
2516         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2517         copy_tail = 0;
2518
2519         if (aligned_right - aligned_left > ceiling_length)
2520         {
2521             /* unaligned routine is tightest */
2522             kernel_count = (uint32_t) (ceiling_length >> 4);
2523             kernel_offset = copy_offset;
2524         }
2525         else
2526         {
2527             /* aligned routine is equally tight, so it is safer to align */
2528             kernel_count = copy_count;
2529             kernel_offset = 0;
2530         }
2531
2532         /* We should avoid reading beyond scanline ends for safety */
2533         if (aligned_line < (dst_line - dest_x) ||
2534             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2535         {
2536             /* switch to precise read */
2537             copy_offset = kernel_offset = 0;
2538             aligned_line = dst_line;
2539             kernel_count = (uint32_t) (ceiling_length >> 4);
2540             copy_count = (width * sizeof(*dst_line)) >> 4;
2541             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2542         }
2543     }
2544
2545     /* Preload the first input scanline */
2546     {
2547         uint8_t *src_ptr = (uint8_t*) src_line;
2548         uint32_t count = (width + 15) / 16;
2549
2550 #ifdef USE_GCC_INLINE_ASM
2551         asm volatile (
2552             "0: @ loop                                          \n"
2553             "   subs    %[count], %[count], #1                  \n"
2554             "   pld     [%[src]]                                \n"
2555             "   add     %[src], %[src], #64                     \n"
2556             "   bgt 0b                                          \n"
2557
2558             /* Clobbered input registers marked as input/outputs */
2559             : [src] "+r" (src_ptr), [count] "+r" (count)
2560             :     /* no unclobbered inputs */
2561             : "cc"
2562             );
2563 #else
2564         do
2565         {
2566             __pld (src_ptr);
2567             src_ptr += 64;
2568         }
2569         while (--count);
2570 #endif
2571     }
2572
2573     {
2574         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2575
2576         /* row-major order */
2577         /* left edge, middle block, right edge */
2578         for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2579         {
2580             /* Uncached framebuffer access is really, really slow if we do
2581              * it piecemeal. It should be much faster if we grab it all at
2582              * once. One scanline should easily fit in L1 cache, so this
2583              * should not waste RAM bandwidth.
2584              */
2585             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2586
2587             /* Apply the actual filter */
2588             ARGB8_over_565_8_pix_neon (
2589                 src_line, scan_line + kernel_offset,
2590                 src_stride * sizeof(*src_line), kernel_count);
2591
2592             /* Copy the modified scanline back */
2593             neon_quadword_copy (dst_line,
2594                                 scan_line + copy_offset,
2595                                 width >> 3, (width & 7) * 2);
2596         }
2597     }
2598 }
2599
2600 #endif  /* USE_GCC_INLINE_ASM */
2601
2602 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2603 {
2604     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_n_8_8,        0 },
2605     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
2606     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
2607     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
2608     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2609     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2610     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2611     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2612 #ifdef USE_GCC_INLINE_ASM
2613     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
2614     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
2615 #if 0 /* this code has some bugs */
2616     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
2617     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
2618     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
2619     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
2620 #endif
2621 #endif
2622     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
2623     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
2624     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
2625     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
2626     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2627     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2628     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
2629     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
2630     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
2631     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
2632     { PIXMAN_OP_NONE },
2633 };
2634
2635 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2636
2637 static void
2638 arm_neon_composite (pixman_implementation_t *imp,
2639                     pixman_op_t              op,
2640                     pixman_image_t *         src,
2641                     pixman_image_t *         mask,
2642                     pixman_image_t *         dest,
2643                     int32_t                  src_x,
2644                     int32_t                  src_y,
2645                     int32_t                  mask_x,
2646                     int32_t                  mask_y,
2647                     int32_t                  dest_x,
2648                     int32_t                  dest_y,
2649                     int32_t                  width,
2650                     int32_t                  height)
2651 {
2652     if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2653                                op, src, mask, dest,
2654                                src_x, src_y,
2655                                mask_x, mask_y,
2656                                dest_x, dest_y,
2657                                width, height))
2658     {
2659         return;
2660     }
2661
2662     _pixman_implementation_composite (imp->delegate, op,
2663                                       src, mask, dest,
2664                                       src_x, src_y,
2665                                       mask_x, mask_y,
2666                                       dest_x, dest_y,
2667                                       width, height);
2668 }
2669
2670 static pixman_bool_t
2671 pixman_blt_neon (void *src_bits,
2672                  void *dst_bits,
2673                  int   src_stride,
2674                  int   dst_stride,
2675                  int   src_bpp,
2676                  int   dst_bpp,
2677                  int   src_x,
2678                  int   src_y,
2679                  int   dst_x,
2680                  int   dst_y,
2681                  int   width,
2682                  int   height)
2683 {
2684     if (!width || !height)
2685         return TRUE;
2686
2687     /* accelerate only straight copies involving complete bytes */
2688     if (src_bpp != dst_bpp || (src_bpp & 7))
2689         return FALSE;
2690
2691     {
2692         uint32_t bytes_per_pixel = src_bpp >> 3;
2693         uint32_t byte_width = width * bytes_per_pixel;
2694         /* parameter is in words for some reason */
2695         int32_t src_stride_bytes = src_stride * 4;
2696         int32_t dst_stride_bytes = dst_stride * 4;
2697         uint8_t *src_bytes = ((uint8_t*) src_bits) +
2698             src_y * src_stride_bytes + src_x * bytes_per_pixel;
2699         uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2700             dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2701         uint32_t quadword_count = byte_width / 16;
2702         uint32_t offset         = byte_width % 16;
2703
2704         while (height--)
2705         {
2706             neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2707             src_bytes += src_stride_bytes;
2708             dst_bytes += dst_stride_bytes;
2709         }
2710     }
2711
2712     return TRUE;
2713 }
2714
2715 static pixman_bool_t
2716 arm_neon_blt (pixman_implementation_t *imp,
2717               uint32_t *               src_bits,
2718               uint32_t *               dst_bits,
2719               int                      src_stride,
2720               int                      dst_stride,
2721               int                      src_bpp,
2722               int                      dst_bpp,
2723               int                      src_x,
2724               int                      src_y,
2725               int                      dst_x,
2726               int                      dst_y,
2727               int                      width,
2728               int                      height)
2729 {
2730     if (pixman_blt_neon (
2731             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2732             src_x, src_y, dst_x, dst_y, width, height))
2733     {
2734         return TRUE;
2735     }
2736
2737     return _pixman_implementation_blt (
2738                imp->delegate,
2739                src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2740                src_x, src_y, dst_x, dst_y, width, height);
2741 }
2742
2743 static pixman_bool_t
2744 arm_neon_fill (pixman_implementation_t *imp,
2745                uint32_t *               bits,
2746                int                      stride,
2747                int                      bpp,
2748                int                      x,
2749                int                      y,
2750                int                      width,
2751                int                      height,
2752                uint32_t xor)
2753 {
2754     if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2755         return TRUE;
2756
2757     return _pixman_implementation_fill (
2758         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2759 }
2760
2761 pixman_implementation_t *
2762 _pixman_implementation_create_arm_neon (void)
2763 {
2764     pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2765     pixman_implementation_t *imp = _pixman_implementation_create (simd);
2766
2767     imp->composite = arm_neon_composite;
2768 #if 0 /* this code has some bugs */
2769     imp->blt = arm_neon_blt;
2770 #endif
2771     imp->fill = arm_neon_fill;
2772
2773     return imp;
2774 }
2775