pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 /* Deal with an intrinsic that is defined differently in GCC */
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch (_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t
  43 unpack0565 (uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8 (0);
  49     gb = vshrq_n_u16 (rgb, 5);
  50     b = vshrq_n_u16 (rgb, 5 + 6);
  51
  52     res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
  53     res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
  54     res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
  55
  56     res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
  57     res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
  58     res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
  59
  60     res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
  61     res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
  62     res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
  63
  64     return res;
  65 }
  66
  67 static force_inline uint16x8_t
  68 pack0565 (uint8x8x4_t s)
  69 {
  70     uint16x8_t rgb, val_g, val_r;
  71
  72     rgb = vshll_n_u8 (s.val[2], 8);
  73     val_g = vshll_n_u8 (s.val[1], 8);
  74     val_r = vshll_n_u8 (s.val[0], 8);
  75     rgb = vsriq_n_u16 (rgb, val_g, 5);
  76     rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
  77
  78     return rgb;
  79 }
  80
  81 static force_inline uint8x8_t
  82 neon2mul (uint8x8_t x,
  83           uint8x8_t alpha)
  84 {
  85     uint16x8_t tmp, tmp2;
  86     uint8x8_t res;
  87
  88     tmp = vmull_u8 (x, alpha);
  89     tmp2 = vrshrq_n_u16 (tmp, 8);
  90     res = vraddhn_u16 (tmp, tmp2);
  91
  92     return res;
  93 }
  94
  95 static force_inline uint8x8x4_t
  96 neon8mul (uint8x8x4_t x,
  97           uint8x8_t   alpha)
  98 {
  99     uint16x8x4_t tmp;
 100     uint8x8x4_t res;
 101     uint16x8_t qtmp1, qtmp2;
 102
 103     tmp.val[0] = vmull_u8 (x.val[0], alpha);
 104     tmp.val[1] = vmull_u8 (x.val[1], alpha);
 105     tmp.val[2] = vmull_u8 (x.val[2], alpha);
 106     tmp.val[3] = vmull_u8 (x.val[3], alpha);
 107
 108     qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
 109     qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
 110     res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
 111     qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
 112     res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
 113     qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
 114     res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
 115     res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 116
 117     return res;
 118 }
 119
 120 static force_inline uint8x8x4_t
 121 neon8qadd (uint8x8x4_t x,
 122            uint8x8x4_t y)
 123 {
 124     uint8x8x4_t res;
 125
 126     res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
 127     res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
 128     res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
 129     res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 130
 131     return res;
 132 }
 133
 134 static void
 135 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
 136                               pixman_op_t               op,
 137                               pixman_image_t *          src_image,
 138                               pixman_image_t *          mask_image,
 139                               pixman_image_t *          dst_image,
 140                               int32_t                   src_x,
 141                               int32_t                   src_y,
 142                               int32_t                   mask_x,
 143                               int32_t                   mask_y,
 144                               int32_t                   dest_x,
 145                               int32_t                   dest_y,
 146                               int32_t                   width,
 147                               int32_t                   height)
 148 {
 149     uint8_t     *dst_line, *dst;
 150     uint8_t     *src_line, *src;
 151     int dst_stride, src_stride;
 152     uint16_t w;
 153
 154     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 155     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 156
 157     if (width >= 8)
 158     {
 159         /* Use overlapping 8-pixel method */
 160         while (height--)
 161         {
 162             uint8_t *keep_dst = 0;
 163             uint8x8_t sval, dval, temp;
 164
 165             dst = dst_line;
 166             dst_line += dst_stride;
 167             src = src_line;
 168             src_line += src_stride;
 169             w = width;
 170
 171 #ifndef USE_GCC_INLINE_ASM
 172             sval = vld1_u8 ((void*)src);
 173             dval = vld1_u8 ((void*)dst);
 174             keep_dst = dst;
 175
 176             temp = vqadd_u8 (dval, sval);
 177
 178             src += (w & 7);
 179             dst += (w & 7);
 180             w -= (w & 7);
 181
 182             while (w)
 183             {
 184                 sval = vld1_u8 ((void*)src);
 185                 dval = vld1_u8 ((void*)dst);
 186
 187                 vst1_u8 ((void*)keep_dst, temp);
 188                 keep_dst = dst;
 189
 190                 temp = vqadd_u8 (dval, sval);
 191
 192                 src += 8;
 193                 dst += 8;
 194                 w -= 8;
 195             }
 196
 197             vst1_u8 ((void*)keep_dst, temp);
 198 #else
 199             asm volatile (
 200 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 201                 "vld1.8  {d0}, [%[src]]\n\t"
 202                 "vld1.8  {d4}, [%[dst]]\n\t"
 203                 "mov     %[keep_dst], %[dst]\n\t"
 204
 205                 "and ip, %[w], #7\n\t"
 206                 "add %[src], %[src], ip\n\t"
 207                 "add %[dst], %[dst], ip\n\t"
 208                 "subs %[w], %[w], ip\n\t"
 209                 "b 9f\n\t"
 210 /* LOOP */
 211                 "2:\n\t"
 212                 "vld1.8  {d0}, [%[src]]!\n\t"
 213                 "vld1.8  {d4}, [%[dst]]!\n\t"
 214                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 215                 "sub     %[keep_dst], %[dst], #8\n\t"
 216                 "subs %[w], %[w], #8\n\t"
 217                 "9:\n\t"
 218                 "vqadd.u8 d20, d0, d4\n\t"
 219
 220                 "bne 2b\n\t"
 221
 222                 "1:\n\t"
 223                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 224
 225                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 226                 :
 227                 : "ip", "cc", "memory", "d0", "d4",
 228                 "d20"
 229                 );
 230 #endif
 231         }
 232     }
 233     else
 234     {
 235         const uint8_t nil = 0;
 236         const uint8x8_t vnil = vld1_dup_u8 (&nil);
 237
 238         while (height--)
 239         {
 240             uint8x8_t sval = vnil, dval = vnil;
 241             uint8_t *dst4 = 0, *dst2 = 0;
 242
 243             dst = dst_line;
 244             dst_line += dst_stride;
 245             src = src_line;
 246             src_line += src_stride;
 247             w = width;
 248
 249             if (w & 4)
 250             {
 251                 sval = vreinterpret_u8_u32 (
 252                     vld1_lane_u32 ((void*)src, vreinterpret_u32_u8 (sval), 1));
 253                 dval = vreinterpret_u8_u32 (
 254                     vld1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (dval), 1));
 255
 256                 dst4 = dst;
 257                 src += 4;
 258                 dst += 4;
 259             }
 260
 261             if (w & 2)
 262             {
 263                 sval = vreinterpret_u8_u16 (
 264                     vld1_lane_u16 ((void*)src, vreinterpret_u16_u8 (sval), 1));
 265                 dval = vreinterpret_u8_u16 (
 266                     vld1_lane_u16 ((void*)dst, vreinterpret_u16_u8 (dval), 1));
 267
 268                 dst2 = dst;
 269                 src += 2;
 270                 dst += 2;
 271             }
 272
 273             if (w & 1)
 274             {
 275                 sval = vld1_lane_u8 (src, sval, 1);
 276                 dval = vld1_lane_u8 (dst, dval, 1);
 277             }
 278
 279             dval = vqadd_u8 (dval, sval);
 280
 281             if (w & 1)
 282                 vst1_lane_u8 (dst, dval, 1);
 283
 284             if (w & 2)
 285                 vst1_lane_u16 ((void*)dst2, vreinterpret_u16_u8 (dval), 1);
 286
 287             if (w & 4)
 288                 vst1_lane_u32 ((void*)dst4, vreinterpret_u32_u8 (dval), 1);
 289         }
 290     }
 291 }
 292
 293 static void
 294 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
 295                                pixman_op_t               op,
 296                                pixman_image_t *          src_image,
 297                                pixman_image_t *          mask_image,
 298                                pixman_image_t *          dst_image,
 299                                int32_t                   src_x,
 300                                int32_t                   src_y,
 301                                int32_t                   mask_x,
 302                                int32_t                   mask_y,
 303                                int32_t                   dest_x,
 304                                int32_t                   dest_y,
 305                                int32_t                   width,
 306                                int32_t                   height)
 307 {
 308     uint32_t    *dst_line, *dst;
 309     uint32_t    *src_line, *src;
 310     int dst_stride, src_stride;
 311     uint32_t w;
 312
 313     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 314     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 315
 316     if (width >= 8)
 317     {
 318         /* Use overlapping 8-pixel method */
 319         while (height--)
 320         {
 321             uint32_t *keep_dst = 0;
 322             uint8x8x4_t sval, dval, temp;
 323
 324             dst = dst_line;
 325             dst_line += dst_stride;
 326             src = src_line;
 327             src_line += src_stride;
 328             w = width;
 329
 330 #ifndef USE_GCC_INLINE_ASM
 331             sval = vld4_u8 ((void*)src);
 332             dval = vld4_u8 ((void*)dst);
 333             keep_dst = dst;
 334
 335             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 336             temp = neon8qadd (sval, temp);
 337
 338             src += (w & 7);
 339             dst += (w & 7);
 340             w -= (w & 7);
 341
 342             while (w)
 343             {
 344                 sval = vld4_u8 ((void*)src);
 345                 dval = vld4_u8 ((void*)dst);
 346
 347                 vst4_u8 ((void*)keep_dst, temp);
 348                 keep_dst = dst;
 349
 350                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 351                 temp = neon8qadd (sval, temp);
 352
 353                 src += 8;
 354                 dst += 8;
 355                 w -= 8;
 356             }
 357
 358             vst4_u8 ((void*)keep_dst, temp);
 359 #else
 360             asm volatile (
 361 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 362                 "vld4.8  {d0-d3}, [%[src]]\n\t"
 363                 "vld4.8  {d4-d7}, [%[dst]]\n\t"
 364                 "mov     %[keep_dst], %[dst]\n\t"
 365
 366                 "and ip, %[w], #7\n\t"
 367                 "add %[src], %[src], ip, LSL#2\n\t"
 368                 "add %[dst], %[dst], ip, LSL#2\n\t"
 369                 "subs %[w], %[w], ip\n\t"
 370                 "b 9f\n\t"
 371 /* LOOP */
 372                 "2:\n\t"
 373                 "vld4.8  {d0-d3}, [%[src]]!\n\t"
 374                 "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 375                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 376                 "sub     %[keep_dst], %[dst], #8*4\n\t"
 377                 "subs %[w], %[w], #8\n\t"
 378                 "9:\n\t"
 379                 "vmvn.8  d31, d3\n\t"
 380                 "vmull.u8 q10, d31, d4\n\t"
 381                 "vmull.u8 q11, d31, d5\n\t"
 382                 "vmull.u8 q12, d31, d6\n\t"
 383                 "vmull.u8 q13, d31, d7\n\t"
 384                 "vrshr.u16 q8, q10, #8\n\t"
 385                 "vrshr.u16 q9, q11, #8\n\t"
 386                 "vraddhn.u16 d20, q10, q8\n\t"
 387                 "vraddhn.u16 d21, q11, q9\n\t"
 388                 "vrshr.u16 q8, q12, #8\n\t"
 389                 "vrshr.u16 q9, q13, #8\n\t"
 390                 "vraddhn.u16 d22, q12, q8\n\t"
 391                 "vraddhn.u16 d23, q13, q9\n\t"
 392 /* result in d20-d23 */
 393                 "vqadd.u8 d20, d0, d20\n\t"
 394                 "vqadd.u8 d21, d1, d21\n\t"
 395                 "vqadd.u8 d22, d2, d22\n\t"
 396                 "vqadd.u8 d23, d3, d23\n\t"
 397
 398                 "bne 2b\n\t"
 399
 400                 "1:\n\t"
 401                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 402
 403                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 404                 :
 405                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 406                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
 407                 );
 408 #endif
 409         }
 410     }
 411     else
 412     {
 413         uint8x8_t alpha_selector = vreinterpret_u8_u64 (
 414             vcreate_u64 (0x0707070703030303ULL));
 415
 416         /* Handle width < 8 */
 417         while (height--)
 418         {
 419             dst = dst_line;
 420             dst_line += dst_stride;
 421             src = src_line;
 422             src_line += src_stride;
 423             w = width;
 424
 425             while (w >= 2)
 426             {
 427                 uint8x8_t sval, dval;
 428
 429                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 430                 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
 431                 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
 432                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 433                 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
 434
 435                 src += 2;
 436                 dst += 2;
 437                 w -= 2;
 438             }
 439
 440             if (w)
 441             {
 442                 uint8x8_t sval, dval;
 443
 444                 /* single 32-bit pixel in lane 0 */
 445                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));  /* only interested in lane 0 */
 446                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));  /* only interested in lane 0 */
 447                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 448                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 449             }
 450         }
 451     }
 452 }
 453
 454 static void
 455 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 456                                  pixman_op_t               op,
 457                                  pixman_image_t *          src_image,
 458                                  pixman_image_t *          mask_image,
 459                                  pixman_image_t *          dst_image,
 460                                  int32_t                   src_x,
 461                                  int32_t                   src_y,
 462                                  int32_t                   mask_x,
 463                                  int32_t                   mask_y,
 464                                  int32_t                   dest_x,
 465                                  int32_t                   dest_y,
 466                                  int32_t                   width,
 467                                  int32_t                   height)
 468 {
 469     uint32_t    *dst_line, *dst;
 470     uint32_t    *src_line, *src;
 471     uint32_t mask;
 472     int dst_stride, src_stride;
 473     uint32_t w;
 474     uint8x8_t mask_alpha;
 475
 476     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 477     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 478
 479     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 480     mask_alpha = vdup_n_u8 ((mask) >> 24);
 481
 482     if (width >= 8)
 483     {
 484         /* Use overlapping 8-pixel method */
 485         while (height--)
 486         {
 487             dst = dst_line;
 488             dst_line += dst_stride;
 489             src = src_line;
 490             src_line += src_stride;
 491             w = width;
 492
 493             uint32_t *keep_dst = 0;
 494
 495 #ifndef USE_GCC_INLINE_ASM
 496             uint8x8x4_t sval, dval, temp;
 497
 498             sval = vld4_u8 ((void*)src);
 499             dval = vld4_u8 ((void*)dst);
 500             keep_dst = dst;
 501
 502             sval = neon8mul (sval, mask_alpha);
 503             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 504             temp = neon8qadd (sval, temp);
 505
 506             src += (w & 7);
 507             dst += (w & 7);
 508             w -= (w & 7);
 509
 510             while (w)
 511             {
 512                 sval = vld4_u8 ((void*)src);
 513                 dval = vld4_u8 ((void*)dst);
 514
 515                 vst4_u8 ((void*)keep_dst, temp);
 516                 keep_dst = dst;
 517
 518                 sval = neon8mul (sval, mask_alpha);
 519                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 520                 temp = neon8qadd (sval, temp);
 521
 522                 src += 8;
 523                 dst += 8;
 524                 w -= 8;
 525             }
 526             vst4_u8 ((void*)keep_dst, temp);
 527 #else
 528             asm volatile (
 529 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 530                 "vdup.32      d30, %[mask]\n\t"
 531                 "vdup.8       d30, d30[3]\n\t"
 532
 533                 "vld4.8       {d0-d3}, [%[src]]\n\t"
 534                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 535                 "mov  %[keep_dst], %[dst]\n\t"
 536
 537                 "and  ip, %[w], #7\n\t"
 538                 "add  %[src], %[src], ip, LSL#2\n\t"
 539                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 540                 "subs  %[w], %[w], ip\n\t"
 541                 "b 9f\n\t"
 542 /* LOOP */
 543                 "2:\n\t"
 544                 "vld4.8       {d0-d3}, [%[src]]!\n\t"
 545                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 546                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 547                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 548                 "subs  %[w], %[w], #8\n\t"
 549
 550                 "9:\n\t"
 551                 "vmull.u8     q10, d30, d0\n\t"
 552                 "vmull.u8     q11, d30, d1\n\t"
 553                 "vmull.u8     q12, d30, d2\n\t"
 554                 "vmull.u8     q13, d30, d3\n\t"
 555                 "vrshr.u16    q8, q10, #8\n\t"
 556                 "vrshr.u16    q9, q11, #8\n\t"
 557                 "vraddhn.u16  d0, q10, q8\n\t"
 558                 "vraddhn.u16  d1, q11, q9\n\t"
 559                 "vrshr.u16    q9, q13, #8\n\t"
 560                 "vrshr.u16    q8, q12, #8\n\t"
 561                 "vraddhn.u16  d3, q13, q9\n\t"
 562                 "vraddhn.u16  d2, q12, q8\n\t"
 563
 564                 "vmvn.8       d31, d3\n\t"
 565                 "vmull.u8     q10, d31, d4\n\t"
 566                 "vmull.u8     q11, d31, d5\n\t"
 567                 "vmull.u8     q12, d31, d6\n\t"
 568                 "vmull.u8     q13, d31, d7\n\t"
 569                 "vrshr.u16    q8, q10, #8\n\t"
 570                 "vrshr.u16    q9, q11, #8\n\t"
 571                 "vraddhn.u16  d20, q10, q8\n\t"
 572                 "vrshr.u16    q8, q12, #8\n\t"
 573                 "vraddhn.u16  d21, q11, q9\n\t"
 574                 "vrshr.u16    q9, q13, #8\n\t"
 575                 "vraddhn.u16  d22, q12, q8\n\t"
 576                 "vraddhn.u16  d23, q13, q9\n\t"
 577
 578 /* result in d20-d23 */
 579                 "vqadd.u8     d20, d0, d20\n\t"
 580                 "vqadd.u8     d21, d1, d21\n\t"
 581                 "vqadd.u8     d22, d2, d22\n\t"
 582                 "vqadd.u8     d23, d3, d23\n\t"
 583
 584                 "bne  2b\n\t"
 585
 586                 "1:\n\t"
 587                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 588
 589                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 590                 : [mask] "r" (mask)
 591                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 592                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
 593                 "d30", "d31"
 594                 );
 595 #endif
 596         }
 597     }
 598     else
 599     {
 600         uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 601
 602         /* Handle width < 8 */
 603         while (height--)
 604         {
 605             dst = dst_line;
 606             dst_line += dst_stride;
 607             src = src_line;
 608             src_line += src_stride;
 609             w = width;
 610
 611             while (w >= 2)
 612             {
 613                 uint8x8_t sval, dval;
 614
 615                 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
 616                 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
 617
 618                 /* sval * const alpha_mul */
 619                 sval = neon2mul (sval, mask_alpha);
 620
 621                 /* dval * 255-(src alpha) */
 622                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 623
 624                 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
 625
 626                 src += 2;
 627                 dst += 2;
 628                 w -= 2;
 629             }
 630
 631             if (w)
 632             {
 633                 uint8x8_t sval, dval;
 634
 635                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));
 636                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
 637
 638                 /* sval * const alpha_mul */
 639                 sval = neon2mul (sval, mask_alpha);
 640
 641                 /* dval * 255-(src alpha) */
 642                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 643
 644                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 645             }
 646         }
 647     }
 648 }
 649
 650 static void
 651 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
 652                               pixman_op_t               op,
 653                               pixman_image_t *          src_image,
 654                               pixman_image_t *          mask_image,
 655                               pixman_image_t *          dst_image,
 656                               int32_t                   src_x,
 657                               int32_t                   src_y,
 658                               int32_t                   mask_x,
 659                               int32_t                   mask_y,
 660                               int32_t                   dest_x,
 661                               int32_t                   dest_y,
 662                               int32_t                   width,
 663                               int32_t                   height)
 664 {
 665     uint32_t     src, srca;
 666     uint16_t    *dst_line, *dst;
 667     uint8_t     *mask_line, *mask;
 668     int          dst_stride, mask_stride;
 669     uint32_t     w;
 670     uint8x8_t    sval2;
 671     uint8x8x4_t  sval8;
 672
 673     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 674
 675     srca = src >> 24;
 676     if (src == 0)
 677         return;
 678
 679     sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
 680     sval8.val[0]=vdup_lane_u8 (sval2,0);
 681     sval8.val[1]=vdup_lane_u8 (sval2,1);
 682     sval8.val[2]=vdup_lane_u8 (sval2,2);
 683     sval8.val[3]=vdup_lane_u8 (sval2,3);
 684
 685     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 686     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 687
 688     if (width>=8)
 689     {
 690         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
 691         while (height--)
 692         {
 693             uint16_t *keep_dst=0;
 694
 695             dst = dst_line;
 696             dst_line += dst_stride;
 697             mask = mask_line;
 698             mask_line += mask_stride;
 699             w = width;
 700
 701 #ifndef USE_GCC_INLINE_ASM
 702             uint8x8_t alpha;
 703             uint16x8_t dval, temp;
 704             uint8x8x4_t sval8temp;
 705
 706             alpha = vld1_u8 ((void*)mask);
 707             dval = vld1q_u16 ((void*)dst);
 708             keep_dst = dst;
 709
 710             sval8temp = neon8mul (sval8,alpha);
 711             temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
 712
 713             mask += (w & 7);
 714             dst += (w & 7);
 715             w -= (w & 7);
 716
 717             while (w)
 718             {
 719                 dval = vld1q_u16 ((void*)dst);
 720                 alpha = vld1_u8 ((void*)mask);
 721
 722                 vst1q_u16 ((void*)keep_dst,temp);
 723                 keep_dst = dst;
 724
 725                 sval8temp = neon8mul (sval8,alpha);
 726                 temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
 727
 728                 mask+=8;
 729                 dst+=8;
 730                 w-=8;
 731             }
 732             vst1q_u16 ((void*)keep_dst,temp);
 733 #else
 734             asm volatile (
 735                 "vdup.32      d0, %[src]\n\t"
 736                 "vdup.8       d1, d0[1]\n\t"
 737                 "vdup.8       d2, d0[2]\n\t"
 738                 "vdup.8       d3, d0[3]\n\t"
 739                 "vdup.8       d0, d0[0]\n\t"
 740
 741                 "vld1.8       {q12}, [%[dst]]\n\t"
 742                 "vld1.8       {d31}, [%[mask]]\n\t"
 743                 "mov  %[keep_dst], %[dst]\n\t"
 744
 745                 "and  ip, %[w], #7\n\t"
 746                 "add  %[mask], %[mask], ip\n\t"
 747                 "add  %[dst], %[dst], ip, LSL#1\n\t"
 748                 "subs  %[w], %[w], ip\n\t"
 749                 "b  9f\n\t"
 750 /* LOOP */
 751                 "2:\n\t"
 752
 753                 "vld1.16      {q12}, [%[dst]]!\n\t"
 754                 "vld1.8       {d31}, [%[mask]]!\n\t"
 755                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 756                 "sub  %[keep_dst], %[dst], #8*2\n\t"
 757                 "subs  %[w], %[w], #8\n\t"
 758                 "9:\n\t"
 759 /* expand 0565 q12 to 8888 {d4-d7} */
 760                 "vmovn.u16    d4, q12\t\n"
 761                 "vshr.u16     q11, q12, #5\t\n"
 762                 "vshr.u16     q10, q12, #6+5\t\n"
 763                 "vmovn.u16    d5, q11\t\n"
 764                 "vmovn.u16    d6, q10\t\n"
 765                 "vshl.u8      d4, d4, #3\t\n"
 766                 "vshl.u8      d5, d5, #2\t\n"
 767                 "vshl.u8      d6, d6, #3\t\n"
 768                 "vsri.u8      d4, d4, #5\t\n"
 769                 "vsri.u8      d5, d5, #6\t\n"
 770                 "vsri.u8      d6, d6, #5\t\n"
 771
 772                 "vmull.u8     q10, d31, d0\n\t"
 773                 "vmull.u8     q11, d31, d1\n\t"
 774                 "vmull.u8     q12, d31, d2\n\t"
 775                 "vmull.u8     q13, d31, d3\n\t"
 776                 "vrshr.u16    q8, q10, #8\n\t"
 777                 "vrshr.u16    q9, q11, #8\n\t"
 778                 "vraddhn.u16  d20, q10, q8\n\t"
 779                 "vraddhn.u16  d21, q11, q9\n\t"
 780                 "vrshr.u16    q9, q13, #8\n\t"
 781                 "vrshr.u16    q8, q12, #8\n\t"
 782                 "vraddhn.u16  d23, q13, q9\n\t"
 783                 "vraddhn.u16  d22, q12, q8\n\t"
 784
 785 /* duplicate in 4/2/1 & 8pix vsns */
 786                 "vmvn.8       d30, d23\n\t"
 787                 "vmull.u8     q14, d30, d6\n\t"
 788                 "vmull.u8     q13, d30, d5\n\t"
 789                 "vmull.u8     q12, d30, d4\n\t"
 790                 "vrshr.u16    q8, q14, #8\n\t"
 791                 "vrshr.u16    q9, q13, #8\n\t"
 792                 "vraddhn.u16  d6, q14, q8\n\t"
 793                 "vrshr.u16    q8, q12, #8\n\t"
 794                 "vraddhn.u16  d5, q13, q9\n\t"
 795                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 796                 "vraddhn.u16  d4, q12, q8\n\t"
 797 /* intentionally don't calculate alpha */
 798 /* result in d4-d6 */
 799
 800 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 801                 "vqadd.u8     d5, d5, d21\n\t"
 802                 "vqadd.u8     d4, d4, d20\n\t"
 803
 804 /* pack 8888 {d20-d23} to 0565 q10 */
 805                 "vshll.u8     q10, d6, #8\n\t"
 806                 "vshll.u8     q3, d5, #8\n\t"
 807                 "vshll.u8     q2, d4, #8\n\t"
 808                 "vsri.u16     q10, q3, #5\t\n"
 809                 "vsri.u16     q10, q2, #11\t\n"
 810
 811                 "bne 2b\n\t"
 812
 813                 "1:\n\t"
 814                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 815
 816                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 817                 : [src] "r" (src)
 818                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 819                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 820                   "d30","d31"
 821                 );
 822 #endif
 823         }
 824     }
 825     else
 826     {
 827         while (height--)
 828         {
 829             void *dst4=0, *dst2=0;
 830
 831             dst = dst_line;
 832             dst_line += dst_stride;
 833             mask = mask_line;
 834             mask_line += mask_stride;
 835             w = width;
 836
 837
 838 #ifndef USE_GCC_INLINE_ASM
 839             uint8x8_t alpha;
 840             uint16x8_t dval, temp;
 841             uint8x8x4_t sval8temp;
 842
 843             if (w&4)
 844             {
 845                 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void*)mask,vreinterpret_u32_u8 (alpha),1));
 846                 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void*)dst,vreinterpretq_u64_u16 (dval),1));
 847                 dst4=dst;
 848                 mask+=4;
 849                 dst+=4;
 850             }
 851             if (w&2)
 852             {
 853                 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void*)mask,vreinterpret_u16_u8 (alpha),1));
 854                 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void*)dst,vreinterpretq_u32_u16 (dval),1));
 855                 dst2=dst;
 856                 mask+=2;
 857                 dst+=2;
 858             }
 859             if (w&1)
 860             {
 861                 alpha = vld1_lane_u8 ((void*)mask,alpha,1);
 862                 dval = vld1q_lane_u16 ((void*)dst,dval,1);
 863             }
 864
 865             sval8temp = neon8mul (sval8,alpha);
 866             temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
 867
 868             if (w&1)
 869                 vst1q_lane_u16 ((void*)dst,temp,1);
 870             if (w&2)
 871                 vst1q_lane_u32 ((void*)dst2,vreinterpretq_u32_u16 (temp),1);
 872             if (w&4)
 873                 vst1q_lane_u64 ((void*)dst4,vreinterpretq_u64_u16 (temp),1);
 874 #else
 875             asm volatile (
 876                 "vdup.32      d0, %[src]\n\t"
 877                 "vdup.8       d1, d0[1]\n\t"
 878                 "vdup.8       d2, d0[2]\n\t"
 879                 "vdup.8       d3, d0[3]\n\t"
 880                 "vdup.8       d0, d0[0]\n\t"
 881
 882                 "tst  %[w], #4\t\n"
 883                 "beq  skip_load4\t\n"
 884
 885                 "vld1.64      {d25}, [%[dst]]\n\t"
 886                 "vld1.32      {d31[1]}, [%[mask]]\n\t"
 887                 "mov  %[dst4], %[dst]\t\n"
 888                 "add  %[mask], %[mask], #4\t\n"
 889                 "add  %[dst], %[dst], #4*2\t\n"
 890
 891                 "skip_load4:\t\n"
 892                 "tst  %[w], #2\t\n"
 893                 "beq  skip_load2\t\n"
 894                 "vld1.32      {d24[1]}, [%[dst]]\n\t"
 895                 "vld1.16      {d31[1]}, [%[mask]]\n\t"
 896                 "mov  %[dst2], %[dst]\t\n"
 897                 "add  %[mask], %[mask], #2\t\n"
 898                 "add  %[dst], %[dst], #2*2\t\n"
 899
 900                 "skip_load2:\t\n"
 901                 "tst  %[w], #1\t\n"
 902                 "beq  skip_load1\t\n"
 903                 "vld1.16      {d24[1]}, [%[dst]]\n\t"
 904                 "vld1.8       {d31[1]}, [%[mask]]\n\t"
 905
 906                 "skip_load1:\t\n"
 907 /* expand 0565 q12 to 8888 {d4-d7} */
 908                 "vmovn.u16    d4, q12\t\n"
 909                 "vshr.u16     q11, q12, #5\t\n"
 910                 "vshr.u16     q10, q12, #6+5\t\n"
 911                 "vmovn.u16    d5, q11\t\n"
 912                 "vmovn.u16    d6, q10\t\n"
 913                 "vshl.u8      d4, d4, #3\t\n"
 914                 "vshl.u8      d5, d5, #2\t\n"
 915                 "vshl.u8      d6, d6, #3\t\n"
 916                 "vsri.u8      d4, d4, #5\t\n"
 917                 "vsri.u8      d5, d5, #6\t\n"
 918                 "vsri.u8      d6, d6, #5\t\n"
 919
 920                 "vmull.u8     q10, d31, d0\n\t"
 921                 "vmull.u8     q11, d31, d1\n\t"
 922                 "vmull.u8     q12, d31, d2\n\t"
 923                 "vmull.u8     q13, d31, d3\n\t"
 924                 "vrshr.u16    q8, q10, #8\n\t"
 925                 "vrshr.u16    q9, q11, #8\n\t"
 926                 "vraddhn.u16  d20, q10, q8\n\t"
 927                 "vraddhn.u16  d21, q11, q9\n\t"
 928                 "vrshr.u16    q9, q13, #8\n\t"
 929                 "vrshr.u16    q8, q12, #8\n\t"
 930                 "vraddhn.u16  d23, q13, q9\n\t"
 931                 "vraddhn.u16  d22, q12, q8\n\t"
 932
 933 /* duplicate in 4/2/1 & 8pix vsns */
 934                 "vmvn.8       d30, d23\n\t"
 935                 "vmull.u8     q14, d30, d6\n\t"
 936                 "vmull.u8     q13, d30, d5\n\t"
 937                 "vmull.u8     q12, d30, d4\n\t"
 938                 "vrshr.u16    q8, q14, #8\n\t"
 939                 "vrshr.u16    q9, q13, #8\n\t"
 940                 "vraddhn.u16  d6, q14, q8\n\t"
 941                 "vrshr.u16    q8, q12, #8\n\t"
 942                 "vraddhn.u16  d5, q13, q9\n\t"
 943                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 944                 "vraddhn.u16  d4, q12, q8\n\t"
 945 /* intentionally don't calculate alpha */
 946 /* result in d4-d6 */
 947
 948 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 949                 "vqadd.u8     d5, d5, d21\n\t"
 950                 "vqadd.u8     d4, d4, d20\n\t"
 951
 952 /* pack 8888 {d20-d23} to 0565 q10 */
 953                 "vshll.u8     q10, d6, #8\n\t"
 954                 "vshll.u8     q3, d5, #8\n\t"
 955                 "vshll.u8     q2, d4, #8\n\t"
 956                 "vsri.u16     q10, q3, #5\t\n"
 957                 "vsri.u16     q10, q2, #11\t\n"
 958
 959                 "tst  %[w], #1\n\t"
 960                 "beq skip_store1\t\n"
 961                 "vst1.16      {d20[1]}, [%[dst]]\t\n"
 962                 "skip_store1:\t\n"
 963                 "tst  %[w], #2\n\t"
 964                 "beq  skip_store2\t\n"
 965                 "vst1.32      {d20[1]}, [%[dst2]]\t\n"
 966                 "skip_store2:\t\n"
 967                 "tst  %[w], #4\n\t"
 968                 "beq skip_store4\t\n"
 969                 "vst1.16      {d21}, [%[dst4]]\t\n"
 970                 "skip_store4:\t\n"
 971
 972                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
 973                 : [src] "r" (src)
 974                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 975                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 976                   "d30","d31"
 977                 );
 978 #endif
 979         }
 980     }
 981 }
 982
 983 static void
 984 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
 985                               pixman_op_t               op,
 986                               pixman_image_t *          src_image,
 987                               pixman_image_t *          mask_image,
 988                               pixman_image_t *          dst_image,
 989                               int32_t                   src_x,
 990                               int32_t                   src_y,
 991                               int32_t                   mask_x,
 992                               int32_t                   mask_y,
 993                               int32_t                   dest_x,
 994                               int32_t                   dest_y,
 995                               int32_t                   width,
 996                               int32_t                   height)
 997 {
 998     uint32_t src, srca;
 999     uint32_t    *dst_line, *dst;
1000     uint8_t     *mask_line, *mask;
1001     int dst_stride, mask_stride;
1002     uint32_t w;
1003     uint8x8_t sval2;
1004     uint8x8x4_t sval8;
1005     uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1006     uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1007
1008     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1009
1010     /* bail out if fully transparent */
1011     srca = src >> 24;
1012     if (src == 0)
1013         return;
1014
1015     sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1016     sval8.val[0] = vdup_lane_u8 (sval2, 0);
1017     sval8.val[1] = vdup_lane_u8 (sval2, 1);
1018     sval8.val[2] = vdup_lane_u8 (sval2, 2);
1019     sval8.val[3] = vdup_lane_u8 (sval2, 3);
1020
1021     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1022     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1023
1024     if (width >= 8)
1025     {
1026         /* Use overlapping 8-pixel method, modified to avoid
1027          * rewritten dest being reused
1028          */
1029         while (height--)
1030         {
1031             uint32_t *keep_dst = 0;
1032
1033             dst = dst_line;
1034             dst_line += dst_stride;
1035             mask = mask_line;
1036             mask_line += mask_stride;
1037             w = width;
1038
1039 #ifndef USE_GCC_INLINE_ASM
1040             uint8x8_t alpha;
1041             uint8x8x4_t dval, temp;
1042
1043             alpha = vld1_u8 ((void*)mask);
1044             dval = vld4_u8 ((void*)dst);
1045             keep_dst = dst;
1046
1047             temp = neon8mul (sval8, alpha);
1048             dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1049             temp = neon8qadd (temp, dval);
1050
1051             mask += (w & 7);
1052             dst += (w & 7);
1053             w -= (w & 7);
1054
1055             while (w)
1056             {
1057                 alpha = vld1_u8 ((void*)mask);
1058                 dval = vld4_u8 ((void*)dst);
1059
1060                 vst4_u8 ((void*)keep_dst, temp);
1061                 keep_dst = dst;
1062
1063                 temp = neon8mul (sval8, alpha);
1064                 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1065                 temp = neon8qadd (temp, dval);
1066
1067                 mask += 8;
1068                 dst += 8;
1069                 w -= 8;
1070             }
1071             vst4_u8 ((void*)keep_dst, temp);
1072 #else
1073             asm volatile (
1074                 "vdup.32      d0, %[src]\n\t"
1075                 "vdup.8       d1, d0[1]\n\t"
1076                 "vdup.8       d2, d0[2]\n\t"
1077                 "vdup.8       d3, d0[3]\n\t"
1078                 "vdup.8       d0, d0[0]\n\t"
1079
1080                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
1081                 "vld1.8       {d31}, [%[mask]]\n\t"
1082                 "mov  %[keep_dst], %[dst]\n\t"
1083
1084                 "and  ip, %[w], #7\n\t"
1085                 "add  %[mask], %[mask], ip\n\t"
1086                 "add  %[dst], %[dst], ip, LSL#2\n\t"
1087                 "subs  %[w], %[w], ip\n\t"
1088                 "b 9f\n\t"
1089 /* LOOP */
1090                 "2:\n\t"
1091                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
1092                 "vld1.8       {d31}, [%[mask]]!\n\t"
1093                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1094                 "sub  %[keep_dst], %[dst], #8*4\n\t"
1095                 "subs  %[w], %[w], #8\n\t"
1096                 "9:\n\t"
1097
1098                 "vmull.u8     q10, d31, d0\n\t"
1099                 "vmull.u8     q11, d31, d1\n\t"
1100                 "vmull.u8     q12, d31, d2\n\t"
1101                 "vmull.u8     q13, d31, d3\n\t"
1102                 "vrshr.u16    q8, q10, #8\n\t"
1103                 "vrshr.u16    q9, q11, #8\n\t"
1104                 "vraddhn.u16  d20, q10, q8\n\t"
1105                 "vraddhn.u16  d21, q11, q9\n\t"
1106                 "vrshr.u16    q9, q13, #8\n\t"
1107                 "vrshr.u16    q8, q12, #8\n\t"
1108                 "vraddhn.u16  d23, q13, q9\n\t"
1109                 "vraddhn.u16  d22, q12, q8\n\t"
1110
1111                 "vmvn.8       d30, d23\n\t"
1112                 "vmull.u8     q12, d30, d4\n\t"
1113                 "vmull.u8     q13, d30, d5\n\t"
1114                 "vmull.u8     q14, d30, d6\n\t"
1115                 "vmull.u8     q15, d30, d7\n\t"
1116
1117                 "vrshr.u16    q8, q12, #8\n\t"
1118                 "vrshr.u16    q9, q13, #8\n\t"
1119                 "vraddhn.u16  d4, q12, q8\n\t"
1120                 "vrshr.u16    q8, q14, #8\n\t"
1121                 "vraddhn.u16  d5, q13, q9\n\t"
1122                 "vrshr.u16    q9, q15, #8\n\t"
1123                 "vraddhn.u16  d6, q14, q8\n\t"
1124                 "vraddhn.u16  d7, q15, q9\n\t"
1125 /* result in d4-d7 */
1126
1127                 "vqadd.u8     d20, d4, d20\n\t"
1128                 "vqadd.u8     d21, d5, d21\n\t"
1129                 "vqadd.u8     d22, d6, d22\n\t"
1130                 "vqadd.u8     d23, d7, d23\n\t"
1131
1132                 "bne 2b\n\t"
1133
1134                 "1:\n\t"
1135                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1136
1137                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1138                 : [src] "r" (src)
1139                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1140                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1141                 "d30", "d31"
1142                 );
1143 #endif
1144         }
1145     }
1146     else
1147     {
1148         while (height--)
1149         {
1150             uint8x8_t alpha;
1151
1152             dst = dst_line;
1153             dst_line += dst_stride;
1154             mask = mask_line;
1155             mask_line += mask_stride;
1156             w = width;
1157
1158             while (w >= 2)
1159             {
1160                 uint8x8_t dval, temp, res;
1161
1162                 alpha = vtbl1_u8 (
1163                     vreinterpret_u8_u16 (vld1_dup_u16 ((void*)mask)), mask_selector);
1164                 dval = vld1_u8 ((void*)dst);
1165
1166                 temp = neon2mul (sval2, alpha);
1167                 res = vqadd_u8 (
1168                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1169
1170                 vst1_u8 ((void*)dst, res);
1171
1172                 mask += 2;
1173                 dst += 2;
1174                 w -= 2;
1175             }
1176
1177             if (w)
1178             {
1179                 uint8x8_t dval, temp, res;
1180
1181                 alpha = vtbl1_u8 (vld1_dup_u8 ((void*)mask), mask_selector);
1182                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
1183
1184                 temp = neon2mul (sval2, alpha);
1185                 res = vqadd_u8 (
1186                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1187
1188                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (res), 0);
1189             }
1190         }
1191     }
1192 }
1193
1194 static void
1195 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
1196                              pixman_op_t               op,
1197                              pixman_image_t *          src_image,
1198                              pixman_image_t *          mask_image,
1199                              pixman_image_t *          dst_image,
1200                              int32_t                   src_x,
1201                              int32_t                   src_y,
1202                              int32_t                   mask_x,
1203                              int32_t                   mask_y,
1204                              int32_t                   dest_x,
1205                              int32_t                   dest_y,
1206                              int32_t                   width,
1207                              int32_t                   height)
1208 {
1209     uint8_t     *dst_line, *dst;
1210     uint8_t     *mask_line, *mask;
1211     int dst_stride, mask_stride;
1212     uint32_t w;
1213     uint32_t src;
1214     uint8x8_t sa;
1215
1216     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1217     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1218     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1219     sa = vdup_n_u8 ((src) >> 24);
1220
1221     if (width >= 8)
1222     {
1223         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1224         while (height--)
1225         {
1226             dst = dst_line;
1227             dst_line += dst_stride;
1228             mask = mask_line;
1229             mask_line += mask_stride;
1230             w = width;
1231
1232             uint8x8_t mval, dval, res;
1233             uint8_t     *keep_dst;
1234
1235             mval = vld1_u8 ((void *)mask);
1236             dval = vld1_u8 ((void *)dst);
1237             keep_dst = dst;
1238
1239             res = vqadd_u8 (neon2mul (mval, sa), dval);
1240
1241             mask += (w & 7);
1242             dst += (w & 7);
1243             w -= w & 7;
1244
1245             while (w)
1246             {
1247                 mval = vld1_u8 ((void *)mask);
1248                 dval = vld1_u8 ((void *)dst);
1249                 vst1_u8 ((void *)keep_dst, res);
1250                 keep_dst = dst;
1251
1252                 res = vqadd_u8 (neon2mul (mval, sa), dval);
1253
1254                 mask += 8;
1255                 dst += 8;
1256                 w -= 8;
1257             }
1258             vst1_u8 ((void *)keep_dst, res);
1259         }
1260     }
1261     else
1262     {
1263         /* Use 4/2/1 load/store method to handle 1-7 pixels */
1264         while (height--)
1265         {
1266             dst = dst_line;
1267             dst_line += dst_stride;
1268             mask = mask_line;
1269             mask_line += mask_stride;
1270             w = width;
1271
1272             uint8x8_t mval = sa, dval = sa, res;
1273             uint8_t *dst4 = 0, *dst2 = 0;
1274
1275             if (w & 4)
1276             {
1277                 mval = vreinterpret_u8_u32 (
1278                     vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1279                 dval = vreinterpret_u8_u32 (
1280                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1281
1282                 dst4 = dst;
1283                 mask += 4;
1284                 dst += 4;
1285             }
1286
1287             if (w & 2)
1288             {
1289                 mval = vreinterpret_u8_u16 (
1290                     vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1291                 dval = vreinterpret_u8_u16 (
1292                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1293                 dst2 = dst;
1294                 mask += 2;
1295                 dst += 2;
1296             }
1297
1298             if (w & 1)
1299             {
1300                 mval = vld1_lane_u8 (mask, mval, 1);
1301                 dval = vld1_lane_u8 (dst, dval, 1);
1302             }
1303
1304             res = vqadd_u8 (neon2mul (mval, sa), dval);
1305
1306             if (w & 1)
1307                 vst1_lane_u8 (dst, res, 1);
1308             if (w & 2)
1309                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1310             if (w & 4)
1311                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1312         }
1313     }
1314 }
1315
1316 #ifdef USE_GCC_INLINE_ASM
1317
1318 static void
1319 neon_composite_src_16_16 (pixman_implementation_t * impl,
1320                           pixman_op_t               op,
1321                           pixman_image_t *          src_image,
1322                           pixman_image_t *          mask_image,
1323                           pixman_image_t *          dst_image,
1324                           int32_t                   src_x,
1325                           int32_t                   src_y,
1326                           int32_t                   mask_x,
1327                           int32_t                   mask_y,
1328                           int32_t                   dest_x,
1329                           int32_t                   dest_y,
1330                           int32_t                   width,
1331                           int32_t                   height)
1332 {
1333     uint16_t    *dst_line, *src_line;
1334     uint32_t dst_stride, src_stride;
1335
1336     if (!height || !width)
1337         return;
1338
1339     /* We simply copy 16-bit-aligned pixels from one place to another. */
1340     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1341     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1342
1343     /* Preload the first input scanline */
1344     {
1345         uint16_t *src_ptr = src_line;
1346         uint32_t count = width;
1347
1348         asm volatile (
1349             "0: @ loop                                                  \n"
1350             "   subs    %[count], %[count], #32                         \n"
1351             "   pld     [%[src]]                                        \n"
1352             "   add     %[src], %[src], #64                             \n"
1353             "   bgt 0b                                                  \n"
1354
1355             /* Clobbered input registers marked as input/outputs */
1356             : [src] "+r" (src_ptr), [count] "+r" (count)
1357             :     /* no unclobbered inputs */
1358             : "cc"
1359             );
1360     }
1361
1362     while (height--)
1363     {
1364         uint16_t *dst_ptr = dst_line;
1365         uint16_t *src_ptr = src_line;
1366         uint32_t count = width;
1367         uint32_t tmp = 0;
1368
1369         /* Uses multi-register access and preloading to maximise bandwidth.
1370          * Each pixel is one halfword, so a quadword contains 8px.
1371          * Preload frequency assumed a 64-byte cacheline.
1372          */
1373         asm volatile (
1374             "   cmp       %[count], #64                         \n"
1375             "   blt 1f    @ skip oversized fragments            \n"
1376             "0: @ start with eight quadwords at a time          \n"
1377             /* preload from next scanline */
1378             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1379             "   sub       %[count], %[count], #64               \n"
1380             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1381             "   vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1382             /* preload from next scanline */
1383             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1384             "   vld1.16   {d24,d25,d26,d27}, [%[src]]!          \n"
1385             "   vld1.16   {d28,d29,d30,d31}, [%[src]]!          \n"
1386             "   cmp       %[count], #64                         \n"
1387             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1388             "   vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1389             "   vst1.16   {d24,d25,d26,d27}, [%[dst]]!          \n"
1390             "   vst1.16   {d28,d29,d30,d31}, [%[dst]]!          \n"
1391             "   bge 0b                                          \n"
1392             "   cmp       %[count], #0                          \n"
1393             "   beq 7f    @ aligned fastpath                    \n"
1394             "1: @ four quadwords                                \n"
1395             "   tst       %[count], #32                         \n"
1396             "   beq 2f    @ skip oversized fragment             \n"
1397             /* preload from next scanline */
1398             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1399             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1400             "   vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1401             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1402             "   vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1403             "2: @ two quadwords                                 \n"
1404             "   tst       %[count], #16                         \n"
1405             "   beq 3f    @ skip oversized fragment             \n"
1406             /* preload from next scanline */
1407             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1408             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1409             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1410             "3: @ one quadword                                  \n"
1411             "   tst       %[count], #8                          \n"
1412             "   beq 4f    @ skip oversized fragment             \n"
1413             "   vld1.16   {d16,d17}, [%[src]]!                  \n"
1414             "   vst1.16   {d16,d17}, [%[dst]]!                  \n"
1415             "4: @ one doubleword                                \n"
1416             "   tst       %[count], #4                          \n"
1417             "   beq 5f    @ skip oversized fragment             \n"
1418             "   vld1.16   {d16}, [%[src]]!                      \n"
1419             "   vst1.16   {d16}, [%[dst]]!                      \n"
1420             "5: @ one word                                      \n"
1421             "   tst       %[count], #2                          \n"
1422             "   beq 6f    @ skip oversized fragment             \n"
1423             "   ldr       %[tmp], [%[src]], #4                  \n"
1424             "   str       %[tmp], [%[dst]], #4                  \n"
1425             "6: @ one halfword                                  \n"
1426             "   tst       %[count], #1                          \n"
1427             "   beq 7f    @ skip oversized fragment             \n"
1428             "   ldrh      %[tmp], [%[src]]                      \n"
1429             "   strh      %[tmp], [%[dst]]                      \n"
1430             "7: @ end                                           \n"
1431
1432             /* Clobbered input registers marked as input/outputs */
1433             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1434               [count] "+r" (count), [tmp] "+r" (tmp)
1435
1436               /* Unclobbered input */
1437             : [src_stride] "r" (src_stride)
1438
1439               /* Clobbered vector registers */
1440             : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1441               "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1442             );
1443
1444         src_line += src_stride;
1445         dst_line += dst_stride;
1446     }
1447 }
1448
1449 #endif /* USE_GCC_INLINE_ASM */
1450
1451 static void
1452 neon_composite_src_24_16 (pixman_implementation_t * impl,
1453                           pixman_op_t               op,
1454                           pixman_image_t *          src_image,
1455                           pixman_image_t *          mask_image,
1456                           pixman_image_t *          dst_image,
1457                           int32_t                   src_x,
1458                           int32_t                   src_y,
1459                           int32_t                   mask_x,
1460                           int32_t                   mask_y,
1461                           int32_t                   dest_x,
1462                           int32_t                   dest_y,
1463                           int32_t                   width,
1464                           int32_t                   height)
1465 {
1466     uint16_t    *dst_line;
1467     uint32_t    *src_line;
1468     uint32_t dst_stride, src_stride;
1469
1470     if (!width || !height)
1471         return;
1472
1473     /* We simply copy pixels from one place to another,
1474      * assuming that the source's alpha is opaque.
1475      */
1476     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1477     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1478
1479     /* Preload the first input scanline */
1480     {
1481         uint8_t *src_ptr = (uint8_t*) src_line;
1482         uint32_t count = (width + 15) / 16;
1483
1484 #ifdef USE_GCC_INLINE_ASM
1485         asm volatile (
1486             "0: @ loop                                          \n"
1487             "   subs    %[count], %[count], #1                  \n"
1488             "   pld     [%[src]]                                \n"
1489             "   add     %[src], %[src], #64                     \n"
1490             "   bgt 0b                                          \n"
1491
1492             /* Clobbered input registers marked as input/outputs */
1493             : [src] "+r" (src_ptr), [count] "+r" (count)
1494             :     /* no unclobbered inputs */
1495             : "cc"
1496             );
1497 #else
1498         do
1499         {
1500             __pld (src_ptr);
1501             src_ptr += 64;
1502         }
1503         while (--count);
1504 #endif
1505     }
1506
1507     while (height--)
1508     {
1509         uint16_t *dst_ptr = dst_line;
1510         uint32_t *src_ptr = src_line;
1511         uint32_t count = width;
1512         const uint32_t rb_mask = 0x1F;
1513         const uint32_t g_mask = 0x3F;
1514
1515         /* If you're going to complain about a goto, take a long hard look
1516          * at the massive blocks of assembler this skips over.  ;-)
1517          */
1518         if (count < 8)
1519             goto small_stuff;
1520
1521 #ifdef USE_GCC_INLINE_ASM
1522
1523         /* This is not as aggressive as the RGB565-source case.
1524          * Generally the source is in cached RAM when the formats are
1525          * different, so we use preload.
1526          *
1527          * We don't need to blend, so we are not reading from the
1528          * uncached framebuffer.
1529          */
1530         asm volatile (
1531             "   cmp       %[count], #16                         \n"
1532             "   blt 1f    @ skip oversized fragments            \n"
1533             "0: @ start with sixteen pixels at a time           \n"
1534             "   sub       %[count], %[count], #16               \n"
1535             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1536             "   vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1537             "   vld4.8    {d4,d5,d6,d7}, [%[src]]!              @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1538             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1539             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1540             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1541             "   vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1542             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1543             "   vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1544             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1545             "   vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1546             "   vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1547             "   vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1548             "   cmp       %[count], #16                         \n"
1549             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels                               \n"
1550             "   bge 0b                                          \n"
1551             "1: @ end of main loop                              \n"
1552             "   cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1553             "   blt 2f                                          \n"
1554             "   sub       %[count], %[count], #8                \n"
1555             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1556             "   vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1557             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1558             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1559             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1560             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1561             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1562             "   vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels                                \n"
1563             "2: @ end                                           \n"
1564
1565             /* Clobbered input and working registers marked as input/outputs */
1566             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1567
1568               /* Unclobbered input */
1569             : [src_stride] "r" (src_stride)
1570
1571               /* Clobbered vector registers */
1572
1573               /* NB: these are the quad aliases of the
1574                * double registers used in the asm
1575                */
1576             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1577               "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1578             );
1579 #else
1580         /* A copy of the above code, in intrinsics-form. */
1581         while (count >= 16)
1582         {
1583             uint8x8x4_t pixel_set_a, pixel_set_b;
1584             uint16x8_t red_a, green_a, blue_a;
1585             uint16x8_t red_b, green_b, blue_b;
1586             uint16x8_t dest_pixels_a, dest_pixels_b;
1587
1588             count -= 16;
1589             __pld (src_ptr + src_stride);
1590             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1591             pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1592             src_ptr += 16;
1593
1594             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1595             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1596             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1597
1598             red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
1599             green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1600             blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
1601
1602             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1603             dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1604
1605             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1606             dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1607
1608             /* There doesn't seem to be an intrinsic for the
1609              * double-quadword variant
1610              */
1611             vst1q_u16 (dst_ptr, dest_pixels_a);
1612             vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1613             dst_ptr += 16;
1614         }
1615
1616         /* 8-pixel loop */
1617         if (count >= 8)
1618         {
1619             uint8x8x4_t pixel_set_a;
1620             uint16x8_t red_a, green_a, blue_a;
1621             uint16x8_t dest_pixels_a;
1622
1623             __pld (src_ptr + src_stride);
1624             count -= 8;
1625             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1626             src_ptr += 8;
1627
1628             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1629             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1630             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1631
1632             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1633             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1634
1635             vst1q_u16 (dst_ptr, dest_pixels_a);
1636             dst_ptr += 8;
1637         }
1638
1639 #endif  /* USE_GCC_INLINE_ASM */
1640
1641     small_stuff:
1642         if (count)
1643             __pld (src_ptr + src_stride);
1644
1645         while (count >= 2)
1646         {
1647             uint32_t src_pixel_a = *src_ptr++;
1648             uint32_t src_pixel_b = *src_ptr++;
1649
1650             /* ARM is really good at shift-then-ALU ops. */
1651             /* This should be a total of six shift-ANDs and five shift-ORs. */
1652             uint32_t dst_pixels_a;
1653             uint32_t dst_pixels_b;
1654
1655             dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1656             dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1657             dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1658
1659             dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1660             dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1661             dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1662
1663             /* little-endian mode only */
1664             *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1665             dst_ptr += 2;
1666             count -= 2;
1667         }
1668
1669         if (count)
1670         {
1671             uint32_t src_pixel = *src_ptr++;
1672
1673             /* ARM is really good at shift-then-ALU ops.
1674              * This block should end up as three shift-ANDs
1675              * and two shift-ORs.
1676              */
1677             uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1678             uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1679             uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1680             uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1681
1682             *dst_ptr++ = dst_pixel;
1683             count--;
1684         }
1685
1686         src_line += src_stride;
1687         dst_line += dst_stride;
1688     }
1689 }
1690
1691 static pixman_bool_t
1692 pixman_fill_neon (uint32_t *bits,
1693                   int       stride,
1694                   int       bpp,
1695                   int       x,
1696                   int       y,
1697                   int       width,
1698                   int       height,
1699                   uint32_t  _xor)
1700 {
1701     uint32_t byte_stride, color;
1702     char *dst;
1703
1704     /* stride is always multiple of 32bit units in pixman */
1705     byte_stride = stride * sizeof(uint32_t);
1706
1707     switch (bpp)
1708     {
1709     case 8:
1710         dst = ((char *) bits) + y * byte_stride + x;
1711         _xor &= 0xff;
1712         color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1713         break;
1714
1715     case 16:
1716         dst = ((char *) bits) + y * byte_stride + x * 2;
1717         _xor &= 0xffff;
1718         color = _xor << 16 | _xor;
1719         width *= 2;         /* width to bytes */
1720         break;
1721
1722     case 32:
1723         dst = ((char *) bits) + y * byte_stride + x * 4;
1724         color = _xor;
1725         width *= 4;         /* width to bytes */
1726         break;
1727
1728     default:
1729         return FALSE;
1730     }
1731
1732 #ifdef USE_GCC_INLINE_ASM
1733     if (width < 16)
1734     {
1735         /* We have a special case for such small widths that don't allow
1736          * us to use wide 128-bit stores anyway. We don't waste time
1737          * trying to align writes, since there are only very few of them anyway
1738          */
1739         asm volatile (
1740             "cmp                %[height], #0\n"/* Check if empty fill */
1741             "beq                3f\n"
1742             "vdup.32    d0, %[color]\n"/* Fill the color to neon req */
1743
1744             /* Check if we have a such width that can easily be handled by single
1745              * operation for each scanline. This significantly reduces the number
1746              * of test/branch instructions for each scanline
1747              */
1748             "cmp                %[width], #8\n"
1749             "beq                4f\n"
1750             "cmp                %[width], #4\n"
1751             "beq                5f\n"
1752             "cmp                %[width], #2\n"
1753             "beq                6f\n"
1754
1755             /* Loop starts here for each scanline */
1756             "1:\n"
1757             "mov                r4, %[dst]\n" /* Starting address of the current line */
1758             "tst                %[width], #8\n"
1759             "beq                2f\n"
1760             "vst1.8             {d0}, [r4]!\n"
1761             "2:\n"
1762             "tst                %[width], #4\n"
1763             "beq                2f\n"
1764             "str                %[color], [r4], #4\n"
1765             "2:\n"
1766             "tst                %[width], #2\n"
1767             "beq                2f\n"
1768             "strh               %[color], [r4], #2\n"
1769             "2:\n"
1770             "tst                %[width], #1\n"
1771             "beq                2f\n"
1772             "strb               %[color], [r4], #1\n"
1773             "2:\n"
1774
1775             "subs               %[height], %[height], #1\n"
1776             "add                %[dst], %[dst], %[byte_stride]\n"
1777             "bne                1b\n"
1778             "b          3f\n"
1779
1780             /* Special fillers for those widths that we can do with single operation */
1781             "4:\n"
1782             "subs               %[height], %[height], #1\n"
1783             "vst1.8             {d0}, [%[dst]]\n"
1784             "add                %[dst], %[dst], %[byte_stride]\n"
1785             "bne                4b\n"
1786             "b          3f\n"
1787
1788             "5:\n"
1789             "subs               %[height], %[height], #1\n"
1790             "str                %[color], [%[dst]]\n"
1791             "add                %[dst], %[dst], %[byte_stride]\n"
1792             "bne                5b\n"
1793             "b          3f\n"
1794
1795             "6:\n"
1796             "subs               %[height], %[height], #1\n"
1797             "strh               %[color], [%[dst]]\n"
1798             "add                %[dst], %[dst], %[byte_stride]\n"
1799             "bne                6b\n"
1800
1801             "3:\n"
1802             : [height] "+r" (height), [dst] "+r" (dst)
1803             : [color] "r" (color), [width] "r" (width),
1804               [byte_stride] "r" (byte_stride)
1805             : "memory", "cc", "d0", "r4");
1806     }
1807     else
1808     {
1809         asm volatile (
1810             "cmp                %[height], #0\n"/* Check if empty fill */
1811             "beq                5f\n"
1812             "vdup.32    q0, %[color]\n"/* Fill the color to neon req */
1813
1814             /* Loop starts here for each scanline */
1815             "1:\n"
1816             "mov                r4, %[dst]\n"/* Starting address of the current line */
1817             "mov                r5, %[width]\n"/* We're going to write this many bytes */
1818             "ands               r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1819             "beq                2f\n"/* Jump to the best case */
1820
1821             /* We're not 128-bit aligned: However, we know that we can get to the
1822                next aligned location, since the fill is at least 16 bytes wide */
1823             "rsb                r6, r6, #16\n" /* We would need to go forward this much */
1824             "sub                r5, r5, r6\n"/* Update bytes left */
1825             "tst                r6, #1\n"
1826             "beq                6f\n"
1827             "vst1.8             {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1828             "6:\n"
1829             "tst                r6, #2\n"
1830             "beq                6f\n"
1831             "vst1.16    {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1832             "6:\n"
1833             "tst                r6, #4\n"
1834             "beq                6f\n"
1835             "vst1.32    {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1836             "6:\n"
1837             "tst                r6, #8\n"
1838             "beq                2f\n"
1839             "vst1.64    {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1840
1841             /* The good case: We're 128-bit aligned for this scanline */
1842             "2:\n"
1843             "and                r6, r5, #15\n"/* Number of tailing bytes */
1844             "cmp                r5, r6\n"/* Do we have at least one qword to write? */
1845             "beq                6f\n"/* No, we just write the tail */
1846             "lsr                r5, r5, #4\n"/* This many full qwords to write */
1847
1848             /* The main block: Do 128-bit aligned writes */
1849             "3:\n"
1850             "subs               r5, r5, #1\n"
1851             "vst1.64    {d0,d1}, [r4, :128]!\n"
1852             "bne                3b\n"
1853
1854             /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1855                We know that we're currently at 128-bit aligned address, so we can just
1856                pick the biggest operations that the remaining write width allows */
1857             "6:\n"
1858             "cmp                r6, #0\n"
1859             "beq                4f\n"
1860             "tst                r6, #8\n"
1861             "beq                6f\n"
1862             "vst1.64    {d0}, [r4, :64]!\n"
1863             "6:\n"
1864             "tst                r6, #4\n"
1865             "beq                6f\n"
1866             "vst1.32    {d0[0]}, [r4, :32]!\n"
1867             "6:\n"
1868             "tst                r6, #2\n"
1869             "beq                6f\n"
1870             "vst1.16    {d0[0]}, [r4, :16]!\n"
1871             "6:\n"
1872             "tst                r6, #1\n"
1873             "beq                4f\n"
1874             "vst1.8             {d0[0]}, [r4]!\n"
1875             "4:\n"
1876
1877             /* Handle the next scanline */
1878             "subs               %[height], %[height], #1\n"
1879             "add                %[dst], %[dst], %[byte_stride]\n"
1880             "bne                1b\n"
1881             "5:\n"
1882             : [height] "+r" (height), [dst] "+r" (dst)
1883             : [color] "r" (color), [width] "r" (width),
1884               [byte_stride] "r" (byte_stride)
1885             : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
1886     }
1887     return TRUE;
1888
1889 #else
1890
1891     /* TODO: intrinsic version for armcc */
1892     return FALSE;
1893
1894 #endif
1895 }
1896
1897 /* TODO: is there a more generic way of doing this being introduced? */
1898 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1899
1900 static inline void
1901 neon_quadword_copy (void*    dst,
1902                     void*    src,
1903                     uint32_t count,         /* of quadwords */
1904                     uint32_t trailer_count  /* of bytes */)
1905 {
1906     uint8_t *t_dst = dst, *t_src = src;
1907
1908     /* Uses aligned multi-register loads to maximise read bandwidth
1909      * on uncached memory such as framebuffers
1910      * The accesses do not have the aligned qualifiers, so that the copy
1911      * may convert between aligned-uncached and unaligned-cached memory.
1912      * It is assumed that the CPU can infer alignedness from the address.
1913      */
1914
1915 #ifdef USE_GCC_INLINE_ASM
1916
1917     asm volatile (
1918         "       cmp       %[count], #8                          \n"
1919         "       blt 1f    @ skip oversized fragments            \n"
1920         "0: @ start with eight quadwords at a time              \n"
1921         "       sub       %[count], %[count], #8                \n"
1922         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1923         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1924         "       vld1.8    {d24,d25,d26,d27}, [%[src]]!          \n"
1925         "       vld1.8    {d28,d29,d30,d31}, [%[src]]!          \n"
1926         "       cmp       %[count], #8                          \n"
1927         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1928         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1929         "       vst1.8    {d24,d25,d26,d27}, [%[dst]]!          \n"
1930         "       vst1.8    {d28,d29,d30,d31}, [%[dst]]!          \n"
1931         "       bge 0b                                          \n"
1932         "1: @ four quadwords                                    \n"
1933         "       tst       %[count], #4                          \n"
1934         "       beq 2f    @ skip oversized fragment             \n"
1935         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1936         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1937         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1938         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1939         "2: @ two quadwords                                     \n"
1940         "       tst       %[count], #2                          \n"
1941         "       beq 3f    @ skip oversized fragment             \n"
1942         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1943         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1944         "3: @ one quadword                                      \n"
1945         "       tst       %[count], #1                          \n"
1946         "       beq 4f    @ skip oversized fragment             \n"
1947         "       vld1.8    {d16,d17}, [%[src]]!                  \n"
1948         "       vst1.8    {d16,d17}, [%[dst]]!                  \n"
1949         "4: @ end                                               \n"
1950
1951         /* Clobbered input registers marked as input/outputs */
1952         : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1953
1954           /* No unclobbered inputs */
1955         :
1956
1957         /* Clobbered vector registers */
1958         : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
1959           "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
1960
1961 #else
1962
1963     while (count >= 8)
1964     {
1965         uint8x16x4_t t1 = vld4q_u8 (t_src);
1966         uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1967
1968         t_src += sizeof(uint8x16x4_t) * 2;
1969         vst4q_u8 (t_dst, t1);
1970         vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1971         t_dst += sizeof(uint8x16x4_t) * 2;
1972         count -= 8;
1973     }
1974
1975     if (count & 4)
1976     {
1977         uint8x16x4_t t1 = vld4q_u8 (t_src);
1978
1979         t_src += sizeof(uint8x16x4_t);
1980         vst4q_u8 (t_dst, t1);
1981         t_dst += sizeof(uint8x16x4_t);
1982     }
1983
1984     if (count & 2)
1985     {
1986         uint8x8x4_t t1 = vld4_u8 (t_src);
1987
1988         t_src += sizeof(uint8x8x4_t);
1989         vst4_u8 (t_dst, t1);
1990         t_dst += sizeof(uint8x8x4_t);
1991     }
1992
1993     if (count & 1)
1994     {
1995         uint8x16_t t1 = vld1q_u8 (t_src);
1996
1997         t_src += sizeof(uint8x16_t);
1998         vst1q_u8 (t_dst, t1);
1999         t_dst += sizeof(uint8x16_t);
2000     }
2001
2002 #endif  /* !USE_GCC_INLINE_ASM */
2003
2004     if (trailer_count)
2005     {
2006         if (trailer_count & 8)
2007         {
2008             uint8x8_t t1 = vld1_u8 (t_src);
2009
2010             t_src += sizeof(uint8x8_t);
2011             vst1_u8 (t_dst, t1);
2012             t_dst += sizeof(uint8x8_t);
2013         }
2014
2015         if (trailer_count & 4)
2016         {
2017             *((uint32_t*) t_dst) = *((uint32_t*) t_src);
2018
2019             t_dst += 4;
2020             t_src += 4;
2021         }
2022
2023         if (trailer_count & 2)
2024         {
2025             *((uint16_t*) t_dst) = *((uint16_t*) t_src);
2026
2027             t_dst += 2;
2028             t_src += 2;
2029         }
2030
2031         if (trailer_count & 1)
2032         {
2033             *t_dst++ = *t_src++;
2034         }
2035     }
2036 }
2037
2038 static inline void
2039 solid_over_565_8_pix_neon (uint32_t  glyph_colour,
2040                            uint16_t *dest,
2041                            uint8_t * in_mask,
2042                            uint32_t  dest_stride,    /* bytes, not elements */
2043                            uint32_t  mask_stride,
2044                            uint32_t  count           /* 8-pixel groups */)
2045 {
2046     /* Inner loop of glyph blitter (solid colour, alpha mask) */
2047
2048 #ifdef USE_GCC_INLINE_ASM
2049
2050     asm volatile (
2051         "       vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]]  @ splat solid colour components    \n"
2052         "0:     @ loop                                                                                                                                                          \n"
2053         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer                       \n"
2054         "       vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph                                                \n"
2055         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
2056         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
2057         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
2058         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
2059         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
2060         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
2061         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
2062         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
2063         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
2064         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
2065         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
2066         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
2067         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
2068         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
2069         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
2070         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
2071         "       add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait                \n"
2072         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
2073         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
2074         "       vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels                                          \n"
2075         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer                                     \n"
2076         "       bne 0b                               @ next please                                                                      \n"
2077
2078         /* Clobbered registers marked as input/outputs */
2079         : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
2080
2081           /* Inputs */
2082         : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
2083
2084           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2085         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
2086           "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
2087         );
2088
2089 #else
2090
2091     uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
2092
2093     while (count--)
2094     {
2095         uint16x8_t pixels = vld1q_u16 (dest);
2096         uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
2097         uint8x8_t mask_image = vmvn_u8 (mask);
2098
2099         uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
2100         uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
2101         uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
2102
2103         uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
2104         uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
2105         uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
2106
2107         s_red   = vmlal (s_red, mask, solid_colour.val[2]);
2108         s_green = vmlal (s_green, mask, solid_colour.val[1]);
2109         s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
2110
2111         pixels = vsri_n_u16 (s_red, s_green, 5);
2112         pixels = vsri_n_u16 (pixels, s_blue, 11);
2113         vst1q_u16 (dest, pixels);
2114
2115         dest += dest_stride;
2116         mask += mask_stride;
2117     }
2118
2119 #endif
2120 }
2121
2122 #if 0 /* this is broken currently */
2123 static void
2124 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
2125                               pixman_op_t               op,
2126                               pixman_image_t *          src_image,
2127                               pixman_image_t *          mask_image,
2128                               pixman_image_t *          dst_image,
2129                               int32_t                   src_x,
2130                               int32_t                   src_y,
2131                               int32_t                   mask_x,
2132                               int32_t                   mask_y,
2133                               int32_t                   dest_x,
2134                               int32_t                   dest_y,
2135                               int32_t                   width,
2136                               int32_t                   height)
2137 {
2138     uint32_t  src, srca;
2139     uint16_t *dst_line, *aligned_line;
2140     uint8_t  *mask_line;
2141     uint32_t  dst_stride, mask_stride;
2142     uint32_t  kernel_count, copy_count, copy_tail;
2143     uint8_t   kernel_offset, copy_offset;
2144
2145     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2146
2147     /* bail out if fully transparent or degenerate */
2148     srca = src >> 24;
2149     if (src == 0)
2150         return;
2151
2152     if (width == 0 || height == 0)
2153         return;
2154
2155     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2156     {
2157         /* split the blit, so we can use a fixed-size scanline buffer
2158          * TODO: there must be a more elegant way of doing this.
2159          */
2160         int x;
2161         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2162         {
2163             neon_composite_over_n_8_0565 (
2164                 impl, op,
2165                 src_image, mask_image, dst_image,
2166                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2167                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2168         }
2169
2170         return;
2171     }
2172
2173     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2174     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2175
2176     /* keep within minimum number of aligned quadwords on width
2177      * while also keeping the minimum number of columns to process
2178      */
2179     {
2180         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2181         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2182         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2183
2184         /* the fast copy should be quadword aligned */
2185         copy_offset = dst_line - ((uint16_t*) aligned_left);
2186         aligned_line = dst_line - copy_offset;
2187         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2188         copy_tail = 0;
2189
2190         if (aligned_right - aligned_left > ceiling_length)
2191         {
2192             /* unaligned routine is tightest */
2193             kernel_count = (uint32_t) (ceiling_length >> 4);
2194             kernel_offset = copy_offset;
2195         }
2196         else
2197         {
2198             /* aligned routine is equally tight, so it is safer to align */
2199             kernel_count = copy_count;
2200             kernel_offset = 0;
2201         }
2202
2203         /* We should avoid reading beyond scanline ends for safety */
2204         if (aligned_line < (dst_line - dest_x) ||
2205             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2206         {
2207             /* switch to precise read */
2208             copy_offset = kernel_offset = 0;
2209             aligned_line = dst_line;
2210             kernel_count = (uint32_t) (ceiling_length >> 4);
2211             copy_count = (width * sizeof(*dst_line)) >> 4;
2212             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2213         }
2214     }
2215
2216     {
2217         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
2218         uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
2219         int y = height;
2220
2221         /* row-major order */
2222         /* left edge, middle block, right edge */
2223         for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
2224         {
2225             /* We don't want to overrun the edges of the glyph,
2226              * so realign the edge data into known buffers
2227              */
2228             neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
2229
2230             /* Uncached framebuffer access is really, really slow
2231              * if we do it piecemeal. It should be much faster if we
2232              * grab it all at once. One scanline should easily fit in
2233              * L1 cache, so this should not waste RAM bandwidth.
2234              */
2235             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2236
2237             /* Apply the actual filter */
2238             solid_over_565_8_pix_neon (
2239                 src, scan_line + kernel_offset,
2240                 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
2241                 8, kernel_count);
2242
2243             /* Copy the modified scanline back */
2244             neon_quadword_copy (dst_line, scan_line + copy_offset,
2245                                 width >> 3, (width & 7) * 2);
2246         }
2247     }
2248 }
2249 #endif
2250
2251 #ifdef USE_GCC_INLINE_ASM
2252
2253 static inline void
2254 plain_over_565_8_pix_neon (uint32_t  colour,
2255                            uint16_t *dest,
2256                            uint32_t  dest_stride,     /* bytes, not elements */
2257                            uint32_t  count            /* 8-pixel groups */)
2258 {
2259     /* Inner loop for plain translucent rects
2260      * (solid colour without alpha mask)
2261      */
2262     asm volatile (
2263         "       vld4.8   {d20[],d21[],d22[],d23[]}, [%[colour]]  @ solid colour load/splat \n"
2264         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
2265         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
2266         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
2267         "       vmvn      d18, d23                   @ inverse alpha for background \n"
2268         "0:     @ loop\n"
2269         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer       \n"
2270         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2271         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2272         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2273         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2274         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2275         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2276         "       vmov      q0, q12                    @ retrieve foreground red   \n"
2277         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
2278         "       vmov      q1, q13                    @ retrieve foreground green \n"
2279         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
2280         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
2281         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
2282         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2283         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
2284         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
2285         "       vst1.16   {d0,d1}, [%[dest]]         @ store composited pixels                  \n"
2286         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer             \n"
2287         "       bne 0b                               @ next please                              \n"
2288
2289         /* Clobbered registers marked as input/outputs */
2290         : [dest] "+r" (dest), [count] "+r" (count)
2291
2292           /* Inputs */
2293         : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
2294
2295           /* Clobbers, including the inputs we modify, and
2296            * potentially lots of memory
2297            */
2298         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
2299           "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2300           "cc", "memory"
2301         );
2302 }
2303
2304 static void
2305 neon_composite_over_n_0565 (pixman_implementation_t * impl,
2306                             pixman_op_t               op,
2307                             pixman_image_t *          src_image,
2308                             pixman_image_t *          mask_image,
2309                             pixman_image_t *          dst_image,
2310                             int32_t                   src_x,
2311                             int32_t                   src_y,
2312                             int32_t                   mask_x,
2313                             int32_t                   mask_y,
2314                             int32_t                   dest_x,
2315                             int32_t                   dest_y,
2316                             int32_t                   width,
2317                             int32_t                   height)
2318 {
2319     uint32_t src, srca;
2320     uint16_t    *dst_line, *aligned_line;
2321     uint32_t dst_stride;
2322     uint32_t kernel_count, copy_count, copy_tail;
2323     uint8_t kernel_offset, copy_offset;
2324
2325     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2326
2327     /* bail out if fully transparent */
2328     srca = src >> 24;
2329     if (src == 0)
2330         return;
2331
2332     if (width == 0 || height == 0)
2333         return;
2334
2335     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2336     {
2337         /* split the blit, so we can use a fixed-size scanline buffer *
2338          * TODO: there must be a more elegant way of doing this.
2339          */
2340         int x;
2341
2342         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2343         {
2344             neon_composite_over_n_0565 (
2345                 impl, op,
2346                 src_image, mask_image, dst_image,
2347                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2348                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2349         }
2350         return;
2351     }
2352
2353     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2354
2355     /* keep within minimum number of aligned quadwords on width
2356      * while also keeping the minimum number of columns to process
2357      */
2358     {
2359         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2360         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2361         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2362
2363         /* the fast copy should be quadword aligned */
2364         copy_offset = dst_line - ((uint16_t*) aligned_left);
2365         aligned_line = dst_line - copy_offset;
2366         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2367         copy_tail = 0;
2368
2369         if (aligned_right - aligned_left > ceiling_length)
2370         {
2371             /* unaligned routine is tightest */
2372             kernel_count = (uint32_t) (ceiling_length >> 4);
2373             kernel_offset = copy_offset;
2374         }
2375         else
2376         {
2377             /* aligned routine is equally tight, so it is safer to align */
2378             kernel_count = copy_count;
2379             kernel_offset = 0;
2380         }
2381
2382         /* We should avoid reading beyond scanline ends for safety */
2383         if (aligned_line < (dst_line - dest_x) ||
2384             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2385         {
2386             /* switch to precise read */
2387             copy_offset = kernel_offset = 0;
2388             aligned_line = dst_line;
2389             kernel_count = (uint32_t) (ceiling_length >> 4);
2390             copy_count = (width * sizeof(*dst_line)) >> 4;
2391             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2392         }
2393     }
2394
2395     {
2396         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
2397
2398         /* row-major order */
2399         /* left edge, middle block, right edge */
2400         for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2401         {
2402             /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2403              * It should be much faster if we grab it all at once.
2404              * One scanline should easily fit in L1 cache, so this should
2405              * not waste RAM bandwidth.
2406              */
2407             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2408
2409             /* Apply the actual filter */
2410             plain_over_565_8_pix_neon (
2411                 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2412
2413             /* Copy the modified scanline back */
2414             neon_quadword_copy (
2415                 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2416         }
2417     }
2418 }
2419
2420 static inline void
2421 ARGB8_over_565_8_pix_neon (uint32_t *src,
2422                            uint16_t *dest,
2423                            uint32_t  src_stride,     /* bytes, not elements */
2424                            uint32_t  count           /* 8-pixel groups */)
2425 {
2426     asm volatile (
2427         "0:     @ loop\n"
2428         "       pld   [%[src], %[src_stride]]         @ preload from next scanline      \n"
2429         "       vld1.16   {d0,d1}, [%[dest]]         @ load pixels from framebuffer     \n"
2430         "       vld4.8   {d20,d21,d22,d23},[%[src]]! @ load source image pixels         \n"
2431         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2432         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2433         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2434         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
2435         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2436         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2437         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2438         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
2439         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
2440         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
2441         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2442         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
2443         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
2444         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
2445         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
2446         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
2447         "       vst1.16   {d2,d3}, [%[dest]]!        @ store composited pixels                  \n"
2448         "       bne 0b                               @ next please                              \n"
2449
2450         /* Clobbered registers marked as input/outputs */
2451         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2452
2453           /* Inputs */
2454         : [src_stride] "r" (src_stride)
2455
2456           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2457         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
2458           "d21", "d22", "d23", "cc", "memory"
2459         );
2460 }
2461
2462 static void
2463 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2464                                pixman_op_t               op,
2465                                pixman_image_t *          src_image,
2466                                pixman_image_t *          mask_image,
2467                                pixman_image_t *          dst_image,
2468                                int32_t                   src_x,
2469                                int32_t                   src_y,
2470                                int32_t                   mask_x,
2471                                int32_t                   mask_y,
2472                                int32_t                   dest_x,
2473                                int32_t                   dest_y,
2474                                int32_t                   width,
2475                                int32_t                   height)
2476 {
2477     uint32_t    *src_line;
2478     uint16_t    *dst_line, *aligned_line;
2479     uint32_t dst_stride, src_stride;
2480     uint32_t kernel_count, copy_count, copy_tail;
2481     uint8_t kernel_offset, copy_offset;
2482
2483     /* we assume mask is opaque
2484      * so the only alpha to deal with is embedded in src
2485      */
2486     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2487     {
2488         /* split the blit, so we can use a fixed-size scanline buffer */
2489         int x;
2490         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2491         {
2492             neon_composite_over_8888_0565 (
2493                 impl, op,
2494                 src_image, mask_image, dst_image,
2495                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2496                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2497         }
2498         return;
2499     }
2500
2501     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2502     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2503
2504     /* keep within minimum number of aligned quadwords on width
2505      * while also keeping the minimum number of columns to process
2506      */
2507     {
2508         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2509         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2510         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2511
2512         /* the fast copy should be quadword aligned */
2513         copy_offset = dst_line - ((uint16_t*) aligned_left);
2514         aligned_line = dst_line - copy_offset;
2515         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2516         copy_tail = 0;
2517
2518         if (aligned_right - aligned_left > ceiling_length)
2519         {
2520             /* unaligned routine is tightest */
2521             kernel_count = (uint32_t) (ceiling_length >> 4);
2522             kernel_offset = copy_offset;
2523         }
2524         else
2525         {
2526             /* aligned routine is equally tight, so it is safer to align */
2527             kernel_count = copy_count;
2528             kernel_offset = 0;
2529         }
2530
2531         /* We should avoid reading beyond scanline ends for safety */
2532         if (aligned_line < (dst_line - dest_x) ||
2533             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2534         {
2535             /* switch to precise read */
2536             copy_offset = kernel_offset = 0;
2537             aligned_line = dst_line;
2538             kernel_count = (uint32_t) (ceiling_length >> 4);
2539             copy_count = (width * sizeof(*dst_line)) >> 4;
2540             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2541         }
2542     }
2543
2544     /* Preload the first input scanline */
2545     {
2546         uint8_t *src_ptr = (uint8_t*) src_line;
2547         uint32_t count = (width + 15) / 16;
2548
2549 #ifdef USE_GCC_INLINE_ASM
2550         asm volatile (
2551             "0: @ loop                                          \n"
2552             "   subs    %[count], %[count], #1                  \n"
2553             "   pld     [%[src]]                                \n"
2554             "   add     %[src], %[src], #64                     \n"
2555             "   bgt 0b                                          \n"
2556
2557             /* Clobbered input registers marked as input/outputs */
2558             : [src] "+r" (src_ptr), [count] "+r" (count)
2559             :     /* no unclobbered inputs */
2560             : "cc"
2561             );
2562 #else
2563         do
2564         {
2565             __pld (src_ptr);
2566             src_ptr += 64;
2567         }
2568         while (--count);
2569 #endif
2570     }
2571
2572     {
2573         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2574
2575         /* row-major order */
2576         /* left edge, middle block, right edge */
2577         for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2578         {
2579             /* Uncached framebuffer access is really, really slow if we do
2580              * it piecemeal. It should be much faster if we grab it all at
2581              * once. One scanline should easily fit in L1 cache, so this
2582              * should not waste RAM bandwidth.
2583              */
2584             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2585
2586             /* Apply the actual filter */
2587             ARGB8_over_565_8_pix_neon (
2588                 src_line, scan_line + kernel_offset,
2589                 src_stride * sizeof(*src_line), kernel_count);
2590
2591             /* Copy the modified scanline back */
2592             neon_quadword_copy (dst_line,
2593                                 scan_line + copy_offset,
2594                                 width >> 3, (width & 7) * 2);
2595         }
2596     }
2597 }
2598
2599 #endif  /* USE_GCC_INLINE_ASM */
2600
2601 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2602 {
2603     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,     0 },
2604     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
2605     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
2606     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
2607     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2608     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2609     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2610     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2611 #ifdef USE_GCC_INLINE_ASM
2612     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
2613     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
2614 #if 0 /* this code has some bugs */
2615     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
2616     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
2617     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
2618     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
2619 #endif
2620 #endif
2621     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
2622     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
2623     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
2624     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
2625     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2626     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2627     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
2628     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
2629     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
2630     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
2631     { PIXMAN_OP_NONE },
2632 };
2633
2634 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2635
2636 static void
2637 arm_neon_composite (pixman_implementation_t *imp,
2638                     pixman_op_t              op,
2639                     pixman_image_t *         src,
2640                     pixman_image_t *         mask,
2641                     pixman_image_t *         dest,
2642                     int32_t                  src_x,
2643                     int32_t                  src_y,
2644                     int32_t                  mask_x,
2645                     int32_t                  mask_y,
2646                     int32_t                  dest_x,
2647                     int32_t                  dest_y,
2648                     int32_t                  width,
2649                     int32_t                  height)
2650 {
2651     if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2652                                op, src, mask, dest,
2653                                src_x, src_y,
2654                                mask_x, mask_y,
2655                                dest_x, dest_y,
2656                                width, height))
2657     {
2658         return;
2659     }
2660
2661     _pixman_implementation_composite (imp->delegate, op,
2662                                       src, mask, dest,
2663                                       src_x, src_y,
2664                                       mask_x, mask_y,
2665                                       dest_x, dest_y,
2666                                       width, height);
2667 }
2668
2669 static pixman_bool_t
2670 pixman_blt_neon (void *src_bits,
2671                  void *dst_bits,
2672                  int   src_stride,
2673                  int   dst_stride,
2674                  int   src_bpp,
2675                  int   dst_bpp,
2676                  int   src_x,
2677                  int   src_y,
2678                  int   dst_x,
2679                  int   dst_y,
2680                  int   width,
2681                  int   height)
2682 {
2683     if (!width || !height)
2684         return TRUE;
2685
2686     /* accelerate only straight copies involving complete bytes */
2687     if (src_bpp != dst_bpp || (src_bpp & 7))
2688         return FALSE;
2689
2690     {
2691         uint32_t bytes_per_pixel = src_bpp >> 3;
2692         uint32_t byte_width = width * bytes_per_pixel;
2693         /* parameter is in words for some reason */
2694         int32_t src_stride_bytes = src_stride * 4;
2695         int32_t dst_stride_bytes = dst_stride * 4;
2696         uint8_t *src_bytes = ((uint8_t*) src_bits) +
2697             src_y * src_stride_bytes + src_x * bytes_per_pixel;
2698         uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2699             dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2700         uint32_t quadword_count = byte_width / 16;
2701         uint32_t offset         = byte_width % 16;
2702
2703         while (height--)
2704         {
2705             neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2706             src_bytes += src_stride_bytes;
2707             dst_bytes += dst_stride_bytes;
2708         }
2709     }
2710
2711     return TRUE;
2712 }
2713
2714 static pixman_bool_t
2715 arm_neon_blt (pixman_implementation_t *imp,
2716               uint32_t *               src_bits,
2717               uint32_t *               dst_bits,
2718               int                      src_stride,
2719               int                      dst_stride,
2720               int                      src_bpp,
2721               int                      dst_bpp,
2722               int                      src_x,
2723               int                      src_y,
2724               int                      dst_x,
2725               int                      dst_y,
2726               int                      width,
2727               int                      height)
2728 {
2729     if (pixman_blt_neon (
2730             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2731             src_x, src_y, dst_x, dst_y, width, height))
2732     {
2733         return TRUE;
2734     }
2735
2736     return _pixman_implementation_blt (
2737                imp->delegate,
2738                src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2739                src_x, src_y, dst_x, dst_y, width, height);
2740 }
2741
2742 static pixman_bool_t
2743 arm_neon_fill (pixman_implementation_t *imp,
2744                uint32_t *               bits,
2745                int                      stride,
2746                int                      bpp,
2747                int                      x,
2748                int                      y,
2749                int                      width,
2750                int                      height,
2751                uint32_t xor)
2752 {
2753     if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2754         return TRUE;
2755
2756     return _pixman_implementation_fill (
2757         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2758 }
2759
2760 pixman_implementation_t *
2761 _pixman_implementation_create_arm_neon (void)
2762 {
2763     pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2764     pixman_implementation_t *imp = _pixman_implementation_create (simd);
2765
2766     imp->composite = arm_neon_composite;
2767 #if 0 /* this code has some bugs */
2768     imp->blt = arm_neon_blt;
2769 #endif
2770     imp->fill = arm_neon_fill;
2771
2772     return imp;
2773 }
2774