pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 /* Deal with an intrinsic that is defined differently in GCC */
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch (_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t
  43 unpack0565 (uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8 (0);
  49     gb = vshrq_n_u16 (rgb, 5);
  50     b = vshrq_n_u16 (rgb, 5 + 6);
  51
  52     res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
  53     res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
  54     res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
  55
  56     res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
  57     res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
  58     res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
  59
  60     res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
  61     res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
  62     res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
  63
  64     return res;
  65 }
  66
  67 static force_inline uint16x8_t
  68 pack0565 (uint8x8x4_t s)
  69 {
  70     uint16x8_t rgb, val_g, val_r;
  71
  72     rgb = vshll_n_u8 (s.val[2], 8);
  73     val_g = vshll_n_u8 (s.val[1], 8);
  74     val_r = vshll_n_u8 (s.val[0], 8);
  75     rgb = vsriq_n_u16 (rgb, val_g, 5);
  76     rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
  77
  78     return rgb;
  79 }
  80
  81 static force_inline uint8x8_t
  82 neon2mul (uint8x8_t x,
  83           uint8x8_t alpha)
  84 {
  85     uint16x8_t tmp, tmp2;
  86     uint8x8_t res;
  87
  88     tmp = vmull_u8 (x, alpha);
  89     tmp2 = vrshrq_n_u16 (tmp, 8);
  90     res = vraddhn_u16 (tmp, tmp2);
  91
  92     return res;
  93 }
  94
  95 static force_inline uint8x8x4_t
  96 neon8mul (uint8x8x4_t x,
  97           uint8x8_t   alpha)
  98 {
  99     uint16x8x4_t tmp;
 100     uint8x8x4_t res;
 101     uint16x8_t qtmp1, qtmp2;
 102
 103     tmp.val[0] = vmull_u8 (x.val[0], alpha);
 104     tmp.val[1] = vmull_u8 (x.val[1], alpha);
 105     tmp.val[2] = vmull_u8 (x.val[2], alpha);
 106     tmp.val[3] = vmull_u8 (x.val[3], alpha);
 107
 108     qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
 109     qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
 110     res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
 111     qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
 112     res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
 113     qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
 114     res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
 115     res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 116
 117     return res;
 118 }
 119
 120 static force_inline uint8x8x4_t
 121 neon8qadd (uint8x8x4_t x,
 122            uint8x8x4_t y)
 123 {
 124     uint8x8x4_t res;
 125
 126     res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
 127     res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
 128     res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
 129     res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 130
 131     return res;
 132 }
 133
 134 static void
 135 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
 136                               pixman_op_t               op,
 137                               pixman_image_t *          src_image,
 138                               pixman_image_t *          mask_image,
 139                               pixman_image_t *          dst_image,
 140                               int32_t                   src_x,
 141                               int32_t                   src_y,
 142                               int32_t                   mask_x,
 143                               int32_t                   mask_y,
 144                               int32_t                   dest_x,
 145                               int32_t                   dest_y,
 146                               int32_t                   width,
 147                               int32_t                   height)
 148 {
 149     uint8_t     *dst_line, *dst;
 150     uint8_t     *src_line, *src;
 151     int dst_stride, src_stride;
 152     uint16_t w;
 153
 154     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 155     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 156
 157     if (width >= 8)
 158     {
 159         /* Use overlapping 8-pixel method */
 160         while (height--)
 161         {
 162             uint8_t *keep_dst = 0;
 163             uint8x8_t sval, dval, temp;
 164
 165             dst = dst_line;
 166             dst_line += dst_stride;
 167             src = src_line;
 168             src_line += src_stride;
 169             w = width;
 170
 171 #ifndef USE_GCC_INLINE_ASM
 172             sval = vld1_u8 ((void*)src);
 173             dval = vld1_u8 ((void*)dst);
 174             keep_dst = dst;
 175
 176             temp = vqadd_u8 (dval, sval);
 177
 178             src += (w & 7);
 179             dst += (w & 7);
 180             w -= (w & 7);
 181
 182             while (w)
 183             {
 184                 sval = vld1_u8 ((void*)src);
 185                 dval = vld1_u8 ((void*)dst);
 186
 187                 vst1_u8 ((void*)keep_dst, temp);
 188                 keep_dst = dst;
 189
 190                 temp = vqadd_u8 (dval, sval);
 191
 192                 src += 8;
 193                 dst += 8;
 194                 w -= 8;
 195             }
 196
 197             vst1_u8 ((void*)keep_dst, temp);
 198 #else
 199             asm volatile (
 200 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 201                 "vld1.8  {d0}, [%[src]]\n\t"
 202                 "vld1.8  {d4}, [%[dst]]\n\t"
 203                 "mov     %[keep_dst], %[dst]\n\t"
 204
 205                 "and ip, %[w], #7\n\t"
 206                 "add %[src], %[src], ip\n\t"
 207                 "add %[dst], %[dst], ip\n\t"
 208                 "subs %[w], %[w], ip\n\t"
 209                 "b 9f\n\t"
 210 /* LOOP */
 211                 "2:\n\t"
 212                 "vld1.8  {d0}, [%[src]]!\n\t"
 213                 "vld1.8  {d4}, [%[dst]]!\n\t"
 214                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 215                 "sub     %[keep_dst], %[dst], #8\n\t"
 216                 "subs %[w], %[w], #8\n\t"
 217                 "9:\n\t"
 218                 "vqadd.u8 d20, d0, d4\n\t"
 219
 220                 "bne 2b\n\t"
 221
 222                 "1:\n\t"
 223                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 224
 225                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 226                 :
 227                 : "ip", "cc", "memory", "d0", "d4",
 228                 "d20"
 229                 );
 230 #endif
 231         }
 232     }
 233     else
 234     {
 235         const uint8_t nil = 0;
 236         const uint8x8_t vnil = vld1_dup_u8 (&nil);
 237
 238         while (height--)
 239         {
 240             uint8x8_t sval = vnil, dval = vnil;
 241             uint8_t *dst4 = 0, *dst2 = 0;
 242
 243             dst = dst_line;
 244             dst_line += dst_stride;
 245             src = src_line;
 246             src_line += src_stride;
 247             w = width;
 248
 249             if (w & 4)
 250             {
 251                 sval = vreinterpret_u8_u32 (
 252                     vld1_lane_u32 ((void*)src, vreinterpret_u32_u8 (sval), 1));
 253                 dval = vreinterpret_u8_u32 (
 254                     vld1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (dval), 1));
 255
 256                 dst4 = dst;
 257                 src += 4;
 258                 dst += 4;
 259             }
 260
 261             if (w & 2)
 262             {
 263                 sval = vreinterpret_u8_u16 (
 264                     vld1_lane_u16 ((void*)src, vreinterpret_u16_u8 (sval), 1));
 265                 dval = vreinterpret_u8_u16 (
 266                     vld1_lane_u16 ((void*)dst, vreinterpret_u16_u8 (dval), 1));
 267
 268                 dst2 = dst;
 269                 src += 2;
 270                 dst += 2;
 271             }
 272
 273             if (w & 1)
 274             {
 275                 sval = vld1_lane_u8 (src, sval, 1);
 276                 dval = vld1_lane_u8 (dst, dval, 1);
 277             }
 278
 279             dval = vqadd_u8 (dval, sval);
 280
 281             if (w & 1)
 282                 vst1_lane_u8 (dst, dval, 1);
 283
 284             if (w & 2)
 285                 vst1_lane_u16 ((void*)dst2, vreinterpret_u16_u8 (dval), 1);
 286
 287             if (w & 4)
 288                 vst1_lane_u32 ((void*)dst4, vreinterpret_u32_u8 (dval), 1);
 289         }
 290     }
 291 }
 292
 293 static void
 294 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
 295                                pixman_op_t               op,
 296                                pixman_image_t *          src_image,
 297                                pixman_image_t *          mask_image,
 298                                pixman_image_t *          dst_image,
 299                                int32_t                   src_x,
 300                                int32_t                   src_y,
 301                                int32_t                   mask_x,
 302                                int32_t                   mask_y,
 303                                int32_t                   dest_x,
 304                                int32_t                   dest_y,
 305                                int32_t                   width,
 306                                int32_t                   height)
 307 {
 308     uint32_t    *dst_line, *dst;
 309     uint32_t    *src_line, *src;
 310     int dst_stride, src_stride;
 311     uint32_t w;
 312
 313     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 314     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 315
 316     if (width >= 8)
 317     {
 318         /* Use overlapping 8-pixel method */
 319         while (height--)
 320         {
 321             uint32_t *keep_dst = 0;
 322             uint8x8x4_t sval, dval, temp;
 323
 324             dst = dst_line;
 325             dst_line += dst_stride;
 326             src = src_line;
 327             src_line += src_stride;
 328             w = width;
 329
 330 #ifndef USE_GCC_INLINE_ASM
 331             sval = vld4_u8 ((void*)src);
 332             dval = vld4_u8 ((void*)dst);
 333             keep_dst = dst;
 334
 335             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 336             temp = neon8qadd (sval, temp);
 337
 338             src += (w & 7);
 339             dst += (w & 7);
 340             w -= (w & 7);
 341
 342             while (w)
 343             {
 344                 sval = vld4_u8 ((void*)src);
 345                 dval = vld4_u8 ((void*)dst);
 346
 347                 vst4_u8 ((void*)keep_dst, temp);
 348                 keep_dst = dst;
 349
 350                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 351                 temp = neon8qadd (sval, temp);
 352
 353                 src += 8;
 354                 dst += 8;
 355                 w -= 8;
 356             }
 357
 358             vst4_u8 ((void*)keep_dst, temp);
 359 #else
 360             asm volatile (
 361 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 362                 "vld4.8  {d0-d3}, [%[src]]\n\t"
 363                 "vld4.8  {d4-d7}, [%[dst]]\n\t"
 364                 "mov     %[keep_dst], %[dst]\n\t"
 365
 366                 "and ip, %[w], #7\n\t"
 367                 "add %[src], %[src], ip, LSL#2\n\t"
 368                 "add %[dst], %[dst], ip, LSL#2\n\t"
 369                 "subs %[w], %[w], ip\n\t"
 370                 "b 9f\n\t"
 371 /* LOOP */
 372                 "2:\n\t"
 373                 "vld4.8  {d0-d3}, [%[src]]!\n\t"
 374                 "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 375                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 376                 "sub     %[keep_dst], %[dst], #8*4\n\t"
 377                 "subs %[w], %[w], #8\n\t"
 378                 "9:\n\t"
 379                 "vmvn.8  d31, d3\n\t"
 380                 "vmull.u8 q10, d31, d4\n\t"
 381                 "vmull.u8 q11, d31, d5\n\t"
 382                 "vmull.u8 q12, d31, d6\n\t"
 383                 "vmull.u8 q13, d31, d7\n\t"
 384                 "vrshr.u16 q8, q10, #8\n\t"
 385                 "vrshr.u16 q9, q11, #8\n\t"
 386                 "vraddhn.u16 d20, q10, q8\n\t"
 387                 "vraddhn.u16 d21, q11, q9\n\t"
 388                 "vrshr.u16 q8, q12, #8\n\t"
 389                 "vrshr.u16 q9, q13, #8\n\t"
 390                 "vraddhn.u16 d22, q12, q8\n\t"
 391                 "vraddhn.u16 d23, q13, q9\n\t"
 392 /* result in d20-d23 */
 393                 "vqadd.u8 d20, d0, d20\n\t"
 394                 "vqadd.u8 d21, d1, d21\n\t"
 395                 "vqadd.u8 d22, d2, d22\n\t"
 396                 "vqadd.u8 d23, d3, d23\n\t"
 397
 398                 "bne 2b\n\t"
 399
 400                 "1:\n\t"
 401                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 402
 403                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 404                 :
 405                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 406                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
 407                 );
 408 #endif
 409         }
 410     }
 411     else
 412     {
 413         uint8x8_t alpha_selector = vreinterpret_u8_u64 (
 414             vcreate_u64 (0x0707070703030303ULL));
 415
 416         /* Handle width < 8 */
 417         while (height--)
 418         {
 419             dst = dst_line;
 420             dst_line += dst_stride;
 421             src = src_line;
 422             src_line += src_stride;
 423             w = width;
 424
 425             while (w >= 2)
 426             {
 427                 uint8x8_t sval, dval;
 428
 429                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 430                 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
 431                 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
 432                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 433                 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
 434
 435                 src += 2;
 436                 dst += 2;
 437                 w -= 2;
 438             }
 439
 440             if (w)
 441             {
 442                 uint8x8_t sval, dval;
 443
 444                 /* single 32-bit pixel in lane 0 */
 445                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));  /* only interested in lane 0 */
 446                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));  /* only interested in lane 0 */
 447                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 448                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 449             }
 450         }
 451     }
 452 }
 453
 454 static void
 455 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 456                                  pixman_op_t               op,
 457                                  pixman_image_t *          src_image,
 458                                  pixman_image_t *          mask_image,
 459                                  pixman_image_t *          dst_image,
 460                                  int32_t                   src_x,
 461                                  int32_t                   src_y,
 462                                  int32_t                   mask_x,
 463                                  int32_t                   mask_y,
 464                                  int32_t                   dest_x,
 465                                  int32_t                   dest_y,
 466                                  int32_t                   width,
 467                                  int32_t                   height)
 468 {
 469     uint32_t    *dst_line, *dst;
 470     uint32_t    *src_line, *src;
 471     uint32_t mask;
 472     int dst_stride, src_stride;
 473     uint32_t w;
 474     uint8x8_t mask_alpha;
 475
 476     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 477     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 478
 479     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 480     mask_alpha = vdup_n_u8 ((mask) >> 24);
 481
 482     if (width >= 8)
 483     {
 484         /* Use overlapping 8-pixel method */
 485         while (height--)
 486         {
 487             dst = dst_line;
 488             dst_line += dst_stride;
 489             src = src_line;
 490             src_line += src_stride;
 491             w = width;
 492
 493             uint32_t *keep_dst = 0;
 494
 495 #ifndef USE_GCC_INLINE_ASM
 496             uint8x8x4_t sval, dval, temp;
 497
 498             sval = vld4_u8 ((void*)src);
 499             dval = vld4_u8 ((void*)dst);
 500             keep_dst = dst;
 501
 502             sval = neon8mul (sval, mask_alpha);
 503             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 504             temp = neon8qadd (sval, temp);
 505
 506             src += (w & 7);
 507             dst += (w & 7);
 508             w -= (w & 7);
 509
 510             while (w)
 511             {
 512                 sval = vld4_u8 ((void*)src);
 513                 dval = vld4_u8 ((void*)dst);
 514
 515                 vst4_u8 ((void*)keep_dst, temp);
 516                 keep_dst = dst;
 517
 518                 sval = neon8mul (sval, mask_alpha);
 519                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 520                 temp = neon8qadd (sval, temp);
 521
 522                 src += 8;
 523                 dst += 8;
 524                 w -= 8;
 525             }
 526             vst4_u8 ((void*)keep_dst, temp);
 527 #else
 528             asm volatile (
 529 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 530                 "vdup.32      d30, %[mask]\n\t"
 531                 "vdup.8       d30, d30[3]\n\t"
 532
 533                 "vld4.8       {d0-d3}, [%[src]]\n\t"
 534                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 535                 "mov  %[keep_dst], %[dst]\n\t"
 536
 537                 "and  ip, %[w], #7\n\t"
 538                 "add  %[src], %[src], ip, LSL#2\n\t"
 539                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 540                 "subs  %[w], %[w], ip\n\t"
 541                 "b 9f\n\t"
 542 /* LOOP */
 543                 "2:\n\t"
 544                 "vld4.8       {d0-d3}, [%[src]]!\n\t"
 545                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 546                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 547                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 548                 "subs  %[w], %[w], #8\n\t"
 549
 550                 "9:\n\t"
 551                 "vmull.u8     q10, d30, d0\n\t"
 552                 "vmull.u8     q11, d30, d1\n\t"
 553                 "vmull.u8     q12, d30, d2\n\t"
 554                 "vmull.u8     q13, d30, d3\n\t"
 555                 "vrshr.u16    q8, q10, #8\n\t"
 556                 "vrshr.u16    q9, q11, #8\n\t"
 557                 "vraddhn.u16  d0, q10, q8\n\t"
 558                 "vraddhn.u16  d1, q11, q9\n\t"
 559                 "vrshr.u16    q9, q13, #8\n\t"
 560                 "vrshr.u16    q8, q12, #8\n\t"
 561                 "vraddhn.u16  d3, q13, q9\n\t"
 562                 "vraddhn.u16  d2, q12, q8\n\t"
 563
 564                 "vmvn.8       d31, d3\n\t"
 565                 "vmull.u8     q10, d31, d4\n\t"
 566                 "vmull.u8     q11, d31, d5\n\t"
 567                 "vmull.u8     q12, d31, d6\n\t"
 568                 "vmull.u8     q13, d31, d7\n\t"
 569                 "vrshr.u16    q8, q10, #8\n\t"
 570                 "vrshr.u16    q9, q11, #8\n\t"
 571                 "vraddhn.u16  d20, q10, q8\n\t"
 572                 "vrshr.u16    q8, q12, #8\n\t"
 573                 "vraddhn.u16  d21, q11, q9\n\t"
 574                 "vrshr.u16    q9, q13, #8\n\t"
 575                 "vraddhn.u16  d22, q12, q8\n\t"
 576                 "vraddhn.u16  d23, q13, q9\n\t"
 577
 578 /* result in d20-d23 */
 579                 "vqadd.u8     d20, d0, d20\n\t"
 580                 "vqadd.u8     d21, d1, d21\n\t"
 581                 "vqadd.u8     d22, d2, d22\n\t"
 582                 "vqadd.u8     d23, d3, d23\n\t"
 583
 584                 "bne  2b\n\t"
 585
 586                 "1:\n\t"
 587                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 588
 589                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 590                 : [mask] "r" (mask)
 591                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 592                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
 593                 "d30", "d31"
 594                 );
 595 #endif
 596         }
 597     }
 598     else
 599     {
 600         uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 601
 602         /* Handle width < 8 */
 603         while (height--)
 604         {
 605             dst = dst_line;
 606             dst_line += dst_stride;
 607             src = src_line;
 608             src_line += src_stride;
 609             w = width;
 610
 611             while (w >= 2)
 612             {
 613                 uint8x8_t sval, dval;
 614
 615                 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
 616                 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
 617
 618                 /* sval * const alpha_mul */
 619                 sval = neon2mul (sval, mask_alpha);
 620
 621                 /* dval * 255-(src alpha) */
 622                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 623
 624                 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
 625
 626                 src += 2;
 627                 dst += 2;
 628                 w -= 2;
 629             }
 630
 631             if (w)
 632             {
 633                 uint8x8_t sval, dval;
 634
 635                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));
 636                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
 637
 638                 /* sval * const alpha_mul */
 639                 sval = neon2mul (sval, mask_alpha);
 640
 641                 /* dval * 255-(src alpha) */
 642                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 643
 644                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 645             }
 646         }
 647     }
 648 }
 649
 650 static void
 651 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
 652                               pixman_op_t               op,
 653                               pixman_image_t *          src_image,
 654                               pixman_image_t *          mask_image,
 655                               pixman_image_t *          dst_image,
 656                               int32_t                   src_x,
 657                               int32_t                   src_y,
 658                               int32_t                   mask_x,
 659                               int32_t                   mask_y,
 660                               int32_t                   dest_x,
 661                               int32_t                   dest_y,
 662                               int32_t                   width,
 663                               int32_t                   height)
 664 {
 665     uint32_t     src, srca;
 666     uint16_t    *dst_line, *dst;
 667     uint8_t     *mask_line, *mask;
 668     int          dst_stride, mask_stride;
 669     uint32_t     w;
 670     uint8x8_t    sval2;
 671     uint8x8x4_t  sval8;
 672
 673     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 674
 675     srca = src >> 24;
 676     if (src == 0)
 677         return;
 678
 679     sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
 680     sval8.val[0]=vdup_lane_u8 (sval2,0);
 681     sval8.val[1]=vdup_lane_u8 (sval2,1);
 682     sval8.val[2]=vdup_lane_u8 (sval2,2);
 683     sval8.val[3]=vdup_lane_u8 (sval2,3);
 684
 685     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 686     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 687
 688     if (width>=8)
 689     {
 690         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
 691         while (height--)
 692         {
 693             uint16_t *keep_dst=0;
 694
 695             dst = dst_line;
 696             dst_line += dst_stride;
 697             mask = mask_line;
 698             mask_line += mask_stride;
 699             w = width;
 700
 701 #ifndef USE_GCC_INLINE_ASM
 702             uint8x8_t alpha;
 703             uint16x8_t dval, temp;
 704             uint8x8x4_t sval8temp;
 705
 706             alpha = vld1_u8 ((void*)mask);
 707             dval = vld1q_u16 ((void*)dst);
 708             keep_dst = dst;
 709
 710             sval8temp = neon8mul (sval8,alpha);
 711             temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
 712
 713             mask += (w & 7);
 714             dst += (w & 7);
 715             w -= (w & 7);
 716
 717             while (w)
 718             {
 719                 dval = vld1q_u16 ((void*)dst);
 720                 alpha = vld1_u8 ((void*)mask);
 721
 722                 vst1q_u16 ((void*)keep_dst,temp);
 723                 keep_dst = dst;
 724
 725                 sval8temp = neon8mul (sval8,alpha);
 726                 temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
 727
 728                 mask+=8;
 729                 dst+=8;
 730                 w-=8;
 731             }
 732             vst1q_u16 ((void*)keep_dst,temp);
 733 #else
 734             asm volatile (
 735                 "vdup.32      d0, %[src]\n\t"
 736                 "vdup.8       d1, d0[1]\n\t"
 737                 "vdup.8       d2, d0[2]\n\t"
 738                 "vdup.8       d3, d0[3]\n\t"
 739                 "vdup.8       d0, d0[0]\n\t"
 740
 741                 "vld1.8       {q12}, [%[dst]]\n\t"
 742                 "vld1.8       {d31}, [%[mask]]\n\t"
 743                 "mov  %[keep_dst], %[dst]\n\t"
 744
 745                 "and  ip, %[w], #7\n\t"
 746                 "add  %[mask], %[mask], ip\n\t"
 747                 "add  %[dst], %[dst], ip, LSL#1\n\t"
 748                 "subs  %[w], %[w], ip\n\t"
 749                 "b  9f\n\t"
 750 /* LOOP */
 751                 "2:\n\t"
 752
 753                 "vld1.16      {q12}, [%[dst]]!\n\t"
 754                 "vld1.8       {d31}, [%[mask]]!\n\t"
 755                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 756                 "sub  %[keep_dst], %[dst], #8*2\n\t"
 757                 "subs  %[w], %[w], #8\n\t"
 758                 "9:\n\t"
 759 /* expand 0565 q12 to 8888 {d4-d7} */
 760                 "vmovn.u16    d4, q12\t\n"
 761                 "vshr.u16     q11, q12, #5\t\n"
 762                 "vshr.u16     q10, q12, #6+5\t\n"
 763                 "vmovn.u16    d5, q11\t\n"
 764                 "vmovn.u16    d6, q10\t\n"
 765                 "vshl.u8      d4, d4, #3\t\n"
 766                 "vshl.u8      d5, d5, #2\t\n"
 767                 "vshl.u8      d6, d6, #3\t\n"
 768                 "vsri.u8      d4, d4, #5\t\n"
 769                 "vsri.u8      d5, d5, #6\t\n"
 770                 "vsri.u8      d6, d6, #5\t\n"
 771
 772                 "vmull.u8     q10, d31, d0\n\t"
 773                 "vmull.u8     q11, d31, d1\n\t"
 774                 "vmull.u8     q12, d31, d2\n\t"
 775                 "vmull.u8     q13, d31, d3\n\t"
 776                 "vrshr.u16    q8, q10, #8\n\t"
 777                 "vrshr.u16    q9, q11, #8\n\t"
 778                 "vraddhn.u16  d20, q10, q8\n\t"
 779                 "vraddhn.u16  d21, q11, q9\n\t"
 780                 "vrshr.u16    q9, q13, #8\n\t"
 781                 "vrshr.u16    q8, q12, #8\n\t"
 782                 "vraddhn.u16  d23, q13, q9\n\t"
 783                 "vraddhn.u16  d22, q12, q8\n\t"
 784
 785 /* duplicate in 4/2/1 & 8pix vsns */
 786                 "vmvn.8       d30, d23\n\t"
 787                 "vmull.u8     q14, d30, d6\n\t"
 788                 "vmull.u8     q13, d30, d5\n\t"
 789                 "vmull.u8     q12, d30, d4\n\t"
 790                 "vrshr.u16    q8, q14, #8\n\t"
 791                 "vrshr.u16    q9, q13, #8\n\t"
 792                 "vraddhn.u16  d6, q14, q8\n\t"
 793                 "vrshr.u16    q8, q12, #8\n\t"
 794                 "vraddhn.u16  d5, q13, q9\n\t"
 795                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 796                 "vraddhn.u16  d4, q12, q8\n\t"
 797 /* intentionally don't calculate alpha */
 798 /* result in d4-d6 */
 799
 800 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 801                 "vqadd.u8     d5, d5, d21\n\t"
 802                 "vqadd.u8     d4, d4, d20\n\t"
 803
 804 /* pack 8888 {d20-d23} to 0565 q10 */
 805                 "vshll.u8     q10, d6, #8\n\t"
 806                 "vshll.u8     q3, d5, #8\n\t"
 807                 "vshll.u8     q2, d4, #8\n\t"
 808                 "vsri.u16     q10, q3, #5\t\n"
 809                 "vsri.u16     q10, q2, #11\t\n"
 810
 811                 "bne 2b\n\t"
 812
 813                 "1:\n\t"
 814                 "vst1.16      {q10}, [%[keep_dst]]\n\t"
 815
 816                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 817                 : [src] "r" (src)
 818                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 819                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 820                   "d30","d31"
 821                 );
 822 #endif
 823         }
 824     }
 825     else
 826     {
 827         while (height--)
 828         {
 829             void *dst4=0, *dst2=0;
 830
 831             dst = dst_line;
 832             dst_line += dst_stride;
 833             mask = mask_line;
 834             mask_line += mask_stride;
 835             w = width;
 836
 837
 838 #ifndef USE_GCC_INLINE_ASM
 839             uint8x8_t alpha;
 840             uint16x8_t dval, temp;
 841             uint8x8x4_t sval8temp;
 842
 843             if (w&4)
 844             {
 845                 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void*)mask,vreinterpret_u32_u8 (alpha),1));
 846                 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void*)dst,vreinterpretq_u64_u16 (dval),1));
 847                 dst4=dst;
 848                 mask+=4;
 849                 dst+=4;
 850             }
 851             if (w&2)
 852             {
 853                 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void*)mask,vreinterpret_u16_u8 (alpha),1));
 854                 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void*)dst,vreinterpretq_u32_u16 (dval),1));
 855                 dst2=dst;
 856                 mask+=2;
 857                 dst+=2;
 858             }
 859             if (w&1)
 860             {
 861                 alpha = vld1_lane_u8 ((void*)mask,alpha,1);
 862                 dval = vld1q_lane_u16 ((void*)dst,dval,1);
 863             }
 864
 865             sval8temp = neon8mul (sval8,alpha);
 866             temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
 867
 868             if (w&1)
 869                 vst1q_lane_u16 ((void*)dst,temp,1);
 870             if (w&2)
 871                 vst1q_lane_u32 ((void*)dst2,vreinterpretq_u32_u16 (temp),1);
 872             if (w&4)
 873                 vst1q_lane_u64 ((void*)dst4,vreinterpretq_u64_u16 (temp),1);
 874 #else
 875             asm volatile (
 876                 "vdup.32      d0, %[src]\n\t"
 877                 "vdup.8       d1, d0[1]\n\t"
 878                 "vdup.8       d2, d0[2]\n\t"
 879                 "vdup.8       d3, d0[3]\n\t"
 880                 "vdup.8       d0, d0[0]\n\t"
 881
 882                 "tst  %[w], #4\t\n"
 883                 "beq  skip_load4\t\n"
 884
 885                 "vld1.64      {d25}, [%[dst]]\n\t"
 886                 "vld1.32      {d31[1]}, [%[mask]]\n\t"
 887                 "mov  %[dst4], %[dst]\t\n"
 888                 "add  %[mask], %[mask], #4\t\n"
 889                 "add  %[dst], %[dst], #4*2\t\n"
 890
 891                 "skip_load4:\t\n"
 892                 "tst  %[w], #2\t\n"
 893                 "beq  skip_load2\t\n"
 894                 "vld1.32      {d24[1]}, [%[dst]]\n\t"
 895                 "vld1.16      {d31[1]}, [%[mask]]\n\t"
 896                 "mov  %[dst2], %[dst]\t\n"
 897                 "add  %[mask], %[mask], #2\t\n"
 898                 "add  %[dst], %[dst], #2*2\t\n"
 899
 900                 "skip_load2:\t\n"
 901                 "tst  %[w], #1\t\n"
 902                 "beq  skip_load1\t\n"
 903                 "vld1.16      {d24[1]}, [%[dst]]\n\t"
 904                 "vld1.8       {d31[1]}, [%[mask]]\n\t"
 905
 906                 "skip_load1:\t\n"
 907 /* expand 0565 q12 to 8888 {d4-d7} */
 908                 "vmovn.u16    d4, q12\t\n"
 909                 "vshr.u16     q11, q12, #5\t\n"
 910                 "vshr.u16     q10, q12, #6+5\t\n"
 911                 "vmovn.u16    d5, q11\t\n"
 912                 "vmovn.u16    d6, q10\t\n"
 913                 "vshl.u8      d4, d4, #3\t\n"
 914                 "vshl.u8      d5, d5, #2\t\n"
 915                 "vshl.u8      d6, d6, #3\t\n"
 916                 "vsri.u8      d4, d4, #5\t\n"
 917                 "vsri.u8      d5, d5, #6\t\n"
 918                 "vsri.u8      d6, d6, #5\t\n"
 919
 920                 "vmull.u8     q10, d31, d0\n\t"
 921                 "vmull.u8     q11, d31, d1\n\t"
 922                 "vmull.u8     q12, d31, d2\n\t"
 923                 "vmull.u8     q13, d31, d3\n\t"
 924                 "vrshr.u16    q8, q10, #8\n\t"
 925                 "vrshr.u16    q9, q11, #8\n\t"
 926                 "vraddhn.u16  d20, q10, q8\n\t"
 927                 "vraddhn.u16  d21, q11, q9\n\t"
 928                 "vrshr.u16    q9, q13, #8\n\t"
 929                 "vrshr.u16    q8, q12, #8\n\t"
 930                 "vraddhn.u16  d23, q13, q9\n\t"
 931                 "vraddhn.u16  d22, q12, q8\n\t"
 932
 933 /* duplicate in 4/2/1 & 8pix vsns */
 934                 "vmvn.8       d30, d23\n\t"
 935                 "vmull.u8     q14, d30, d6\n\t"
 936                 "vmull.u8     q13, d30, d5\n\t"
 937                 "vmull.u8     q12, d30, d4\n\t"
 938                 "vrshr.u16    q8, q14, #8\n\t"
 939                 "vrshr.u16    q9, q13, #8\n\t"
 940                 "vraddhn.u16  d6, q14, q8\n\t"
 941                 "vrshr.u16    q8, q12, #8\n\t"
 942                 "vraddhn.u16  d5, q13, q9\n\t"
 943                 "vqadd.u8     d6, d6, d22\n\t"  /* moved up */
 944                 "vraddhn.u16  d4, q12, q8\n\t"
 945 /* intentionally don't calculate alpha */
 946 /* result in d4-d6 */
 947
 948 /*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
 949                 "vqadd.u8     d5, d5, d21\n\t"
 950                 "vqadd.u8     d4, d4, d20\n\t"
 951
 952 /* pack 8888 {d20-d23} to 0565 q10 */
 953                 "vshll.u8     q10, d6, #8\n\t"
 954                 "vshll.u8     q3, d5, #8\n\t"
 955                 "vshll.u8     q2, d4, #8\n\t"
 956                 "vsri.u16     q10, q3, #5\t\n"
 957                 "vsri.u16     q10, q2, #11\t\n"
 958
 959                 "tst  %[w], #1\n\t"
 960                 "beq skip_store1\t\n"
 961                 "vst1.16      {d20[1]}, [%[dst]]\t\n"
 962                 "skip_store1:\t\n"
 963                 "tst  %[w], #2\n\t"
 964                 "beq  skip_store2\t\n"
 965                 "vst1.32      {d20[1]}, [%[dst2]]\t\n"
 966                 "skip_store2:\t\n"
 967                 "tst  %[w], #4\n\t"
 968                 "beq skip_store4\t\n"
 969                 "vst1.16      {d21}, [%[dst4]]\t\n"
 970                 "skip_store4:\t\n"
 971
 972                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
 973                 : [src] "r" (src)
 974                 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 975                   "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 976                   "d30","d31"
 977                 );
 978 #endif
 979         }
 980     }
 981 }
 982
 983 static void
 984 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
 985                               pixman_op_t               op,
 986                               pixman_image_t *          src_image,
 987                               pixman_image_t *          mask_image,
 988                               pixman_image_t *          dst_image,
 989                               int32_t                   src_x,
 990                               int32_t                   src_y,
 991                               int32_t                   mask_x,
 992                               int32_t                   mask_y,
 993                               int32_t                   dest_x,
 994                               int32_t                   dest_y,
 995                               int32_t                   width,
 996                               int32_t                   height)
 997 {
 998     uint32_t src, srca;
 999     uint32_t    *dst_line, *dst;
1000     uint8_t     *mask_line, *mask;
1001     int dst_stride, mask_stride;
1002     uint32_t w;
1003     uint8x8_t sval2;
1004     uint8x8x4_t sval8;
1005     uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1006     uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1007
1008     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1009
1010     /* bail out if fully transparent */
1011     srca = src >> 24;
1012     if (src == 0)
1013         return;
1014
1015     sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1016     sval8.val[0] = vdup_lane_u8 (sval2, 0);
1017     sval8.val[1] = vdup_lane_u8 (sval2, 1);
1018     sval8.val[2] = vdup_lane_u8 (sval2, 2);
1019     sval8.val[3] = vdup_lane_u8 (sval2, 3);
1020
1021     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1022     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1023
1024     if (width >= 8)
1025     {
1026         /* Use overlapping 8-pixel method, modified to avoid
1027          * rewritten dest being reused
1028          */
1029         while (height--)
1030         {
1031             uint32_t *keep_dst = 0;
1032
1033             dst = dst_line;
1034             dst_line += dst_stride;
1035             mask = mask_line;
1036             mask_line += mask_stride;
1037             w = width;
1038
1039 #ifndef USE_GCC_INLINE_ASM
1040             uint8x8_t alpha;
1041             uint8x8x4_t dval, temp;
1042
1043             alpha = vld1_u8 ((void*)mask);
1044             dval = vld4_u8 ((void*)dst);
1045             keep_dst = dst;
1046
1047             temp = neon8mul (sval8, alpha);
1048             dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1049             temp = neon8qadd (temp, dval);
1050
1051             mask += (w & 7);
1052             dst += (w & 7);
1053             w -= (w & 7);
1054
1055             while (w)
1056             {
1057                 alpha = vld1_u8 ((void*)mask);
1058                 dval = vld4_u8 ((void*)dst);
1059
1060                 vst4_u8 ((void*)keep_dst, temp);
1061                 keep_dst = dst;
1062
1063                 temp = neon8mul (sval8, alpha);
1064                 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1065                 temp = neon8qadd (temp, dval);
1066
1067                 mask += 8;
1068                 dst += 8;
1069                 w -= 8;
1070             }
1071             vst4_u8 ((void*)keep_dst, temp);
1072 #else
1073             asm volatile (
1074                 "vdup.32      d0, %[src]\n\t"
1075                 "vdup.8       d1, d0[1]\n\t"
1076                 "vdup.8       d2, d0[2]\n\t"
1077                 "vdup.8       d3, d0[3]\n\t"
1078                 "vdup.8       d0, d0[0]\n\t"
1079
1080                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
1081                 "vld1.8       {d31}, [%[mask]]\n\t"
1082                 "mov  %[keep_dst], %[dst]\n\t"
1083
1084                 "and  ip, %[w], #7\n\t"
1085                 "add  %[mask], %[mask], ip\n\t"
1086                 "add  %[dst], %[dst], ip, LSL#2\n\t"
1087                 "subs  %[w], %[w], ip\n\t"
1088                 "b 9f\n\t"
1089 /* LOOP */
1090                 "2:\n\t"
1091                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
1092                 "vld1.8       {d31}, [%[mask]]!\n\t"
1093                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1094                 "sub  %[keep_dst], %[dst], #8*4\n\t"
1095                 "subs  %[w], %[w], #8\n\t"
1096                 "9:\n\t"
1097
1098                 "vmull.u8     q10, d31, d0\n\t"
1099                 "vmull.u8     q11, d31, d1\n\t"
1100                 "vmull.u8     q12, d31, d2\n\t"
1101                 "vmull.u8     q13, d31, d3\n\t"
1102                 "vrshr.u16    q8, q10, #8\n\t"
1103                 "vrshr.u16    q9, q11, #8\n\t"
1104                 "vraddhn.u16  d20, q10, q8\n\t"
1105                 "vraddhn.u16  d21, q11, q9\n\t"
1106                 "vrshr.u16    q9, q13, #8\n\t"
1107                 "vrshr.u16    q8, q12, #8\n\t"
1108                 "vraddhn.u16  d23, q13, q9\n\t"
1109                 "vraddhn.u16  d22, q12, q8\n\t"
1110
1111                 "vmvn.8       d30, d23\n\t"
1112                 "vmull.u8     q12, d30, d4\n\t"
1113                 "vmull.u8     q13, d30, d5\n\t"
1114                 "vmull.u8     q14, d30, d6\n\t"
1115                 "vmull.u8     q15, d30, d7\n\t"
1116
1117                 "vrshr.u16    q8, q12, #8\n\t"
1118                 "vrshr.u16    q9, q13, #8\n\t"
1119                 "vraddhn.u16  d4, q12, q8\n\t"
1120                 "vrshr.u16    q8, q14, #8\n\t"
1121                 "vraddhn.u16  d5, q13, q9\n\t"
1122                 "vrshr.u16    q9, q15, #8\n\t"
1123                 "vraddhn.u16  d6, q14, q8\n\t"
1124                 "vraddhn.u16  d7, q15, q9\n\t"
1125 /* result in d4-d7 */
1126
1127                 "vqadd.u8     d20, d4, d20\n\t"
1128                 "vqadd.u8     d21, d5, d21\n\t"
1129                 "vqadd.u8     d22, d6, d22\n\t"
1130                 "vqadd.u8     d23, d7, d23\n\t"
1131
1132                 "bne 2b\n\t"
1133
1134                 "1:\n\t"
1135                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
1136
1137                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1138                 : [src] "r" (src)
1139                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1140                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1141                 "d30", "d31"
1142                 );
1143 #endif
1144         }
1145     }
1146     else
1147     {
1148         while (height--)
1149         {
1150             uint8x8_t alpha;
1151
1152             dst = dst_line;
1153             dst_line += dst_stride;
1154             mask = mask_line;
1155             mask_line += mask_stride;
1156             w = width;
1157
1158             while (w >= 2)
1159             {
1160                 uint8x8_t dval, temp, res;
1161
1162                 alpha = vtbl1_u8 (
1163                     vreinterpret_u8_u16 (vld1_dup_u16 ((void*)mask)), mask_selector);
1164                 dval = vld1_u8 ((void*)dst);
1165
1166                 temp = neon2mul (sval2, alpha);
1167                 res = vqadd_u8 (
1168                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1169
1170                 vst1_u8 ((void*)dst, res);
1171
1172                 mask += 2;
1173                 dst += 2;
1174                 w -= 2;
1175             }
1176
1177             if (w)
1178             {
1179                 uint8x8_t dval, temp, res;
1180
1181                 alpha = vtbl1_u8 (vld1_dup_u8 ((void*)mask), mask_selector);
1182                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
1183
1184                 temp = neon2mul (sval2, alpha);
1185                 res = vqadd_u8 (
1186                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1187
1188                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (res), 0);
1189             }
1190         }
1191     }
1192 }
1193
1194 static void
1195 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
1196                              pixman_op_t               op,
1197                              pixman_image_t *          src_image,
1198                              pixman_image_t *          mask_image,
1199                              pixman_image_t *          dst_image,
1200                              int32_t                   src_x,
1201                              int32_t                   src_y,
1202                              int32_t                   mask_x,
1203                              int32_t                   mask_y,
1204                              int32_t                   dest_x,
1205                              int32_t                   dest_y,
1206                              int32_t                   width,
1207                              int32_t                   height)
1208 {
1209     uint8_t     *dst_line, *dst;
1210     uint8_t     *mask_line, *mask;
1211     int dst_stride, mask_stride;
1212     uint32_t w;
1213     uint32_t src;
1214     uint8x8_t sa;
1215
1216     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1217     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1218     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1219     sa = vdup_n_u8 ((src) >> 24);
1220
1221     if (width >= 8)
1222     {
1223         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1224         while (height--)
1225         {
1226             dst = dst_line;
1227             dst_line += dst_stride;
1228             mask = mask_line;
1229             mask_line += mask_stride;
1230             w = width;
1231
1232             uint8x8_t mval, dval, res;
1233             uint8_t     *keep_dst;
1234
1235             mval = vld1_u8 ((void *)mask);
1236             dval = vld1_u8 ((void *)dst);
1237             keep_dst = dst;
1238
1239             res = vqadd_u8 (neon2mul (mval, sa), dval);
1240
1241             mask += (w & 7);
1242             dst += (w & 7);
1243             w -= w & 7;
1244
1245             while (w)
1246             {
1247                 mval = vld1_u8 ((void *)mask);
1248                 dval = vld1_u8 ((void *)dst);
1249                 vst1_u8 ((void *)keep_dst, res);
1250                 keep_dst = dst;
1251
1252                 res = vqadd_u8 (neon2mul (mval, sa), dval);
1253
1254                 mask += 8;
1255                 dst += 8;
1256                 w -= 8;
1257             }
1258             vst1_u8 ((void *)keep_dst, res);
1259         }
1260     }
1261     else
1262     {
1263         /* Use 4/2/1 load/store method to handle 1-7 pixels */
1264         while (height--)
1265         {
1266             dst = dst_line;
1267             dst_line += dst_stride;
1268             mask = mask_line;
1269             mask_line += mask_stride;
1270             w = width;
1271
1272             uint8x8_t mval = sa, dval = sa, res;
1273             uint8_t *dst4 = 0, *dst2 = 0;
1274
1275             if (w & 4)
1276             {
1277                 mval = vreinterpret_u8_u32 (
1278                     vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1279                 dval = vreinterpret_u8_u32 (
1280                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1281
1282                 dst4 = dst;
1283                 mask += 4;
1284                 dst += 4;
1285             }
1286
1287             if (w & 2)
1288             {
1289                 mval = vreinterpret_u8_u16 (
1290                     vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1291                 dval = vreinterpret_u8_u16 (
1292                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1293                 dst2 = dst;
1294                 mask += 2;
1295                 dst += 2;
1296             }
1297
1298             if (w & 1)
1299             {
1300                 mval = vld1_lane_u8 (mask, mval, 1);
1301                 dval = vld1_lane_u8 (dst, dval, 1);
1302             }
1303
1304             res = vqadd_u8 (neon2mul (mval, sa), dval);
1305
1306             if (w & 1)
1307                 vst1_lane_u8 (dst, res, 1);
1308             if (w & 2)
1309                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1310             if (w & 4)
1311                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1312         }
1313     }
1314 }
1315
1316 #ifdef USE_GCC_INLINE_ASM
1317
1318 static void
1319 neon_composite_src_16_16 (pixman_implementation_t * impl,
1320                           pixman_op_t               op,
1321                           pixman_image_t *          src_image,
1322                           pixman_image_t *          mask_image,
1323                           pixman_image_t *          dst_image,
1324                           int32_t                   src_x,
1325                           int32_t                   src_y,
1326                           int32_t                   mask_x,
1327                           int32_t                   mask_y,
1328                           int32_t                   dest_x,
1329                           int32_t                   dest_y,
1330                           int32_t                   width,
1331                           int32_t                   height)
1332 {
1333     uint16_t    *dst_line, *src_line;
1334     uint32_t dst_stride, src_stride;
1335
1336     if (!height || !width)
1337         return;
1338
1339     /* We simply copy 16-bit-aligned pixels from one place to another. */
1340     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1341     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1342
1343     /* Preload the first input scanline */
1344     {
1345         uint16_t *src_ptr = src_line;
1346         uint32_t count = width;
1347
1348         asm volatile (
1349             "0: @ loop                                                  \n"
1350             "   subs    %[count], %[count], #32                         \n"
1351             "   pld     [%[src]]                                        \n"
1352             "   add     %[src], %[src], #64                             \n"
1353             "   bgt 0b                                                  \n"
1354
1355             /* Clobbered input registers marked as input/outputs */
1356             : [src] "+r" (src_ptr), [count] "+r" (count)
1357             :     /* no unclobbered inputs */
1358             : "cc"
1359             );
1360     }
1361
1362     while (height--)
1363     {
1364         uint16_t *dst_ptr = dst_line;
1365         uint16_t *src_ptr = src_line;
1366         uint32_t count = width;
1367         uint32_t tmp = 0;
1368
1369         /* Uses multi-register access and preloading to maximise bandwidth.
1370          * Each pixel is one halfword, so a quadword contains 8px.
1371          * Preload frequency assumed a 64-byte cacheline.
1372          */
1373         asm volatile (
1374             "   cmp       %[count], #64                         \n"
1375             "   blt 1f    @ skip oversized fragments            \n"
1376             "0: @ start with eight quadwords at a time          \n"
1377             /* preload from next scanline */
1378             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1379             "   sub       %[count], %[count], #64               \n"
1380             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1381             "   vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1382             /* preload from next scanline */
1383             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1384             "   vld1.16   {d24,d25,d26,d27}, [%[src]]!          \n"
1385             "   vld1.16   {d28,d29,d30,d31}, [%[src]]!          \n"
1386             "   cmp       %[count], #64                         \n"
1387             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1388             "   vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1389             "   vst1.16   {d24,d25,d26,d27}, [%[dst]]!          \n"
1390             "   vst1.16   {d28,d29,d30,d31}, [%[dst]]!          \n"
1391             "   bge 0b                                          \n"
1392             "   cmp       %[count], #0                          \n"
1393             "   beq 7f    @ aligned fastpath                    \n"
1394             "1: @ four quadwords                                \n"
1395             "   tst       %[count], #32                         \n"
1396             "   beq 2f    @ skip oversized fragment             \n"
1397             /* preload from next scanline */
1398             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1399             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1400             "   vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1401             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1402             "   vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1403             "2: @ two quadwords                                 \n"
1404             "   tst       %[count], #16                         \n"
1405             "   beq 3f    @ skip oversized fragment             \n"
1406             /* preload from next scanline */
1407             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1408             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1409             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1410             "3: @ one quadword                                  \n"
1411             "   tst       %[count], #8                          \n"
1412             "   beq 4f    @ skip oversized fragment             \n"
1413             "   vld1.16   {d16,d17}, [%[src]]!                  \n"
1414             "   vst1.16   {d16,d17}, [%[dst]]!                  \n"
1415             "4: @ one doubleword                                \n"
1416             "   tst       %[count], #4                          \n"
1417             "   beq 5f    @ skip oversized fragment             \n"
1418             "   vld1.16   {d16}, [%[src]]!                      \n"
1419             "   vst1.16   {d16}, [%[dst]]!                      \n"
1420             "5: @ one word                                      \n"
1421             "   tst       %[count], #2                          \n"
1422             "   beq 6f    @ skip oversized fragment             \n"
1423             "   ldr       %[tmp], [%[src]], #4                  \n"
1424             "   str       %[tmp], [%[dst]], #4                  \n"
1425             "6: @ one halfword                                  \n"
1426             "   tst       %[count], #1                          \n"
1427             "   beq 7f    @ skip oversized fragment             \n"
1428             "   ldrh      %[tmp], [%[src]]                      \n"
1429             "   strh      %[tmp], [%[dst]]                      \n"
1430             "7: @ end                                           \n"
1431
1432             /* Clobbered input registers marked as input/outputs */
1433             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1434               [count] "+r" (count), [tmp] "+r" (tmp)
1435
1436               /* Unclobbered input */
1437             : [src_stride] "r" (src_stride)
1438
1439               /* Clobbered vector registers */
1440             : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1441               "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1442             );
1443
1444         src_line += src_stride;
1445         dst_line += dst_stride;
1446     }
1447 }
1448
1449 #endif /* USE_GCC_INLINE_ASM */
1450
1451 static void
1452 neon_composite_src_24_16 (pixman_implementation_t * impl,
1453                           pixman_op_t               op,
1454                           pixman_image_t *          src_image,
1455                           pixman_image_t *          mask_image,
1456                           pixman_image_t *          dst_image,
1457                           int32_t                   src_x,
1458                           int32_t                   src_y,
1459                           int32_t                   mask_x,
1460                           int32_t                   mask_y,
1461                           int32_t                   dest_x,
1462                           int32_t                   dest_y,
1463                           int32_t                   width,
1464                           int32_t                   height)
1465 {
1466     uint16_t    *dst_line;
1467     uint32_t    *src_line;
1468     uint32_t dst_stride, src_stride;
1469
1470     if (!width || !height)
1471         return;
1472
1473     /* We simply copy pixels from one place to another,
1474      * assuming that the source's alpha is opaque.
1475      */
1476     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1477     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1478
1479     /* Preload the first input scanline */
1480     {
1481         uint8_t *src_ptr = (uint8_t*) src_line;
1482         uint32_t count = (width + 15) / 16;
1483
1484 #ifdef USE_GCC_INLINE_ASM
1485         asm volatile (
1486             "0: @ loop                                          \n"
1487             "   subs    %[count], %[count], #1                  \n"
1488             "   pld     [%[src]]                                \n"
1489             "   add     %[src], %[src], #64                     \n"
1490             "   bgt 0b                                          \n"
1491
1492             /* Clobbered input registers marked as input/outputs */
1493             : [src] "+r" (src_ptr), [count] "+r" (count)
1494             :     /* no unclobbered inputs */
1495             : "cc"
1496             );
1497 #else
1498         do
1499         {
1500             __pld (src_ptr);
1501             src_ptr += 64;
1502         }
1503         while (--count);
1504 #endif
1505     }
1506
1507     while (height--)
1508     {
1509         uint16_t *dst_ptr = dst_line;
1510         uint32_t *src_ptr = src_line;
1511         uint32_t count = width;
1512         const uint32_t rb_mask = 0x1F;
1513         const uint32_t g_mask = 0x3F;
1514
1515         /* If you're going to complain about a goto, take a long hard look
1516          * at the massive blocks of assembler this skips over.  ;-)
1517          */
1518         if (count < 8)
1519             goto small_stuff;
1520
1521 #ifdef USE_GCC_INLINE_ASM
1522
1523         /* This is not as aggressive as the RGB565-source case.
1524          * Generally the source is in cached RAM when the formats are
1525          * different, so we use preload.
1526          *
1527          * We don't need to blend, so we are not reading from the
1528          * uncached framebuffer.
1529          */
1530         asm volatile (
1531             "   cmp       %[count], #16                         \n"
1532             "   blt 1f    @ skip oversized fragments            \n"
1533             "0: @ start with sixteen pixels at a time           \n"
1534             "   sub       %[count], %[count], #16               \n"
1535             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1536             "   vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1537             "   vld4.8    {d4,d5,d6,d7}, [%[src]]!              @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1538             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1539             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1540             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1541             "   vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1542             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1543             "   vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1544             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1545             "   vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1546             "   vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1547             "   vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1548             "   cmp       %[count], #16                         \n"
1549             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels                               \n"
1550             "   bge 0b                                          \n"
1551             "1: @ end of main loop                              \n"
1552             "   cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1553             "   blt 2f                                          \n"
1554             "   sub       %[count], %[count], #8                \n"
1555             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1556             "   vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1557             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1558             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1559             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1560             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1561             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1562             "   vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels                                \n"
1563             "2: @ end                                           \n"
1564
1565             /* Clobbered input and working registers marked as input/outputs */
1566             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1567
1568               /* Unclobbered input */
1569             : [src_stride] "r" (src_stride)
1570
1571               /* Clobbered vector registers */
1572
1573               /* NB: these are the quad aliases of the
1574                * double registers used in the asm
1575                */
1576             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1577               "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1578             );
1579 #else
1580         /* A copy of the above code, in intrinsics-form. */
1581         while (count >= 16)
1582         {
1583             uint8x8x4_t pixel_set_a, pixel_set_b;
1584             uint16x8_t red_a, green_a, blue_a;
1585             uint16x8_t red_b, green_b, blue_b;
1586             uint16x8_t dest_pixels_a, dest_pixels_b;
1587
1588             count -= 16;
1589             __pld (src_ptr + src_stride);
1590             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1591             pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1592             src_ptr += 16;
1593
1594             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1595             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1596             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1597
1598             red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
1599             green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1600             blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
1601
1602             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1603             dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1604
1605             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1606             dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1607
1608             /* There doesn't seem to be an intrinsic for the
1609              * double-quadword variant
1610              */
1611             vst1q_u16 (dst_ptr, dest_pixels_a);
1612             vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1613             dst_ptr += 16;
1614         }
1615
1616         /* 8-pixel loop */
1617         if (count >= 8)
1618         {
1619             uint8x8x4_t pixel_set_a;
1620             uint16x8_t red_a, green_a, blue_a;
1621             uint16x8_t dest_pixels_a;
1622
1623             __pld (src_ptr + src_stride);
1624             count -= 8;
1625             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1626             src_ptr += 8;
1627
1628             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1629             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1630             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1631
1632             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1633             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1634
1635             vst1q_u16 (dst_ptr, dest_pixels_a);
1636             dst_ptr += 8;
1637         }
1638
1639 #endif  /* USE_GCC_INLINE_ASM */
1640
1641     small_stuff:
1642         if (count)
1643             __pld (src_ptr + src_stride);
1644
1645         while (count >= 2)
1646         {
1647             uint32_t src_pixel_a = *src_ptr++;
1648             uint32_t src_pixel_b = *src_ptr++;
1649
1650             /* ARM is really good at shift-then-ALU ops. */
1651             /* This should be a total of six shift-ANDs and five shift-ORs. */
1652             uint32_t dst_pixels_a;
1653             uint32_t dst_pixels_b;
1654
1655             dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1656             dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1657             dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1658
1659             dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1660             dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1661             dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1662
1663             /* little-endian mode only */
1664             *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1665             dst_ptr += 2;
1666             count -= 2;
1667         }
1668
1669         if (count)
1670         {
1671             uint32_t src_pixel = *src_ptr++;
1672
1673             /* ARM is really good at shift-then-ALU ops.
1674              * This block should end up as three shift-ANDs
1675              * and two shift-ORs.
1676              */
1677             uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1678             uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1679             uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1680             uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1681
1682             *dst_ptr++ = dst_pixel;
1683             count--;
1684         }
1685
1686         src_line += src_stride;
1687         dst_line += dst_stride;
1688     }
1689 }
1690
1691 static pixman_bool_t
1692 pixman_fill_neon (uint32_t *bits,
1693                   int       stride,
1694                   int       bpp,
1695                   int       x,
1696                   int       y,
1697                   int       width,
1698                   int       height,
1699                   uint32_t  _xor)
1700 {
1701     uint32_t byte_stride, color;
1702     char *dst;
1703
1704     /* stride is always multiple of 32bit units in pixman */
1705     byte_stride = stride * sizeof(uint32_t);
1706
1707     switch (bpp)
1708     {
1709     case 8:
1710         dst = ((char *) bits) + y * byte_stride + x;
1711         _xor &= 0xff;
1712         color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1713         break;
1714
1715     case 16:
1716         dst = ((char *) bits) + y * byte_stride + x * 2;
1717         _xor &= 0xffff;
1718         color = _xor << 16 | _xor;
1719         width *= 2;         /* width to bytes */
1720         break;
1721
1722     case 32:
1723         dst = ((char *) bits) + y * byte_stride + x * 4;
1724         color = _xor;
1725         width *= 4;         /* width to bytes */
1726         break;
1727
1728     default:
1729         return FALSE;
1730     }
1731
1732 #ifdef USE_GCC_INLINE_ASM
1733     if (width < 16)
1734     {
1735         /* We have a special case for such small widths that don't allow
1736          * us to use wide 128-bit stores anyway. We don't waste time
1737          * trying to align writes, since there are only very few of them anyway
1738          */
1739         asm volatile (
1740             "cmp                %[height], #0\n"/* Check if empty fill */
1741             "beq                3f\n"
1742             "vdup.32    d0, %[color]\n"/* Fill the color to neon req */
1743
1744             /* Check if we have a such width that can easily be handled by single
1745              * operation for each scanline. This significantly reduces the number
1746              * of test/branch instructions for each scanline
1747              */
1748             "cmp                %[width], #8\n"
1749             "beq                4f\n"
1750             "cmp                %[width], #4\n"
1751             "beq                5f\n"
1752             "cmp                %[width], #2\n"
1753             "beq                6f\n"
1754
1755             /* Loop starts here for each scanline */
1756             "1:\n"
1757             "mov                r4, %[dst]\n" /* Starting address of the current line */
1758             "tst                %[width], #8\n"
1759             "beq                2f\n"
1760             "vst1.8             {d0}, [r4]!\n"
1761             "2:\n"
1762             "tst                %[width], #4\n"
1763             "beq                2f\n"
1764             "str                %[color], [r4], #4\n"
1765             "2:\n"
1766             "tst                %[width], #2\n"
1767             "beq                2f\n"
1768             "strh               %[color], [r4], #2\n"
1769             "2:\n"
1770             "tst                %[width], #1\n"
1771             "beq                2f\n"
1772             "strb               %[color], [r4], #1\n"
1773             "2:\n"
1774
1775             "subs               %[height], %[height], #1\n"
1776             "add                %[dst], %[dst], %[byte_stride]\n"
1777             "bne                1b\n"
1778             "b          3f\n"
1779
1780             /* Special fillers for those widths that we can do with single operation */
1781             "4:\n"
1782             "subs               %[height], %[height], #1\n"
1783             "vst1.8             {d0}, [%[dst]]\n"
1784             "add                %[dst], %[dst], %[byte_stride]\n"
1785             "bne                4b\n"
1786             "b          3f\n"
1787
1788             "5:\n"
1789             "subs               %[height], %[height], #1\n"
1790             "str                %[color], [%[dst]]\n"
1791             "add                %[dst], %[dst], %[byte_stride]\n"
1792             "bne                5b\n"
1793             "b          3f\n"
1794
1795             "6:\n"
1796             "subs               %[height], %[height], #1\n"
1797             "strh               %[color], [%[dst]]\n"
1798             "add                %[dst], %[dst], %[byte_stride]\n"
1799             "bne                6b\n"
1800
1801             "3:\n"
1802
1803             : /* No output members */
1804             : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1805             [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1806             : "memory", "cc", "d0", "r4", "r5");
1807     }
1808     else
1809     {
1810         asm volatile (
1811             "cmp                %[height], #0\n"/* Check if empty fill */
1812             "beq                5f\n"
1813             "vdup.32    q0, %[color]\n"/* Fill the color to neon req */
1814
1815             /* Loop starts here for each scanline */
1816             "1:\n"
1817             "mov                r4, %[dst]\n"/* Starting address of the current line */
1818             "mov                r5, %[width]\n"/* We're going to write this many bytes */
1819             "ands               r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1820             "beq                2f\n"/* Jump to the best case */
1821
1822             /* We're not 128-bit aligned: However, we know that we can get to the
1823                next aligned location, since the fill is at least 16 bytes wide */
1824             "rsb                r6, r6, #16\n" /* We would need to go forward this much */
1825             "sub                r5, r5, r6\n"/* Update bytes left */
1826             "tst                r6, #1\n"
1827             "beq                6f\n"
1828             "vst1.8             {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1829             "6:\n"
1830             "tst                r6, #2\n"
1831             "beq                6f\n"
1832             "vst1.16    {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1833             "6:\n"
1834             "tst                r6, #4\n"
1835             "beq                6f\n"
1836             "vst1.32    {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1837             "6:\n"
1838             "tst                r6, #8\n"
1839             "beq                2f\n"
1840             "vst1.64    {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1841
1842             /* The good case: We're 128-bit aligned for this scanline */
1843             "2:\n"
1844             "and                r6, r5, #15\n"/* Number of tailing bytes */
1845             "cmp                r5, r6\n"/* Do we have at least one qword to write? */
1846             "beq                6f\n"/* No, we just write the tail */
1847             "lsr                r5, r5, #4\n"/* This many full qwords to write */
1848
1849             /* The main block: Do 128-bit aligned writes */
1850             "3:\n"
1851             "subs               r5, r5, #1\n"
1852             "vst1.64    {d0,d1}, [r4, :128]!\n"
1853             "bne                3b\n"
1854
1855             /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1856                We know that we're currently at 128-bit aligned address, so we can just
1857                pick the biggest operations that the remaining write width allows */
1858             "6:\n"
1859             "cmp                r6, #0\n"
1860             "beq                4f\n"
1861             "tst                r6, #8\n"
1862             "beq                6f\n"
1863             "vst1.64    {d0}, [r4, :64]!\n"
1864             "6:\n"
1865             "tst                r6, #4\n"
1866             "beq                6f\n"
1867             "vst1.32    {d0[0]}, [r4, :32]!\n"
1868             "6:\n"
1869             "tst                r6, #2\n"
1870             "beq                6f\n"
1871             "vst1.16    {d0[0]}, [r4, :16]!\n"
1872             "6:\n"
1873             "tst                r6, #1\n"
1874             "beq                4f\n"
1875             "vst1.8             {d0[0]}, [r4]!\n"
1876             "4:\n"
1877
1878             /* Handle the next scanline */
1879             "subs               %[height], %[height], #1\n"
1880             "add                %[dst], %[dst], %[byte_stride]\n"
1881             "bne                1b\n"
1882             "5:\n"
1883             : /* No output members */
1884             : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1885             [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1886             : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1887     }
1888     return TRUE;
1889
1890 #else
1891
1892     /* TODO: intrinsic version for armcc */
1893     return FALSE;
1894
1895 #endif
1896 }
1897
1898 /* TODO: is there a more generic way of doing this being introduced? */
1899 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1900
1901 static inline void
1902 neon_quadword_copy (void*    dst,
1903                     void*    src,
1904                     uint32_t count,         /* of quadwords */
1905                     uint32_t trailer_count  /* of bytes */)
1906 {
1907     uint8_t *t_dst = dst, *t_src = src;
1908
1909     /* Uses aligned multi-register loads to maximise read bandwidth
1910      * on uncached memory such as framebuffers
1911      * The accesses do not have the aligned qualifiers, so that the copy
1912      * may convert between aligned-uncached and unaligned-cached memory.
1913      * It is assumed that the CPU can infer alignedness from the address.
1914      */
1915
1916 #ifdef USE_GCC_INLINE_ASM
1917
1918     asm volatile (
1919         "       cmp       %[count], #8                          \n"
1920         "       blt 1f    @ skip oversized fragments            \n"
1921         "0: @ start with eight quadwords at a time              \n"
1922         "       sub       %[count], %[count], #8                \n"
1923         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1924         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1925         "       vld1.8    {d24,d25,d26,d27}, [%[src]]!          \n"
1926         "       vld1.8    {d28,d29,d30,d31}, [%[src]]!          \n"
1927         "       cmp       %[count], #8                          \n"
1928         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1929         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1930         "       vst1.8    {d24,d25,d26,d27}, [%[dst]]!          \n"
1931         "       vst1.8    {d28,d29,d30,d31}, [%[dst]]!          \n"
1932         "       bge 0b                                          \n"
1933         "1: @ four quadwords                                    \n"
1934         "       tst       %[count], #4                          \n"
1935         "       beq 2f    @ skip oversized fragment             \n"
1936         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1937         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1938         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1939         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1940         "2: @ two quadwords                                     \n"
1941         "       tst       %[count], #2                          \n"
1942         "       beq 3f    @ skip oversized fragment             \n"
1943         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1944         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1945         "3: @ one quadword                                      \n"
1946         "       tst       %[count], #1                          \n"
1947         "       beq 4f    @ skip oversized fragment             \n"
1948         "       vld1.8    {d16,d17}, [%[src]]!                  \n"
1949         "       vst1.8    {d16,d17}, [%[dst]]!                  \n"
1950         "4: @ end                                               \n"
1951
1952         /* Clobbered input registers marked as input/outputs */
1953         : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1954
1955           /* No unclobbered inputs */
1956         :
1957
1958         /* Clobbered vector registers */
1959         : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
1960           "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
1961
1962 #else
1963
1964     while (count >= 8)
1965     {
1966         uint8x16x4_t t1 = vld4q_u8 (t_src);
1967         uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1968
1969         t_src += sizeof(uint8x16x4_t) * 2;
1970         vst4q_u8 (t_dst, t1);
1971         vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1972         t_dst += sizeof(uint8x16x4_t) * 2;
1973         count -= 8;
1974     }
1975
1976     if (count & 4)
1977     {
1978         uint8x16x4_t t1 = vld4q_u8 (t_src);
1979
1980         t_src += sizeof(uint8x16x4_t);
1981         vst4q_u8 (t_dst, t1);
1982         t_dst += sizeof(uint8x16x4_t);
1983     }
1984
1985     if (count & 2)
1986     {
1987         uint8x8x4_t t1 = vld4_u8 (t_src);
1988
1989         t_src += sizeof(uint8x8x4_t);
1990         vst4_u8 (t_dst, t1);
1991         t_dst += sizeof(uint8x8x4_t);
1992     }
1993
1994     if (count & 1)
1995     {
1996         uint8x16_t t1 = vld1q_u8 (t_src);
1997
1998         t_src += sizeof(uint8x16_t);
1999         vst1q_u8 (t_dst, t1);
2000         t_dst += sizeof(uint8x16_t);
2001     }
2002
2003 #endif  /* !USE_GCC_INLINE_ASM */
2004
2005     if (trailer_count)
2006     {
2007         if (trailer_count & 8)
2008         {
2009             uint8x8_t t1 = vld1_u8 (t_src);
2010
2011             t_src += sizeof(uint8x8_t);
2012             vst1_u8 (t_dst, t1);
2013             t_dst += sizeof(uint8x8_t);
2014         }
2015
2016         if (trailer_count & 4)
2017         {
2018             *((uint32_t*) t_dst) = *((uint32_t*) t_src);
2019
2020             t_dst += 4;
2021             t_src += 4;
2022         }
2023
2024         if (trailer_count & 2)
2025         {
2026             *((uint16_t*) t_dst) = *((uint16_t*) t_src);
2027
2028             t_dst += 2;
2029             t_src += 2;
2030         }
2031
2032         if (trailer_count & 1)
2033         {
2034             *t_dst++ = *t_src++;
2035         }
2036     }
2037 }
2038
2039 static inline void
2040 solid_over_565_8_pix_neon (uint32_t  glyph_colour,
2041                            uint16_t *dest,
2042                            uint8_t * in_mask,
2043                            uint32_t  dest_stride,    /* bytes, not elements */
2044                            uint32_t  mask_stride,
2045                            uint32_t  count           /* 8-pixel groups */)
2046 {
2047     /* Inner loop of glyph blitter (solid colour, alpha mask) */
2048
2049 #ifdef USE_GCC_INLINE_ASM
2050
2051     asm volatile (
2052         "       vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]]  @ splat solid colour components    \n"
2053         "0:     @ loop                                                                                                                                                          \n"
2054         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer                       \n"
2055         "       vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph                                                \n"
2056         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
2057         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
2058         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
2059         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
2060         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
2061         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
2062         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
2063         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
2064         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
2065         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
2066         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
2067         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
2068         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
2069         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
2070         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
2071         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
2072         "       add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait                \n"
2073         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
2074         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
2075         "       vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels                                          \n"
2076         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer                                     \n"
2077         "       bne 0b                               @ next please                                                                      \n"
2078
2079         /* Clobbered registers marked as input/outputs */
2080         : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
2081
2082           /* Inputs */
2083         : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
2084
2085           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2086         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
2087           "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
2088         );
2089
2090 #else
2091
2092     uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
2093
2094     while (count--)
2095     {
2096         uint16x8_t pixels = vld1q_u16 (dest);
2097         uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
2098         uint8x8_t mask_image = vmvn_u8 (mask);
2099
2100         uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
2101         uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
2102         uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
2103
2104         uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
2105         uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
2106         uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
2107
2108         s_red   = vmlal (s_red, mask, solid_colour.val[2]);
2109         s_green = vmlal (s_green, mask, solid_colour.val[1]);
2110         s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
2111
2112         pixels = vsri_n_u16 (s_red, s_green, 5);
2113         pixels = vsri_n_u16 (pixels, s_blue, 11);
2114         vst1q_u16 (dest, pixels);
2115
2116         dest += dest_stride;
2117         mask += mask_stride;
2118     }
2119
2120 #endif
2121 }
2122
2123 #if 0 /* this is broken currently */
2124 static void
2125 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
2126                               pixman_op_t               op,
2127                               pixman_image_t *          src_image,
2128                               pixman_image_t *          mask_image,
2129                               pixman_image_t *          dst_image,
2130                               int32_t                   src_x,
2131                               int32_t                   src_y,
2132                               int32_t                   mask_x,
2133                               int32_t                   mask_y,
2134                               int32_t                   dest_x,
2135                               int32_t                   dest_y,
2136                               int32_t                   width,
2137                               int32_t                   height)
2138 {
2139     uint32_t  src, srca;
2140     uint16_t *dst_line, *aligned_line;
2141     uint8_t  *mask_line;
2142     uint32_t  dst_stride, mask_stride;
2143     uint32_t  kernel_count, copy_count, copy_tail;
2144     uint8_t   kernel_offset, copy_offset;
2145
2146     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2147
2148     /* bail out if fully transparent or degenerate */
2149     srca = src >> 24;
2150     if (src == 0)
2151         return;
2152
2153     if (width == 0 || height == 0)
2154         return;
2155
2156     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2157     {
2158         /* split the blit, so we can use a fixed-size scanline buffer
2159          * TODO: there must be a more elegant way of doing this.
2160          */
2161         int x;
2162         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2163         {
2164             neon_composite_over_n_8_0565 (
2165                 impl, op,
2166                 src_image, mask_image, dst_image,
2167                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2168                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2169         }
2170
2171         return;
2172     }
2173
2174     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2175     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2176
2177     /* keep within minimum number of aligned quadwords on width
2178      * while also keeping the minimum number of columns to process
2179      */
2180     {
2181         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2182         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2183         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2184
2185         /* the fast copy should be quadword aligned */
2186         copy_offset = dst_line - ((uint16_t*) aligned_left);
2187         aligned_line = dst_line - copy_offset;
2188         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2189         copy_tail = 0;
2190
2191         if (aligned_right - aligned_left > ceiling_length)
2192         {
2193             /* unaligned routine is tightest */
2194             kernel_count = (uint32_t) (ceiling_length >> 4);
2195             kernel_offset = copy_offset;
2196         }
2197         else
2198         {
2199             /* aligned routine is equally tight, so it is safer to align */
2200             kernel_count = copy_count;
2201             kernel_offset = 0;
2202         }
2203
2204         /* We should avoid reading beyond scanline ends for safety */
2205         if (aligned_line < (dst_line - dest_x) ||
2206             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2207         {
2208             /* switch to precise read */
2209             copy_offset = kernel_offset = 0;
2210             aligned_line = dst_line;
2211             kernel_count = (uint32_t) (ceiling_length >> 4);
2212             copy_count = (width * sizeof(*dst_line)) >> 4;
2213             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2214         }
2215     }
2216
2217     {
2218         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
2219         uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
2220         int y = height;
2221
2222         /* row-major order */
2223         /* left edge, middle block, right edge */
2224         for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
2225         {
2226             /* We don't want to overrun the edges of the glyph,
2227              * so realign the edge data into known buffers
2228              */
2229             neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
2230
2231             /* Uncached framebuffer access is really, really slow
2232              * if we do it piecemeal. It should be much faster if we
2233              * grab it all at once. One scanline should easily fit in
2234              * L1 cache, so this should not waste RAM bandwidth.
2235              */
2236             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2237
2238             /* Apply the actual filter */
2239             solid_over_565_8_pix_neon (
2240                 src, scan_line + kernel_offset,
2241                 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
2242                 8, kernel_count);
2243
2244             /* Copy the modified scanline back */
2245             neon_quadword_copy (dst_line, scan_line + copy_offset,
2246                                 width >> 3, (width & 7) * 2);
2247         }
2248     }
2249 }
2250 #endif
2251
2252 #ifdef USE_GCC_INLINE_ASM
2253
2254 static inline void
2255 plain_over_565_8_pix_neon (uint32_t  colour,
2256                            uint16_t *dest,
2257                            uint32_t  dest_stride,     /* bytes, not elements */
2258                            uint32_t  count            /* 8-pixel groups */)
2259 {
2260     /* Inner loop for plain translucent rects
2261      * (solid colour without alpha mask)
2262      */
2263     asm volatile (
2264         "       vld4.8   {d20[],d21[],d22[],d23[]}, [%[colour]]  @ solid colour load/splat \n"
2265         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
2266         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
2267         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
2268         "       vmvn      d18, d23                   @ inverse alpha for background \n"
2269         "0:     @ loop\n"
2270         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer       \n"
2271         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2272         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2273         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2274         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2275         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2276         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2277         "       vmov      q0, q12                    @ retrieve foreground red   \n"
2278         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
2279         "       vmov      q1, q13                    @ retrieve foreground green \n"
2280         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
2281         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
2282         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
2283         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2284         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
2285         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
2286         "       vst1.16   {d0,d1}, [%[dest]]         @ store composited pixels                  \n"
2287         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer             \n"
2288         "       bne 0b                               @ next please                              \n"
2289
2290         /* Clobbered registers marked as input/outputs */
2291         : [dest] "+r" (dest), [count] "+r" (count)
2292
2293           /* Inputs */
2294         : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
2295
2296           /* Clobbers, including the inputs we modify, and
2297            * potentially lots of memory
2298            */
2299         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
2300           "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2301           "cc", "memory"
2302         );
2303 }
2304
2305 static void
2306 neon_composite_over_n_0565 (pixman_implementation_t * impl,
2307                             pixman_op_t               op,
2308                             pixman_image_t *          src_image,
2309                             pixman_image_t *          mask_image,
2310                             pixman_image_t *          dst_image,
2311                             int32_t                   src_x,
2312                             int32_t                   src_y,
2313                             int32_t                   mask_x,
2314                             int32_t                   mask_y,
2315                             int32_t                   dest_x,
2316                             int32_t                   dest_y,
2317                             int32_t                   width,
2318                             int32_t                   height)
2319 {
2320     uint32_t src, srca;
2321     uint16_t    *dst_line, *aligned_line;
2322     uint32_t dst_stride;
2323     uint32_t kernel_count, copy_count, copy_tail;
2324     uint8_t kernel_offset, copy_offset;
2325
2326     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2327
2328     /* bail out if fully transparent */
2329     srca = src >> 24;
2330     if (src == 0)
2331         return;
2332
2333     if (width == 0 || height == 0)
2334         return;
2335
2336     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2337     {
2338         /* split the blit, so we can use a fixed-size scanline buffer *
2339          * TODO: there must be a more elegant way of doing this.
2340          */
2341         int x;
2342
2343         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2344         {
2345             neon_composite_over_n_0565 (
2346                 impl, op,
2347                 src_image, mask_image, dst_image,
2348                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2349                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2350         }
2351         return;
2352     }
2353
2354     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2355
2356     /* keep within minimum number of aligned quadwords on width
2357      * while also keeping the minimum number of columns to process
2358      */
2359     {
2360         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2361         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2362         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2363
2364         /* the fast copy should be quadword aligned */
2365         copy_offset = dst_line - ((uint16_t*) aligned_left);
2366         aligned_line = dst_line - copy_offset;
2367         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2368         copy_tail = 0;
2369
2370         if (aligned_right - aligned_left > ceiling_length)
2371         {
2372             /* unaligned routine is tightest */
2373             kernel_count = (uint32_t) (ceiling_length >> 4);
2374             kernel_offset = copy_offset;
2375         }
2376         else
2377         {
2378             /* aligned routine is equally tight, so it is safer to align */
2379             kernel_count = copy_count;
2380             kernel_offset = 0;
2381         }
2382
2383         /* We should avoid reading beyond scanline ends for safety */
2384         if (aligned_line < (dst_line - dest_x) ||
2385             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2386         {
2387             /* switch to precise read */
2388             copy_offset = kernel_offset = 0;
2389             aligned_line = dst_line;
2390             kernel_count = (uint32_t) (ceiling_length >> 4);
2391             copy_count = (width * sizeof(*dst_line)) >> 4;
2392             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2393         }
2394     }
2395
2396     {
2397         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
2398
2399         /* row-major order */
2400         /* left edge, middle block, right edge */
2401         for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2402         {
2403             /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2404              * It should be much faster if we grab it all at once.
2405              * One scanline should easily fit in L1 cache, so this should
2406              * not waste RAM bandwidth.
2407              */
2408             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2409
2410             /* Apply the actual filter */
2411             plain_over_565_8_pix_neon (
2412                 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2413
2414             /* Copy the modified scanline back */
2415             neon_quadword_copy (
2416                 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2417         }
2418     }
2419 }
2420
2421 static inline void
2422 ARGB8_over_565_8_pix_neon (uint32_t *src,
2423                            uint16_t *dest,
2424                            uint32_t  src_stride,     /* bytes, not elements */
2425                            uint32_t  count           /* 8-pixel groups */)
2426 {
2427     asm volatile (
2428         "0:     @ loop\n"
2429         "       pld   [%[src], %[src_stride]]         @ preload from next scanline      \n"
2430         "       vld1.16   {d0,d1}, [%[dest]]         @ load pixels from framebuffer     \n"
2431         "       vld4.8   {d20,d21,d22,d23},[%[src]]! @ load source image pixels         \n"
2432         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2433         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2434         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2435         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
2436         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2437         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2438         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2439         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
2440         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
2441         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
2442         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2443         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
2444         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
2445         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
2446         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
2447         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
2448         "       vst1.16   {d2,d3}, [%[dest]]!        @ store composited pixels                  \n"
2449         "       bne 0b                               @ next please                              \n"
2450
2451         /* Clobbered registers marked as input/outputs */
2452         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2453
2454           /* Inputs */
2455         : [src_stride] "r" (src_stride)
2456
2457           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2458         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
2459           "d21", "d22", "d23", "cc", "memory"
2460         );
2461 }
2462
2463 static void
2464 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2465                                pixman_op_t               op,
2466                                pixman_image_t *          src_image,
2467                                pixman_image_t *          mask_image,
2468                                pixman_image_t *          dst_image,
2469                                int32_t                   src_x,
2470                                int32_t                   src_y,
2471                                int32_t                   mask_x,
2472                                int32_t                   mask_y,
2473                                int32_t                   dest_x,
2474                                int32_t                   dest_y,
2475                                int32_t                   width,
2476                                int32_t                   height)
2477 {
2478     uint32_t    *src_line;
2479     uint16_t    *dst_line, *aligned_line;
2480     uint32_t dst_stride, src_stride;
2481     uint32_t kernel_count, copy_count, copy_tail;
2482     uint8_t kernel_offset, copy_offset;
2483
2484     /* we assume mask is opaque
2485      * so the only alpha to deal with is embedded in src
2486      */
2487     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2488     {
2489         /* split the blit, so we can use a fixed-size scanline buffer */
2490         int x;
2491         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2492         {
2493             neon_composite_over_8888_0565 (
2494                 impl, op,
2495                 src_image, mask_image, dst_image,
2496                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2497                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2498         }
2499         return;
2500     }
2501
2502     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2503     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2504
2505     /* keep within minimum number of aligned quadwords on width
2506      * while also keeping the minimum number of columns to process
2507      */
2508     {
2509         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2510         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2511         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2512
2513         /* the fast copy should be quadword aligned */
2514         copy_offset = dst_line - ((uint16_t*) aligned_left);
2515         aligned_line = dst_line - copy_offset;
2516         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2517         copy_tail = 0;
2518
2519         if (aligned_right - aligned_left > ceiling_length)
2520         {
2521             /* unaligned routine is tightest */
2522             kernel_count = (uint32_t) (ceiling_length >> 4);
2523             kernel_offset = copy_offset;
2524         }
2525         else
2526         {
2527             /* aligned routine is equally tight, so it is safer to align */
2528             kernel_count = copy_count;
2529             kernel_offset = 0;
2530         }
2531
2532         /* We should avoid reading beyond scanline ends for safety */
2533         if (aligned_line < (dst_line - dest_x) ||
2534             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2535         {
2536             /* switch to precise read */
2537             copy_offset = kernel_offset = 0;
2538             aligned_line = dst_line;
2539             kernel_count = (uint32_t) (ceiling_length >> 4);
2540             copy_count = (width * sizeof(*dst_line)) >> 4;
2541             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2542         }
2543     }
2544
2545     /* Preload the first input scanline */
2546     {
2547         uint8_t *src_ptr = (uint8_t*) src_line;
2548         uint32_t count = (width + 15) / 16;
2549
2550 #ifdef USE_GCC_INLINE_ASM
2551         asm volatile (
2552             "0: @ loop                                          \n"
2553             "   subs    %[count], %[count], #1                  \n"
2554             "   pld     [%[src]]                                \n"
2555             "   add     %[src], %[src], #64                     \n"
2556             "   bgt 0b                                          \n"
2557
2558             /* Clobbered input registers marked as input/outputs */
2559             : [src] "+r" (src_ptr), [count] "+r" (count)
2560             :     /* no unclobbered inputs */
2561             : "cc"
2562             );
2563 #else
2564         do
2565         {
2566             __pld (src_ptr);
2567             src_ptr += 64;
2568         }
2569         while (--count);
2570 #endif
2571     }
2572
2573     {
2574         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2575
2576         /* row-major order */
2577         /* left edge, middle block, right edge */
2578         for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2579         {
2580             /* Uncached framebuffer access is really, really slow if we do
2581              * it piecemeal. It should be much faster if we grab it all at
2582              * once. One scanline should easily fit in L1 cache, so this
2583              * should not waste RAM bandwidth.
2584              */
2585             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2586
2587             /* Apply the actual filter */
2588             ARGB8_over_565_8_pix_neon (
2589                 src_line, scan_line + kernel_offset,
2590                 src_stride * sizeof(*src_line), kernel_count);
2591
2592             /* Copy the modified scanline back */
2593             neon_quadword_copy (dst_line,
2594                                 scan_line + copy_offset,
2595                                 width >> 3, (width & 7) * 2);
2596         }
2597     }
2598 }
2599
2600 #endif  /* USE_GCC_INLINE_ASM */
2601
2602 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2603 {
2604     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,     0 },
2605     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
2606     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
2607     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
2608     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2609     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2610     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2611     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2612 #ifdef USE_GCC_INLINE_ASM
2613     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
2614     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
2615 #if 0 /* this code has some bugs */
2616     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
2617     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
2618     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
2619     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
2620 #endif
2621 #endif
2622     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
2623     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
2624     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
2625     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
2626     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2627     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2628     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
2629     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
2630     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
2631     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
2632     { PIXMAN_OP_NONE },
2633 };
2634
2635 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2636
2637 static void
2638 arm_neon_composite (pixman_implementation_t *imp,
2639                     pixman_op_t              op,
2640                     pixman_image_t *         src,
2641                     pixman_image_t *         mask,
2642                     pixman_image_t *         dest,
2643                     int32_t                  src_x,
2644                     int32_t                  src_y,
2645                     int32_t                  mask_x,
2646                     int32_t                  mask_y,
2647                     int32_t                  dest_x,
2648                     int32_t                  dest_y,
2649                     int32_t                  width,
2650                     int32_t                  height)
2651 {
2652     if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2653                                op, src, mask, dest,
2654                                src_x, src_y,
2655                                mask_x, mask_y,
2656                                dest_x, dest_y,
2657                                width, height))
2658     {
2659         return;
2660     }
2661
2662     _pixman_implementation_composite (imp->delegate, op,
2663                                       src, mask, dest,
2664                                       src_x, src_y,
2665                                       mask_x, mask_y,
2666                                       dest_x, dest_y,
2667                                       width, height);
2668 }
2669
2670 static pixman_bool_t
2671 pixman_blt_neon (void *src_bits,
2672                  void *dst_bits,
2673                  int   src_stride,
2674                  int   dst_stride,
2675                  int   src_bpp,
2676                  int   dst_bpp,
2677                  int   src_x,
2678                  int   src_y,
2679                  int   dst_x,
2680                  int   dst_y,
2681                  int   width,
2682                  int   height)
2683 {
2684     if (!width || !height)
2685         return TRUE;
2686
2687     /* accelerate only straight copies involving complete bytes */
2688     if (src_bpp != dst_bpp || (src_bpp & 7))
2689         return FALSE;
2690
2691     {
2692         uint32_t bytes_per_pixel = src_bpp >> 3;
2693         uint32_t byte_width = width * bytes_per_pixel;
2694         /* parameter is in words for some reason */
2695         int32_t src_stride_bytes = src_stride * 4;
2696         int32_t dst_stride_bytes = dst_stride * 4;
2697         uint8_t *src_bytes = ((uint8_t*) src_bits) +
2698             src_y * src_stride_bytes + src_x * bytes_per_pixel;
2699         uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2700             dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2701         uint32_t quadword_count = byte_width / 16;
2702         uint32_t offset         = byte_width % 16;
2703
2704         while (height--)
2705         {
2706             neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2707             src_bytes += src_stride_bytes;
2708             dst_bytes += dst_stride_bytes;
2709         }
2710     }
2711
2712     return TRUE;
2713 }
2714
2715 static pixman_bool_t
2716 arm_neon_blt (pixman_implementation_t *imp,
2717               uint32_t *               src_bits,
2718               uint32_t *               dst_bits,
2719               int                      src_stride,
2720               int                      dst_stride,
2721               int                      src_bpp,
2722               int                      dst_bpp,
2723               int                      src_x,
2724               int                      src_y,
2725               int                      dst_x,
2726               int                      dst_y,
2727               int                      width,
2728               int                      height)
2729 {
2730     if (pixman_blt_neon (
2731             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2732             src_x, src_y, dst_x, dst_y, width, height))
2733     {
2734         return TRUE;
2735     }
2736
2737     return _pixman_implementation_blt (
2738                imp->delegate,
2739                src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2740                src_x, src_y, dst_x, dst_y, width, height);
2741 }
2742
2743 static pixman_bool_t
2744 arm_neon_fill (pixman_implementation_t *imp,
2745                uint32_t *               bits,
2746                int                      stride,
2747                int                      bpp,
2748                int                      x,
2749                int                      y,
2750                int                      width,
2751                int                      height,
2752                uint32_t xor)
2753 {
2754     if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2755         return TRUE;
2756
2757     return _pixman_implementation_fill (
2758         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2759 }
2760
2761 pixman_implementation_t *
2762 _pixman_implementation_create_arm_neon (void)
2763 {
2764     pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2765     pixman_implementation_t *imp = _pixman_implementation_create (simd);
2766
2767     imp->composite = arm_neon_composite;
2768 #if 0 /* this code has some bugs */
2769     imp->blt = arm_neon_blt;
2770 #endif
2771     imp->fill = arm_neon_fill;
2772
2773     return imp;
2774 }
2775