pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 /* Deal with an intrinsic that is defined differently in GCC */
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch (_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t
  43 unpack0565 (uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8 (0);
  49     gb = vshrq_n_u16 (rgb, 5);
  50     b = vshrq_n_u16 (rgb, 5 + 6);
  51
  52     res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
  53     res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
  54     res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
  55
  56     res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
  57     res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
  58     res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
  59
  60     res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
  61     res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
  62     res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
  63
  64     return res;
  65 }
  66
  67 static force_inline uint16x8_t
  68 pack0565 (uint8x8x4_t s)
  69 {
  70     uint16x8_t rgb, val_g, val_r;
  71
  72     rgb = vshll_n_u8 (s.val[2], 8);
  73     val_g = vshll_n_u8 (s.val[1], 8);
  74     val_r = vshll_n_u8 (s.val[0], 8);
  75     rgb = vsriq_n_u16 (rgb, val_g, 5);
  76     rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
  77
  78     return rgb;
  79 }
  80
  81 static force_inline uint8x8_t
  82 neon2mul (uint8x8_t x,
  83           uint8x8_t alpha)
  84 {
  85     uint16x8_t tmp, tmp2;
  86     uint8x8_t res;
  87
  88     tmp = vmull_u8 (x, alpha);
  89     tmp2 = vrshrq_n_u16 (tmp, 8);
  90     res = vraddhn_u16 (tmp, tmp2);
  91
  92     return res;
  93 }
  94
  95 static force_inline uint8x8x4_t
  96 neon8mul (uint8x8x4_t x,
  97           uint8x8_t   alpha)
  98 {
  99     uint16x8x4_t tmp;
 100     uint8x8x4_t res;
 101     uint16x8_t qtmp1, qtmp2;
 102
 103     tmp.val[0] = vmull_u8 (x.val[0], alpha);
 104     tmp.val[1] = vmull_u8 (x.val[1], alpha);
 105     tmp.val[2] = vmull_u8 (x.val[2], alpha);
 106     tmp.val[3] = vmull_u8 (x.val[3], alpha);
 107
 108     qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
 109     qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
 110     res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
 111     qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
 112     res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
 113     qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
 114     res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
 115     res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 116
 117     return res;
 118 }
 119
 120 static force_inline uint8x8x4_t
 121 neon8qadd (uint8x8x4_t x,
 122            uint8x8x4_t y)
 123 {
 124     uint8x8x4_t res;
 125
 126     res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
 127     res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
 128     res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
 129     res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 130
 131     return res;
 132 }
 133
 134 static void
 135 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
 136                               pixman_op_t               op,
 137                               pixman_image_t *          src_image,
 138                               pixman_image_t *          mask_image,
 139                               pixman_image_t *          dst_image,
 140                               int32_t                   src_x,
 141                               int32_t                   src_y,
 142                               int32_t                   mask_x,
 143                               int32_t                   mask_y,
 144                               int32_t                   dest_x,
 145                               int32_t                   dest_y,
 146                               int32_t                   width,
 147                               int32_t                   height)
 148 {
 149     uint8_t     *dst_line, *dst;
 150     uint8_t     *src_line, *src;
 151     int dst_stride, src_stride;
 152     uint16_t w;
 153
 154     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 155     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 156
 157     if (width >= 8)
 158     {
 159         /* Use overlapping 8-pixel method */
 160         while (height--)
 161         {
 162             uint8_t *keep_dst = 0;
 163             uint8x8_t sval, dval, temp;
 164
 165             dst = dst_line;
 166             dst_line += dst_stride;
 167             src = src_line;
 168             src_line += src_stride;
 169             w = width;
 170
 171 #ifndef USE_GCC_INLINE_ASM
 172             sval = vld1_u8 ((void*)src);
 173             dval = vld1_u8 ((void*)dst);
 174             keep_dst = dst;
 175
 176             temp = vqadd_u8 (dval, sval);
 177
 178             src += (w & 7);
 179             dst += (w & 7);
 180             w -= (w & 7);
 181
 182             while (w)
 183             {
 184                 sval = vld1_u8 ((void*)src);
 185                 dval = vld1_u8 ((void*)dst);
 186
 187                 vst1_u8 ((void*)keep_dst, temp);
 188                 keep_dst = dst;
 189
 190                 temp = vqadd_u8 (dval, sval);
 191
 192                 src += 8;
 193                 dst += 8;
 194                 w -= 8;
 195             }
 196
 197             vst1_u8 ((void*)keep_dst, temp);
 198 #else
 199             asm volatile (
 200 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 201                 "vld1.8  {d0}, [%[src]]\n\t"
 202                 "vld1.8  {d4}, [%[dst]]\n\t"
 203                 "mov     %[keep_dst], %[dst]\n\t"
 204
 205                 "and ip, %[w], #7\n\t"
 206                 "add %[src], %[src], ip\n\t"
 207                 "add %[dst], %[dst], ip\n\t"
 208                 "subs %[w], %[w], ip\n\t"
 209                 "b 9f\n\t"
 210 /* LOOP */
 211                 "2:\n\t"
 212                 "vld1.8  {d0}, [%[src]]!\n\t"
 213                 "vld1.8  {d4}, [%[dst]]!\n\t"
 214                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 215                 "sub     %[keep_dst], %[dst], #8\n\t"
 216                 "subs %[w], %[w], #8\n\t"
 217                 "9:\n\t"
 218                 "vqadd.u8 d20, d0, d4\n\t"
 219
 220                 "bne 2b\n\t"
 221
 222                 "1:\n\t"
 223                 "vst1.8  {d20}, [%[keep_dst]]\n\t"
 224
 225                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 226                 :
 227                 : "ip", "cc", "memory", "d0", "d4",
 228                 "d20"
 229                 );
 230 #endif
 231         }
 232     }
 233     else
 234     {
 235         const uint8_t nil = 0;
 236         const uint8x8_t vnil = vld1_dup_u8 (&nil);
 237
 238         while (height--)
 239         {
 240             uint8x8_t sval = vnil, dval = vnil;
 241             uint8_t *dst4 = 0, *dst2 = 0;
 242
 243             dst = dst_line;
 244             dst_line += dst_stride;
 245             src = src_line;
 246             src_line += src_stride;
 247             w = width;
 248
 249             if (w & 4)
 250             {
 251                 sval = vreinterpret_u8_u32 (
 252                     vld1_lane_u32 ((void*)src, vreinterpret_u32_u8 (sval), 1));
 253                 dval = vreinterpret_u8_u32 (
 254                     vld1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (dval), 1));
 255
 256                 dst4 = dst;
 257                 src += 4;
 258                 dst += 4;
 259             }
 260
 261             if (w & 2)
 262             {
 263                 sval = vreinterpret_u8_u16 (
 264                     vld1_lane_u16 ((void*)src, vreinterpret_u16_u8 (sval), 1));
 265                 dval = vreinterpret_u8_u16 (
 266                     vld1_lane_u16 ((void*)dst, vreinterpret_u16_u8 (dval), 1));
 267
 268                 dst2 = dst;
 269                 src += 2;
 270                 dst += 2;
 271             }
 272
 273             if (w & 1)
 274             {
 275                 sval = vld1_lane_u8 (src, sval, 1);
 276                 dval = vld1_lane_u8 (dst, dval, 1);
 277             }
 278
 279             dval = vqadd_u8 (dval, sval);
 280
 281             if (w & 1)
 282                 vst1_lane_u8 (dst, dval, 1);
 283
 284             if (w & 2)
 285                 vst1_lane_u16 ((void*)dst2, vreinterpret_u16_u8 (dval), 1);
 286
 287             if (w & 4)
 288                 vst1_lane_u32 ((void*)dst4, vreinterpret_u32_u8 (dval), 1);
 289         }
 290     }
 291 }
 292
 293 static void
 294 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
 295                                pixman_op_t               op,
 296                                pixman_image_t *          src_image,
 297                                pixman_image_t *          mask_image,
 298                                pixman_image_t *          dst_image,
 299                                int32_t                   src_x,
 300                                int32_t                   src_y,
 301                                int32_t                   mask_x,
 302                                int32_t                   mask_y,
 303                                int32_t                   dest_x,
 304                                int32_t                   dest_y,
 305                                int32_t                   width,
 306                                int32_t                   height)
 307 {
 308     uint32_t    *dst_line, *dst;
 309     uint32_t    *src_line, *src;
 310     int dst_stride, src_stride;
 311     uint32_t w;
 312
 313     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 314     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 315
 316     if (width >= 8)
 317     {
 318         /* Use overlapping 8-pixel method */
 319         while (height--)
 320         {
 321             uint32_t *keep_dst = 0;
 322             uint8x8x4_t sval, dval, temp;
 323
 324             dst = dst_line;
 325             dst_line += dst_stride;
 326             src = src_line;
 327             src_line += src_stride;
 328             w = width;
 329
 330 #ifndef USE_GCC_INLINE_ASM
 331             sval = vld4_u8 ((void*)src);
 332             dval = vld4_u8 ((void*)dst);
 333             keep_dst = dst;
 334
 335             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 336             temp = neon8qadd (sval, temp);
 337
 338             src += (w & 7);
 339             dst += (w & 7);
 340             w -= (w & 7);
 341
 342             while (w)
 343             {
 344                 sval = vld4_u8 ((void*)src);
 345                 dval = vld4_u8 ((void*)dst);
 346
 347                 vst4_u8 ((void*)keep_dst, temp);
 348                 keep_dst = dst;
 349
 350                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 351                 temp = neon8qadd (sval, temp);
 352
 353                 src += 8;
 354                 dst += 8;
 355                 w -= 8;
 356             }
 357
 358             vst4_u8 ((void*)keep_dst, temp);
 359 #else
 360             asm volatile (
 361 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 362                 "vld4.8  {d0-d3}, [%[src]]\n\t"
 363                 "vld4.8  {d4-d7}, [%[dst]]\n\t"
 364                 "mov     %[keep_dst], %[dst]\n\t"
 365
 366                 "and ip, %[w], #7\n\t"
 367                 "add %[src], %[src], ip, LSL#2\n\t"
 368                 "add %[dst], %[dst], ip, LSL#2\n\t"
 369                 "subs %[w], %[w], ip\n\t"
 370                 "b 9f\n\t"
 371 /* LOOP */
 372                 "2:\n\t"
 373                 "vld4.8  {d0-d3}, [%[src]]!\n\t"
 374                 "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 375                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 376                 "sub     %[keep_dst], %[dst], #8*4\n\t"
 377                 "subs %[w], %[w], #8\n\t"
 378                 "9:\n\t"
 379                 "vmvn.8  d31, d3\n\t"
 380                 "vmull.u8 q10, d31, d4\n\t"
 381                 "vmull.u8 q11, d31, d5\n\t"
 382                 "vmull.u8 q12, d31, d6\n\t"
 383                 "vmull.u8 q13, d31, d7\n\t"
 384                 "vrshr.u16 q8, q10, #8\n\t"
 385                 "vrshr.u16 q9, q11, #8\n\t"
 386                 "vraddhn.u16 d20, q10, q8\n\t"
 387                 "vraddhn.u16 d21, q11, q9\n\t"
 388                 "vrshr.u16 q8, q12, #8\n\t"
 389                 "vrshr.u16 q9, q13, #8\n\t"
 390                 "vraddhn.u16 d22, q12, q8\n\t"
 391                 "vraddhn.u16 d23, q13, q9\n\t"
 392 /* result in d20-d23 */
 393                 "vqadd.u8 d20, d0, d20\n\t"
 394                 "vqadd.u8 d21, d1, d21\n\t"
 395                 "vqadd.u8 d22, d2, d22\n\t"
 396                 "vqadd.u8 d23, d3, d23\n\t"
 397
 398                 "bne 2b\n\t"
 399
 400                 "1:\n\t"
 401                 "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 402
 403                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 404                 :
 405                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 406                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
 407                 );
 408 #endif
 409         }
 410     }
 411     else
 412     {
 413         uint8x8_t alpha_selector = vreinterpret_u8_u64 (
 414             vcreate_u64 (0x0707070703030303ULL));
 415
 416         /* Handle width < 8 */
 417         while (height--)
 418         {
 419             dst = dst_line;
 420             dst_line += dst_stride;
 421             src = src_line;
 422             src_line += src_stride;
 423             w = width;
 424
 425             while (w >= 2)
 426             {
 427                 uint8x8_t sval, dval;
 428
 429                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 430                 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
 431                 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
 432                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 433                 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
 434
 435                 src += 2;
 436                 dst += 2;
 437                 w -= 2;
 438             }
 439
 440             if (w)
 441             {
 442                 uint8x8_t sval, dval;
 443
 444                 /* single 32-bit pixel in lane 0 */
 445                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));  /* only interested in lane 0 */
 446                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));  /* only interested in lane 0 */
 447                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 448                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 449             }
 450         }
 451     }
 452 }
 453
 454 static void
 455 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 456                                  pixman_op_t               op,
 457                                  pixman_image_t *          src_image,
 458                                  pixman_image_t *          mask_image,
 459                                  pixman_image_t *          dst_image,
 460                                  int32_t                   src_x,
 461                                  int32_t                   src_y,
 462                                  int32_t                   mask_x,
 463                                  int32_t                   mask_y,
 464                                  int32_t                   dest_x,
 465                                  int32_t                   dest_y,
 466                                  int32_t                   width,
 467                                  int32_t                   height)
 468 {
 469     uint32_t    *dst_line, *dst;
 470     uint32_t    *src_line, *src;
 471     uint32_t mask;
 472     int dst_stride, src_stride;
 473     uint32_t w;
 474     uint8x8_t mask_alpha;
 475
 476     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 477     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 478
 479     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 480     mask_alpha = vdup_n_u8 ((mask) >> 24);
 481
 482     if (width >= 8)
 483     {
 484         /* Use overlapping 8-pixel method */
 485         while (height--)
 486         {
 487             dst = dst_line;
 488             dst_line += dst_stride;
 489             src = src_line;
 490             src_line += src_stride;
 491             w = width;
 492
 493             uint32_t *keep_dst = 0;
 494
 495 #ifndef USE_GCC_INLINE_ASM
 496             uint8x8x4_t sval, dval, temp;
 497
 498             sval = vld4_u8 ((void*)src);
 499             dval = vld4_u8 ((void*)dst);
 500             keep_dst = dst;
 501
 502             sval = neon8mul (sval, mask_alpha);
 503             temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 504             temp = neon8qadd (sval, temp);
 505
 506             src += (w & 7);
 507             dst += (w & 7);
 508             w -= (w & 7);
 509
 510             while (w)
 511             {
 512                 sval = vld4_u8 ((void*)src);
 513                 dval = vld4_u8 ((void*)dst);
 514
 515                 vst4_u8 ((void*)keep_dst, temp);
 516                 keep_dst = dst;
 517
 518                 sval = neon8mul (sval, mask_alpha);
 519                 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
 520                 temp = neon8qadd (sval, temp);
 521
 522                 src += 8;
 523                 dst += 8;
 524                 w -= 8;
 525             }
 526             vst4_u8 ((void*)keep_dst, temp);
 527 #else
 528             asm volatile (
 529 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
 530                 "vdup.32      d30, %[mask]\n\t"
 531                 "vdup.8       d30, d30[3]\n\t"
 532
 533                 "vld4.8       {d0-d3}, [%[src]]\n\t"
 534                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 535                 "mov  %[keep_dst], %[dst]\n\t"
 536
 537                 "and  ip, %[w], #7\n\t"
 538                 "add  %[src], %[src], ip, LSL#2\n\t"
 539                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 540                 "subs  %[w], %[w], ip\n\t"
 541                 "b 9f\n\t"
 542 /* LOOP */
 543                 "2:\n\t"
 544                 "vld4.8       {d0-d3}, [%[src]]!\n\t"
 545                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 546                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 547                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 548                 "subs  %[w], %[w], #8\n\t"
 549
 550                 "9:\n\t"
 551                 "vmull.u8     q10, d30, d0\n\t"
 552                 "vmull.u8     q11, d30, d1\n\t"
 553                 "vmull.u8     q12, d30, d2\n\t"
 554                 "vmull.u8     q13, d30, d3\n\t"
 555                 "vrshr.u16    q8, q10, #8\n\t"
 556                 "vrshr.u16    q9, q11, #8\n\t"
 557                 "vraddhn.u16  d0, q10, q8\n\t"
 558                 "vraddhn.u16  d1, q11, q9\n\t"
 559                 "vrshr.u16    q9, q13, #8\n\t"
 560                 "vrshr.u16    q8, q12, #8\n\t"
 561                 "vraddhn.u16  d3, q13, q9\n\t"
 562                 "vraddhn.u16  d2, q12, q8\n\t"
 563
 564                 "vmvn.8       d31, d3\n\t"
 565                 "vmull.u8     q10, d31, d4\n\t"
 566                 "vmull.u8     q11, d31, d5\n\t"
 567                 "vmull.u8     q12, d31, d6\n\t"
 568                 "vmull.u8     q13, d31, d7\n\t"
 569                 "vrshr.u16    q8, q10, #8\n\t"
 570                 "vrshr.u16    q9, q11, #8\n\t"
 571                 "vraddhn.u16  d20, q10, q8\n\t"
 572                 "vrshr.u16    q8, q12, #8\n\t"
 573                 "vraddhn.u16  d21, q11, q9\n\t"
 574                 "vrshr.u16    q9, q13, #8\n\t"
 575                 "vraddhn.u16  d22, q12, q8\n\t"
 576                 "vraddhn.u16  d23, q13, q9\n\t"
 577
 578 /* result in d20-d23 */
 579                 "vqadd.u8     d20, d0, d20\n\t"
 580                 "vqadd.u8     d21, d1, d21\n\t"
 581                 "vqadd.u8     d22, d2, d22\n\t"
 582                 "vqadd.u8     d23, d3, d23\n\t"
 583
 584                 "bne  2b\n\t"
 585
 586                 "1:\n\t"
 587                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 588
 589                 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 590                 : [mask] "r" (mask)
 591                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 592                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
 593                 "d30", "d31"
 594                 );
 595 #endif
 596         }
 597     }
 598     else
 599     {
 600         uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 601
 602         /* Handle width < 8 */
 603         while (height--)
 604         {
 605             dst = dst_line;
 606             dst_line += dst_stride;
 607             src = src_line;
 608             src_line += src_stride;
 609             w = width;
 610
 611             while (w >= 2)
 612             {
 613                 uint8x8_t sval, dval;
 614
 615                 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
 616                 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
 617
 618                 /* sval * const alpha_mul */
 619                 sval = neon2mul (sval, mask_alpha);
 620
 621                 /* dval * 255-(src alpha) */
 622                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 623
 624                 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
 625
 626                 src += 2;
 627                 dst += 2;
 628                 w -= 2;
 629             }
 630
 631             if (w)
 632             {
 633                 uint8x8_t sval, dval;
 634
 635                 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));
 636                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
 637
 638                 /* sval * const alpha_mul */
 639                 sval = neon2mul (sval, mask_alpha);
 640
 641                 /* dval * 255-(src alpha) */
 642                 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 643
 644                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 645             }
 646         }
 647     }
 648 }
 649
 650 static void
 651 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
 652                               pixman_op_t               op,
 653                               pixman_image_t *          src_image,
 654                               pixman_image_t *          mask_image,
 655                               pixman_image_t *          dst_image,
 656                               int32_t                   src_x,
 657                               int32_t                   src_y,
 658                               int32_t                   mask_x,
 659                               int32_t                   mask_y,
 660                               int32_t                   dest_x,
 661                               int32_t                   dest_y,
 662                               int32_t                   width,
 663                               int32_t                   height)
 664 {
 665     uint32_t src, srca;
 666     uint32_t    *dst_line, *dst;
 667     uint8_t     *mask_line, *mask;
 668     int dst_stride, mask_stride;
 669     uint32_t w;
 670     uint8x8_t sval2;
 671     uint8x8x4_t sval8;
 672     uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
 673     uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 674
 675     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 676
 677     /* bail out if fully transparent */
 678     srca = src >> 24;
 679     if (src == 0)
 680         return;
 681
 682     sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
 683     sval8.val[0] = vdup_lane_u8 (sval2, 0);
 684     sval8.val[1] = vdup_lane_u8 (sval2, 1);
 685     sval8.val[2] = vdup_lane_u8 (sval2, 2);
 686     sval8.val[3] = vdup_lane_u8 (sval2, 3);
 687
 688     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 689     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 690
 691     if (width >= 8)
 692     {
 693         /* Use overlapping 8-pixel method, modified to avoid
 694          * rewritten dest being reused
 695          */
 696         while (height--)
 697         {
 698             uint32_t *keep_dst = 0;
 699
 700             dst = dst_line;
 701             dst_line += dst_stride;
 702             mask = mask_line;
 703             mask_line += mask_stride;
 704             w = width;
 705
 706 #ifndef USE_GCC_INLINE_ASM
 707             uint8x8_t alpha;
 708             uint8x8x4_t dval, temp;
 709
 710             alpha = vld1_u8 ((void*)mask);
 711             dval = vld4_u8 ((void*)dst);
 712             keep_dst = dst;
 713
 714             temp = neon8mul (sval8, alpha);
 715             dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
 716             temp = neon8qadd (temp, dval);
 717
 718             mask += (w & 7);
 719             dst += (w & 7);
 720             w -= (w & 7);
 721
 722             while (w)
 723             {
 724                 alpha = vld1_u8 ((void*)mask);
 725                 dval = vld4_u8 ((void*)dst);
 726
 727                 vst4_u8 ((void*)keep_dst, temp);
 728                 keep_dst = dst;
 729
 730                 temp = neon8mul (sval8, alpha);
 731                 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
 732                 temp = neon8qadd (temp, dval);
 733
 734                 mask += 8;
 735                 dst += 8;
 736                 w -= 8;
 737             }
 738             vst4_u8 ((void*)keep_dst, temp);
 739 #else
 740             asm volatile (
 741                 "vdup.32      d0, %[src]\n\t"
 742                 "vdup.8       d1, d0[1]\n\t"
 743                 "vdup.8       d2, d0[2]\n\t"
 744                 "vdup.8       d3, d0[3]\n\t"
 745                 "vdup.8       d0, d0[0]\n\t"
 746
 747                 "vld4.8       {d4-d7}, [%[dst]]\n\t"
 748                 "vld1.8       {d31}, [%[mask]]\n\t"
 749                 "mov  %[keep_dst], %[dst]\n\t"
 750
 751                 "and  ip, %[w], #7\n\t"
 752                 "add  %[mask], %[mask], ip\n\t"
 753                 "add  %[dst], %[dst], ip, LSL#2\n\t"
 754                 "subs  %[w], %[w], ip\n\t"
 755                 "b 9f\n\t"
 756 /* LOOP */
 757                 "2:\n\t"
 758                 "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 759                 "vld1.8       {d31}, [%[mask]]!\n\t"
 760                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 761                 "sub  %[keep_dst], %[dst], #8*4\n\t"
 762                 "subs  %[w], %[w], #8\n\t"
 763                 "9:\n\t"
 764
 765                 "vmull.u8     q10, d31, d0\n\t"
 766                 "vmull.u8     q11, d31, d1\n\t"
 767                 "vmull.u8     q12, d31, d2\n\t"
 768                 "vmull.u8     q13, d31, d3\n\t"
 769                 "vrshr.u16    q8, q10, #8\n\t"
 770                 "vrshr.u16    q9, q11, #8\n\t"
 771                 "vraddhn.u16  d20, q10, q8\n\t"
 772                 "vraddhn.u16  d21, q11, q9\n\t"
 773                 "vrshr.u16    q9, q13, #8\n\t"
 774                 "vrshr.u16    q8, q12, #8\n\t"
 775                 "vraddhn.u16  d23, q13, q9\n\t"
 776                 "vraddhn.u16  d22, q12, q8\n\t"
 777
 778                 "vmvn.8       d30, d23\n\t"
 779                 "vmull.u8     q12, d30, d4\n\t"
 780                 "vmull.u8     q13, d30, d5\n\t"
 781                 "vmull.u8     q14, d30, d6\n\t"
 782                 "vmull.u8     q15, d30, d7\n\t"
 783
 784                 "vrshr.u16    q8, q12, #8\n\t"
 785                 "vrshr.u16    q9, q13, #8\n\t"
 786                 "vraddhn.u16  d4, q12, q8\n\t"
 787                 "vrshr.u16    q8, q14, #8\n\t"
 788                 "vraddhn.u16  d5, q13, q9\n\t"
 789                 "vrshr.u16    q9, q15, #8\n\t"
 790                 "vraddhn.u16  d6, q14, q8\n\t"
 791                 "vraddhn.u16  d7, q15, q9\n\t"
 792 /* result in d4-d7 */
 793
 794                 "vqadd.u8     d20, d4, d20\n\t"
 795                 "vqadd.u8     d21, d5, d21\n\t"
 796                 "vqadd.u8     d22, d6, d22\n\t"
 797                 "vqadd.u8     d23, d7, d23\n\t"
 798
 799                 "bne 2b\n\t"
 800
 801                 "1:\n\t"
 802                 "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 803
 804                 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 805                 : [src] "r" (src)
 806                 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 807                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
 808                 "d30", "d31"
 809                 );
 810 #endif
 811         }
 812     }
 813     else
 814     {
 815         while (height--)
 816         {
 817             uint8x8_t alpha;
 818
 819             dst = dst_line;
 820             dst_line += dst_stride;
 821             mask = mask_line;
 822             mask_line += mask_stride;
 823             w = width;
 824
 825             while (w >= 2)
 826             {
 827                 uint8x8_t dval, temp, res;
 828
 829                 alpha = vtbl1_u8 (
 830                     vreinterpret_u8_u16 (vld1_dup_u16 ((void*)mask)), mask_selector);
 831                 dval = vld1_u8 ((void*)dst);
 832
 833                 temp = neon2mul (sval2, alpha);
 834                 res = vqadd_u8 (
 835                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
 836
 837                 vst1_u8 ((void*)dst, res);
 838
 839                 mask += 2;
 840                 dst += 2;
 841                 w -= 2;
 842             }
 843
 844             if (w)
 845             {
 846                 uint8x8_t dval, temp, res;
 847
 848                 alpha = vtbl1_u8 (vld1_dup_u8 ((void*)mask), mask_selector);
 849                 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
 850
 851                 temp = neon2mul (sval2, alpha);
 852                 res = vqadd_u8 (
 853                     temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
 854
 855                 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (res), 0);
 856             }
 857         }
 858     }
 859 }
 860
 861 static void
 862 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
 863                              pixman_op_t               op,
 864                              pixman_image_t *          src_image,
 865                              pixman_image_t *          mask_image,
 866                              pixman_image_t *          dst_image,
 867                              int32_t                   src_x,
 868                              int32_t                   src_y,
 869                              int32_t                   mask_x,
 870                              int32_t                   mask_y,
 871                              int32_t                   dest_x,
 872                              int32_t                   dest_y,
 873                              int32_t                   width,
 874                              int32_t                   height)
 875 {
 876     uint8_t     *dst_line, *dst;
 877     uint8_t     *mask_line, *mask;
 878     int dst_stride, mask_stride;
 879     uint32_t w;
 880     uint32_t src;
 881     uint8x8_t sa;
 882
 883     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 884     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 885     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 886     sa = vdup_n_u8 ((src) >> 24);
 887
 888     if (width >= 8)
 889     {
 890         /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
 891         while (height--)
 892         {
 893             dst = dst_line;
 894             dst_line += dst_stride;
 895             mask = mask_line;
 896             mask_line += mask_stride;
 897             w = width;
 898
 899             uint8x8_t mval, dval, res;
 900             uint8_t     *keep_dst;
 901
 902             mval = vld1_u8 ((void *)mask);
 903             dval = vld1_u8 ((void *)dst);
 904             keep_dst = dst;
 905
 906             res = vqadd_u8 (neon2mul (mval, sa), dval);
 907
 908             mask += (w & 7);
 909             dst += (w & 7);
 910             w -= w & 7;
 911
 912             while (w)
 913             {
 914                 mval = vld1_u8 ((void *)mask);
 915                 dval = vld1_u8 ((void *)dst);
 916                 vst1_u8 ((void *)keep_dst, res);
 917                 keep_dst = dst;
 918
 919                 res = vqadd_u8 (neon2mul (mval, sa), dval);
 920
 921                 mask += 8;
 922                 dst += 8;
 923                 w -= 8;
 924             }
 925             vst1_u8 ((void *)keep_dst, res);
 926         }
 927     }
 928     else
 929     {
 930         /* Use 4/2/1 load/store method to handle 1-7 pixels */
 931         while (height--)
 932         {
 933             dst = dst_line;
 934             dst_line += dst_stride;
 935             mask = mask_line;
 936             mask_line += mask_stride;
 937             w = width;
 938
 939             uint8x8_t mval = sa, dval = sa, res;
 940             uint8_t *dst4 = 0, *dst2 = 0;
 941
 942             if (w & 4)
 943             {
 944                 mval = vreinterpret_u8_u32 (
 945                     vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
 946                 dval = vreinterpret_u8_u32 (
 947                     vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
 948
 949                 dst4 = dst;
 950                 mask += 4;
 951                 dst += 4;
 952             }
 953
 954             if (w & 2)
 955             {
 956                 mval = vreinterpret_u8_u16 (
 957                     vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
 958                 dval = vreinterpret_u8_u16 (
 959                     vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
 960                 dst2 = dst;
 961                 mask += 2;
 962                 dst += 2;
 963             }
 964
 965             if (w & 1)
 966             {
 967                 mval = vld1_lane_u8 (mask, mval, 1);
 968                 dval = vld1_lane_u8 (dst, dval, 1);
 969             }
 970
 971             res = vqadd_u8 (neon2mul (mval, sa), dval);
 972
 973             if (w & 1)
 974                 vst1_lane_u8 (dst, res, 1);
 975             if (w & 2)
 976                 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
 977             if (w & 4)
 978                 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
 979         }
 980     }
 981 }
 982
 983 #ifdef USE_GCC_INLINE_ASM
 984
 985 static void
 986 neon_composite_src_16_16 (pixman_implementation_t * impl,
 987                           pixman_op_t               op,
 988                           pixman_image_t *          src_image,
 989                           pixman_image_t *          mask_image,
 990                           pixman_image_t *          dst_image,
 991                           int32_t                   src_x,
 992                           int32_t                   src_y,
 993                           int32_t                   mask_x,
 994                           int32_t                   mask_y,
 995                           int32_t                   dest_x,
 996                           int32_t                   dest_y,
 997                           int32_t                   width,
 998                           int32_t                   height)
 999 {
1000     uint16_t    *dst_line, *src_line;
1001     uint32_t dst_stride, src_stride;
1002
1003     if (!height || !width)
1004         return;
1005
1006     /* We simply copy 16-bit-aligned pixels from one place to another. */
1007     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1008     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1009
1010     /* Preload the first input scanline */
1011     {
1012         uint16_t *src_ptr = src_line;
1013         uint32_t count = width;
1014
1015         asm volatile (
1016             "0: @ loop                                                  \n"
1017             "   subs    %[count], %[count], #32                         \n"
1018             "   pld     [%[src]]                                        \n"
1019             "   add     %[src], %[src], #64                             \n"
1020             "   bgt 0b                                                  \n"
1021
1022             /* Clobbered input registers marked as input/outputs */
1023             : [src] "+r" (src_ptr), [count] "+r" (count)
1024             :     /* no unclobbered inputs */
1025             : "cc"
1026             );
1027     }
1028
1029     while (height--)
1030     {
1031         uint16_t *dst_ptr = dst_line;
1032         uint16_t *src_ptr = src_line;
1033         uint32_t count = width;
1034         uint32_t tmp = 0;
1035
1036         /* Uses multi-register access and preloading to maximise bandwidth.
1037          * Each pixel is one halfword, so a quadword contains 8px.
1038          * Preload frequency assumed a 64-byte cacheline.
1039          */
1040         asm volatile (
1041             "   cmp       %[count], #64                         \n"
1042             "   blt 1f    @ skip oversized fragments            \n"
1043             "0: @ start with eight quadwords at a time          \n"
1044             /* preload from next scanline */
1045             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1046             "   sub       %[count], %[count], #64               \n"
1047             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1048             "   vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1049             /* preload from next scanline */
1050             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1051             "   vld1.16   {d24,d25,d26,d27}, [%[src]]!          \n"
1052             "   vld1.16   {d28,d29,d30,d31}, [%[src]]!          \n"
1053             "   cmp       %[count], #64                         \n"
1054             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1055             "   vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1056             "   vst1.16   {d24,d25,d26,d27}, [%[dst]]!          \n"
1057             "   vst1.16   {d28,d29,d30,d31}, [%[dst]]!          \n"
1058             "   bge 0b                                          \n"
1059             "   cmp       %[count], #0                          \n"
1060             "   beq 7f    @ aligned fastpath                    \n"
1061             "1: @ four quadwords                                \n"
1062             "   tst       %[count], #32                         \n"
1063             "   beq 2f    @ skip oversized fragment             \n"
1064             /* preload from next scanline */
1065             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1066             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1067             "   vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1068             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1069             "   vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1070             "2: @ two quadwords                                 \n"
1071             "   tst       %[count], #16                         \n"
1072             "   beq 3f    @ skip oversized fragment             \n"
1073             /* preload from next scanline */
1074             "   pld       [%[src], %[src_stride], LSL #1]       \n"
1075             "   vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1076             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1077             "3: @ one quadword                                  \n"
1078             "   tst       %[count], #8                          \n"
1079             "   beq 4f    @ skip oversized fragment             \n"
1080             "   vld1.16   {d16,d17}, [%[src]]!                  \n"
1081             "   vst1.16   {d16,d17}, [%[dst]]!                  \n"
1082             "4: @ one doubleword                                \n"
1083             "   tst       %[count], #4                          \n"
1084             "   beq 5f    @ skip oversized fragment             \n"
1085             "   vld1.16   {d16}, [%[src]]!                      \n"
1086             "   vst1.16   {d16}, [%[dst]]!                      \n"
1087             "5: @ one word                                      \n"
1088             "   tst       %[count], #2                          \n"
1089             "   beq 6f    @ skip oversized fragment             \n"
1090             "   ldr       %[tmp], [%[src]], #4                  \n"
1091             "   str       %[tmp], [%[dst]], #4                  \n"
1092             "6: @ one halfword                                  \n"
1093             "   tst       %[count], #1                          \n"
1094             "   beq 7f    @ skip oversized fragment             \n"
1095             "   ldrh      %[tmp], [%[src]]                      \n"
1096             "   strh      %[tmp], [%[dst]]                      \n"
1097             "7: @ end                                           \n"
1098
1099             /* Clobbered input registers marked as input/outputs */
1100             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1101               [count] "+r" (count), [tmp] "+r" (tmp)
1102
1103               /* Unclobbered input */
1104             : [src_stride] "r" (src_stride)
1105
1106               /* Clobbered vector registers */
1107
1108               /* NB: these are the quad aliases of the double
1109                * registers used in the asm
1110                */
1111             : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1112             );
1113
1114         src_line += src_stride;
1115         dst_line += dst_stride;
1116     }
1117 }
1118
1119 #endif /* USE_GCC_INLINE_ASM */
1120
1121 static void
1122 neon_composite_src_24_16 (pixman_implementation_t * impl,
1123                           pixman_op_t               op,
1124                           pixman_image_t *          src_image,
1125                           pixman_image_t *          mask_image,
1126                           pixman_image_t *          dst_image,
1127                           int32_t                   src_x,
1128                           int32_t                   src_y,
1129                           int32_t                   mask_x,
1130                           int32_t                   mask_y,
1131                           int32_t                   dest_x,
1132                           int32_t                   dest_y,
1133                           int32_t                   width,
1134                           int32_t                   height)
1135 {
1136     uint16_t    *dst_line;
1137     uint32_t    *src_line;
1138     uint32_t dst_stride, src_stride;
1139
1140     if (!width || !height)
1141         return;
1142
1143     /* We simply copy pixels from one place to another,
1144      * assuming that the source's alpha is opaque.
1145      */
1146     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1147     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1148
1149     /* Preload the first input scanline */
1150     {
1151         uint8_t *src_ptr = (uint8_t*) src_line;
1152         uint32_t count = (width + 15) / 16;
1153
1154 #ifdef USE_GCC_INLINE_ASM
1155         asm volatile (
1156             "0: @ loop                                          \n"
1157             "   subs    %[count], %[count], #1                  \n"
1158             "   pld     [%[src]]                                \n"
1159             "   add     %[src], %[src], #64                     \n"
1160             "   bgt 0b                                          \n"
1161
1162             /* Clobbered input registers marked as input/outputs */
1163             : [src] "+r" (src_ptr), [count] "+r" (count)
1164             :     /* no unclobbered inputs */
1165             : "cc"
1166             );
1167 #else
1168         do
1169         {
1170             __pld (src_ptr);
1171             src_ptr += 64;
1172         }
1173         while (--count);
1174 #endif
1175     }
1176
1177     while (height--)
1178     {
1179         uint16_t *dst_ptr = dst_line;
1180         uint32_t *src_ptr = src_line;
1181         uint32_t count = width;
1182         const uint32_t rb_mask = 0x1F;
1183         const uint32_t g_mask = 0x3F;
1184
1185         /* If you're going to complain about a goto, take a long hard look
1186          * at the massive blocks of assembler this skips over.  ;-)
1187          */
1188         if (count < 8)
1189             goto small_stuff;
1190
1191 #ifdef USE_GCC_INLINE_ASM
1192
1193         /* This is not as aggressive as the RGB565-source case.
1194          * Generally the source is in cached RAM when the formats are
1195          * different, so we use preload.
1196          *
1197          * We don't need to blend, so we are not reading from the
1198          * uncached framebuffer.
1199          */
1200         asm volatile (
1201             "   cmp       %[count], #16                         \n"
1202             "   blt 1f    @ skip oversized fragments            \n"
1203             "0: @ start with sixteen pixels at a time           \n"
1204             "   sub       %[count], %[count], #16               \n"
1205             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1206             "   vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1207             "   vld4.8    {d4,d5,d6,d7}, [%[src]]!              @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1208             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1209             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1210             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1211             "   vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1212             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1213             "   vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1214             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1215             "   vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1216             "   vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1217             "   vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1218             "   cmp       %[count], #16                         \n"
1219             "   vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels                               \n"
1220             "   bge 0b                                          \n"
1221             "1: @ end of main loop                              \n"
1222             "   cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1223             "   blt 2f                                          \n"
1224             "   sub       %[count], %[count], #8                \n"
1225             "   pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline                    \n"
1226             "   vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1227             "   vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1228             "   vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1229             "   vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1230             "   vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1231             "   vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1232             "   vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels                                \n"
1233             "2: @ end                                           \n"
1234
1235             /* Clobbered input and working registers marked as input/outputs */
1236             : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1237
1238               /* Unclobbered input */
1239             : [src_stride] "r" (src_stride)
1240
1241               /* Clobbered vector registers */
1242
1243               /* NB: these are the quad aliases of the
1244                * double registers used in the asm
1245                */
1246             : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
1247             );
1248 #else
1249         /* A copy of the above code, in intrinsics-form. */
1250         while (count >= 16)
1251         {
1252             uint8x8x4_t pixel_set_a, pixel_set_b;
1253             uint16x8_t red_a, green_a, blue_a;
1254             uint16x8_t red_b, green_b, blue_b;
1255             uint16x8_t dest_pixels_a, dest_pixels_b;
1256
1257             count -= 16;
1258             __pld (src_ptr + src_stride);
1259             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1260             pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1261             src_ptr += 16;
1262
1263             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1264             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1265             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1266
1267             red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
1268             green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1269             blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
1270
1271             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1272             dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1273
1274             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1275             dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1276
1277             /* There doesn't seem to be an intrinsic for the
1278              * double-quadword variant
1279              */
1280             vst1q_u16 (dst_ptr, dest_pixels_a);
1281             vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1282             dst_ptr += 16;
1283         }
1284
1285         /* 8-pixel loop */
1286         if (count >= 8)
1287         {
1288             uint8x8x4_t pixel_set_a;
1289             uint16x8_t red_a, green_a, blue_a;
1290             uint16x8_t dest_pixels_a;
1291
1292             __pld (src_ptr + src_stride);
1293             count -= 8;
1294             pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1295             src_ptr += 8;
1296
1297             red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
1298             green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1299             blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
1300
1301             dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1302             dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1303
1304             vst1q_u16 (dst_ptr, dest_pixels_a);
1305             dst_ptr += 8;
1306         }
1307
1308 #endif  /* USE_GCC_INLINE_ASM */
1309
1310     small_stuff:
1311         if (count)
1312             __pld (src_ptr + src_stride);
1313
1314         while (count >= 2)
1315         {
1316             uint32_t src_pixel_a = *src_ptr++;
1317             uint32_t src_pixel_b = *src_ptr++;
1318
1319             /* ARM is really good at shift-then-ALU ops. */
1320             /* This should be a total of six shift-ANDs and five shift-ORs. */
1321             uint32_t dst_pixels_a;
1322             uint32_t dst_pixels_b;
1323
1324             dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1325             dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1326             dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1327
1328             dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1329             dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1330             dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1331
1332             /* little-endian mode only */
1333             *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1334             dst_ptr += 2;
1335             count -= 2;
1336         }
1337
1338         if (count)
1339         {
1340             uint32_t src_pixel = *src_ptr++;
1341
1342             /* ARM is really good at shift-then-ALU ops.
1343              * This block should end up as three shift-ANDs
1344              * and two shift-ORs.
1345              */
1346             uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1347             uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1348             uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1349             uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1350
1351             *dst_ptr++ = dst_pixel;
1352             count--;
1353         }
1354
1355         src_line += src_stride;
1356         dst_line += dst_stride;
1357     }
1358 }
1359
1360 static pixman_bool_t
1361 pixman_fill_neon (uint32_t *bits,
1362                   int       stride,
1363                   int       bpp,
1364                   int       x,
1365                   int       y,
1366                   int       width,
1367                   int       height,
1368                   uint32_t  _xor)
1369 {
1370     uint32_t byte_stride, color;
1371     char *dst;
1372
1373     /* stride is always multiple of 32bit units in pixman */
1374     byte_stride = stride * sizeof(uint32_t);
1375
1376     switch (bpp)
1377     {
1378     case 8:
1379         dst = ((char *) bits) + y * byte_stride + x;
1380         _xor &= 0xff;
1381         color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1382         break;
1383
1384     case 16:
1385         dst = ((char *) bits) + y * byte_stride + x * 2;
1386         _xor &= 0xffff;
1387         color = _xor << 16 | _xor;
1388         width *= 2;         /* width to bytes */
1389         break;
1390
1391     case 32:
1392         dst = ((char *) bits) + y * byte_stride + x * 4;
1393         color = _xor;
1394         width *= 4;         /* width to bytes */
1395         break;
1396
1397     default:
1398         return FALSE;
1399     }
1400
1401 #ifdef USE_GCC_INLINE_ASM
1402     if (width < 16)
1403     {
1404         /* We have a special case for such small widths that don't allow
1405          * us to use wide 128-bit stores anyway. We don't waste time
1406          * trying to align writes, since there are only very few of them anyway
1407          */
1408         asm volatile (
1409             "cmp                %[height], #0\n"/* Check if empty fill */
1410             "beq                3f\n"
1411             "vdup.32    d0, %[color]\n"/* Fill the color to neon req */
1412
1413             /* Check if we have a such width that can easily be handled by single
1414              * operation for each scanline. This significantly reduces the number
1415              * of test/branch instructions for each scanline
1416              */
1417             "cmp                %[width], #8\n"
1418             "beq                4f\n"
1419             "cmp                %[width], #4\n"
1420             "beq                5f\n"
1421             "cmp                %[width], #2\n"
1422             "beq                6f\n"
1423
1424             /* Loop starts here for each scanline */
1425             "1:\n"
1426             "mov                r4, %[dst]\n" /* Starting address of the current line */
1427             "tst                %[width], #8\n"
1428             "beq                2f\n"
1429             "vst1.8             {d0}, [r4]!\n"
1430             "2:\n"
1431             "tst                %[width], #4\n"
1432             "beq                2f\n"
1433             "str                %[color], [r4], #4\n"
1434             "2:\n"
1435             "tst                %[width], #2\n"
1436             "beq                2f\n"
1437             "strh               %[color], [r4], #2\n"
1438             "2:\n"
1439             "tst                %[width], #1\n"
1440             "beq                2f\n"
1441             "strb               %[color], [r4], #1\n"
1442             "2:\n"
1443
1444             "subs               %[height], %[height], #1\n"
1445             "add                %[dst], %[dst], %[byte_stride]\n"
1446             "bne                1b\n"
1447             "b          3f\n"
1448
1449             /* Special fillers for those widths that we can do with single operation */
1450             "4:\n"
1451             "subs               %[height], %[height], #1\n"
1452             "vst1.8             {d0}, [%[dst]]\n"
1453             "add                %[dst], %[dst], %[byte_stride]\n"
1454             "bne                4b\n"
1455             "b          3f\n"
1456
1457             "5:\n"
1458             "subs               %[height], %[height], #1\n"
1459             "str                %[color], [%[dst]]\n"
1460             "add                %[dst], %[dst], %[byte_stride]\n"
1461             "bne                5b\n"
1462             "b          3f\n"
1463
1464             "6:\n"
1465             "subs               %[height], %[height], #1\n"
1466             "strh               %[color], [%[dst]]\n"
1467             "add                %[dst], %[dst], %[byte_stride]\n"
1468             "bne                6b\n"
1469
1470             "3:\n"
1471
1472             : /* No output members */
1473             : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1474             [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1475             : "memory", "cc", "d0", "r4", "r5");
1476     }
1477     else
1478     {
1479         asm volatile (
1480             "cmp                %[height], #0\n"/* Check if empty fill */
1481             "beq                5f\n"
1482             "vdup.32    q0, %[color]\n"/* Fill the color to neon req */
1483
1484             /* Loop starts here for each scanline */
1485             "1:\n"
1486             "mov                r4, %[dst]\n"/* Starting address of the current line */
1487             "mov                r5, %[width]\n"/* We're going to write this many bytes */
1488             "ands               r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1489             "beq                2f\n"/* Jump to the best case */
1490
1491             /* We're not 128-bit aligned: However, we know that we can get to the
1492                next aligned location, since the fill is at least 16 bytes wide */
1493             "rsb                r6, r6, #16\n" /* We would need to go forward this much */
1494             "sub                r5, r5, r6\n"/* Update bytes left */
1495             "tst                r6, #1\n"
1496             "beq                6f\n"
1497             "vst1.8             {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1498             "6:\n"
1499             "tst                r6, #2\n"
1500             "beq                6f\n"
1501             "vst1.16    {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1502             "6:\n"
1503             "tst                r6, #4\n"
1504             "beq                6f\n"
1505             "vst1.32    {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1506             "6:\n"
1507             "tst                r6, #8\n"
1508             "beq                2f\n"
1509             "vst1.64    {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1510
1511             /* The good case: We're 128-bit aligned for this scanline */
1512             "2:\n"
1513             "and                r6, r5, #15\n"/* Number of tailing bytes */
1514             "cmp                r5, r6\n"/* Do we have at least one qword to write? */
1515             "beq                6f\n"/* No, we just write the tail */
1516             "lsr                r5, r5, #4\n"/* This many full qwords to write */
1517
1518             /* The main block: Do 128-bit aligned writes */
1519             "3:\n"
1520             "subs               r5, r5, #1\n"
1521             "vst1.64    {d0,d1}, [r4, :128]!\n"
1522             "bne                3b\n"
1523
1524             /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1525                We know that we're currently at 128-bit aligned address, so we can just
1526                pick the biggest operations that the remaining write width allows */
1527             "6:\n"
1528             "cmp                r6, #0\n"
1529             "beq                4f\n"
1530             "tst                r6, #8\n"
1531             "beq                6f\n"
1532             "vst1.64    {d0}, [r4, :64]!\n"
1533             "6:\n"
1534             "tst                r6, #4\n"
1535             "beq                6f\n"
1536             "vst1.32    {d0[0]}, [r4, :32]!\n"
1537             "6:\n"
1538             "tst                r6, #2\n"
1539             "beq                6f\n"
1540             "vst1.16    {d0[0]}, [r4, :16]!\n"
1541             "6:\n"
1542             "tst                r6, #1\n"
1543             "beq                4f\n"
1544             "vst1.8             {d0[0]}, [r4]!\n"
1545             "4:\n"
1546
1547             /* Handle the next scanline */
1548             "subs               %[height], %[height], #1\n"
1549             "add                %[dst], %[dst], %[byte_stride]\n"
1550             "bne                1b\n"
1551             "5:\n"
1552             : /* No output members */
1553             : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1554             [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1555             : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1556     }
1557     return TRUE;
1558
1559 #else
1560
1561     /* TODO: intrinsic version for armcc */
1562     return FALSE;
1563
1564 #endif
1565 }
1566
1567 /* TODO: is there a more generic way of doing this being introduced? */
1568 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1569
1570 static inline void
1571 neon_quadword_copy (void*    dst,
1572                     void*    src,
1573                     uint32_t count,         /* of quadwords */
1574                     uint32_t trailer_count  /* of bytes */)
1575 {
1576     uint8_t *t_dst = dst, *t_src = src;
1577
1578     /* Uses aligned multi-register loads to maximise read bandwidth
1579      * on uncached memory such as framebuffers
1580      * The accesses do not have the aligned qualifiers, so that the copy
1581      * may convert between aligned-uncached and unaligned-cached memory.
1582      * It is assumed that the CPU can infer alignedness from the address.
1583      */
1584
1585 #ifdef USE_GCC_INLINE_ASM
1586
1587     asm volatile (
1588         "       cmp       %[count], #8                          \n"
1589         "       blt 1f    @ skip oversized fragments            \n"
1590         "0: @ start with eight quadwords at a time              \n"
1591         "       sub       %[count], %[count], #8                \n"
1592         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1593         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1594         "       vld1.8    {d24,d25,d26,d27}, [%[src]]!          \n"
1595         "       vld1.8    {d28,d29,d30,d31}, [%[src]]!          \n"
1596         "       cmp       %[count], #8                          \n"
1597         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1598         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1599         "       vst1.8    {d24,d25,d26,d27}, [%[dst]]!          \n"
1600         "       vst1.8    {d28,d29,d30,d31}, [%[dst]]!          \n"
1601         "       bge 0b                                          \n"
1602         "1: @ four quadwords                                    \n"
1603         "       tst       %[count], #4                          \n"
1604         "       beq 2f    @ skip oversized fragment             \n"
1605         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1606         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1607         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1608         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1609         "2: @ two quadwords                                     \n"
1610         "       tst       %[count], #2                          \n"
1611         "       beq 3f    @ skip oversized fragment             \n"
1612         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1613         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1614         "3: @ one quadword                                      \n"
1615         "       tst       %[count], #1                          \n"
1616         "       beq 4f    @ skip oversized fragment             \n"
1617         "       vld1.8    {d16,d17}, [%[src]]!                  \n"
1618         "       vst1.8    {d16,d17}, [%[dst]]!                  \n"
1619         "4: @ end                                               \n"
1620
1621         /* Clobbered input registers marked as input/outputs */
1622         : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1623
1624           /* No unclobbered inputs */
1625         :
1626
1627         /* Clobbered vector registers */
1628         /* NB: these are the quad aliases of the double
1629          * registers used in the asm
1630          */
1631         : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory");
1632
1633 #else
1634
1635     while (count >= 8)
1636     {
1637         uint8x16x4_t t1 = vld4q_u8 (t_src);
1638         uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1639
1640         t_src += sizeof(uint8x16x4_t) * 2;
1641         vst4q_u8 (t_dst, t1);
1642         vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1643         t_dst += sizeof(uint8x16x4_t) * 2;
1644         count -= 8;
1645     }
1646
1647     if (count & 4)
1648     {
1649         uint8x16x4_t t1 = vld4q_u8 (t_src);
1650
1651         t_src += sizeof(uint8x16x4_t);
1652         vst4q_u8 (t_dst, t1);
1653         t_dst += sizeof(uint8x16x4_t);
1654     }
1655
1656     if (count & 2)
1657     {
1658         uint8x8x4_t t1 = vld4_u8 (t_src);
1659
1660         t_src += sizeof(uint8x8x4_t);
1661         vst4_u8 (t_dst, t1);
1662         t_dst += sizeof(uint8x8x4_t);
1663     }
1664
1665     if (count & 1)
1666     {
1667         uint8x16_t t1 = vld1q_u8 (t_src);
1668
1669         t_src += sizeof(uint8x16_t);
1670         vst1q_u8 (t_dst, t1);
1671         t_dst += sizeof(uint8x16_t);
1672     }
1673
1674 #endif  /* !USE_GCC_INLINE_ASM */
1675
1676     if (trailer_count)
1677     {
1678         if (trailer_count & 8)
1679         {
1680             uint8x8_t t1 = vld1_u8 (t_src);
1681
1682             t_src += sizeof(uint8x8_t);
1683             vst1_u8 (t_dst, t1);
1684             t_dst += sizeof(uint8x8_t);
1685         }
1686
1687         if (trailer_count & 4)
1688         {
1689             *((uint32_t*) t_dst) = *((uint32_t*) t_src);
1690
1691             t_dst += 4;
1692             t_src += 4;
1693         }
1694
1695         if (trailer_count & 2)
1696         {
1697             *((uint16_t*) t_dst) = *((uint16_t*) t_src);
1698
1699             t_dst += 2;
1700             t_src += 2;
1701         }
1702
1703         if (trailer_count & 1)
1704         {
1705             *t_dst++ = *t_src++;
1706         }
1707     }
1708 }
1709
1710 static inline void
1711 solid_over_565_8_pix_neon (uint32_t  glyph_colour,
1712                            uint16_t *dest,
1713                            uint8_t * in_mask,
1714                            uint32_t  dest_stride,    /* bytes, not elements */
1715                            uint32_t  mask_stride,
1716                            uint32_t  count           /* 8-pixel groups */)
1717 {
1718     /* Inner loop of glyph blitter (solid colour, alpha mask) */
1719
1720 #ifdef USE_GCC_INLINE_ASM
1721
1722     asm volatile (
1723         "       vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]]  @ splat solid colour components    \n"
1724         "0:     @ loop                                                                                                                                                          \n"
1725         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer                       \n"
1726         "       vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph                                                \n"
1727         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
1728         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
1729         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
1730         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
1731         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
1732         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
1733         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
1734         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
1735         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
1736         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
1737         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
1738         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
1739         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
1740         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
1741         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
1742         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
1743         "       add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait                \n"
1744         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
1745         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
1746         "       vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels                                          \n"
1747         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer                                     \n"
1748         "       bne 0b                               @ next please                                                                      \n"
1749
1750         /* Clobbered registers marked as input/outputs */
1751         : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
1752
1753           /* Inputs */
1754         : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
1755
1756           /* Clobbers, including the inputs we modify, and potentially lots of memory */
1757         : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
1758         );
1759
1760 #else
1761
1762     uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
1763
1764     while (count--)
1765     {
1766         uint16x8_t pixels = vld1q_u16 (dest);
1767         uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
1768         uint8x8_t mask_image = vmvn_u8 (mask);
1769
1770         uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
1771         uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
1772         uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
1773
1774         uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
1775         uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
1776         uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
1777
1778         s_red   = vmlal (s_red, mask, solid_colour.val[2]);
1779         s_green = vmlal (s_green, mask, solid_colour.val[1]);
1780         s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
1781
1782         pixels = vsri_n_u16 (s_red, s_green, 5);
1783         pixels = vsri_n_u16 (pixels, s_blue, 11);
1784         vst1q_u16 (dest, pixels);
1785
1786         dest += dest_stride;
1787         mask += mask_stride;
1788     }
1789
1790 #endif
1791 }
1792
1793 static void
1794 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
1795                               pixman_op_t               op,
1796                               pixman_image_t *          src_image,
1797                               pixman_image_t *          mask_image,
1798                               pixman_image_t *          dst_image,
1799                               int32_t                   src_x,
1800                               int32_t                   src_y,
1801                               int32_t                   mask_x,
1802                               int32_t                   mask_y,
1803                               int32_t                   dest_x,
1804                               int32_t                   dest_y,
1805                               int32_t                   width,
1806                               int32_t                   height)
1807 {
1808     uint32_t  src, srca;
1809     uint16_t *dst_line, *aligned_line;
1810     uint8_t  *mask_line;
1811     uint32_t  dst_stride, mask_stride;
1812     uint32_t  kernel_count, copy_count, copy_tail;
1813     uint8_t   kernel_offset, copy_offset;
1814
1815     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1816
1817     /* bail out if fully transparent or degenerate */
1818     srca = src >> 24;
1819     if (src == 0)
1820         return;
1821
1822     if (width == 0 || height == 0)
1823         return;
1824
1825     if (width > NEON_SCANLINE_BUFFER_PIXELS)
1826     {
1827         /* split the blit, so we can use a fixed-size scanline buffer
1828          * TODO: there must be a more elegant way of doing this.
1829          */
1830         int x;
1831         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
1832         {
1833             neon_composite_over_n_8_0565 (
1834                 impl, op,
1835                 src_image, mask_image, dst_image,
1836                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
1837                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
1838         }
1839
1840         return;
1841     }
1842
1843     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1844     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1845
1846     /* keep within minimum number of aligned quadwords on width
1847      * while also keeping the minimum number of columns to process
1848      */
1849     {
1850         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
1851         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
1852         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
1853
1854         /* the fast copy should be quadword aligned */
1855         copy_offset = dst_line - ((uint16_t*) aligned_left);
1856         aligned_line = dst_line - copy_offset;
1857         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
1858         copy_tail = 0;
1859
1860         if (aligned_right - aligned_left > ceiling_length)
1861         {
1862             /* unaligned routine is tightest */
1863             kernel_count = (uint32_t) (ceiling_length >> 4);
1864             kernel_offset = copy_offset;
1865         }
1866         else
1867         {
1868             /* aligned routine is equally tight, so it is safer to align */
1869             kernel_count = copy_count;
1870             kernel_offset = 0;
1871         }
1872
1873         /* We should avoid reading beyond scanline ends for safety */
1874         if (aligned_line < (dst_line - dest_x) ||
1875             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
1876         {
1877             /* switch to precise read */
1878             copy_offset = kernel_offset = 0;
1879             aligned_line = dst_line;
1880             kernel_count = (uint32_t) (ceiling_length >> 4);
1881             copy_count = (width * sizeof(*dst_line)) >> 4;
1882             copy_tail = (width * sizeof(*dst_line)) & 0xF;
1883         }
1884     }
1885
1886     {
1887         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
1888         uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
1889         int y = height;
1890
1891         /* row-major order */
1892         /* left edge, middle block, right edge */
1893         for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
1894         {
1895             /* We don't want to overrun the edges of the glyph,
1896              * so realign the edge data into known buffers
1897              */
1898             neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
1899
1900             /* Uncached framebuffer access is really, really slow
1901              * if we do it piecemeal. It should be much faster if we
1902              * grab it all at once. One scanline should easily fit in
1903              * L1 cache, so this should not waste RAM bandwidth.
1904              */
1905             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
1906
1907             /* Apply the actual filter */
1908             solid_over_565_8_pix_neon (
1909                 src, scan_line + kernel_offset,
1910                 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
1911                 8, kernel_count);
1912
1913             /* Copy the modified scanline back */
1914             neon_quadword_copy (dst_line, scan_line + copy_offset,
1915                                 width >> 3, (width & 7) * 2);
1916         }
1917     }
1918 }
1919
1920 #ifdef USE_GCC_INLINE_ASM
1921
1922 static inline void
1923 plain_over_565_8_pix_neon (uint32_t  colour,
1924                            uint16_t *dest,
1925                            uint32_t  dest_stride,     /* bytes, not elements */
1926                            uint32_t  count            /* 8-pixel groups */)
1927 {
1928     /* Inner loop for plain translucent rects
1929      * (solid colour without alpha mask)
1930      */
1931     asm volatile (
1932         "       vld4.8   {d20[],d21[],d22[],d23[]}, [%[colour]]  @ solid colour load/splat \n"
1933         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
1934         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
1935         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
1936         "       vmvn      d18, d23                   @ inverse alpha for background \n"
1937         "0:     @ loop\n"
1938         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer       \n"
1939         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
1940         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
1941         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
1942         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
1943         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
1944         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
1945         "       vmov      q0, q12                    @ retrieve foreground red   \n"
1946         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
1947         "       vmov      q1, q13                    @ retrieve foreground green \n"
1948         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
1949         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
1950         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
1951         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
1952         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
1953         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
1954         "       vst1.16   {d0,d1}, [%[dest]]         @ store composited pixels                  \n"
1955         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer             \n"
1956         "       bne 0b                               @ next please                              \n"
1957
1958         /* Clobbered registers marked as input/outputs */
1959         : [dest] "+r" (dest), [count] "+r" (count)
1960
1961           /* Inputs */
1962         : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
1963
1964           /* Clobbers, including the inputs we modify, and
1965            * potentially lots of memory
1966            */
1967         : "q0", "q1", "q2", "q3", "q9",
1968           "q10", "q11", "q12", "q13", "q14",
1969           "cc", "memory"
1970         );
1971 }
1972
1973 static void
1974 neon_composite_over_n_0565 (pixman_implementation_t * impl,
1975                             pixman_op_t               op,
1976                             pixman_image_t *          src_image,
1977                             pixman_image_t *          mask_image,
1978                             pixman_image_t *          dst_image,
1979                             int32_t                   src_x,
1980                             int32_t                   src_y,
1981                             int32_t                   mask_x,
1982                             int32_t                   mask_y,
1983                             int32_t                   dest_x,
1984                             int32_t                   dest_y,
1985                             int32_t                   width,
1986                             int32_t                   height)
1987 {
1988     uint32_t src, srca;
1989     uint16_t    *dst_line, *aligned_line;
1990     uint32_t dst_stride;
1991     uint32_t kernel_count, copy_count, copy_tail;
1992     uint8_t kernel_offset, copy_offset;
1993
1994     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1995
1996     /* bail out if fully transparent */
1997     srca = src >> 24;
1998     if (src == 0)
1999         return;
2000
2001     if (width == 0 || height == 0)
2002         return;
2003
2004     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2005     {
2006         /* split the blit, so we can use a fixed-size scanline buffer *
2007          * TODO: there must be a more elegant way of doing this.
2008          */
2009         int x;
2010
2011         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2012         {
2013             neon_composite_over_n_0565 (
2014                 impl, op,
2015                 src_image, mask_image, dst_image,
2016                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2017                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2018         }
2019         return;
2020     }
2021
2022     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2023
2024     /* keep within minimum number of aligned quadwords on width
2025      * while also keeping the minimum number of columns to process
2026      */
2027     {
2028         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2029         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2030         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2031
2032         /* the fast copy should be quadword aligned */
2033         copy_offset = dst_line - ((uint16_t*) aligned_left);
2034         aligned_line = dst_line - copy_offset;
2035         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2036         copy_tail = 0;
2037
2038         if (aligned_right - aligned_left > ceiling_length)
2039         {
2040             /* unaligned routine is tightest */
2041             kernel_count = (uint32_t) (ceiling_length >> 4);
2042             kernel_offset = copy_offset;
2043         }
2044         else
2045         {
2046             /* aligned routine is equally tight, so it is safer to align */
2047             kernel_count = copy_count;
2048             kernel_offset = 0;
2049         }
2050
2051         /* We should avoid reading beyond scanline ends for safety */
2052         if (aligned_line < (dst_line - dest_x) ||
2053             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2054         {
2055             /* switch to precise read */
2056             copy_offset = kernel_offset = 0;
2057             aligned_line = dst_line;
2058             kernel_count = (uint32_t) (ceiling_length >> 4);
2059             copy_count = (width * sizeof(*dst_line)) >> 4;
2060             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2061         }
2062     }
2063
2064     {
2065         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
2066
2067         /* row-major order */
2068         /* left edge, middle block, right edge */
2069         for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2070         {
2071             /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2072              * It should be much faster if we grab it all at once.
2073              * One scanline should easily fit in L1 cache, so this should
2074              * not waste RAM bandwidth.
2075              */
2076             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2077
2078             /* Apply the actual filter */
2079             plain_over_565_8_pix_neon (
2080                 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2081
2082             /* Copy the modified scanline back */
2083             neon_quadword_copy (
2084                 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2085         }
2086     }
2087 }
2088
2089 static inline void
2090 ARGB8_over_565_8_pix_neon (uint32_t *src,
2091                            uint16_t *dest,
2092                            uint32_t  src_stride,     /* bytes, not elements */
2093                            uint32_t  count           /* 8-pixel groups */)
2094 {
2095     asm volatile (
2096         "0:     @ loop\n"
2097         "       pld   [%[src], %[src_stride]]         @ preload from next scanline      \n"
2098         "       vld1.16   {d0,d1}, [%[dest]]         @ load pixels from framebuffer     \n"
2099         "       vld4.8   {d20,d21,d22,d23},[%[src]]! @ load source image pixels         \n"
2100         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
2101         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
2102         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
2103         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
2104         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
2105         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
2106         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
2107         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
2108         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
2109         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
2110         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
2111         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
2112         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
2113         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
2114         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
2115         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
2116         "       vst1.16   {d2,d3}, [%[dest]]!        @ store composited pixels                  \n"
2117         "       bne 0b                               @ next please                              \n"
2118
2119         /* Clobbered registers marked as input/outputs */
2120         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2121
2122           /* Inputs */
2123         : [src_stride] "r" (src_stride)
2124
2125           /* Clobbers, including the inputs we modify, and potentially lots of memory */
2126         : "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
2127         );
2128 }
2129
2130 static void
2131 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2132                                pixman_op_t               op,
2133                                pixman_image_t *          src_image,
2134                                pixman_image_t *          mask_image,
2135                                pixman_image_t *          dst_image,
2136                                int32_t                   src_x,
2137                                int32_t                   src_y,
2138                                int32_t                   mask_x,
2139                                int32_t                   mask_y,
2140                                int32_t                   dest_x,
2141                                int32_t                   dest_y,
2142                                int32_t                   width,
2143                                int32_t                   height)
2144 {
2145     uint32_t    *src_line;
2146     uint16_t    *dst_line, *aligned_line;
2147     uint32_t dst_stride, src_stride;
2148     uint32_t kernel_count, copy_count, copy_tail;
2149     uint8_t kernel_offset, copy_offset;
2150
2151     /* we assume mask is opaque
2152      * so the only alpha to deal with is embedded in src
2153      */
2154     if (width > NEON_SCANLINE_BUFFER_PIXELS)
2155     {
2156         /* split the blit, so we can use a fixed-size scanline buffer */
2157         int x;
2158         for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2159         {
2160             neon_composite_over_8888_0565 (
2161                 impl, op,
2162                 src_image, mask_image, dst_image,
2163                 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2164                 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2165         }
2166         return;
2167     }
2168
2169     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2170     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2171
2172     /* keep within minimum number of aligned quadwords on width
2173      * while also keeping the minimum number of columns to process
2174      */
2175     {
2176         unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2177         unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2178         unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2179
2180         /* the fast copy should be quadword aligned */
2181         copy_offset = dst_line - ((uint16_t*) aligned_left);
2182         aligned_line = dst_line - copy_offset;
2183         copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2184         copy_tail = 0;
2185
2186         if (aligned_right - aligned_left > ceiling_length)
2187         {
2188             /* unaligned routine is tightest */
2189             kernel_count = (uint32_t) (ceiling_length >> 4);
2190             kernel_offset = copy_offset;
2191         }
2192         else
2193         {
2194             /* aligned routine is equally tight, so it is safer to align */
2195             kernel_count = copy_count;
2196             kernel_offset = 0;
2197         }
2198
2199         /* We should avoid reading beyond scanline ends for safety */
2200         if (aligned_line < (dst_line - dest_x) ||
2201             (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2202         {
2203             /* switch to precise read */
2204             copy_offset = kernel_offset = 0;
2205             aligned_line = dst_line;
2206             kernel_count = (uint32_t) (ceiling_length >> 4);
2207             copy_count = (width * sizeof(*dst_line)) >> 4;
2208             copy_tail = (width * sizeof(*dst_line)) & 0xF;
2209         }
2210     }
2211
2212     /* Preload the first input scanline */
2213     {
2214         uint8_t *src_ptr = (uint8_t*) src_line;
2215         uint32_t count = (width + 15) / 16;
2216
2217 #ifdef USE_GCC_INLINE_ASM
2218         asm volatile (
2219             "0: @ loop                                          \n"
2220             "   subs    %[count], %[count], #1                  \n"
2221             "   pld     [%[src]]                                \n"
2222             "   add     %[src], %[src], #64                     \n"
2223             "   bgt 0b                                          \n"
2224
2225             /* Clobbered input registers marked as input/outputs */
2226             : [src] "+r" (src_ptr), [count] "+r" (count)
2227             :     /* no unclobbered inputs */
2228             : "cc"
2229             );
2230 #else
2231         do
2232         {
2233             __pld (src_ptr);
2234             src_ptr += 64;
2235         }
2236         while (--count);
2237 #endif
2238     }
2239
2240     {
2241         uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2242
2243         /* row-major order */
2244         /* left edge, middle block, right edge */
2245         for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2246         {
2247             /* Uncached framebuffer access is really, really slow if we do
2248              * it piecemeal. It should be much faster if we grab it all at
2249              * once. One scanline should easily fit in L1 cache, so this
2250              * should not waste RAM bandwidth.
2251              */
2252             neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2253
2254             /* Apply the actual filter */
2255             ARGB8_over_565_8_pix_neon (
2256                 src_line, scan_line + kernel_offset,
2257                 src_stride * sizeof(*src_line), kernel_count);
2258
2259             /* Copy the modified scanline back */
2260             neon_quadword_copy (dst_line,
2261                                 scan_line + copy_offset,
2262                                 width >> 3, (width & 7) * 2);
2263         }
2264     }
2265 }
2266
2267 #endif  /* USE_GCC_INLINE_ASM */
2268
2269 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2270 {
2271     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,     0 },
2272     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
2273     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
2274     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
2275     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2276     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
2277     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2278     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
2279 #ifdef USE_GCC_INLINE_ASM
2280     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
2281     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
2282     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
2283     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
2284     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
2285     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
2286 #endif
2287     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
2288     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
2289     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
2290     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
2291     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2292     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2293     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
2294     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
2295     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
2296     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
2297     { PIXMAN_OP_NONE },
2298 };
2299
2300 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2301
2302 static void
2303 arm_neon_composite (pixman_implementation_t *imp,
2304                     pixman_op_t              op,
2305                     pixman_image_t *         src,
2306                     pixman_image_t *         mask,
2307                     pixman_image_t *         dest,
2308                     int32_t                  src_x,
2309                     int32_t                  src_y,
2310                     int32_t                  mask_x,
2311                     int32_t                  mask_y,
2312                     int32_t                  dest_x,
2313                     int32_t                  dest_y,
2314                     int32_t                  width,
2315                     int32_t                  height)
2316 {
2317     if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2318                                op, src, mask, dest,
2319                                src_x, src_y,
2320                                mask_x, mask_y,
2321                                dest_x, dest_y,
2322                                width, height))
2323     {
2324         return;
2325     }
2326
2327     _pixman_implementation_composite (imp->delegate, op,
2328                                       src, mask, dest,
2329                                       src_x, src_y,
2330                                       mask_x, mask_y,
2331                                       dest_x, dest_y,
2332                                       width, height);
2333 }
2334
2335 static pixman_bool_t
2336 pixman_blt_neon (void *src_bits,
2337                  void *dst_bits,
2338                  int   src_stride,
2339                  int   dst_stride,
2340                  int   src_bpp,
2341                  int   dst_bpp,
2342                  int   src_x,
2343                  int   src_y,
2344                  int   dst_x,
2345                  int   dst_y,
2346                  int   width,
2347                  int   height)
2348 {
2349     if (!width || !height)
2350         return TRUE;
2351
2352     /* accelerate only straight copies involving complete bytes */
2353     if (src_bpp != dst_bpp || (src_bpp & 7))
2354         return FALSE;
2355
2356     {
2357         uint32_t bytes_per_pixel = src_bpp >> 3;
2358         uint32_t byte_width = width * bytes_per_pixel;
2359         /* parameter is in words for some reason */
2360         int32_t src_stride_bytes = src_stride * 4;
2361         int32_t dst_stride_bytes = dst_stride * 4;
2362         uint8_t *src_bytes = ((uint8_t*) src_bits) +
2363             src_y * src_stride_bytes + src_x * bytes_per_pixel;
2364         uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2365             dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2366         uint32_t quadword_count = byte_width / 16;
2367         uint32_t offset         = byte_width % 16;
2368
2369         while (height--)
2370         {
2371             neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2372             src_bytes += src_stride_bytes;
2373             dst_bytes += dst_stride_bytes;
2374         }
2375     }
2376
2377     return TRUE;
2378 }
2379
2380 static pixman_bool_t
2381 arm_neon_blt (pixman_implementation_t *imp,
2382               uint32_t *               src_bits,
2383               uint32_t *               dst_bits,
2384               int                      src_stride,
2385               int                      dst_stride,
2386               int                      src_bpp,
2387               int                      dst_bpp,
2388               int                      src_x,
2389               int                      src_y,
2390               int                      dst_x,
2391               int                      dst_y,
2392               int                      width,
2393               int                      height)
2394 {
2395     if (pixman_blt_neon (
2396             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2397             src_x, src_y, dst_x, dst_y, width, height))
2398     {
2399         return TRUE;
2400     }
2401
2402     return _pixman_implementation_blt (
2403                imp->delegate,
2404                src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2405                src_x, src_y, dst_x, dst_y, width, height);
2406 }
2407
2408 static pixman_bool_t
2409 arm_neon_fill (pixman_implementation_t *imp,
2410                uint32_t *               bits,
2411                int                      stride,
2412                int                      bpp,
2413                int                      x,
2414                int                      y,
2415                int                      width,
2416                int                      height,
2417                uint32_t xor)
2418 {
2419     if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2420         return TRUE;
2421
2422     return _pixman_implementation_fill (
2423         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2424 }
2425
2426 pixman_implementation_t *
2427 _pixman_implementation_create_arm_neon (void)
2428 {
2429     pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2430     pixman_implementation_t *imp = _pixman_implementation_create (simd);
2431
2432     imp->composite = arm_neon_composite;
2433     imp->blt = arm_neon_blt;
2434     imp->fill = arm_neon_fill;
2435
2436     return imp;
2437 }
2438