pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include "pixman-arm-neon.h"
  34
  35 #include <arm_neon.h>
  36 #include <string.h>
  37
  38 // Deal with an intrinsic that is defined differently in GCC
  39 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  40 #define __pld(_x) __builtin_prefetch(_x)
  41 #endif
  42
  43 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
  44 {
  45     uint16x8_t gb, b;
  46     uint8x8x4_t res;
  47
  48     res.val[3] = vdup_n_u8(0);
  49     gb = vshrq_n_u16(rgb, 5);
  50     b = vshrq_n_u16(rgb, 5+6);
  51     res.val[0] = vmovn_u16(rgb);  // get low 5 bits
  52     res.val[1] = vmovn_u16(gb);   // get mid 6 bits
  53     res.val[2] = vmovn_u16(b);    // get top 5 bits
  54
  55     res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top
  56     res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top
  57     res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top
  58
  59     res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5);
  60     res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6);
  61     res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5);
  62
  63     return res;
  64 }
  65
  66 static force_inline uint16x8_t pack0565(uint8x8x4_t s)
  67 {
  68     uint16x8_t rgb, val_g, val_r;
  69
  70     rgb = vshll_n_u8(s.val[2],8);
  71     val_g = vshll_n_u8(s.val[1],8);
  72     val_r = vshll_n_u8(s.val[0],8);
  73     rgb = vsriq_n_u16(rgb, val_g, 5);
  74     rgb = vsriq_n_u16(rgb, val_r, 5+6);
  75
  76     return rgb;
  77 }
  78
  79 static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha)
  80 {
  81     uint16x8_t tmp,tmp2;
  82     uint8x8_t res;
  83
  84     tmp = vmull_u8(x,alpha);
  85     tmp2 = vrshrq_n_u16(tmp,8);
  86     res = vraddhn_u16(tmp,tmp2);
  87
  88     return res;
  89 }
  90
  91 static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha)
  92 {
  93     uint16x8x4_t tmp;
  94     uint8x8x4_t res;
  95     uint16x8_t qtmp1,qtmp2;
  96
  97     tmp.val[0] = vmull_u8(x.val[0],alpha);
  98     tmp.val[1] = vmull_u8(x.val[1],alpha);
  99     tmp.val[2] = vmull_u8(x.val[2],alpha);
 100     tmp.val[3] = vmull_u8(x.val[3],alpha);
 101
 102     qtmp1 = vrshrq_n_u16(tmp.val[0],8);
 103     qtmp2 = vrshrq_n_u16(tmp.val[1],8);
 104     res.val[0] = vraddhn_u16(tmp.val[0],qtmp1);
 105     qtmp1 = vrshrq_n_u16(tmp.val[2],8);
 106     res.val[1] = vraddhn_u16(tmp.val[1],qtmp2);
 107     qtmp2 = vrshrq_n_u16(tmp.val[3],8);
 108     res.val[2] = vraddhn_u16(tmp.val[2],qtmp1);
 109     res.val[3] = vraddhn_u16(tmp.val[3],qtmp2);
 110
 111     return res;
 112 }
 113
 114 static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y)
 115 {
 116     uint8x8x4_t res;
 117
 118     res.val[0] = vqadd_u8(x.val[0],y.val[0]);
 119     res.val[1] = vqadd_u8(x.val[1],y.val[1]);
 120     res.val[2] = vqadd_u8(x.val[2],y.val[2]);
 121     res.val[3] = vqadd_u8(x.val[3],y.val[3]);
 122
 123     return res;
 124 }
 125
 126
 127 void
 128 fbCompositeSrcAdd_8000x8000neon (
 129                             pixman_implementation_t * impl,
 130                             pixman_op_t op,
 131                                 pixman_image_t * pSrc,
 132                                 pixman_image_t * pMask,
 133                                 pixman_image_t * pDst,
 134                                 int32_t      xSrc,
 135                                 int32_t      ySrc,
 136                                 int32_t      xMask,
 137                                 int32_t      yMask,
 138                                 int32_t      xDst,
 139                                 int32_t      yDst,
 140                                 int32_t      width,
 141                                 int32_t      height)
 142 {
 143     uint8_t     *dstLine, *dst;
 144     uint8_t     *srcLine, *src;
 145     int dstStride, srcStride;
 146     uint16_t    w;
 147
 148     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
 149     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
 150
 151     if (width>=8)
 152     {
 153         // Use overlapping 8-pixel method
 154         while (height--)
 155         {
 156             dst = dstLine;
 157             dstLine += dstStride;
 158             src = srcLine;
 159             srcLine += srcStride;
 160             w = width;
 161
 162             uint8_t *keep_dst=0;
 163
 164 #ifndef USE_GCC_INLINE_ASM
 165             uint8x8_t sval,dval,temp;
 166
 167             sval = vld1_u8((void*)src);
 168             dval = vld1_u8((void*)dst);
 169             keep_dst = dst;
 170
 171             temp = vqadd_u8(dval,sval);
 172
 173             src += (w & 7);
 174             dst += (w & 7);
 175             w -= (w & 7);
 176
 177             while (w)
 178             {
 179                 sval = vld1_u8((void*)src);
 180                 dval = vld1_u8((void*)dst);
 181
 182                 vst1_u8((void*)keep_dst,temp);
 183                 keep_dst = dst;
 184
 185                 temp = vqadd_u8(dval,sval);
 186
 187                 src+=8;
 188                 dst+=8;
 189                 w-=8;
 190             }
 191             vst1_u8((void*)keep_dst,temp);
 192 #else
 193             asm volatile (
 194 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
 195                         "vld1.8  {d0}, [%[src]]\n\t"
 196                         "vld1.8  {d4}, [%[dst]]\n\t"
 197                         "mov     %[keep_dst], %[dst]\n\t"
 198
 199                         "and ip, %[w], #7\n\t"
 200                         "add %[src], %[src], ip\n\t"
 201                         "add %[dst], %[dst], ip\n\t"
 202                         "subs %[w], %[w], ip\n\t"
 203                         "b 9f\n\t"
 204 // LOOP
 205                         "2:\n\t"
 206                         "vld1.8  {d0}, [%[src]]!\n\t"
 207                         "vld1.8  {d4}, [%[dst]]!\n\t"
 208                         "vst1.8  {d20}, [%[keep_dst]]\n\t"
 209                         "sub     %[keep_dst], %[dst], #8\n\t"
 210                         "subs %[w], %[w], #8\n\t"
 211                         "9:\n\t"
 212                         "vqadd.u8 d20, d0, d4\n\t"
 213
 214                         "bne 2b\n\t"
 215
 216                         "1:\n\t"
 217                         "vst1.8  {d20}, [%[keep_dst]]\n\t"
 218
 219                         : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 220                         :
 221                         : "ip", "cc", "memory", "d0","d4",
 222                           "d20"
 223                         );
 224 #endif
 225         }
 226     }
 227     else
 228     {
 229         const uint8_t nil = 0;
 230         const uint8x8_t vnil = vld1_dup_u8(&nil);
 231
 232         while (height--)
 233         {
 234             dst = dstLine;
 235             dstLine += dstStride;
 236             src = srcLine;
 237             srcLine += srcStride;
 238             w = width;
 239             uint8x8_t sval=vnil, dval=vnil;
 240             uint8_t *dst4=0, *dst2=0;
 241
 242             if (w&4)
 243             {
 244                 sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1));
 245                 dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1));
 246                 dst4=dst;
 247                 src+=4;
 248                 dst+=4;
 249             }
 250             if (w&2)
 251             {
 252                 sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1));
 253                 dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1));
 254                 dst2=dst;
 255                 src+=2;
 256                 dst+=2;
 257             }
 258             if (w&1)
 259             {
 260                 sval = vld1_lane_u8(src,sval,1);
 261                 dval = vld1_lane_u8(dst,dval,1);
 262             }
 263
 264             dval = vqadd_u8(dval,sval);
 265
 266             if (w&1)
 267                 vst1_lane_u8(dst,dval,1);
 268             if (w&2)
 269                 vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1);
 270             if (w&4)
 271                 vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1);
 272         }
 273     }
 274 }
 275
 276
 277 void
 278 fbCompositeSrc_8888x8888neon (
 279                             pixman_implementation_t * impl,
 280                             pixman_op_t op,
 281                          pixman_image_t * pSrc,
 282                          pixman_image_t * pMask,
 283                          pixman_image_t * pDst,
 284                          int32_t      xSrc,
 285                          int32_t      ySrc,
 286                          int32_t      xMask,
 287                          int32_t      yMask,
 288                          int32_t      xDst,
 289                          int32_t      yDst,
 290                          int32_t      width,
 291                          int32_t      height)
 292 {
 293     uint32_t    *dstLine, *dst;
 294     uint32_t    *srcLine, *src;
 295     int dstStride, srcStride;
 296     uint32_t    w;
 297
 298     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 299     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 300
 301     if (width>=8)
 302     {
 303         // Use overlapping 8-pixel method
 304         while (height--)
 305         {
 306             dst = dstLine;
 307             dstLine += dstStride;
 308             src = srcLine;
 309             srcLine += srcStride;
 310             w = width;
 311
 312             uint32_t *keep_dst=0;
 313
 314 #ifndef USE_GCC_INLINE_ASM
 315             uint8x8x4_t sval,dval,temp;
 316
 317             sval = vld4_u8((void*)src);
 318             dval = vld4_u8((void*)dst);
 319             keep_dst = dst;
 320
 321             temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 322             temp = neon8qadd(sval,temp);
 323
 324             src += (w & 7);
 325             dst += (w & 7);
 326             w -= (w & 7);
 327
 328             while (w)
 329             {
 330                 sval = vld4_u8((void*)src);
 331                 dval = vld4_u8((void*)dst);
 332
 333                 vst4_u8((void*)keep_dst,temp);
 334                 keep_dst = dst;
 335
 336                 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 337                 temp = neon8qadd(sval,temp);
 338
 339                 src+=8;
 340                 dst+=8;
 341                 w-=8;
 342             }
 343             vst4_u8((void*)keep_dst,temp);
 344 #else
 345             asm volatile (
 346 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
 347                         "vld4.8  {d0-d3}, [%[src]]\n\t"
 348                         "vld4.8  {d4-d7}, [%[dst]]\n\t"
 349                         "mov     %[keep_dst], %[dst]\n\t"
 350
 351                         "and ip, %[w], #7\n\t"
 352                         "add %[src], %[src], ip, LSL#2\n\t"
 353                         "add %[dst], %[dst], ip, LSL#2\n\t"
 354                         "subs %[w], %[w], ip\n\t"
 355                         "b 9f\n\t"
 356 // LOOP
 357                         "2:\n\t"
 358                         "vld4.8  {d0-d3}, [%[src]]!\n\t"
 359                         "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 360                         "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 361                         "sub     %[keep_dst], %[dst], #8*4\n\t"
 362                         "subs %[w], %[w], #8\n\t"
 363                         "9:\n\t"
 364                         "vmvn.8  d31, d3\n\t"
 365                         "vmull.u8 q10, d31, d4\n\t"
 366                         "vmull.u8 q11, d31, d5\n\t"
 367                         "vmull.u8 q12, d31, d6\n\t"
 368                         "vmull.u8 q13, d31, d7\n\t"
 369                         "vrshr.u16 q8, q10, #8\n\t"
 370                         "vrshr.u16 q9, q11, #8\n\t"
 371                         "vraddhn.u16 d20, q10, q8\n\t"
 372                         "vraddhn.u16 d21, q11, q9\n\t"
 373                         "vrshr.u16 q8, q12, #8\n\t"
 374                         "vrshr.u16 q9, q13, #8\n\t"
 375                         "vraddhn.u16 d22, q12, q8\n\t"
 376                         "vraddhn.u16 d23, q13, q9\n\t"
 377 // result in d20-d23
 378                         "vqadd.u8 d20, d0, d20\n\t"
 379                         "vqadd.u8 d21, d1, d21\n\t"
 380                         "vqadd.u8 d22, d2, d22\n\t"
 381                         "vqadd.u8 d23, d3, d23\n\t"
 382
 383                         "bne 2b\n\t"
 384
 385                         "1:\n\t"
 386                         "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 387
 388                         : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 389                         :
 390                         : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 391                           "d16","d17","d18","d19","d20","d21","d22","d23"
 392                         );
 393 #endif
 394         }
 395     }
 396     else
 397     {
 398         uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
 399
 400         // Handle width<8
 401         while (height--)
 402         {
 403             dst = dstLine;
 404             dstLine += dstStride;
 405             src = srcLine;
 406             srcLine += srcStride;
 407             w = width;
 408
 409             while (w>=2)
 410             {
 411                 uint8x8_t sval,dval;
 412
 413                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 414                 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
 415                 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
 416                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
 417                 vst1_u8((void*)dst,vqadd_u8(sval,dval));
 418
 419                 src+=2;
 420                 dst+=2;
 421                 w-=2;
 422             }
 423
 424             if (w)
 425             {
 426                 uint8x8_t sval,dval;
 427
 428                 /* single 32-bit pixel in lane 0 */
 429                 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));  // only interested in lane 0
 430                 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));  // only interested in lane 0
 431                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
 432                 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
 433             }
 434         }
 435     }
 436 }
 437
 438 void
 439 fbCompositeSrc_8888x8x8888neon (
 440                                pixman_implementation_t * impl,
 441                                pixman_op_t op,
 442                                pixman_image_t * pSrc,
 443                                pixman_image_t * pMask,
 444                                pixman_image_t * pDst,
 445                                int32_t  xSrc,
 446                                int32_t  ySrc,
 447                                int32_t      xMask,
 448                                int32_t      yMask,
 449                                int32_t      xDst,
 450                                int32_t      yDst,
 451                                int32_t      width,
 452                                int32_t      height)
 453 {
 454     uint32_t    *dstLine, *dst;
 455     uint32_t    *srcLine, *src;
 456     uint32_t    mask;
 457     int dstStride, srcStride;
 458     uint32_t    w;
 459     uint8x8_t mask_alpha;
 460
 461     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 462     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
 463
 464     _pixman_image_get_solid (pMask, mask, pDst->bits.format);
 465     mask_alpha = vdup_n_u8((mask) >> 24);
 466
 467     if (width>=8)
 468     {
 469         // Use overlapping 8-pixel method
 470         while (height--)
 471         {
 472             dst = dstLine;
 473             dstLine += dstStride;
 474             src = srcLine;
 475             srcLine += srcStride;
 476             w = width;
 477
 478             uint32_t *keep_dst=0;
 479
 480 #ifndef USE_GCC_INLINE_ASM
 481             uint8x8x4_t sval,dval,temp;
 482
 483             sval = vld4_u8((void*)src);
 484             dval = vld4_u8((void*)dst);
 485             keep_dst = dst;
 486
 487             sval = neon8mul(sval,mask_alpha);
 488             temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 489             temp = neon8qadd(sval,temp);
 490
 491             src += (w & 7);
 492             dst += (w & 7);
 493             w -= (w & 7);
 494
 495             while (w)
 496             {
 497                 sval = vld4_u8((void*)src);
 498                 dval = vld4_u8((void*)dst);
 499
 500                 vst4_u8((void*)keep_dst,temp);
 501                 keep_dst = dst;
 502
 503                 sval = neon8mul(sval,mask_alpha);
 504                 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 505                 temp = neon8qadd(sval,temp);
 506
 507                 src+=8;
 508                 dst+=8;
 509                 w-=8;
 510             }
 511             vst4_u8((void*)keep_dst,temp);
 512 #else
 513             asm volatile (
 514 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
 515                         "vdup.32      d30, %[mask]\n\t"
 516                         "vdup.8       d30, d30[3]\n\t"
 517
 518                         "vld4.8       {d0-d3}, [%[src]]\n\t"
 519                         "vld4.8       {d4-d7}, [%[dst]]\n\t"
 520                         "mov  %[keep_dst], %[dst]\n\t"
 521
 522                         "and  ip, %[w], #7\n\t"
 523                         "add  %[src], %[src], ip, LSL#2\n\t"
 524                         "add  %[dst], %[dst], ip, LSL#2\n\t"
 525                         "subs  %[w], %[w], ip\n\t"
 526                         "b 9f\n\t"
 527 // LOOP
 528                         "2:\n\t"
 529                         "vld4.8       {d0-d3}, [%[src]]!\n\t"
 530                         "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 531                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 532                         "sub  %[keep_dst], %[dst], #8*4\n\t"
 533                         "subs  %[w], %[w], #8\n\t"
 534
 535                         "9:\n\t"
 536                         "vmull.u8     q10, d30, d0\n\t"
 537                         "vmull.u8     q11, d30, d1\n\t"
 538                         "vmull.u8     q12, d30, d2\n\t"
 539                         "vmull.u8     q13, d30, d3\n\t"
 540                         "vrshr.u16    q8, q10, #8\n\t"
 541                         "vrshr.u16    q9, q11, #8\n\t"
 542                         "vraddhn.u16  d0, q10, q8\n\t"
 543                         "vraddhn.u16  d1, q11, q9\n\t"
 544                         "vrshr.u16    q9, q13, #8\n\t"
 545                         "vrshr.u16    q8, q12, #8\n\t"
 546                         "vraddhn.u16  d3, q13, q9\n\t"
 547                         "vraddhn.u16  d2, q12, q8\n\t"
 548
 549                         "vmvn.8       d31, d3\n\t"
 550                         "vmull.u8     q10, d31, d4\n\t"
 551                         "vmull.u8     q11, d31, d5\n\t"
 552                         "vmull.u8     q12, d31, d6\n\t"
 553                         "vmull.u8     q13, d31, d7\n\t"
 554                         "vrshr.u16    q8, q10, #8\n\t"
 555                         "vrshr.u16    q9, q11, #8\n\t"
 556                         "vraddhn.u16  d20, q10, q8\n\t"
 557                         "vrshr.u16    q8, q12, #8\n\t"
 558                         "vraddhn.u16  d21, q11, q9\n\t"
 559                         "vrshr.u16    q9, q13, #8\n\t"
 560                         "vraddhn.u16  d22, q12, q8\n\t"
 561                         "vraddhn.u16  d23, q13, q9\n\t"
 562 // result in d20-d23
 563                         "vqadd.u8     d20, d0, d20\n\t"
 564                         "vqadd.u8     d21, d1, d21\n\t"
 565                         "vqadd.u8     d22, d2, d22\n\t"
 566                         "vqadd.u8     d23, d3, d23\n\t"
 567
 568                         "bne  2b\n\t"
 569
 570                         "1:\n\t"
 571                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 572
 573                         : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 574                         : [mask] "r" (mask)
 575                         : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 576                           "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
 577                           "d30","d31"
 578                         );
 579 #endif
 580         }
 581     }
 582     else
 583     {
 584         uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
 585
 586         // Handle width<8
 587         while (height--)
 588         {
 589             dst = dstLine;
 590             dstLine += dstStride;
 591             src = srcLine;
 592             srcLine += srcStride;
 593             w = width;
 594
 595             while (w>=2)
 596             {
 597                 uint8x8_t sval,dval;
 598
 599                 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
 600                 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
 601
 602                 /* sval * const alpha_mul */
 603                 sval = neon2mul(sval,mask_alpha);
 604
 605                 /* dval * 255-(src alpha) */
 606                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
 607
 608                 vst1_u8((void*)dst,vqadd_u8(sval,dval));
 609
 610                 src+=2;
 611                 dst+=2;
 612                 w-=2;
 613             }
 614
 615             if (w)
 616             {
 617                 uint8x8_t sval,dval;
 618
 619                 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));
 620                 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
 621
 622                 /* sval * const alpha_mul */
 623                 sval = neon2mul(sval,mask_alpha);
 624
 625                 /* dval * 255-(src alpha) */
 626                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
 627
 628                 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
 629             }
 630         }
 631     }
 632 }
 633
 634
 635 void
 636 fbCompositeSolidMask_nx8x8888neon (
 637                             pixman_implementation_t * impl,
 638                             pixman_op_t      op,
 639                                pixman_image_t * pSrc,
 640                                pixman_image_t * pMask,
 641                                pixman_image_t * pDst,
 642                                int32_t      xSrc,
 643                                int32_t      ySrc,
 644                                int32_t      xMask,
 645                                int32_t      yMask,
 646                                int32_t      xDst,
 647                                int32_t      yDst,
 648                                int32_t      width,
 649                                int32_t      height)
 650 {
 651     uint32_t     src, srca;
 652     uint32_t    *dstLine, *dst;
 653     uint8_t     *maskLine, *mask;
 654     int          dstStride, maskStride;
 655     uint32_t     w;
 656     uint8x8_t    sval2;
 657     uint8x8x4_t  sval8;
 658     uint8x8_t    mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL));
 659     uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
 660
 661     _pixman_image_get_solid(pSrc, src, pDst->bits.format);
 662
 663     srca = src >> 24;
 664     if (src == 0)
 665         return;
 666
 667     sval2=vreinterpret_u8_u32(vdup_n_u32(src));
 668     sval8.val[0]=vdup_lane_u8(sval2,0);
 669     sval8.val[1]=vdup_lane_u8(sval2,1);
 670     sval8.val[2]=vdup_lane_u8(sval2,2);
 671     sval8.val[3]=vdup_lane_u8(sval2,3);
 672
 673     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
 674     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
 675
 676     if (width>=8)
 677     {
 678         // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
 679         while (height--)
 680         {
 681             uint32_t *keep_dst=0;
 682
 683             dst = dstLine;
 684             dstLine += dstStride;
 685             mask = maskLine;
 686             maskLine += maskStride;
 687             w = width;
 688
 689 #ifndef USE_GCC_INLINE_ASM
 690             uint8x8_t alpha;
 691             uint8x8x4_t dval, temp;
 692
 693             alpha = vld1_u8((void*)mask);
 694             dval = vld4_u8((void*)dst);
 695             keep_dst = dst;
 696
 697             temp = neon8mul(sval8,alpha);
 698             dval = neon8mul(dval,vmvn_u8(temp.val[3]));
 699             temp = neon8qadd(temp,dval);
 700
 701             mask += (w & 7);
 702             dst += (w & 7);
 703             w -= (w & 7);
 704
 705             while (w)
 706             {
 707                 alpha = vld1_u8((void*)mask);
 708                 dval = vld4_u8((void*)dst);
 709
 710                 vst4_u8((void*)keep_dst,temp);
 711                 keep_dst = dst;
 712
 713                 temp = neon8mul(sval8,alpha);
 714                 dval = neon8mul(dval,vmvn_u8(temp.val[3]));
 715                 temp = neon8qadd(temp,dval);
 716
 717                 mask+=8;
 718                 dst+=8;
 719                 w-=8;
 720             }
 721             vst4_u8((void*)keep_dst,temp);
 722 #else
 723         asm volatile (
 724                         "vdup.32      d0, %[src]\n\t"
 725                         "vdup.8       d1, d0[1]\n\t"
 726                         "vdup.8       d2, d0[2]\n\t"
 727                         "vdup.8       d3, d0[3]\n\t"
 728                         "vdup.8       d0, d0[0]\n\t"
 729
 730                         "vld4.8       {d4-d7}, [%[dst]]\n\t"
 731                         "vld1.8       {d31}, [%[mask]]\n\t"
 732                         "mov  %[keep_dst], %[dst]\n\t"
 733
 734                         "and  ip, %[w], #7\n\t"
 735                         "add  %[mask], %[mask], ip\n\t"
 736                         "add  %[dst], %[dst], ip, LSL#2\n\t"
 737                         "subs  %[w], %[w], ip\n\t"
 738                         "b 9f\n\t"
 739 // LOOP
 740                         "2:\n\t"
 741                         "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 742                         "vld1.8       {d31}, [%[mask]]!\n\t"
 743                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 744                         "sub  %[keep_dst], %[dst], #8*4\n\t"
 745                         "subs  %[w], %[w], #8\n\t"
 746                         "9:\n\t"
 747
 748                         "vmull.u8     q10, d31, d0\n\t"
 749                         "vmull.u8     q11, d31, d1\n\t"
 750                         "vmull.u8     q12, d31, d2\n\t"
 751                         "vmull.u8     q13, d31, d3\n\t"
 752                         "vrshr.u16    q8, q10, #8\n\t"
 753                         "vrshr.u16    q9, q11, #8\n\t"
 754                         "vraddhn.u16  d20, q10, q8\n\t"
 755                         "vraddhn.u16  d21, q11, q9\n\t"
 756                         "vrshr.u16    q9, q13, #8\n\t"
 757                         "vrshr.u16    q8, q12, #8\n\t"
 758                         "vraddhn.u16  d23, q13, q9\n\t"
 759                         "vraddhn.u16  d22, q12, q8\n\t"
 760
 761                         "vmvn.8       d30, d23\n\t"
 762                         "vmull.u8     q12, d30, d4\n\t"
 763                         "vmull.u8     q13, d30, d5\n\t"
 764                         "vmull.u8     q14, d30, d6\n\t"
 765                         "vmull.u8     q15, d30, d7\n\t"
 766
 767                         "vrshr.u16    q8, q12, #8\n\t"
 768                         "vrshr.u16    q9, q13, #8\n\t"
 769                         "vraddhn.u16  d4, q12, q8\n\t"
 770                         "vrshr.u16    q8, q14, #8\n\t"
 771                         "vraddhn.u16  d5, q13, q9\n\t"
 772                         "vrshr.u16    q9, q15, #8\n\t"
 773                         "vraddhn.u16  d6, q14, q8\n\t"
 774                         "vraddhn.u16  d7, q15, q9\n\t"
 775 // result in d4-d7
 776
 777                         "vqadd.u8     d20, d4, d20\n\t"
 778                         "vqadd.u8     d21, d5, d21\n\t"
 779                         "vqadd.u8     d22, d6, d22\n\t"
 780                         "vqadd.u8     d23, d7, d23\n\t"
 781
 782                         "bne 2b\n\t"
 783
 784                         "1:\n\t"
 785                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 786
 787                         : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 788                         : [src] "r" (src)
 789                         : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 790                           "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 791                           "d30","d31"
 792                         );
 793 #endif
 794         }
 795     }
 796     else
 797     {
 798         while (height--)
 799         {
 800             uint8x8_t alpha;
 801
 802             dst = dstLine;
 803             dstLine += dstStride;
 804             mask = maskLine;
 805             maskLine += maskStride;
 806             w = width;
 807
 808             while (w>=2)
 809             {
 810                 uint8x8_t dval, temp, res;
 811
 812                 alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector);
 813                 dval = vld1_u8((void*)dst);
 814
 815                 temp = neon2mul(sval2,alpha);
 816                 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
 817
 818                 vst1_u8((void*)dst,res);
 819
 820                 mask+=2;
 821                 dst+=2;
 822                 w-=2;
 823             }
 824             if (w)
 825             {
 826                 uint8x8_t dval, temp, res;
 827
 828                 alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector);
 829                 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
 830
 831                 temp = neon2mul(sval2,alpha);
 832                 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
 833
 834                 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0);
 835             }
 836         }
 837     }
 838 }
 839
 840
 841 void
 842 fbCompositeSrcAdd_8888x8x8neon (
 843                             pixman_implementation_t * impl,
 844                             pixman_op_t op,
 845                             pixman_image_t * pSrc,
 846                             pixman_image_t * pMask,
 847                             pixman_image_t * pDst,
 848                             int32_t      xSrc,
 849                             int32_t      ySrc,
 850                             int32_t      xMask,
 851                             int32_t      yMask,
 852                             int32_t      xDst,
 853                             int32_t      yDst,
 854                             int32_t      width,
 855                             int32_t      height)
 856 {
 857     uint8_t     *dstLine, *dst;
 858     uint8_t     *maskLine, *mask;
 859     int dstStride, maskStride;
 860     uint32_t    w;
 861     uint32_t    src;
 862     uint8x8_t   sa;
 863
 864     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
 865     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
 866     _pixman_image_get_solid (pSrc, src, pDst->bits.format);
 867     sa = vdup_n_u8((src) >> 24);
 868
 869     if (width>=8)
 870     {
 871         // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
 872         while (height--)
 873         {
 874             dst = dstLine;
 875             dstLine += dstStride;
 876             mask = maskLine;
 877             maskLine += maskStride;
 878             w = width;
 879
 880             uint8x8_t mval, dval, res;
 881             uint8_t     *keep_dst;
 882
 883             mval = vld1_u8((void *)mask);
 884             dval = vld1_u8((void *)dst);
 885             keep_dst = dst;
 886
 887             res = vqadd_u8(neon2mul(mval,sa),dval);
 888
 889             mask += (w & 7);
 890             dst += (w & 7);
 891             w -= w & 7;
 892
 893             while (w)
 894             {
 895                 mval = vld1_u8((void *)mask);
 896                 dval = vld1_u8((void *)dst);
 897                 vst1_u8((void *)keep_dst, res);
 898                 keep_dst = dst;
 899
 900                 res = vqadd_u8(neon2mul(mval,sa),dval);
 901
 902                 mask += 8;
 903                 dst += 8;
 904                 w -= 8;
 905             }
 906             vst1_u8((void *)keep_dst, res);
 907         }
 908     }
 909     else
 910     {
 911         // Use 4/2/1 load/store method to handle 1-7 pixels
 912         while (height--)
 913         {
 914             dst = dstLine;
 915             dstLine += dstStride;
 916             mask = maskLine;
 917             maskLine += maskStride;
 918             w = width;
 919
 920             uint8x8_t mval=sa, dval=sa, res;
 921             uint8_t *dst4=0, *dst2=0;
 922
 923             if (w&4)
 924             {
 925                 mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1));
 926                 dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1));
 927
 928                 dst4 = dst;
 929                 mask += 4;
 930                 dst += 4;
 931             }
 932             if (w&2)
 933             {
 934                 mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1));
 935                 dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1));
 936                 dst2 = dst;
 937                 mask += 2;
 938                 dst += 2;
 939             }
 940             if (w&1)
 941             {
 942                 mval = vld1_lane_u8(mask, mval, 1);
 943                 dval = vld1_lane_u8(dst, dval, 1);
 944             }
 945
 946             res = vqadd_u8(neon2mul(mval,sa),dval);
 947
 948             if (w&1)
 949                 vst1_lane_u8(dst, res, 1);
 950             if (w&2)
 951                 vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1);
 952             if (w&4)
 953                 vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1);
 954         }
 955     }
 956 }
 957
 958 #ifdef USE_GCC_INLINE_ASM
 959
 960 void
 961 fbCompositeSrc_16x16neon (
 962         pixman_implementation_t * impl,
 963         pixman_op_t op,
 964         pixman_image_t * pSrc,
 965         pixman_image_t * pMask,
 966         pixman_image_t * pDst,
 967         int32_t      xSrc,
 968         int32_t      ySrc,
 969         int32_t      xMask,
 970         int32_t      yMask,
 971         int32_t      xDst,
 972         int32_t      yDst,
 973         int32_t      width,
 974         int32_t      height)
 975 {
 976         uint16_t    *dstLine, *srcLine;
 977         uint32_t     dstStride, srcStride;
 978
 979         if(!height || !width)
 980                 return;
 981
 982         /* We simply copy 16-bit-aligned pixels from one place to another. */
 983         fbComposeGetStart (pSrc, xSrc, ySrc, uint16_t, srcStride, srcLine, 1);
 984         fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
 985
 986         /* Preload the first input scanline */
 987         {
 988                 uint16_t *srcPtr = srcLine;
 989                 uint32_t count = width;
 990
 991                 asm volatile (
 992                 "0: @ loop                                                      \n"
 993                 "       subs    %[count], %[count], #32                         \n"
 994                 "       pld     [%[src]]                                        \n"
 995                 "       add     %[src], %[src], #64                             \n"
 996                 "       bgt 0b                                                  \n"
 997
 998                 // Clobbered input registers marked as input/outputs
 999                 : [src] "+r" (srcPtr), [count] "+r" (count)
1000                 : // no unclobbered inputs
1001                 : "cc"
1002                 );
1003         }
1004
1005         while(height--) {
1006                 uint16_t *dstPtr = dstLine;
1007                 uint16_t *srcPtr = srcLine;
1008                 uint32_t count = width;
1009                 uint32_t tmp = 0;
1010
1011                 // Uses multi-register access and preloading to maximise bandwidth.
1012                 // Each pixel is one halfword, so a quadword contains 8px.
1013                 // Preload frequency assumed a 64-byte cacheline.
1014                 asm volatile (
1015                 "       cmp       %[count], #64                         \n"
1016                 "       blt 1f    @ skip oversized fragments            \n"
1017                 "0: @ start with eight quadwords at a time              \n"
1018                 "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
1019                 "       sub       %[count], %[count], #64               \n"
1020                 "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1021                 "       vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1022                 "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
1023                 "       vld1.16   {d24,d25,d26,d27}, [%[src]]!          \n"
1024                 "       vld1.16   {d28,d29,d30,d31}, [%[src]]!          \n"
1025                 "       cmp       %[count], #64                         \n"
1026                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1027                 "       vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1028                 "       vst1.16   {d24,d25,d26,d27}, [%[dst]]!          \n"
1029                 "       vst1.16   {d28,d29,d30,d31}, [%[dst]]!          \n"
1030                 "       bge 0b                                          \n"
1031                 "       cmp       %[count], #0                          \n"
1032                 "       beq 7f    @ aligned fastpath                    \n"
1033                 "1: @ four quadwords                                    \n"
1034                 "       tst       %[count], #32                         \n"
1035                 "       beq 2f    @ skip oversized fragment             \n"
1036                 "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
1037                 "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1038                 "       vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1039                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1040                 "       vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1041                 "2: @ two quadwords                                     \n"
1042                 "       tst       %[count], #16                         \n"
1043                 "       beq 3f    @ skip oversized fragment             \n"
1044                 "       pld       [%[src], %[srcStride], LSL #1]        \n" // preload from next scanline
1045                 "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1046                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1047                 "3: @ one quadword                                      \n"
1048                 "       tst       %[count], #8                          \n"
1049                 "       beq 4f    @ skip oversized fragment             \n"
1050                 "       vld1.16   {d16,d17}, [%[src]]!                  \n"
1051                 "       vst1.16   {d16,d17}, [%[dst]]!                  \n"
1052                 "4: @ one doubleword                                    \n"
1053                 "       tst       %[count], #4                          \n"
1054                 "       beq 5f    @ skip oversized fragment             \n"
1055                 "       vld1.16   {d16}, [%[src]]!                      \n"
1056                 "       vst1.16   {d16}, [%[dst]]!                      \n"
1057                 "5: @ one word                                          \n"
1058                 "       tst       %[count], #2                          \n"
1059                 "       beq 6f    @ skip oversized fragment             \n"
1060                 "       ldr       %[tmp], [%[src]], #4                  \n"
1061                 "       str       %[tmp], [%[dst]], #4                  \n"
1062                 "6: @ one halfword                                      \n"
1063                 "       tst       %[count], #1                          \n"
1064                 "       beq 7f    @ skip oversized fragment             \n"
1065                 "       ldrh      %[tmp], [%[src]]                      \n"
1066                 "       strh      %[tmp], [%[dst]]                      \n"
1067                 "7: @ end                                               \n"
1068
1069                 // Clobbered input registers marked as input/outputs
1070                 : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
1071
1072                 // Unclobbered input
1073                 : [srcStride] "r" (srcStride)
1074
1075                 // Clobbered vector registers
1076                 // NB: these are the quad aliases of the double registers used in the asm
1077                 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1078                 );
1079
1080                 srcLine += srcStride;
1081                 dstLine += dstStride;
1082         }
1083 }
1084
1085 #endif /* USE_GCC_INLINE_ASM */
1086
1087 void
1088 fbCompositeSrc_24x16neon (
1089         pixman_implementation_t * impl,
1090         pixman_op_t op,
1091         pixman_image_t * pSrc,
1092         pixman_image_t * pMask,
1093         pixman_image_t * pDst,
1094         int32_t      xSrc,
1095         int32_t      ySrc,
1096         int32_t      xMask,
1097         int32_t      yMask,
1098         int32_t      xDst,
1099         int32_t      yDst,
1100         int32_t      width,
1101         int32_t      height)
1102 {
1103         uint16_t    *dstLine;
1104         uint32_t    *srcLine;
1105         uint32_t     dstStride, srcStride;
1106
1107         if(!width || !height)
1108                 return;
1109
1110         /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
1111         fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
1112         fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
1113
1114         /* Preload the first input scanline */
1115         {
1116                 uint8_t *srcPtr = (uint8_t*) srcLine;
1117                 uint32_t count = (width + 15) / 16;
1118
1119 #ifdef USE_GCC_INLINE_ASM
1120                 asm volatile (
1121                 "0: @ loop                                              \n"
1122                 "       subs    %[count], %[count], #1                  \n"
1123                 "       pld     [%[src]]                                \n"
1124                 "       add     %[src], %[src], #64                     \n"
1125                 "       bgt 0b                                          \n"
1126
1127                 // Clobbered input registers marked as input/outputs
1128                 : [src] "+r" (srcPtr), [count] "+r" (count)
1129                 : // no unclobbered inputs
1130                 : "cc"
1131                 );
1132 #else
1133                 do {
1134                         __pld(srcPtr);
1135                         srcPtr += 64;
1136                 } while(--count);
1137 #endif
1138         }
1139
1140         while(height--) {
1141                 uint16_t *dstPtr = dstLine;
1142                 uint32_t *srcPtr = srcLine;
1143                 uint32_t count = width;
1144                 const uint32_t RBmask = 0x1F;
1145                 const uint32_t Gmask = 0x3F;
1146
1147                 // If you're going to complain about a goto, take a long hard look
1148                 // at the massive blocks of assembler this skips over.  ;-)
1149                 if(count < 8)
1150                         goto smallStuff;
1151
1152 #ifdef USE_GCC_INLINE_ASM
1153
1154                 // This is not as aggressive as the RGB565-source case.
1155                 // Generally the source is in cached RAM when the formats are different, so we use preload.
1156                 // We don't need to blend, so we are not reading from the uncached framebuffer.
1157                 asm volatile (
1158                 "       cmp       %[count], #16                                                                         \n"
1159                 "       blt 1f    @ skip oversized fragments                                                            \n"
1160                 "0: @ start with sixteen pixels at a time                                                               \n"
1161                 "       sub       %[count], %[count], #16                                                               \n"
1162                 "       pld      [%[src], %[srcStride], lsl #2]         @ preload from next scanline                    \n"
1163                 "       vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1164                 "       vld4.8    {d4,d5,d6,d7}, [%[src]]!              @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1165                 "       vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1166                 "       vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1167                 "       vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1168                 "       vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1169                 "       vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1170                 "       vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1171                 "       vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1172                 "       vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1173                 "       vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1174                 "       vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1175                 "       cmp       %[count], #16                                                                         \n"
1176                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels                               \n"
1177                 "       bge 0b                                                                                          \n"
1178                 "1: @ end of main loop  \n"
1179                 "       cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1180                 "       blt 2f                                                                                          \n"
1181                 "       sub       %[count], %[count], #8        \n"
1182                 "       pld      [%[src], %[srcStride], lsl #2]         @ preload from next scanline                    \n"
1183                 "       vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1184                 "       vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1185                 "       vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1186                 "       vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1187                 "       vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1188                 "       vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1189                 "       vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels                                \n"
1190                 "2: @ end                                                                                               \n"
1191
1192                 // Clobbered input and working registers marked as input/outputs
1193                 : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count)
1194
1195                 // Unclobbered input
1196                 : [srcStride] "r" (srcStride)
1197
1198                 // Clobbered vector registers
1199                 // NB: these are the quad aliases of the double registers used in the asm
1200                 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
1201                 );
1202 #else
1203                 // A copy of the above code, in intrinsics-form.
1204                 // This should be pretty self-documenting...
1205                 while(count >= 16) {
1206                         uint8x8x4_t pixelSetA, pixelSetB;
1207                         uint16x8_t redA, greenA, blueA;
1208                         uint16x8_t redB, greenB, blueB;
1209                         uint16x8_t destPixelsA, destPixelsB;
1210
1211                         count -= 16;
1212                         __pld(srcPtr + srcStride);
1213                         pixelSetA = vld4_u8((uint8_t*)(srcPtr));
1214                         pixelSetB = vld4_u8((uint8_t*)(srcPtr+8));
1215                         srcPtr += 16;
1216
1217                         redA   = vshll_n_u8(pixelSetA.val[2], 8);
1218                         greenA = vshll_n_u8(pixelSetA.val[1], 8);
1219                         blueA  = vshll_n_u8(pixelSetA.val[0], 8);
1220                         redB   = vshll_n_u8(pixelSetB.val[2], 8);
1221                         greenB = vshll_n_u8(pixelSetB.val[1], 8);
1222                         blueB  = vshll_n_u8(pixelSetB.val[0], 8);
1223                         destPixelsA = vsriq_n_u16(redA, greenA, 5);
1224                         destPixelsB = vsriq_n_u16(redB, greenB, 5);
1225                         destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
1226                         destPixelsB = vsriq_n_u16(destPixelsB, blueB, 11);
1227
1228                         // There doesn't seem to be an intrinsic for the double-quadword variant
1229                         vst1q_u16(dstPtr  , destPixelsA);
1230                         vst1q_u16(dstPtr+8, destPixelsB);
1231                         dstPtr += 16;
1232                 }
1233
1234                 // 8-pixel loop
1235                 if(count >= 8) {
1236                         uint8x8x4_t pixelSetA;
1237                         uint16x8_t redA, greenA, blueA;
1238                         uint16x8_t destPixelsA;
1239
1240                         __pld(srcPtr + srcStride);
1241                         count -= 8;
1242                         pixelSetA = vld4_u8((uint8_t*)(srcPtr));
1243                         srcPtr += 8;
1244
1245                         redA   = vshll_n_u8(pixelSetA.val[2], 8);
1246                         greenA = vshll_n_u8(pixelSetA.val[1], 8);
1247                         blueA  = vshll_n_u8(pixelSetA.val[0], 8);
1248                         destPixelsA = vsriq_n_u16(redA, greenA, 5);
1249                         destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
1250
1251                         vst1q_u16(dstPtr  , destPixelsA);
1252                         dstPtr += 8;
1253                 }
1254
1255 #endif  // USE_GCC_INLINE_ASM
1256
1257         smallStuff:
1258
1259                 if(count)
1260                         __pld(srcPtr + srcStride);
1261
1262                 while(count >= 2) {
1263                         uint32_t srcPixelA = *srcPtr++;
1264                         uint32_t srcPixelB = *srcPtr++;
1265
1266                         // ARM is really good at shift-then-ALU ops.
1267                         // This should be a total of six shift-ANDs and five shift-ORs.
1268                         uint32_t dstPixelsA;
1269                         uint32_t dstPixelsB;
1270
1271                         dstPixelsA  = ((srcPixelA >>  3) & RBmask);
1272                         dstPixelsA |= ((srcPixelA >> 10) &  Gmask) << 5;
1273                         dstPixelsA |= ((srcPixelA >> 19) & RBmask) << 11;
1274
1275                         dstPixelsB  = ((srcPixelB >>  3) & RBmask);
1276                         dstPixelsB |= ((srcPixelB >> 10) &  Gmask) << 5;
1277                         dstPixelsB |= ((srcPixelB >> 19) & RBmask) << 11;
1278
1279                         // little-endian mode only
1280                         *((uint32_t*) dstPtr) = dstPixelsA | (dstPixelsB << 16);
1281                         dstPtr += 2;
1282                         count -= 2;
1283                 }
1284
1285                 if(count) {
1286                         uint32_t srcPixel = *srcPtr++;
1287
1288                         // ARM is really good at shift-then-ALU ops.
1289                         // This block should end up as three shift-ANDs and two shift-ORs.
1290                         uint32_t tmpBlue  = (srcPixel >>  3) & RBmask;
1291                         uint32_t tmpGreen = (srcPixel >> 10) & Gmask;
1292                         uint32_t tmpRed   = (srcPixel >> 19) & RBmask;
1293                         uint16_t dstPixel = (tmpRed << 11) | (tmpGreen << 5) | tmpBlue;
1294
1295                         *dstPtr++ = dstPixel;
1296                         count--;
1297                 }
1298
1299                 srcLine += srcStride;
1300                 dstLine += dstStride;
1301         }
1302 }
1303
1304
1305 pixman_bool_t
1306 pixman_fill_neon (uint32_t *bits,
1307                   int stride,
1308                   int bpp,
1309                   int x,
1310                   int y,
1311                   int width,
1312                   int height,
1313                   uint32_t _xor)
1314 {
1315     uint32_t byte_stride, color;
1316     char *dst;
1317
1318     /* stride is always multiple of 32bit units in pixman */
1319     byte_stride = stride * sizeof(uint32_t);
1320
1321     switch (bpp)
1322     {
1323         case 8:
1324             dst = ((char *) bits) + y * byte_stride + x;
1325             _xor &= 0xff;
1326             color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1327             break;
1328         case 16:
1329             dst = ((char *) bits) + y * byte_stride + x * 2;
1330             _xor &= 0xffff;
1331             color = _xor << 16 | _xor;
1332             width *= 2;     /* width to bytes */
1333             break;
1334         case 32:
1335             dst = ((char *) bits) + y * byte_stride + x * 4;
1336             color = _xor;
1337             width *= 4;     /* width to bytes */
1338             break;
1339         default:
1340             return FALSE;
1341     }
1342
1343 #ifdef USE_GCC_INLINE_ASM
1344     if (width < 16)
1345         /* We have a special case for such small widths that don't allow
1346            us to use wide 128-bit stores anyway. We don't waste time
1347            trying to align writes, since there are only very few of them anyway */
1348         asm volatile (
1349         "cmp            %[height], #0\n" /* Check if empty fill */
1350         "beq            3f\n"
1351         "vdup.32        d0, %[color]\n"  /* Fill the color to neon req */
1352
1353         /* Check if we have a such width that can easily be handled by single
1354            operation for each scanline. This significantly reduces the number
1355            of test/branch instructions for each scanline */
1356         "cmp            %[width], #8\n"
1357         "beq            4f\n"
1358         "cmp            %[width], #4\n"
1359         "beq            5f\n"
1360         "cmp            %[width], #2\n"
1361         "beq            6f\n"
1362
1363         /* Loop starts here for each scanline */
1364         "1:\n"
1365         "mov            r4, %[dst]\n"    /* Starting address of the current line */
1366         "tst            %[width], #8\n"
1367         "beq            2f\n"
1368         "vst1.8         {d0}, [r4]!\n"
1369         "2:\n"
1370         "tst            %[width], #4\n"
1371         "beq            2f\n"
1372         "str            %[color], [r4], #4\n"
1373         "2:\n"
1374         "tst            %[width], #2\n"
1375         "beq            2f\n"
1376         "strh           %[color], [r4], #2\n"
1377         "2:\n"
1378         "tst            %[width], #1\n"
1379         "beq            2f\n"
1380         "strb           %[color], [r4], #1\n"
1381         "2:\n"
1382
1383         "subs           %[height], %[height], #1\n"
1384         "add            %[dst], %[dst], %[byte_stride]\n"
1385         "bne            1b\n"
1386         "b              3f\n"
1387
1388         /* Special fillers for those widths that we can do with single operation */
1389         "4:\n"
1390         "subs           %[height], %[height], #1\n"
1391         "vst1.8         {d0}, [%[dst]]\n"
1392         "add            %[dst], %[dst], %[byte_stride]\n"
1393         "bne            4b\n"
1394         "b              3f\n"
1395
1396         "5:\n"
1397         "subs           %[height], %[height], #1\n"
1398         "str            %[color], [%[dst]]\n"
1399         "add            %[dst], %[dst], %[byte_stride]\n"
1400         "bne            5b\n"
1401         "b              3f\n"
1402
1403         "6:\n"
1404         "subs           %[height], %[height], #1\n"
1405         "strh           %[color], [%[dst]]\n"
1406         "add            %[dst], %[dst], %[byte_stride]\n"
1407         "bne            6b\n"
1408
1409         "3:\n"
1410         : /* No output members */
1411         : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1412           [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1413         : "memory", "cc", "d0", "r4", "r5");
1414     else
1415         asm volatile (
1416         "cmp            %[height], #0\n" /* Check if empty fill */
1417         "beq            5f\n"
1418         "vdup.32        q0, %[color]\n"  /* Fill the color to neon req */
1419
1420         /* Loop starts here for each scanline */
1421         "1:\n"
1422         "mov            r4, %[dst]\n"    /* Starting address of the current line */
1423         "mov            r5, %[width]\n"  /* We're going to write this many bytes */
1424         "ands           r6, r4, #15\n"   /* Are we at the 128-bit aligned address? */
1425         "beq            2f\n"            /* Jump to the best case */
1426
1427         /* We're not 128-bit aligned: However, we know that we can get to the
1428            next aligned location, since the fill is at least 16 bytes wide */
1429         "rsb            r6, r6, #16\n"   /* We would need to go forward this much */
1430         "sub            r5, r5, r6\n"    /* Update bytes left */
1431         "tst            r6, #1\n"
1432         "beq            6f\n"
1433         "vst1.8         {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1434         "6:\n"
1435         "tst            r6, #2\n"
1436         "beq            6f\n"
1437         "vst1.16        {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1438         "6:\n"
1439         "tst            r6, #4\n"
1440         "beq            6f\n"
1441         "vst1.32        {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1442         "6:\n"
1443         "tst            r6, #8\n"
1444         "beq            2f\n"
1445         "vst1.64        {d0}, [r4, :64]!\n"    /* Store qword now we're 64-bit aligned */
1446
1447         /* The good case: We're 128-bit aligned for this scanline */
1448         "2:\n"
1449         "and            r6, r5, #15\n"        /* Number of tailing bytes */
1450         "cmp            r5, r6\n"             /* Do we have at least one qword to write? */
1451         "beq            6f\n"                 /* No, we just write the tail */
1452         "lsr            r5, r5, #4\n"         /* This many full qwords to write */
1453
1454         /* The main block: Do 128-bit aligned writes */
1455         "3:\n"
1456         "subs           r5, r5, #1\n"
1457         "vst1.64        {d0,d1}, [r4, :128]!\n"
1458         "bne            3b\n"
1459
1460         /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1461             We know that we're currently at 128-bit aligned address, so we can just
1462             pick the biggest operations that the remaining write width allows */
1463         "6:\n"
1464         "cmp            r6, #0\n"
1465         "beq            4f\n"
1466         "tst            r6, #8\n"
1467         "beq            6f\n"
1468         "vst1.64        {d0}, [r4, :64]!\n"
1469         "6:\n"
1470         "tst            r6, #4\n"
1471         "beq            6f\n"
1472         "vst1.32        {d0[0]}, [r4, :32]!\n"
1473         "6:\n"
1474         "tst            r6, #2\n"
1475         "beq            6f\n"
1476         "vst1.16        {d0[0]}, [r4, :16]!\n"
1477         "6:\n"
1478         "tst            r6, #1\n"
1479         "beq            4f\n"
1480         "vst1.8         {d0[0]}, [r4]!\n"
1481         "4:\n"
1482
1483         /* Handle the next scanline */
1484         "subs           %[height], %[height], #1\n"
1485         "add            %[dst], %[dst], %[byte_stride]\n"
1486         "bne            1b\n"
1487         "5:\n"
1488         : /* No output members */
1489         : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1490           [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1491         : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1492
1493     return TRUE;
1494
1495 #else
1496
1497     // TODO: intrinsic version for armcc
1498     return FALSE;
1499
1500 #endif
1501 }
1502
1503
1504 // TODO: is there a more generic way of doing this being introduced?
1505 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1506
1507 static inline void QuadwordCopy_neon(
1508         void* dst,
1509         void* src,
1510         uint32_t count,       // of quadwords
1511         uint32_t trailerCount // of bytes
1512 )
1513 {
1514         uint8_t *tDst = dst, *tSrc = src;
1515
1516         // Uses aligned multi-register loads to maximise read bandwidth
1517         // on uncached memory such as framebuffers
1518         // The accesses do not have the aligned qualifiers, so that the copy
1519         // may convert between aligned-uncached and unaligned-cached memory.
1520         // It is assumed that the CPU can infer alignedness from the address.
1521
1522 #ifdef USE_GCC_INLINE_ASM
1523
1524         asm volatile (
1525         "       cmp       %[count], #8                                          \n"
1526         "       blt 1f    @ skip oversized fragments            \n"
1527         "0: @ start with eight quadwords at a time              \n"
1528         "       sub       %[count], %[count], #8                        \n"
1529         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1530         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1531         "       vld1.8    {d24,d25,d26,d27}, [%[src]]!          \n"
1532         "       vld1.8    {d28,d29,d30,d31}, [%[src]]!          \n"
1533         "       cmp       %[count], #8                                          \n"
1534         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1535         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1536         "       vst1.8    {d24,d25,d26,d27}, [%[dst]]!          \n"
1537         "       vst1.8    {d28,d29,d30,d31}, [%[dst]]!          \n"
1538         "       bge 0b                                                                          \n"
1539         "1: @ four quadwords                                                    \n"
1540         "       tst       %[count], #4                                          \n"
1541         "       beq 2f    @ skip oversized fragment                     \n"
1542         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1543         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1544         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1545         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1546         "2: @ two quadwords                                                             \n"
1547         "       tst       %[count], #2                                          \n"
1548         "       beq 3f    @ skip oversized fragment                     \n"
1549         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1550         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1551         "3: @ one quadword                                                              \n"
1552         "       tst       %[count], #1                                          \n"
1553         "       beq 4f    @ skip oversized fragment                     \n"
1554         "       vld1.8    {d16,d17}, [%[src]]!                          \n"
1555         "       vst1.8    {d16,d17}, [%[dst]]!                          \n"
1556         "4: @ end                                                                               \n"
1557
1558         // Clobbered input registers marked as input/outputs
1559         : [dst] "+r" (tDst), [src] "+r" (tSrc), [count] "+r" (count)
1560
1561         // No unclobbered inputs
1562         :
1563
1564         // Clobbered vector registers
1565         // NB: these are the quad aliases of the double registers used in the asm
1566         : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1567         );
1568
1569 #else
1570
1571         while(count >= 8) {
1572                 uint8x16x4_t t1 = vld4q_u8(tSrc);
1573                 uint8x16x4_t t2 = vld4q_u8(tSrc + sizeof(uint8x16x4_t));
1574                 tSrc += sizeof(uint8x16x4_t) * 2;
1575                 vst4q_u8(tDst, t1);
1576                 vst4q_u8(tDst + sizeof(uint8x16x4_t), t2);
1577                 tDst += sizeof(uint8x16x4_t) * 2;
1578                 count -= 8;
1579         }
1580
1581         if(count & 4) {
1582                 uint8x16x4_t t1 = vld4q_u8(tSrc);
1583                 tSrc += sizeof(uint8x16x4_t);
1584                 vst4q_u8(tDst, t1);
1585                 tDst += sizeof(uint8x16x4_t);
1586         }
1587
1588         if(count & 2) {
1589                 uint8x8x4_t t1 = vld4_u8(tSrc);
1590                 tSrc += sizeof(uint8x8x4_t);
1591                 vst4_u8(tDst, t1);
1592                 tDst += sizeof(uint8x8x4_t);
1593         }
1594
1595         if(count & 1) {
1596                 uint8x16_t t1 = vld1q_u8(tSrc);
1597                 tSrc += sizeof(uint8x16_t);
1598                 vst1q_u8(tDst, t1);
1599                 tDst += sizeof(uint8x16_t);
1600         }
1601
1602 #endif  // !USE_GCC_INLINE_ASM
1603
1604         if(trailerCount) {
1605                 if(trailerCount & 8) {
1606                         uint8x8_t t1 = vld1_u8(tSrc);
1607                         tSrc += sizeof(uint8x8_t);
1608                         vst1_u8(tDst, t1);
1609                         tDst += sizeof(uint8x8_t);
1610                 }
1611
1612                 if(trailerCount & 4) {
1613                         *((uint32_t*) tDst) = *((uint32_t*) tSrc);
1614                         tDst += 4;
1615                         tSrc += 4;
1616                 }
1617
1618                 if(trailerCount & 2) {
1619                         *((uint16_t*) tDst) = *((uint16_t*) tSrc);
1620                         tDst += 2;
1621                         tSrc += 2;
1622                 }
1623
1624                 if(trailerCount & 1) {
1625                         *tDst++ = *tSrc++;
1626                 }
1627         }
1628 }
1629
1630 static inline void SolidOver565_8pix_neon(
1631         uint32_t  glyphColour,
1632         uint16_t *dest,
1633         uint8_t  *inMask,
1634         uint32_t  destStride,  // bytes, not elements
1635         uint32_t  maskStride,
1636         uint32_t  count        // 8-pixel groups
1637 )
1638 {
1639         // Inner loop of glyph blitter (solid colour, alpha mask)
1640
1641 #ifdef USE_GCC_INLINE_ASM
1642
1643         asm volatile (
1644         "       vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyphColour]]  @ splat solid colour components     \n"
1645         "0:     @ loop                                                                                                                                                          \n"
1646         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer                       \n"
1647         "       vld1.8    {d17}, [%[inMask]]         @ load alpha mask of glyph                                         \n"
1648         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
1649         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
1650         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
1651         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
1652         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
1653         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
1654         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
1655         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
1656         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
1657         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
1658         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
1659         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
1660         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
1661         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
1662         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
1663         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
1664         "       add %[inMask], %[inMask], %[maskStride] @ advance mask pointer, while we wait           \n"
1665         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
1666         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
1667         "       vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels                                          \n"
1668         "       add %[dest], %[dest], %[destStride]  @ advance framebuffer pointer                                      \n"
1669         "       bne 0b                               @ next please                                                                      \n"
1670
1671         // Clobbered registers marked as input/outputs
1672         : [dest] "+r" (dest), [inMask] "+r" (inMask), [count] "+r" (count)
1673
1674         // Inputs
1675         : [destStride] "r" (destStride), [maskStride] "r" (maskStride), [glyphColour] "r" (&glyphColour)
1676
1677         // Clobbers, including the inputs we modify, and potentially lots of memory
1678         : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
1679         );
1680
1681 #else
1682
1683         uint8x8x4_t solidColour = vld4_dup_u8((uint8_t*) &glyphColour);
1684
1685         while(count--)
1686         {
1687                 uint16x8_t  pixels = vld1q_u16(dest);
1688                 uint8x8_t   mask = vshrn_n_u16(vmull_u8(solidColour.val[3], vld1_u8(inMask)), 8);
1689                 uint8x8_t  iMask = vmvn_u8(mask);
1690
1691                 uint8x8_t  tRed   = vshrn_n_u16(pixels, 8);
1692                 uint8x8_t  tGreen = vshrn_n_u16(pixels, 3);
1693                 uint8x8_t  tBlue  = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
1694
1695                 uint16x8_t sRed   = vmull_u8(vsri_n_u8(tRed  , tRed  , 5), iMask);
1696                 uint16x8_t sGreen = vmull_u8(vsri_n_u8(tGreen, tGreen, 6), iMask);
1697                 uint16x8_t sBlue  = vmull_u8(          tBlue             , iMask);
1698
1699                 sRed   = vmlal(sRed  , mask, solidColour.val[2]);
1700                 sGreen = vmlal(sGreen, mask, solidColour.val[1]);
1701                 sBlue  = vmlal(sBlue , mask, solidColour.val[0]);
1702
1703                 pixels = vsri_n_u16(sRed, sGreen, 5);
1704                 pixels = vsri_n_u16(pixels, sBlue, 11);
1705                 vst1q_u16(dest, pixels);
1706
1707                 dest += destStride;
1708                 mask += maskStride;
1709         }
1710
1711 #endif
1712 }
1713
1714 void
1715 fbCompositeSolidMask_nx8x0565neon (
1716         pixman_implementation_t * impl,
1717         pixman_op_t op,
1718         pixman_image_t * pSrc,
1719         pixman_image_t * pMask,
1720         pixman_image_t * pDst,
1721         int32_t      xSrc,
1722         int32_t      ySrc,
1723         int32_t      xMask,
1724         int32_t      yMask,
1725         int32_t      xDst,
1726         int32_t      yDst,
1727         int32_t      width,
1728         int32_t      height)
1729 {
1730         uint32_t     src, srca;
1731         uint16_t    *dstLine, *alignedLine;
1732         uint8_t     *maskLine;
1733         uint32_t     dstStride, maskStride;
1734         uint32_t     kernelCount, copyCount;
1735         uint8_t      kernelOffset, copyOffset;
1736
1737         _pixman_image_get_solid(pSrc, src, pDst->bits.format);
1738
1739         // bail out if fully transparent or degenerate
1740         srca = src >> 24;
1741         if(srca == 0)
1742                 return;
1743         if(width == 0 || height == 0)
1744                 return;
1745
1746         if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1747                 // split the blit, so we can use a fixed-size scanline buffer
1748                 // TODO: there must be a more elegant way of doing this.
1749                 int x;
1750                 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1751                         fbCompositeSolidMask_nx8x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
1752                                                                                           (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1753                 }
1754                 return;
1755         }
1756
1757         fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
1758         fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
1759
1760         // keep within minimum number of aligned quadwords on width
1761         // while also keeping the minimum number of columns to process
1762         {
1763                 unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
1764                 unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
1765                 unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
1766
1767                 // the fast copy must always be quadword aligned
1768                 copyOffset = dstLine - ((uint16_t*) alignedLeft);
1769                 alignedLine = dstLine - copyOffset;
1770                 copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
1771
1772                 if(alignedRight - alignedLeft > ceilingLength) {
1773                         // unaligned routine is tightest, and will not overrun
1774                         kernelCount = (uint32_t) (ceilingLength >> 4);
1775                         kernelOffset = copyOffset;
1776                 } else {
1777                         // aligned routine is equally tight, so it is safer to align
1778                         kernelCount = copyCount;
1779                         kernelOffset = 0;
1780                 }
1781         }
1782
1783         {
1784                 uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1785                 uint8_t glyphLine[NEON_SCANLINE_BUFFER_PIXELS + 8];
1786                 int y = height;
1787
1788                 // row-major order
1789                 // left edge, middle block, right edge
1790                 for( ; y--; maskLine += maskStride, alignedLine += dstStride, dstLine += dstStride) {
1791                         // We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
1792                         QuadwordCopy_neon(glyphLine + copyOffset, maskLine, width >> 4, width & 0xF);
1793
1794                         // Uncached framebuffer access is really, really slow if we do it piecemeal.
1795                         // It should be much faster if we grab it all at once.
1796                         // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1797                         QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
1798
1799                         // Apply the actual filter
1800                         SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount);
1801
1802                         // Copy the modified scanline back
1803                         QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
1804                 }
1805         }
1806 }
1807
1808 #ifdef USE_GCC_INLINE_ASM
1809
1810 static inline void PlainOver565_8pix_neon(
1811         uint32_t  colour,
1812         uint16_t *dest,
1813         uint32_t  destStride,  // bytes, not elements
1814         uint32_t  count        // 8-pixel groups
1815 )
1816 {
1817         // Inner loop for plain translucent rects (solid colour without alpha mask)
1818         asm volatile (
1819         "       vld4.8   {d20[],d21[],d22[],d23[]}, [%[colour]]  @ solid colour load/splat \n"
1820         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
1821         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
1822         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
1823         "       vmvn      d18, d23                   @ inverse alpha for background \n"
1824         "0:     @ loop\n"
1825         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer       \n"
1826         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
1827         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
1828         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
1829         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
1830         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
1831         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
1832         "       vmov      q0, q12                    @ retrieve foreground red   \n"
1833         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
1834         "       vmov      q1, q13                    @ retrieve foreground green \n"
1835         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
1836         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
1837         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
1838         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
1839         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
1840         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
1841         "       vst1.16   {d0,d1}, [%[dest]]         @ store composited pixels                  \n"
1842         "       add %[dest], %[dest], %[destStride]  @ advance framebuffer pointer              \n"
1843         "       bne 0b                               @ next please                              \n"
1844
1845         // Clobbered registers marked as input/outputs
1846         : [dest] "+r" (dest), [count] "+r" (count)
1847
1848         // Inputs
1849         : [destStride] "r" (destStride), [colour] "r" (&colour)
1850
1851         // Clobbers, including the inputs we modify, and potentially lots of memory
1852         : "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12", "q13", "q14", "cc", "memory"
1853         );
1854 }
1855
1856 void
1857 fbCompositeSolid_nx0565neon (
1858         pixman_implementation_t * impl,
1859         pixman_op_t op,
1860         pixman_image_t * pSrc,
1861         pixman_image_t * pMask,
1862         pixman_image_t * pDst,
1863         int32_t      xSrc,
1864         int32_t      ySrc,
1865         int32_t      xMask,
1866         int32_t      yMask,
1867         int32_t      xDst,
1868         int32_t      yDst,
1869         int32_t      width,
1870         int32_t      height)
1871 {
1872         uint32_t     src, srca;
1873         uint16_t    *dstLine, *alignedLine;
1874         uint32_t     dstStride;
1875         uint32_t     kernelCount, copyCount;
1876         uint8_t      kernelOffset, copyOffset;
1877
1878         _pixman_image_get_solid(pSrc, src, pDst->bits.format);
1879
1880         // bail out if fully transparent
1881         srca = src >> 24;
1882         if(srca == 0)
1883                 return;
1884         if(width == 0 || height == 0)
1885                 return;
1886
1887         if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1888                 // split the blit, so we can use a fixed-size scanline buffer
1889                 // TODO: there must be a more elegant way of doing this.
1890                 int x;
1891                 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1892                         fbCompositeSolid_nx0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
1893                                                                                 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1894                 }
1895                 return;
1896         }
1897
1898         fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
1899
1900         // keep within minimum number of aligned quadwords on width
1901         // while also keeping the minimum number of columns to process
1902         {
1903                 unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
1904                 unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
1905                 unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
1906
1907                 // the fast copy must always be quadword aligned
1908                 copyOffset = dstLine - ((uint16_t*) alignedLeft);
1909                 alignedLine = dstLine - copyOffset;
1910                 copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
1911
1912                 if(alignedRight - alignedLeft > ceilingLength) {
1913                         // unaligned routine is tightest, and will not overrun
1914                         kernelCount = (uint32_t) (ceilingLength >> 4);
1915                         kernelOffset = copyOffset;
1916                 } else {
1917                         // aligned routine is equally tight, so it is safer to align
1918                         kernelCount = copyCount;
1919                         kernelOffset = 0;
1920                 }
1921         }
1922
1923         {
1924                 uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1925
1926                 // row-major order
1927                 // left edge, middle block, right edge
1928                 for( ; height--; alignedLine += dstStride, dstLine += dstStride) {
1929
1930                         // Uncached framebuffer access is really, really slow if we do it piecemeal.
1931                         // It should be much faster if we grab it all at once.
1932                         // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1933                         QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
1934
1935                         // Apply the actual filter
1936                         PlainOver565_8pix_neon(src, scanLine + kernelOffset, 8 * sizeof(*dstLine), kernelCount);
1937
1938                         // Copy the modified scanline back
1939                         QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
1940                 }
1941         }
1942 }
1943
1944 static inline void ARGB8_Over565_8pix_neon(
1945         uint32_t *src,
1946         uint16_t *dest,
1947         uint32_t  srcStride,  // bytes, not elements
1948         uint32_t  count        // 8-pixel groups
1949 )
1950 {
1951         asm volatile (
1952         "0:     @ loop\n"
1953         "       pld   [%[src], %[srcStride]]         @ preload from next scanline       \n"
1954         "       vld1.16   {d0,d1}, [%[dest]]         @ load pixels from framebuffer     \n"
1955         "       vld4.8   {d20,d21,d22,d23},[%[src]]! @ load source image pixels         \n"
1956         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
1957         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
1958         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
1959         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
1960         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
1961         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
1962         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
1963         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
1964         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
1965         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
1966         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
1967         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
1968         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
1969         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
1970         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
1971         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
1972         "       vst1.16   {d2,d3}, [%[dest]]!        @ store composited pixels                  \n"
1973         "       bne 0b                               @ next please                              \n"
1974
1975         // Clobbered registers marked as input/outputs
1976         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
1977
1978         // Inputs
1979         : [srcStride] "r" (srcStride)
1980
1981         // Clobbers, including the inputs we modify, and potentially lots of memory
1982         : "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
1983         );
1984 }
1985
1986 void
1987 fbCompositeOver_8888x0565neon (
1988         pixman_implementation_t * impl,
1989         pixman_op_t op,
1990         pixman_image_t * pSrc,
1991         pixman_image_t * pMask,
1992         pixman_image_t * pDst,
1993         int32_t      xSrc,
1994         int32_t      ySrc,
1995         int32_t      xMask,
1996         int32_t      yMask,
1997         int32_t      xDst,
1998         int32_t      yDst,
1999         int32_t      width,
2000         int32_t      height)
2001 {
2002         uint32_t    *srcLine;
2003         uint16_t    *dstLine, *alignedLine;
2004         uint32_t     dstStride, srcStride;
2005         uint32_t     kernelCount, copyCount;
2006         uint8_t      kernelOffset, copyOffset;
2007
2008         // we assume mask is opaque
2009         // so the only alpha to deal with is embedded in src
2010
2011         if(width > NEON_SCANLINE_BUFFER_PIXELS) {
2012                 // split the blit, so we can use a fixed-size scanline buffer
2013                 int x;
2014                 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
2015                         fbCompositeOver_8888x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
2016                                                                                   (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
2017                 }
2018                 return;
2019         }
2020
2021         fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2022         fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2023
2024         // keep within minimum number of aligned quadwords on width
2025         // while also keeping the minimum number of columns to process
2026         {
2027                 unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
2028                 unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
2029                 unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
2030
2031                 // the fast copy must always be quadword aligned
2032                 copyOffset = dstLine - ((uint16_t*) alignedLeft);
2033                 alignedLine = dstLine - copyOffset;
2034                 copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
2035
2036                 if(alignedRight - alignedLeft > ceilingLength) {
2037                         // unaligned routine is tightest, and will not overrun
2038                         kernelCount = (uint32_t) (ceilingLength >> 4);
2039                         kernelOffset = copyOffset;
2040                 } else {
2041                         // aligned routine is equally tight, so it is safer to align
2042                         kernelCount = copyCount;
2043                         kernelOffset = 0;
2044                 }
2045         }
2046
2047         /* Preload the first input scanline */
2048         {
2049                 uint8_t *srcPtr = (uint8_t*) srcLine;
2050                 uint32_t count = (width + 15) / 16;
2051
2052 #ifdef USE_GCC_INLINE_ASM
2053                 asm volatile (
2054                 "0: @ loop                                              \n"
2055                 "       subs    %[count], %[count], #1                  \n"
2056                 "       pld     [%[src]]                                \n"
2057                 "       add     %[src], %[src], #64                     \n"
2058                 "       bgt 0b                                          \n"
2059
2060                 // Clobbered input registers marked as input/outputs
2061                 : [src] "+r" (srcPtr), [count] "+r" (count)
2062                 : // no unclobbered inputs
2063                 : "cc"
2064                 );
2065 #else
2066                 do {
2067                         __pld(srcPtr);
2068                         srcPtr += 64;
2069                 } while(--count);
2070 #endif
2071         }
2072
2073         {
2074                 uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
2075
2076                 // row-major order
2077                 // left edge, middle block, right edge
2078                 for( ; height--; srcLine += srcStride, alignedLine += dstStride) {
2079                         // Uncached framebuffer access is really, really slow if we do it piecemeal.
2080                         // It should be much faster if we grab it all at once.
2081                         // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
2082                         QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
2083
2084                         // Apply the actual filter
2085                         ARGB8_Over565_8pix_neon(srcLine, scanLine + kernelOffset, srcStride * sizeof(*srcLine), kernelCount);
2086
2087                         // Copy the modified scanline back
2088                         QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
2089                 }
2090         }
2091 }
2092
2093 #endif  // USE_GCC_INLINE_ASM
2094
2095 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2096 {
2097     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
2098     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000neon,       0 },
2099     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
2100     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
2101     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_24x16neon,              0 },
2102     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_24x16neon,              0 },
2103     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_24x16neon,              0 },
2104     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_24x16neon,              0 },
2105 #ifdef USE_GCC_INLINE_ASM
2106     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_16x16neon,              0 },
2107     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
2108     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSolid_nx0565neon,           0 },
2109     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSolid_nx0565neon,           0 },
2110     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeOver_8888x0565neon,         0 },
2111     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeOver_8888x0565neon,         0 },
2112 #endif
2113     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
2114     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
2115     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
2116     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
2117     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
2118     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
2119     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
2120     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
2121     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
2122     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
2123     { PIXMAN_OP_NONE },
2124 };
2125
2126 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2127
2128 static void
2129 arm_neon_composite (pixman_implementation_t *imp,
2130                 pixman_op_t     op,
2131                 pixman_image_t *src,
2132                 pixman_image_t *mask,
2133                 pixman_image_t *dest,
2134                 int32_t         src_x,
2135                 int32_t         src_y,
2136                 int32_t         mask_x,
2137                 int32_t         mask_y,
2138                 int32_t         dest_x,
2139                 int32_t         dest_y,
2140                 int32_t        width,
2141                 int32_t        height)
2142 {
2143         if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2144                                op, src, mask, dest,
2145                                src_x, src_y,
2146                                mask_x, mask_y,
2147                                dest_x, dest_y,
2148                                width, height))
2149         {
2150                 return;
2151         }
2152
2153         _pixman_implementation_composite (imp->delegate, op,
2154                                       src, mask, dest,
2155                                       src_x, src_y,
2156                                       mask_x, mask_y,
2157                                       dest_x, dest_y,
2158                                       width, height);
2159 }
2160
2161 pixman_bool_t
2162 pixman_blt_neon (
2163         void *src_bits,
2164         void *dst_bits,
2165         int src_stride,
2166         int dst_stride,
2167         int src_bpp,
2168         int dst_bpp,
2169         int src_x, int src_y,
2170         int dst_x, int dst_y,
2171         int width, int height)
2172 {
2173         if(!width || !height)
2174                 return TRUE;
2175
2176         // accelerate only straight copies involving complete bytes
2177         if(src_bpp != dst_bpp || (src_bpp & 7))
2178                 return FALSE;
2179
2180         {
2181                 uint32_t bytes_per_pixel = src_bpp >> 3;
2182                 uint32_t byte_width = width * bytes_per_pixel;
2183                 int32_t src_stride_bytes = src_stride * 4; // parameter is in words for some reason
2184                 int32_t dst_stride_bytes = dst_stride * 4;
2185                 uint8_t *src_bytes = ((uint8_t*) src_bits) + src_y * src_stride_bytes + src_x * bytes_per_pixel;
2186                 uint8_t *dst_bytes = ((uint8_t*) dst_bits) + dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2187                 uint32_t quadword_count = byte_width / 16;
2188                 uint32_t offset         = byte_width % 16;
2189
2190                 while(height--) {
2191                         QuadwordCopy_neon(dst_bytes, src_bytes, quadword_count, offset);
2192                         src_bytes += src_stride_bytes;
2193                         dst_bytes += dst_stride_bytes;
2194                 }
2195         }
2196
2197         return TRUE;
2198 }
2199
2200 static pixman_bool_t
2201 arm_neon_blt (pixman_implementation_t *imp,
2202           uint32_t *src_bits,
2203           uint32_t *dst_bits,
2204           int src_stride,
2205           int dst_stride,
2206           int src_bpp,
2207           int dst_bpp,
2208           int src_x, int src_y,
2209           int dst_x, int dst_y,
2210           int width, int height)
2211 {
2212         if (pixman_blt_neon (
2213                         src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2214                         src_x, src_y, dst_x, dst_y, width, height))
2215                 return TRUE;
2216
2217         return _pixman_implementation_blt (
2218                         imp->delegate,
2219                         src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2220                         src_x, src_y, dst_x, dst_y, width, height);
2221 }
2222
2223 static pixman_bool_t
2224 arm_neon_fill (pixman_implementation_t *imp,
2225            uint32_t *bits,
2226            int stride,
2227            int bpp,
2228            int x,
2229            int y,
2230            int width,
2231            int height,
2232            uint32_t xor)
2233 {
2234         if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2235                 return TRUE;
2236
2237         return _pixman_implementation_fill (
2238                         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2239 }
2240
2241 pixman_implementation_t *
2242 _pixman_implementation_create_arm_neon (void)
2243 {
2244         pixman_implementation_t *simd = _pixman_implementation_create_arm_simd();
2245         pixman_implementation_t *imp  = _pixman_implementation_create (simd);
2246
2247         imp->composite = arm_neon_composite;
2248         imp->blt = arm_neon_blt;
2249         imp->fill = arm_neon_fill;
2250
2251         return imp;
2252 }