pixman/pixman-arm-neon.c

   1 /*
   2  * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
   3  *
   4  * Permission to use, copy, modify, distribute, and sell this software and its
   5  * documentation for any purpose is hereby granted without fee, provided that
   6  * the above copyright notice appear in all copies and that both that
   7  * copyright notice and this permission notice appear in supporting
   8  * documentation, and that the name of ARM Ltd not be used in
   9  * advertising or publicity pertaining to distribution of the software without
  10  * specific, written prior permission.  ARM Ltd makes no
  11  * representations about the suitability of this software for any purpose.  It
  12  * is provided "as is" without express or implied warranty.
  13  *
  14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  21  * SOFTWARE.
  22  *
  23  * Author:  Ian Rickards (ian.rickards@arm.com)
  24  * Author:  Jonathan Morton (jonathan.morton@movial.com)
  25  * Author:  Markku Vire (markku.vire@movial.com)
  26  *
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 #include <arm_neon.h>
  34 #include <string.h>
  35 #include "pixman-private.h"
  36
  37 // Deal with an intrinsic that is defined differently in GCC
  38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
  39 #define __pld(_x) __builtin_prefetch(_x)
  40 #endif
  41
  42 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
  43 {
  44     uint16x8_t gb, b;
  45     uint8x8x4_t res;
  46
  47     res.val[3] = vdup_n_u8(0);
  48     gb = vshrq_n_u16(rgb, 5);
  49     b = vshrq_n_u16(rgb, 5+6);
  50     res.val[0] = vmovn_u16(rgb);  // get low 5 bits
  51     res.val[1] = vmovn_u16(gb);   // get mid 6 bits
  52     res.val[2] = vmovn_u16(b);    // get top 5 bits
  53
  54     res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top
  55     res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top
  56     res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top
  57
  58     res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5);
  59     res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6);
  60     res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5);
  61
  62     return res;
  63 }
  64
  65 static force_inline uint16x8_t pack0565(uint8x8x4_t s)
  66 {
  67     uint16x8_t rgb, val_g, val_r;
  68
  69     rgb = vshll_n_u8(s.val[2],8);
  70     val_g = vshll_n_u8(s.val[1],8);
  71     val_r = vshll_n_u8(s.val[0],8);
  72     rgb = vsriq_n_u16(rgb, val_g, 5);
  73     rgb = vsriq_n_u16(rgb, val_r, 5+6);
  74
  75     return rgb;
  76 }
  77
  78 static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha)
  79 {
  80     uint16x8_t tmp,tmp2;
  81     uint8x8_t res;
  82
  83     tmp = vmull_u8(x,alpha);
  84     tmp2 = vrshrq_n_u16(tmp,8);
  85     res = vraddhn_u16(tmp,tmp2);
  86
  87     return res;
  88 }
  89
  90 static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha)
  91 {
  92     uint16x8x4_t tmp;
  93     uint8x8x4_t res;
  94     uint16x8_t qtmp1,qtmp2;
  95
  96     tmp.val[0] = vmull_u8(x.val[0],alpha);
  97     tmp.val[1] = vmull_u8(x.val[1],alpha);
  98     tmp.val[2] = vmull_u8(x.val[2],alpha);
  99     tmp.val[3] = vmull_u8(x.val[3],alpha);
 100
 101     qtmp1 = vrshrq_n_u16(tmp.val[0],8);
 102     qtmp2 = vrshrq_n_u16(tmp.val[1],8);
 103     res.val[0] = vraddhn_u16(tmp.val[0],qtmp1);
 104     qtmp1 = vrshrq_n_u16(tmp.val[2],8);
 105     res.val[1] = vraddhn_u16(tmp.val[1],qtmp2);
 106     qtmp2 = vrshrq_n_u16(tmp.val[3],8);
 107     res.val[2] = vraddhn_u16(tmp.val[2],qtmp1);
 108     res.val[3] = vraddhn_u16(tmp.val[3],qtmp2);
 109
 110     return res;
 111 }
 112
 113 static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y)
 114 {
 115     uint8x8x4_t res;
 116
 117     res.val[0] = vqadd_u8(x.val[0],y.val[0]);
 118     res.val[1] = vqadd_u8(x.val[1],y.val[1]);
 119     res.val[2] = vqadd_u8(x.val[2],y.val[2]);
 120     res.val[3] = vqadd_u8(x.val[3],y.val[3]);
 121
 122     return res;
 123 }
 124
 125
 126 static void
 127 neon_composite_add_8000_8000 (
 128                             pixman_implementation_t * impl,
 129                             pixman_op_t op,
 130                                 pixman_image_t * src_image,
 131                                 pixman_image_t * mask_image,
 132                                 pixman_image_t * dst_image,
 133                                 int32_t      src_x,
 134                                 int32_t      src_y,
 135                                 int32_t      mask_x,
 136                                 int32_t      mask_y,
 137                                 int32_t      dest_x,
 138                                 int32_t      dest_y,
 139                                 int32_t      width,
 140                                 int32_t      height)
 141 {
 142     uint8_t     *dst_line, *dst;
 143     uint8_t     *src_line, *src;
 144     int dst_stride, src_stride;
 145     uint16_t    w;
 146
 147     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 148     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 149
 150     if (width>=8)
 151     {
 152         // Use overlapping 8-pixel method
 153         while (height--)
 154         {
 155             dst = dst_line;
 156             dst_line += dst_stride;
 157             src = src_line;
 158             src_line += src_stride;
 159             w = width;
 160
 161             uint8_t *keep_dst=0;
 162
 163 #ifndef USE_GCC_INLINE_ASM
 164             uint8x8_t sval,dval,temp;
 165
 166             sval = vld1_u8((void*)src);
 167             dval = vld1_u8((void*)dst);
 168             keep_dst = dst;
 169
 170             temp = vqadd_u8(dval,sval);
 171
 172             src += (w & 7);
 173             dst += (w & 7);
 174             w -= (w & 7);
 175
 176             while (w)
 177             {
 178                 sval = vld1_u8((void*)src);
 179                 dval = vld1_u8((void*)dst);
 180
 181                 vst1_u8((void*)keep_dst,temp);
 182                 keep_dst = dst;
 183
 184                 temp = vqadd_u8(dval,sval);
 185
 186                 src+=8;
 187                 dst+=8;
 188                 w-=8;
 189             }
 190             vst1_u8((void*)keep_dst,temp);
 191 #else
 192             asm volatile (
 193 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
 194                         "vld1.8  {d0}, [%[src]]\n\t"
 195                         "vld1.8  {d4}, [%[dst]]\n\t"
 196                         "mov     %[keep_dst], %[dst]\n\t"
 197
 198                         "and ip, %[w], #7\n\t"
 199                         "add %[src], %[src], ip\n\t"
 200                         "add %[dst], %[dst], ip\n\t"
 201                         "subs %[w], %[w], ip\n\t"
 202                         "b 9f\n\t"
 203 // LOOP
 204                         "2:\n\t"
 205                         "vld1.8  {d0}, [%[src]]!\n\t"
 206                         "vld1.8  {d4}, [%[dst]]!\n\t"
 207                         "vst1.8  {d20}, [%[keep_dst]]\n\t"
 208                         "sub     %[keep_dst], %[dst], #8\n\t"
 209                         "subs %[w], %[w], #8\n\t"
 210                         "9:\n\t"
 211                         "vqadd.u8 d20, d0, d4\n\t"
 212
 213                         "bne 2b\n\t"
 214
 215                         "1:\n\t"
 216                         "vst1.8  {d20}, [%[keep_dst]]\n\t"
 217
 218                         : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 219                         :
 220                         : "ip", "cc", "memory", "d0","d4",
 221                           "d20"
 222                         );
 223 #endif
 224         }
 225     }
 226     else
 227     {
 228         const uint8_t nil = 0;
 229         const uint8x8_t vnil = vld1_dup_u8(&nil);
 230
 231         while (height--)
 232         {
 233             dst = dst_line;
 234             dst_line += dst_stride;
 235             src = src_line;
 236             src_line += src_stride;
 237             w = width;
 238             uint8x8_t sval=vnil, dval=vnil;
 239             uint8_t *dst4=0, *dst2=0;
 240
 241             if (w&4)
 242             {
 243                 sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1));
 244                 dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1));
 245                 dst4=dst;
 246                 src+=4;
 247                 dst+=4;
 248             }
 249             if (w&2)
 250             {
 251                 sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1));
 252                 dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1));
 253                 dst2=dst;
 254                 src+=2;
 255                 dst+=2;
 256             }
 257             if (w&1)
 258             {
 259                 sval = vld1_lane_u8(src,sval,1);
 260                 dval = vld1_lane_u8(dst,dval,1);
 261             }
 262
 263             dval = vqadd_u8(dval,sval);
 264
 265             if (w&1)
 266                 vst1_lane_u8(dst,dval,1);
 267             if (w&2)
 268                 vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1);
 269             if (w&4)
 270                 vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1);
 271         }
 272     }
 273 }
 274
 275
 276 static void
 277 neon_composite_over_8888_8888 (
 278                             pixman_implementation_t * impl,
 279                             pixman_op_t op,
 280                          pixman_image_t * src_image,
 281                          pixman_image_t * mask_image,
 282                          pixman_image_t * dst_image,
 283                          int32_t      src_x,
 284                          int32_t      src_y,
 285                          int32_t      mask_x,
 286                          int32_t      mask_y,
 287                          int32_t      dest_x,
 288                          int32_t      dest_y,
 289                          int32_t      width,
 290                          int32_t      height)
 291 {
 292     uint32_t    *dst_line, *dst;
 293     uint32_t    *src_line, *src;
 294     int dst_stride, src_stride;
 295     uint32_t    w;
 296
 297     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 298     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 299
 300     if (width>=8)
 301     {
 302         // Use overlapping 8-pixel method
 303         while (height--)
 304         {
 305             dst = dst_line;
 306             dst_line += dst_stride;
 307             src = src_line;
 308             src_line += src_stride;
 309             w = width;
 310
 311             uint32_t *keep_dst=0;
 312
 313 #ifndef USE_GCC_INLINE_ASM
 314             uint8x8x4_t sval,dval,temp;
 315
 316             sval = vld4_u8((void*)src);
 317             dval = vld4_u8((void*)dst);
 318             keep_dst = dst;
 319
 320             temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 321             temp = neon8qadd(sval,temp);
 322
 323             src += (w & 7);
 324             dst += (w & 7);
 325             w -= (w & 7);
 326
 327             while (w)
 328             {
 329                 sval = vld4_u8((void*)src);
 330                 dval = vld4_u8((void*)dst);
 331
 332                 vst4_u8((void*)keep_dst,temp);
 333                 keep_dst = dst;
 334
 335                 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 336                 temp = neon8qadd(sval,temp);
 337
 338                 src+=8;
 339                 dst+=8;
 340                 w-=8;
 341             }
 342             vst4_u8((void*)keep_dst,temp);
 343 #else
 344             asm volatile (
 345 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
 346                         "vld4.8  {d0-d3}, [%[src]]\n\t"
 347                         "vld4.8  {d4-d7}, [%[dst]]\n\t"
 348                         "mov     %[keep_dst], %[dst]\n\t"
 349
 350                         "and ip, %[w], #7\n\t"
 351                         "add %[src], %[src], ip, LSL#2\n\t"
 352                         "add %[dst], %[dst], ip, LSL#2\n\t"
 353                         "subs %[w], %[w], ip\n\t"
 354                         "b 9f\n\t"
 355 // LOOP
 356                         "2:\n\t"
 357                         "vld4.8  {d0-d3}, [%[src]]!\n\t"
 358                         "vld4.8  {d4-d7}, [%[dst]]!\n\t"
 359                         "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 360                         "sub     %[keep_dst], %[dst], #8*4\n\t"
 361                         "subs %[w], %[w], #8\n\t"
 362                         "9:\n\t"
 363                         "vmvn.8  d31, d3\n\t"
 364                         "vmull.u8 q10, d31, d4\n\t"
 365                         "vmull.u8 q11, d31, d5\n\t"
 366                         "vmull.u8 q12, d31, d6\n\t"
 367                         "vmull.u8 q13, d31, d7\n\t"
 368                         "vrshr.u16 q8, q10, #8\n\t"
 369                         "vrshr.u16 q9, q11, #8\n\t"
 370                         "vraddhn.u16 d20, q10, q8\n\t"
 371                         "vraddhn.u16 d21, q11, q9\n\t"
 372                         "vrshr.u16 q8, q12, #8\n\t"
 373                         "vrshr.u16 q9, q13, #8\n\t"
 374                         "vraddhn.u16 d22, q12, q8\n\t"
 375                         "vraddhn.u16 d23, q13, q9\n\t"
 376 // result in d20-d23
 377                         "vqadd.u8 d20, d0, d20\n\t"
 378                         "vqadd.u8 d21, d1, d21\n\t"
 379                         "vqadd.u8 d22, d2, d22\n\t"
 380                         "vqadd.u8 d23, d3, d23\n\t"
 381
 382                         "bne 2b\n\t"
 383
 384                         "1:\n\t"
 385                         "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
 386
 387                         : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 388                         :
 389                         : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 390                           "d16","d17","d18","d19","d20","d21","d22","d23"
 391                         );
 392 #endif
 393         }
 394     }
 395     else
 396     {
 397         uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
 398
 399         // Handle width<8
 400         while (height--)
 401         {
 402             dst = dst_line;
 403             dst_line += dst_stride;
 404             src = src_line;
 405             src_line += src_stride;
 406             w = width;
 407
 408             while (w>=2)
 409             {
 410                 uint8x8_t sval,dval;
 411
 412                 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
 413                 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
 414                 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
 415                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
 416                 vst1_u8((void*)dst,vqadd_u8(sval,dval));
 417
 418                 src+=2;
 419                 dst+=2;
 420                 w-=2;
 421             }
 422
 423             if (w)
 424             {
 425                 uint8x8_t sval,dval;
 426
 427                 /* single 32-bit pixel in lane 0 */
 428                 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));  // only interested in lane 0
 429                 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));  // only interested in lane 0
 430                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
 431                 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
 432             }
 433         }
 434     }
 435 }
 436
 437 static void
 438 neon_composite_over_8888_n_8888 (
 439                                pixman_implementation_t * impl,
 440                                pixman_op_t op,
 441                                pixman_image_t * src_image,
 442                                pixman_image_t * mask_image,
 443                                pixman_image_t * dst_image,
 444                                int32_t  src_x,
 445                                int32_t  src_y,
 446                                int32_t      mask_x,
 447                                int32_t      mask_y,
 448                                int32_t      dest_x,
 449                                int32_t      dest_y,
 450                                int32_t      width,
 451                                int32_t      height)
 452 {
 453     uint32_t    *dst_line, *dst;
 454     uint32_t    *src_line, *src;
 455     uint32_t    mask;
 456     int dst_stride, src_stride;
 457     uint32_t    w;
 458     uint8x8_t mask_alpha;
 459
 460     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 461     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 462
 463     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 464     mask_alpha = vdup_n_u8((mask) >> 24);
 465
 466     if (width>=8)
 467     {
 468         // Use overlapping 8-pixel method
 469         while (height--)
 470         {
 471             dst = dst_line;
 472             dst_line += dst_stride;
 473             src = src_line;
 474             src_line += src_stride;
 475             w = width;
 476
 477             uint32_t *keep_dst=0;
 478
 479 #ifndef USE_GCC_INLINE_ASM
 480             uint8x8x4_t sval,dval,temp;
 481
 482             sval = vld4_u8((void*)src);
 483             dval = vld4_u8((void*)dst);
 484             keep_dst = dst;
 485
 486             sval = neon8mul(sval,mask_alpha);
 487             temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 488             temp = neon8qadd(sval,temp);
 489
 490             src += (w & 7);
 491             dst += (w & 7);
 492             w -= (w & 7);
 493
 494             while (w)
 495             {
 496                 sval = vld4_u8((void*)src);
 497                 dval = vld4_u8((void*)dst);
 498
 499                 vst4_u8((void*)keep_dst,temp);
 500                 keep_dst = dst;
 501
 502                 sval = neon8mul(sval,mask_alpha);
 503                 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
 504                 temp = neon8qadd(sval,temp);
 505
 506                 src+=8;
 507                 dst+=8;
 508                 w-=8;
 509             }
 510             vst4_u8((void*)keep_dst,temp);
 511 #else
 512             asm volatile (
 513 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
 514                         "vdup.32      d30, %[mask]\n\t"
 515                         "vdup.8       d30, d30[3]\n\t"
 516
 517                         "vld4.8       {d0-d3}, [%[src]]\n\t"
 518                         "vld4.8       {d4-d7}, [%[dst]]\n\t"
 519                         "mov  %[keep_dst], %[dst]\n\t"
 520
 521                         "and  ip, %[w], #7\n\t"
 522                         "add  %[src], %[src], ip, LSL#2\n\t"
 523                         "add  %[dst], %[dst], ip, LSL#2\n\t"
 524                         "subs  %[w], %[w], ip\n\t"
 525                         "b 9f\n\t"
 526 // LOOP
 527                         "2:\n\t"
 528                         "vld4.8       {d0-d3}, [%[src]]!\n\t"
 529                         "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 530                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 531                         "sub  %[keep_dst], %[dst], #8*4\n\t"
 532                         "subs  %[w], %[w], #8\n\t"
 533
 534                         "9:\n\t"
 535                         "vmull.u8     q10, d30, d0\n\t"
 536                         "vmull.u8     q11, d30, d1\n\t"
 537                         "vmull.u8     q12, d30, d2\n\t"
 538                         "vmull.u8     q13, d30, d3\n\t"
 539                         "vrshr.u16    q8, q10, #8\n\t"
 540                         "vrshr.u16    q9, q11, #8\n\t"
 541                         "vraddhn.u16  d0, q10, q8\n\t"
 542                         "vraddhn.u16  d1, q11, q9\n\t"
 543                         "vrshr.u16    q9, q13, #8\n\t"
 544                         "vrshr.u16    q8, q12, #8\n\t"
 545                         "vraddhn.u16  d3, q13, q9\n\t"
 546                         "vraddhn.u16  d2, q12, q8\n\t"
 547
 548                         "vmvn.8       d31, d3\n\t"
 549                         "vmull.u8     q10, d31, d4\n\t"
 550                         "vmull.u8     q11, d31, d5\n\t"
 551                         "vmull.u8     q12, d31, d6\n\t"
 552                         "vmull.u8     q13, d31, d7\n\t"
 553                         "vrshr.u16    q8, q10, #8\n\t"
 554                         "vrshr.u16    q9, q11, #8\n\t"
 555                         "vraddhn.u16  d20, q10, q8\n\t"
 556                         "vrshr.u16    q8, q12, #8\n\t"
 557                         "vraddhn.u16  d21, q11, q9\n\t"
 558                         "vrshr.u16    q9, q13, #8\n\t"
 559                         "vraddhn.u16  d22, q12, q8\n\t"
 560                         "vraddhn.u16  d23, q13, q9\n\t"
 561 // result in d20-d23
 562                         "vqadd.u8     d20, d0, d20\n\t"
 563                         "vqadd.u8     d21, d1, d21\n\t"
 564                         "vqadd.u8     d22, d2, d22\n\t"
 565                         "vqadd.u8     d23, d3, d23\n\t"
 566
 567                         "bne  2b\n\t"
 568
 569                         "1:\n\t"
 570                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 571
 572                         : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
 573                         : [mask] "r" (mask)
 574                         : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 575                           "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
 576                           "d30","d31"
 577                         );
 578 #endif
 579         }
 580     }
 581     else
 582     {
 583         uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
 584
 585         // Handle width<8
 586         while (height--)
 587         {
 588             dst = dst_line;
 589             dst_line += dst_stride;
 590             src = src_line;
 591             src_line += src_stride;
 592             w = width;
 593
 594             while (w>=2)
 595             {
 596                 uint8x8_t sval,dval;
 597
 598                 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
 599                 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
 600
 601                 /* sval * const alpha_mul */
 602                 sval = neon2mul(sval,mask_alpha);
 603
 604                 /* dval * 255-(src alpha) */
 605                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
 606
 607                 vst1_u8((void*)dst,vqadd_u8(sval,dval));
 608
 609                 src+=2;
 610                 dst+=2;
 611                 w-=2;
 612             }
 613
 614             if (w)
 615             {
 616                 uint8x8_t sval,dval;
 617
 618                 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));
 619                 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
 620
 621                 /* sval * const alpha_mul */
 622                 sval = neon2mul(sval,mask_alpha);
 623
 624                 /* dval * 255-(src alpha) */
 625                 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
 626
 627                 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
 628             }
 629         }
 630     }
 631 }
 632
 633
 634 static void
 635 neon_composite_over_n_8_8888 (
 636                             pixman_implementation_t * impl,
 637                             pixman_op_t      op,
 638                                pixman_image_t * src_image,
 639                                pixman_image_t * mask_image,
 640                                pixman_image_t * dst_image,
 641                                int32_t      src_x,
 642                                int32_t      src_y,
 643                                int32_t      mask_x,
 644                                int32_t      mask_y,
 645                                int32_t      dest_x,
 646                                int32_t      dest_y,
 647                                int32_t      width,
 648                                int32_t      height)
 649 {
 650     uint32_t     src, srca;
 651     uint32_t    *dst_line, *dst;
 652     uint8_t     *mask_line, *mask;
 653     int          dst_stride, mask_stride;
 654     uint32_t     w;
 655     uint8x8_t    sval2;
 656     uint8x8x4_t  sval8;
 657     uint8x8_t    mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL));
 658     uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
 659
 660     src = _pixman_image_get_solid(src_image, dst_image->bits.format);
 661
 662     // bail out if fully transparent
 663     srca = src >> 24;
 664     if (src == 0)
 665         return;
 666
 667     sval2=vreinterpret_u8_u32(vdup_n_u32(src));
 668     sval8.val[0]=vdup_lane_u8(sval2,0);
 669     sval8.val[1]=vdup_lane_u8(sval2,1);
 670     sval8.val[2]=vdup_lane_u8(sval2,2);
 671     sval8.val[3]=vdup_lane_u8(sval2,3);
 672
 673     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 674     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 675
 676     if (width>=8)
 677     {
 678         // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
 679         while (height--)
 680         {
 681             uint32_t *keep_dst=0;
 682
 683             dst = dst_line;
 684             dst_line += dst_stride;
 685             mask = mask_line;
 686             mask_line += mask_stride;
 687             w = width;
 688
 689 #ifndef USE_GCC_INLINE_ASM
 690             uint8x8_t alpha;
 691             uint8x8x4_t dval, temp;
 692
 693             alpha = vld1_u8((void*)mask);
 694             dval = vld4_u8((void*)dst);
 695             keep_dst = dst;
 696
 697             temp = neon8mul(sval8,alpha);
 698             dval = neon8mul(dval,vmvn_u8(temp.val[3]));
 699             temp = neon8qadd(temp,dval);
 700
 701             mask += (w & 7);
 702             dst += (w & 7);
 703             w -= (w & 7);
 704
 705             while (w)
 706             {
 707                 alpha = vld1_u8((void*)mask);
 708                 dval = vld4_u8((void*)dst);
 709
 710                 vst4_u8((void*)keep_dst,temp);
 711                 keep_dst = dst;
 712
 713                 temp = neon8mul(sval8,alpha);
 714                 dval = neon8mul(dval,vmvn_u8(temp.val[3]));
 715                 temp = neon8qadd(temp,dval);
 716
 717                 mask+=8;
 718                 dst+=8;
 719                 w-=8;
 720             }
 721             vst4_u8((void*)keep_dst,temp);
 722 #else
 723         asm volatile (
 724                         "vdup.32      d0, %[src]\n\t"
 725                         "vdup.8       d1, d0[1]\n\t"
 726                         "vdup.8       d2, d0[2]\n\t"
 727                         "vdup.8       d3, d0[3]\n\t"
 728                         "vdup.8       d0, d0[0]\n\t"
 729
 730                         "vld4.8       {d4-d7}, [%[dst]]\n\t"
 731                         "vld1.8       {d31}, [%[mask]]\n\t"
 732                         "mov  %[keep_dst], %[dst]\n\t"
 733
 734                         "and  ip, %[w], #7\n\t"
 735                         "add  %[mask], %[mask], ip\n\t"
 736                         "add  %[dst], %[dst], ip, LSL#2\n\t"
 737                         "subs  %[w], %[w], ip\n\t"
 738                         "b 9f\n\t"
 739 // LOOP
 740                         "2:\n\t"
 741                         "vld4.8       {d4-d7}, [%[dst]]!\n\t"
 742                         "vld1.8       {d31}, [%[mask]]!\n\t"
 743                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 744                         "sub  %[keep_dst], %[dst], #8*4\n\t"
 745                         "subs  %[w], %[w], #8\n\t"
 746                         "9:\n\t"
 747
 748                         "vmull.u8     q10, d31, d0\n\t"
 749                         "vmull.u8     q11, d31, d1\n\t"
 750                         "vmull.u8     q12, d31, d2\n\t"
 751                         "vmull.u8     q13, d31, d3\n\t"
 752                         "vrshr.u16    q8, q10, #8\n\t"
 753                         "vrshr.u16    q9, q11, #8\n\t"
 754                         "vraddhn.u16  d20, q10, q8\n\t"
 755                         "vraddhn.u16  d21, q11, q9\n\t"
 756                         "vrshr.u16    q9, q13, #8\n\t"
 757                         "vrshr.u16    q8, q12, #8\n\t"
 758                         "vraddhn.u16  d23, q13, q9\n\t"
 759                         "vraddhn.u16  d22, q12, q8\n\t"
 760
 761                         "vmvn.8       d30, d23\n\t"
 762                         "vmull.u8     q12, d30, d4\n\t"
 763                         "vmull.u8     q13, d30, d5\n\t"
 764                         "vmull.u8     q14, d30, d6\n\t"
 765                         "vmull.u8     q15, d30, d7\n\t"
 766
 767                         "vrshr.u16    q8, q12, #8\n\t"
 768                         "vrshr.u16    q9, q13, #8\n\t"
 769                         "vraddhn.u16  d4, q12, q8\n\t"
 770                         "vrshr.u16    q8, q14, #8\n\t"
 771                         "vraddhn.u16  d5, q13, q9\n\t"
 772                         "vrshr.u16    q9, q15, #8\n\t"
 773                         "vraddhn.u16  d6, q14, q8\n\t"
 774                         "vraddhn.u16  d7, q15, q9\n\t"
 775 // result in d4-d7
 776
 777                         "vqadd.u8     d20, d4, d20\n\t"
 778                         "vqadd.u8     d21, d5, d21\n\t"
 779                         "vqadd.u8     d22, d6, d22\n\t"
 780                         "vqadd.u8     d23, d7, d23\n\t"
 781
 782                         "bne 2b\n\t"
 783
 784                         "1:\n\t"
 785                         "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
 786
 787                         : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
 788                         : [src] "r" (src)
 789                         : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 790                           "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 791                           "d30","d31"
 792                         );
 793 #endif
 794         }
 795     }
 796     else
 797     {
 798         while (height--)
 799         {
 800             uint8x8_t alpha;
 801
 802             dst = dst_line;
 803             dst_line += dst_stride;
 804             mask = mask_line;
 805             mask_line += mask_stride;
 806             w = width;
 807
 808             while (w>=2)
 809             {
 810                 uint8x8_t dval, temp, res;
 811
 812                 alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector);
 813                 dval = vld1_u8((void*)dst);
 814
 815                 temp = neon2mul(sval2,alpha);
 816                 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
 817
 818                 vst1_u8((void*)dst,res);
 819
 820                 mask+=2;
 821                 dst+=2;
 822                 w-=2;
 823             }
 824             if (w)
 825             {
 826                 uint8x8_t dval, temp, res;
 827
 828                 alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector);
 829                 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
 830
 831                 temp = neon2mul(sval2,alpha);
 832                 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
 833
 834                 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0);
 835             }
 836         }
 837     }
 838 }
 839
 840
 841 static void
 842 neon_composite_add_8888_8_8 (
 843                             pixman_implementation_t * impl,
 844                             pixman_op_t op,
 845                             pixman_image_t * src_image,
 846                             pixman_image_t * mask_image,
 847                             pixman_image_t * dst_image,
 848                             int32_t      src_x,
 849                             int32_t      src_y,
 850                             int32_t      mask_x,
 851                             int32_t      mask_y,
 852                             int32_t      dest_x,
 853                             int32_t      dest_y,
 854                             int32_t      width,
 855                             int32_t      height)
 856 {
 857     uint8_t     *dst_line, *dst;
 858     uint8_t     *mask_line, *mask;
 859     int dst_stride, mask_stride;
 860     uint32_t    w;
 861     uint32_t    src;
 862     uint8x8_t   sa;
 863
 864     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 865     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 866     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 867     sa = vdup_n_u8((src) >> 24);
 868
 869     if (width>=8)
 870     {
 871         // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
 872         while (height--)
 873         {
 874             dst = dst_line;
 875             dst_line += dst_stride;
 876             mask = mask_line;
 877             mask_line += mask_stride;
 878             w = width;
 879
 880             uint8x8_t mval, dval, res;
 881             uint8_t     *keep_dst;
 882
 883             mval = vld1_u8((void *)mask);
 884             dval = vld1_u8((void *)dst);
 885             keep_dst = dst;
 886
 887             res = vqadd_u8(neon2mul(mval,sa),dval);
 888
 889             mask += (w & 7);
 890             dst += (w & 7);
 891             w -= w & 7;
 892
 893             while (w)
 894             {
 895                 mval = vld1_u8((void *)mask);
 896                 dval = vld1_u8((void *)dst);
 897                 vst1_u8((void *)keep_dst, res);
 898                 keep_dst = dst;
 899
 900                 res = vqadd_u8(neon2mul(mval,sa),dval);
 901
 902                 mask += 8;
 903                 dst += 8;
 904                 w -= 8;
 905             }
 906             vst1_u8((void *)keep_dst, res);
 907         }
 908     }
 909     else
 910     {
 911         // Use 4/2/1 load/store method to handle 1-7 pixels
 912         while (height--)
 913         {
 914             dst = dst_line;
 915             dst_line += dst_stride;
 916             mask = mask_line;
 917             mask_line += mask_stride;
 918             w = width;
 919
 920             uint8x8_t mval=sa, dval=sa, res;
 921             uint8_t *dst4=0, *dst2=0;
 922
 923             if (w&4)
 924             {
 925                 mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1));
 926                 dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1));
 927
 928                 dst4 = dst;
 929                 mask += 4;
 930                 dst += 4;
 931             }
 932             if (w&2)
 933             {
 934                 mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1));
 935                 dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1));
 936                 dst2 = dst;
 937                 mask += 2;
 938                 dst += 2;
 939             }
 940             if (w&1)
 941             {
 942                 mval = vld1_lane_u8(mask, mval, 1);
 943                 dval = vld1_lane_u8(dst, dval, 1);
 944             }
 945
 946             res = vqadd_u8(neon2mul(mval,sa),dval);
 947
 948             if (w&1)
 949                 vst1_lane_u8(dst, res, 1);
 950             if (w&2)
 951                 vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1);
 952             if (w&4)
 953                 vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1);
 954         }
 955     }
 956 }
 957
 958 #ifdef USE_GCC_INLINE_ASM
 959
 960 static void
 961 neon_composite_src_16_16 (
 962         pixman_implementation_t * impl,
 963         pixman_op_t op,
 964         pixman_image_t * src_image,
 965         pixman_image_t * mask_image,
 966         pixman_image_t * dst_image,
 967         int32_t      src_x,
 968         int32_t      src_y,
 969         int32_t      mask_x,
 970         int32_t      mask_y,
 971         int32_t      dest_x,
 972         int32_t      dest_y,
 973         int32_t      width,
 974         int32_t      height)
 975 {
 976         uint16_t    *dst_line, *src_line;
 977         uint32_t     dst_stride, src_stride;
 978
 979         if(!height || !width)
 980                 return;
 981
 982         /* We simply copy 16-bit-aligned pixels from one place to another. */
 983         PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
 984         PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 985
 986         /* Preload the first input scanline */
 987         {
 988                 uint16_t *src_ptr = src_line;
 989                 uint32_t count = width;
 990
 991                 asm volatile (
 992                 "0: @ loop                                                      \n"
 993                 "       subs    %[count], %[count], #32                         \n"
 994                 "       pld     [%[src]]                                        \n"
 995                 "       add     %[src], %[src], #64                             \n"
 996                 "       bgt 0b                                                  \n"
 997
 998                 // Clobbered input registers marked as input/outputs
 999                 : [src] "+r" (src_ptr), [count] "+r" (count)
1000                 : // no unclobbered inputs
1001                 : "cc"
1002                 );
1003         }
1004
1005         while(height--) {
1006                 uint16_t *dst_ptr = dst_line;
1007                 uint16_t *src_ptr = src_line;
1008                 uint32_t count = width;
1009                 uint32_t tmp = 0;
1010
1011                 // Uses multi-register access and preloading to maximise bandwidth.
1012                 // Each pixel is one halfword, so a quadword contains 8px.
1013                 // Preload frequency assumed a 64-byte cacheline.
1014                 asm volatile (
1015                 "       cmp       %[count], #64                         \n"
1016                 "       blt 1f    @ skip oversized fragments            \n"
1017                 "0: @ start with eight quadwords at a time              \n"
1018                 "       pld       [%[src], %[src_stride], LSL #1]       \n" // preload from next scanline
1019                 "       sub       %[count], %[count], #64               \n"
1020                 "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1021                 "       vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1022                 "       pld       [%[src], %[src_stride], LSL #1]       \n" // preload from next scanline
1023                 "       vld1.16   {d24,d25,d26,d27}, [%[src]]!          \n"
1024                 "       vld1.16   {d28,d29,d30,d31}, [%[src]]!          \n"
1025                 "       cmp       %[count], #64                         \n"
1026                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1027                 "       vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1028                 "       vst1.16   {d24,d25,d26,d27}, [%[dst]]!          \n"
1029                 "       vst1.16   {d28,d29,d30,d31}, [%[dst]]!          \n"
1030                 "       bge 0b                                          \n"
1031                 "       cmp       %[count], #0                          \n"
1032                 "       beq 7f    @ aligned fastpath                    \n"
1033                 "1: @ four quadwords                                    \n"
1034                 "       tst       %[count], #32                         \n"
1035                 "       beq 2f    @ skip oversized fragment             \n"
1036                 "       pld       [%[src], %[src_stride], LSL #1]       \n" // preload from next scanline
1037                 "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1038                 "       vld1.16   {d20,d21,d22,d23}, [%[src]]!          \n"
1039                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1040                 "       vst1.16   {d20,d21,d22,d23}, [%[dst]]!          \n"
1041                 "2: @ two quadwords                                     \n"
1042                 "       tst       %[count], #16                         \n"
1043                 "       beq 3f    @ skip oversized fragment             \n"
1044                 "       pld       [%[src], %[src_stride], LSL #1]       \n" // preload from next scanline
1045                 "       vld1.16   {d16,d17,d18,d19}, [%[src]]!          \n"
1046                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          \n"
1047                 "3: @ one quadword                                      \n"
1048                 "       tst       %[count], #8                          \n"
1049                 "       beq 4f    @ skip oversized fragment             \n"
1050                 "       vld1.16   {d16,d17}, [%[src]]!                  \n"
1051                 "       vst1.16   {d16,d17}, [%[dst]]!                  \n"
1052                 "4: @ one doubleword                                    \n"
1053                 "       tst       %[count], #4                          \n"
1054                 "       beq 5f    @ skip oversized fragment             \n"
1055                 "       vld1.16   {d16}, [%[src]]!                      \n"
1056                 "       vst1.16   {d16}, [%[dst]]!                      \n"
1057                 "5: @ one word                                          \n"
1058                 "       tst       %[count], #2                          \n"
1059                 "       beq 6f    @ skip oversized fragment             \n"
1060                 "       ldr       %[tmp], [%[src]], #4                  \n"
1061                 "       str       %[tmp], [%[dst]], #4                  \n"
1062                 "6: @ one halfword                                      \n"
1063                 "       tst       %[count], #1                          \n"
1064                 "       beq 7f    @ skip oversized fragment             \n"
1065                 "       ldrh      %[tmp], [%[src]]                      \n"
1066                 "       strh      %[tmp], [%[dst]]                      \n"
1067                 "7: @ end                                               \n"
1068
1069                 // Clobbered input registers marked as input/outputs
1070                 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count), [tmp] "+r" (tmp)
1071
1072                 // Unclobbered input
1073                 : [src_stride] "r" (src_stride)
1074
1075                 // Clobbered vector registers
1076                 // NB: these are the quad aliases of the double registers used in the asm
1077                 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1078                 );
1079
1080                 src_line += src_stride;
1081                 dst_line += dst_stride;
1082         }
1083 }
1084
1085 #endif /* USE_GCC_INLINE_ASM */
1086
1087 static void
1088 neon_composite_src_24_16 (
1089         pixman_implementation_t * impl,
1090         pixman_op_t op,
1091         pixman_image_t * src_image,
1092         pixman_image_t * mask_image,
1093         pixman_image_t * dst_image,
1094         int32_t      src_x,
1095         int32_t      src_y,
1096         int32_t      mask_x,
1097         int32_t      mask_y,
1098         int32_t      dest_x,
1099         int32_t      dest_y,
1100         int32_t      width,
1101         int32_t      height)
1102 {
1103         uint16_t    *dst_line;
1104         uint32_t    *src_line;
1105         uint32_t     dst_stride, src_stride;
1106
1107         if(!width || !height)
1108                 return;
1109
1110         /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
1111         PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1112         PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1113
1114         /* Preload the first input scanline */
1115         {
1116                 uint8_t *src_ptr = (uint8_t*) src_line;
1117                 uint32_t count = (width + 15) / 16;
1118
1119 #ifdef USE_GCC_INLINE_ASM
1120                 asm volatile (
1121                 "0: @ loop                                              \n"
1122                 "       subs    %[count], %[count], #1                  \n"
1123                 "       pld     [%[src]]                                \n"
1124                 "       add     %[src], %[src], #64                     \n"
1125                 "       bgt 0b                                          \n"
1126
1127                 // Clobbered input registers marked as input/outputs
1128                 : [src] "+r" (src_ptr), [count] "+r" (count)
1129                 : // no unclobbered inputs
1130                 : "cc"
1131                 );
1132 #else
1133                 do {
1134                         __pld(src_ptr);
1135                         src_ptr += 64;
1136                 } while(--count);
1137 #endif
1138         }
1139
1140         while(height--) {
1141                 uint16_t *dst_ptr = dst_line;
1142                 uint32_t *src_ptr = src_line;
1143                 uint32_t count = width;
1144                 const uint32_t rb_mask = 0x1F;
1145                 const uint32_t g_mask = 0x3F;
1146
1147                 // If you're going to complain about a goto, take a long hard look
1148                 // at the massive blocks of assembler this skips over.  ;-)
1149                 if(count < 8)
1150                         goto small_stuff;
1151
1152 #ifdef USE_GCC_INLINE_ASM
1153
1154                 // This is not as aggressive as the RGB565-source case.
1155                 // Generally the source is in cached RAM when the formats are different, so we use preload.
1156                 // We don't need to blend, so we are not reading from the uncached framebuffer.
1157                 asm volatile (
1158                 "       cmp       %[count], #16                                                                         \n"
1159                 "       blt 1f    @ skip oversized fragments                                                            \n"
1160                 "0: @ start with sixteen pixels at a time                                                               \n"
1161                 "       sub       %[count], %[count], #16                                                               \n"
1162                 "       pld      [%[src], %[src_stride], lsl #2]         @ preload from next scanline                   \n"
1163                 "       vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1164                 "       vld4.8    {d4,d5,d6,d7}, [%[src]]!              @ d7 is alpha and ignored, d6-4 are rgb.        \n"
1165                 "       vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1166                 "       vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1167                 "       vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1168                 "       vshll.u8  q9, d6, #8                            @ expand second red for repacking               \n"
1169                 "       vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1170                 "       vshll.u8  q10, d5, #8                           @ expand second green for repacking             \n"
1171                 "       vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1172                 "       vshll.u8  q11, d4, #8                           @ expand second blue for repacking              \n"
1173                 "       vsri.u16  q9, q10, #5                           @ insert second green after red                 \n"
1174                 "       vsri.u16  q9, q11, #11                          @ insert second blue after green                \n"
1175                 "       cmp       %[count], #16                                                                         \n"
1176                 "       vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels                               \n"
1177                 "       bge 0b                                                                                          \n"
1178                 "1: @ end of main loop  \n"
1179                 "       cmp       %[count], #8                          @ can we still do an 8-pixel block?             \n"
1180                 "       blt 2f                                                                                          \n"
1181                 "       sub       %[count], %[count], #8        \n"
1182                 "       pld      [%[src], %[src_stride], lsl #2]         @ preload from next scanline                   \n"
1183                 "       vld4.8    {d0,d1,d2,d3}, [%[src]]!              @ d3 is alpha and ignored, d2-0 are rgb.        \n"
1184                 "       vshll.u8  q8, d2, #8                            @ expand first red for repacking                \n"
1185                 "       vshll.u8  q10, d1, #8                           @ expand first green for repacking              \n"
1186                 "       vshll.u8  q11, d0, #8                           @ expand first blue for repacking               \n"
1187                 "       vsri.u16  q8, q10, #5                           @ insert first green after red                  \n"
1188                 "       vsri.u16  q8, q11, #11                          @ insert first blue after green                 \n"
1189                 "       vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels                                \n"
1190                 "2: @ end                                                                                               \n"
1191
1192                 // Clobbered input and working registers marked as input/outputs
1193                 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1194
1195                 // Unclobbered input
1196                 : [src_stride] "r" (src_stride)
1197
1198                 // Clobbered vector registers
1199                 // NB: these are the quad aliases of the double registers used in the asm
1200                 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
1201                 );
1202 #else
1203                 // A copy of the above code, in intrinsics-form.
1204                 // This should be pretty self-documenting...
1205                 while(count >= 16) {
1206                         uint8x8x4_t pixel_set_a, pixel_set_b;
1207                         uint16x8_t red_a, green_a, blue_a;
1208                         uint16x8_t red_b, green_b, blue_b;
1209                         uint16x8_t dest_pixels_a, dest_pixels_b;
1210
1211                         count -= 16;
1212                         __pld(src_ptr + src_stride);
1213                         pixel_set_a = vld4_u8((uint8_t*)(src_ptr));
1214                         pixel_set_b = vld4_u8((uint8_t*)(src_ptr+8));
1215                         src_ptr += 16;
1216
1217                         red_a   = vshll_n_u8(pixel_set_a.val[2], 8);
1218                         green_a = vshll_n_u8(pixel_set_a.val[1], 8);
1219                         blue_a  = vshll_n_u8(pixel_set_a.val[0], 8);
1220                         red_b   = vshll_n_u8(pixel_set_b.val[2], 8);
1221                         green_b = vshll_n_u8(pixel_set_b.val[1], 8);
1222                         blue_b  = vshll_n_u8(pixel_set_b.val[0], 8);
1223                         dest_pixels_a = vsriq_n_u16(red_a, green_a, 5);
1224                         dest_pixels_b = vsriq_n_u16(red_b, green_b, 5);
1225                         dest_pixels_a = vsriq_n_u16(dest_pixels_a, blue_a, 11);
1226                         dest_pixels_b = vsriq_n_u16(dest_pixels_b, blue_b, 11);
1227
1228                         // There doesn't seem to be an intrinsic for the double-quadword variant
1229                         vst1q_u16(dst_ptr  , dest_pixels_a);
1230                         vst1q_u16(dst_ptr+8, dest_pixels_b);
1231                         dst_ptr += 16;
1232                 }
1233
1234                 // 8-pixel loop
1235                 if(count >= 8) {
1236                         uint8x8x4_t pixel_set_a;
1237                         uint16x8_t red_a, green_a, blue_a;
1238                         uint16x8_t dest_pixels_a;
1239
1240                         __pld(src_ptr + src_stride);
1241                         count -= 8;
1242                         pixel_set_a = vld4_u8((uint8_t*)(src_ptr));
1243                         src_ptr += 8;
1244
1245                         red_a   = vshll_n_u8(pixel_set_a.val[2], 8);
1246                         green_a = vshll_n_u8(pixel_set_a.val[1], 8);
1247                         blue_a  = vshll_n_u8(pixel_set_a.val[0], 8);
1248                         dest_pixels_a = vsriq_n_u16(red_a, green_a, 5);
1249                         dest_pixels_a = vsriq_n_u16(dest_pixels_a, blue_a, 11);
1250
1251                         vst1q_u16(dst_ptr  , dest_pixels_a);
1252                         dst_ptr += 8;
1253                 }
1254
1255 #endif  // USE_GCC_INLINE_ASM
1256
1257         small_stuff:
1258
1259                 if(count)
1260                         __pld(src_ptr + src_stride);
1261
1262                 while(count >= 2) {
1263                         uint32_t src_pixel_a = *src_ptr++;
1264                         uint32_t src_pixel_b = *src_ptr++;
1265
1266                         // ARM is really good at shift-then-ALU ops.
1267                         // This should be a total of six shift-ANDs and five shift-ORs.
1268                         uint32_t dst_pixels_a;
1269                         uint32_t dst_pixels_b;
1270
1271                         dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
1272                         dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
1273                         dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1274
1275                         dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
1276                         dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
1277                         dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1278
1279                         // little-endian mode only
1280                         *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1281                         dst_ptr += 2;
1282                         count -= 2;
1283                 }
1284
1285                 if(count) {
1286                         uint32_t src_pixel = *src_ptr++;
1287
1288                         // ARM is really good at shift-then-ALU ops.
1289                         // This block should end up as three shift-ANDs and two shift-ORs.
1290                         uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
1291                         uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1292                         uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
1293                         uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1294
1295                         *dst_ptr++ = dst_pixel;
1296                         count--;
1297                 }
1298
1299                 src_line += src_stride;
1300                 dst_line += dst_stride;
1301         }
1302 }
1303
1304
1305 static pixman_bool_t
1306 pixman_fill_neon (uint32_t *bits,
1307                   int stride,
1308                   int bpp,
1309                   int x,
1310                   int y,
1311                   int width,
1312                   int height,
1313                   uint32_t _xor)
1314 {
1315     uint32_t byte_stride, color;
1316     char *dst;
1317
1318     /* stride is always multiple of 32bit units in pixman */
1319     byte_stride = stride * sizeof(uint32_t);
1320
1321     switch (bpp)
1322     {
1323         case 8:
1324             dst = ((char *) bits) + y * byte_stride + x;
1325             _xor &= 0xff;
1326             color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1327             break;
1328         case 16:
1329             dst = ((char *) bits) + y * byte_stride + x * 2;
1330             _xor &= 0xffff;
1331             color = _xor << 16 | _xor;
1332             width *= 2;     /* width to bytes */
1333             break;
1334         case 32:
1335             dst = ((char *) bits) + y * byte_stride + x * 4;
1336             color = _xor;
1337             width *= 4;     /* width to bytes */
1338             break;
1339         default:
1340             return FALSE;
1341     }
1342
1343 #ifdef USE_GCC_INLINE_ASM
1344     if (width < 16)
1345         /* We have a special case for such small widths that don't allow
1346            us to use wide 128-bit stores anyway. We don't waste time
1347            trying to align writes, since there are only very few of them anyway */
1348         asm volatile (
1349         "cmp            %[height], #0\n" /* Check if empty fill */
1350         "beq            3f\n"
1351         "vdup.32        d0, %[color]\n"  /* Fill the color to neon req */
1352
1353         /* Check if we have a such width that can easily be handled by single
1354            operation for each scanline. This significantly reduces the number
1355            of test/branch instructions for each scanline */
1356         "cmp            %[width], #8\n"
1357         "beq            4f\n"
1358         "cmp            %[width], #4\n"
1359         "beq            5f\n"
1360         "cmp            %[width], #2\n"
1361         "beq            6f\n"
1362
1363         /* Loop starts here for each scanline */
1364         "1:\n"
1365         "mov            r4, %[dst]\n"    /* Starting address of the current line */
1366         "tst            %[width], #8\n"
1367         "beq            2f\n"
1368         "vst1.8         {d0}, [r4]!\n"
1369         "2:\n"
1370         "tst            %[width], #4\n"
1371         "beq            2f\n"
1372         "str            %[color], [r4], #4\n"
1373         "2:\n"
1374         "tst            %[width], #2\n"
1375         "beq            2f\n"
1376         "strh           %[color], [r4], #2\n"
1377         "2:\n"
1378         "tst            %[width], #1\n"
1379         "beq            2f\n"
1380         "strb           %[color], [r4], #1\n"
1381         "2:\n"
1382
1383         "subs           %[height], %[height], #1\n"
1384         "add            %[dst], %[dst], %[byte_stride]\n"
1385         "bne            1b\n"
1386         "b              3f\n"
1387
1388         /* Special fillers for those widths that we can do with single operation */
1389         "4:\n"
1390         "subs           %[height], %[height], #1\n"
1391         "vst1.8         {d0}, [%[dst]]\n"
1392         "add            %[dst], %[dst], %[byte_stride]\n"
1393         "bne            4b\n"
1394         "b              3f\n"
1395
1396         "5:\n"
1397         "subs           %[height], %[height], #1\n"
1398         "str            %[color], [%[dst]]\n"
1399         "add            %[dst], %[dst], %[byte_stride]\n"
1400         "bne            5b\n"
1401         "b              3f\n"
1402
1403         "6:\n"
1404         "subs           %[height], %[height], #1\n"
1405         "strh           %[color], [%[dst]]\n"
1406         "add            %[dst], %[dst], %[byte_stride]\n"
1407         "bne            6b\n"
1408
1409         "3:\n"
1410         : /* No output members */
1411         : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1412           [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1413         : "memory", "cc", "d0", "r4", "r5");
1414     else
1415         asm volatile (
1416         "cmp            %[height], #0\n" /* Check if empty fill */
1417         "beq            5f\n"
1418         "vdup.32        q0, %[color]\n"  /* Fill the color to neon req */
1419
1420         /* Loop starts here for each scanline */
1421         "1:\n"
1422         "mov            r4, %[dst]\n"    /* Starting address of the current line */
1423         "mov            r5, %[width]\n"  /* We're going to write this many bytes */
1424         "ands           r6, r4, #15\n"   /* Are we at the 128-bit aligned address? */
1425         "beq            2f\n"            /* Jump to the best case */
1426
1427         /* We're not 128-bit aligned: However, we know that we can get to the
1428            next aligned location, since the fill is at least 16 bytes wide */
1429         "rsb            r6, r6, #16\n"   /* We would need to go forward this much */
1430         "sub            r5, r5, r6\n"    /* Update bytes left */
1431         "tst            r6, #1\n"
1432         "beq            6f\n"
1433         "vst1.8         {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1434         "6:\n"
1435         "tst            r6, #2\n"
1436         "beq            6f\n"
1437         "vst1.16        {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1438         "6:\n"
1439         "tst            r6, #4\n"
1440         "beq            6f\n"
1441         "vst1.32        {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1442         "6:\n"
1443         "tst            r6, #8\n"
1444         "beq            2f\n"
1445         "vst1.64        {d0}, [r4, :64]!\n"    /* Store qword now we're 64-bit aligned */
1446
1447         /* The good case: We're 128-bit aligned for this scanline */
1448         "2:\n"
1449         "and            r6, r5, #15\n"        /* Number of tailing bytes */
1450         "cmp            r5, r6\n"             /* Do we have at least one qword to write? */
1451         "beq            6f\n"                 /* No, we just write the tail */
1452         "lsr            r5, r5, #4\n"         /* This many full qwords to write */
1453
1454         /* The main block: Do 128-bit aligned writes */
1455         "3:\n"
1456         "subs           r5, r5, #1\n"
1457         "vst1.64        {d0,d1}, [r4, :128]!\n"
1458         "bne            3b\n"
1459
1460         /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1461             We know that we're currently at 128-bit aligned address, so we can just
1462             pick the biggest operations that the remaining write width allows */
1463         "6:\n"
1464         "cmp            r6, #0\n"
1465         "beq            4f\n"
1466         "tst            r6, #8\n"
1467         "beq            6f\n"
1468         "vst1.64        {d0}, [r4, :64]!\n"
1469         "6:\n"
1470         "tst            r6, #4\n"
1471         "beq            6f\n"
1472         "vst1.32        {d0[0]}, [r4, :32]!\n"
1473         "6:\n"
1474         "tst            r6, #2\n"
1475         "beq            6f\n"
1476         "vst1.16        {d0[0]}, [r4, :16]!\n"
1477         "6:\n"
1478         "tst            r6, #1\n"
1479         "beq            4f\n"
1480         "vst1.8         {d0[0]}, [r4]!\n"
1481         "4:\n"
1482
1483         /* Handle the next scanline */
1484         "subs           %[height], %[height], #1\n"
1485         "add            %[dst], %[dst], %[byte_stride]\n"
1486         "bne            1b\n"
1487         "5:\n"
1488         : /* No output members */
1489         : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1490           [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1491         : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1492
1493     return TRUE;
1494
1495 #else
1496
1497     // TODO: intrinsic version for armcc
1498     return FALSE;
1499
1500 #endif
1501 }
1502
1503
1504 // TODO: is there a more generic way of doing this being introduced?
1505 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1506
1507 static inline void neon_quadword_copy(
1508         void* dst,
1509         void* src,
1510         uint32_t count,       // of quadwords
1511         uint32_t trailer_count // of bytes
1512 )
1513 {
1514         uint8_t *t_dst = dst, *t_src = src;
1515
1516         // Uses aligned multi-register loads to maximise read bandwidth
1517         // on uncached memory such as framebuffers
1518         // The accesses do not have the aligned qualifiers, so that the copy
1519         // may convert between aligned-uncached and unaligned-cached memory.
1520         // It is assumed that the CPU can infer alignedness from the address.
1521
1522 #ifdef USE_GCC_INLINE_ASM
1523
1524         asm volatile (
1525         "       cmp       %[count], #8                                          \n"
1526         "       blt 1f    @ skip oversized fragments            \n"
1527         "0: @ start with eight quadwords at a time              \n"
1528         "       sub       %[count], %[count], #8                        \n"
1529         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1530         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1531         "       vld1.8    {d24,d25,d26,d27}, [%[src]]!          \n"
1532         "       vld1.8    {d28,d29,d30,d31}, [%[src]]!          \n"
1533         "       cmp       %[count], #8                                          \n"
1534         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1535         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1536         "       vst1.8    {d24,d25,d26,d27}, [%[dst]]!          \n"
1537         "       vst1.8    {d28,d29,d30,d31}, [%[dst]]!          \n"
1538         "       bge 0b                                                                          \n"
1539         "1: @ four quadwords                                                    \n"
1540         "       tst       %[count], #4                                          \n"
1541         "       beq 2f    @ skip oversized fragment                     \n"
1542         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1543         "       vld1.8    {d20,d21,d22,d23}, [%[src]]!          \n"
1544         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1545         "       vst1.8    {d20,d21,d22,d23}, [%[dst]]!          \n"
1546         "2: @ two quadwords                                                             \n"
1547         "       tst       %[count], #2                                          \n"
1548         "       beq 3f    @ skip oversized fragment                     \n"
1549         "       vld1.8    {d16,d17,d18,d19}, [%[src]]!          \n"
1550         "       vst1.8    {d16,d17,d18,d19}, [%[dst]]!          \n"
1551         "3: @ one quadword                                                              \n"
1552         "       tst       %[count], #1                                          \n"
1553         "       beq 4f    @ skip oversized fragment                     \n"
1554         "       vld1.8    {d16,d17}, [%[src]]!                          \n"
1555         "       vst1.8    {d16,d17}, [%[dst]]!                          \n"
1556         "4: @ end                                                                               \n"
1557
1558         // Clobbered input registers marked as input/outputs
1559         : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1560
1561         // No unclobbered inputs
1562         :
1563
1564         // Clobbered vector registers
1565         // NB: these are the quad aliases of the double registers used in the asm
1566         : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1567         );
1568
1569 #else
1570
1571         while(count >= 8) {
1572                 uint8x16x4_t t1 = vld4q_u8(t_src);
1573                 uint8x16x4_t t2 = vld4q_u8(t_src + sizeof(uint8x16x4_t));
1574                 t_src += sizeof(uint8x16x4_t) * 2;
1575                 vst4q_u8(t_dst, t1);
1576                 vst4q_u8(t_dst + sizeof(uint8x16x4_t), t2);
1577                 t_dst += sizeof(uint8x16x4_t) * 2;
1578                 count -= 8;
1579         }
1580
1581         if(count & 4) {
1582                 uint8x16x4_t t1 = vld4q_u8(t_src);
1583                 t_src += sizeof(uint8x16x4_t);
1584                 vst4q_u8(t_dst, t1);
1585                 t_dst += sizeof(uint8x16x4_t);
1586         }
1587
1588         if(count & 2) {
1589                 uint8x8x4_t t1 = vld4_u8(t_src);
1590                 t_src += sizeof(uint8x8x4_t);
1591                 vst4_u8(t_dst, t1);
1592                 t_dst += sizeof(uint8x8x4_t);
1593         }
1594
1595         if(count & 1) {
1596                 uint8x16_t t1 = vld1q_u8(t_src);
1597                 t_src += sizeof(uint8x16_t);
1598                 vst1q_u8(t_dst, t1);
1599                 t_dst += sizeof(uint8x16_t);
1600         }
1601
1602 #endif  // !USE_GCC_INLINE_ASM
1603
1604         if(trailer_count) {
1605                 if(trailer_count & 8) {
1606                         uint8x8_t t1 = vld1_u8(t_src);
1607                         t_src += sizeof(uint8x8_t);
1608                         vst1_u8(t_dst, t1);
1609                         t_dst += sizeof(uint8x8_t);
1610                 }
1611
1612                 if(trailer_count & 4) {
1613                         *((uint32_t*) t_dst) = *((uint32_t*) t_src);
1614                         t_dst += 4;
1615                         t_src += 4;
1616                 }
1617
1618                 if(trailer_count & 2) {
1619                         *((uint16_t*) t_dst) = *((uint16_t*) t_src);
1620                         t_dst += 2;
1621                         t_src += 2;
1622                 }
1623
1624                 if(trailer_count & 1) {
1625                         *t_dst++ = *t_src++;
1626                 }
1627         }
1628 }
1629
1630 static inline void solid_over_565_8_pix_neon(
1631         uint32_t  glyph_colour,
1632         uint16_t *dest,
1633         uint8_t  *in_mask,
1634         uint32_t  dest_stride,  // bytes, not elements
1635         uint32_t  mask_stride,
1636         uint32_t  count        // 8-pixel groups
1637 )
1638 {
1639         // Inner loop of glyph blitter (solid colour, alpha mask)
1640
1641 #ifdef USE_GCC_INLINE_ASM
1642
1643         asm volatile (
1644         "       vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]]  @ splat solid colour components    \n"
1645         "0:     @ loop                                                                                                                                                          \n"
1646         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer                       \n"
1647         "       vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph                                                \n"
1648         "       vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask                         \n"
1649         "       vshrn.u16 d17, q9, #8                @ reformat it to match original mask                       \n"
1650         "       vmvn      d18, d17                   @ we need the inverse mask for the background      \n"
1651         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits                          \n"
1652         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels                       \n"
1653         "       vshrn.u16 d4, q0, #3                 @ unpack green                                                                     \n"
1654         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)                       \n"
1655         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)          \n"
1656         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)                     \n"
1657         "       vmull.u8  q1, d2, d18                @ apply inverse mask to background red...          \n"
1658         "       vmull.u8  q2, d4, d18                @ ...green...                                                                      \n"
1659         "       vmull.u8  q3, d6, d18                @ ...blue                                                                          \n"
1660         "       subs      %[count], %[count], #1     @ decrement/test loop counter                                      \n"
1661         "       vmlal.u8  q1, d17, d22               @ add masked foreground red...                                     \n"
1662         "       vmlal.u8  q2, d17, d21               @ ...green...                                                                      \n"
1663         "       vmlal.u8  q3, d17, d20               @ ...blue                                                                          \n"
1664         "       add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait                \n"
1665         "       vsri.16   q1, q2, #5                 @ pack green behind red                                            \n"
1666         "       vsri.16   q1, q3, #11                @ pack blue into pixels                                            \n"
1667         "       vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels                                          \n"
1668         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer                                     \n"
1669         "       bne 0b                               @ next please                                                                      \n"
1670
1671         // Clobbered registers marked as input/outputs
1672         : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
1673
1674         // Inputs
1675         : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
1676
1677         // Clobbers, including the inputs we modify, and potentially lots of memory
1678         : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
1679         );
1680
1681 #else
1682
1683         uint8x8x4_t solid_colour = vld4_dup_u8((uint8_t*) &glyph_colour);
1684
1685         while(count--)
1686         {
1687                 uint16x8_t  pixels = vld1q_u16(dest);
1688                 uint8x8_t   mask = vshrn_n_u16(vmull_u8(solid_colour.val[3], vld1_u8(in_mask)), 8);
1689                 uint8x8_t  mask_image = vmvn_u8(mask);
1690
1691                 uint8x8_t  t_red   = vshrn_n_u16(pixels, 8);
1692                 uint8x8_t  t_green = vshrn_n_u16(pixels, 3);
1693                 uint8x8_t  t_blue  = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
1694
1695                 uint16x8_t s_red   = vmull_u8(vsri_n_u8(t_red  , t_red  , 5), mask_image);
1696                 uint16x8_t s_green = vmull_u8(vsri_n_u8(t_green, t_green, 6), mask_image);
1697                 uint16x8_t s_blue  = vmull_u8(          t_blue             , mask_image);
1698
1699                 s_red   = vmlal(s_red  , mask, solid_colour.val[2]);
1700                 s_green = vmlal(s_green, mask, solid_colour.val[1]);
1701                 s_blue  = vmlal(s_blue , mask, solid_colour.val[0]);
1702
1703                 pixels = vsri_n_u16(s_red, s_green, 5);
1704                 pixels = vsri_n_u16(pixels, s_blue, 11);
1705                 vst1q_u16(dest, pixels);
1706
1707                 dest += dest_stride;
1708                 mask += mask_stride;
1709         }
1710
1711 #endif
1712 }
1713
1714 static void
1715 neon_composite_over_n_8_0565 (
1716         pixman_implementation_t * impl,
1717         pixman_op_t op,
1718         pixman_image_t * src_image,
1719         pixman_image_t * mask_image,
1720         pixman_image_t * dst_image,
1721         int32_t      src_x,
1722         int32_t      src_y,
1723         int32_t      mask_x,
1724         int32_t      mask_y,
1725         int32_t      dest_x,
1726         int32_t      dest_y,
1727         int32_t      width,
1728         int32_t      height)
1729 {
1730         uint32_t     src, srca;
1731         uint16_t    *dst_line, *aligned_line;
1732         uint8_t     *mask_line;
1733         uint32_t     dst_stride, mask_stride;
1734         uint32_t     kernel_count, copy_count, copy_tail;
1735         uint8_t      kernel_offset, copy_offset;
1736
1737         src = _pixman_image_get_solid(src_image, dst_image->bits.format);
1738
1739         // bail out if fully transparent or degenerate
1740         srca = src >> 24;
1741         if(src == 0)
1742                 return;
1743         if(width == 0 || height == 0)
1744                 return;
1745
1746         if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1747                 // split the blit, so we can use a fixed-size scanline buffer
1748                 // TODO: there must be a more elegant way of doing this.
1749                 int x;
1750                 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1751                         neon_composite_over_n_8_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
1752                                                                                           (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1753                 }
1754                 return;
1755         }
1756
1757         PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1758         PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1759
1760         // keep within minimum number of aligned quadwords on width
1761         // while also keeping the minimum number of columns to process
1762         {
1763                 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
1764                 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
1765                 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
1766
1767                 // the fast copy should be quadword aligned
1768                 copy_offset = dst_line - ((uint16_t*) aligned_left);
1769                 aligned_line = dst_line - copy_offset;
1770                 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
1771                 copy_tail = 0;
1772
1773                 if(aligned_right - aligned_left > ceiling_length) {
1774                         // unaligned routine is tightest
1775                         kernel_count = (uint32_t) (ceiling_length >> 4);
1776                         kernel_offset = copy_offset;
1777                 } else {
1778                         // aligned routine is equally tight, so it is safer to align
1779                         kernel_count = copy_count;
1780                         kernel_offset = 0;
1781                 }
1782
1783                 // We should avoid reading beyond scanline ends for safety
1784                 if(aligned_line < (dst_line - x_dst) ||
1785                         (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - x_dst) + p_dst->bits.width))
1786                 {
1787                         // switch to precise read
1788                         copy_offset = kernel_offset = 0;
1789                         aligned_line = dst_line;
1790                         kernel_count = (uint32_t) (ceiling_length >> 4);
1791                         copy_count = (width * sizeof(*dst_line)) >> 4;
1792                         copy_tail = (width * sizeof(*dst_line)) & 0xF;
1793                 }
1794         }
1795
1796         {
1797                 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1798                 uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
1799                 int y = height;
1800
1801                 // row-major order
1802                 // left edge, middle block, right edge
1803                 for( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride) {
1804                         // We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
1805                         neon_quadword_copy(glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
1806
1807                         // Uncached framebuffer access is really, really slow if we do it piecemeal.
1808                         // It should be much faster if we grab it all at once.
1809                         // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1810                         neon_quadword_copy(scan_line, aligned_line, copy_count, copy_tail);
1811
1812                         // Apply the actual filter
1813                         solid_over_565_8_pix_neon(src, scan_line + kernel_offset, glyph_line + kernel_offset, 8 * sizeof(*dst_line), 8, kernel_count);
1814
1815                         // Copy the modified scanline back
1816                         neon_quadword_copy(dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
1817                 }
1818         }
1819 }
1820
1821 #ifdef USE_GCC_INLINE_ASM
1822
1823 static inline void plain_over_565_8_pix_neon(
1824         uint32_t  colour,
1825         uint16_t *dest,
1826         uint32_t  dest_stride,  // bytes, not elements
1827         uint32_t  count        // 8-pixel groups
1828 )
1829 {
1830         // Inner loop for plain translucent rects (solid colour without alpha mask)
1831         asm volatile (
1832         "       vld4.8   {d20[],d21[],d22[],d23[]}, [%[colour]]  @ solid colour load/splat \n"
1833         "       vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
1834         "       vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
1835         "       vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
1836         "       vmvn      d18, d23                   @ inverse alpha for background \n"
1837         "0:     @ loop\n"
1838         "       vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer       \n"
1839         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
1840         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
1841         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
1842         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
1843         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
1844         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
1845         "       vmov      q0, q12                    @ retrieve foreground red   \n"
1846         "       vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
1847         "       vmov      q1, q13                    @ retrieve foreground green \n"
1848         "       vmlal.u8  q1, d4, d18                @ blend green               \n"
1849         "       vmov      q2, q14                    @ retrieve foreground blue  \n"
1850         "       vmlal.u8  q2, d6, d18                @ blend blue                \n"
1851         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
1852         "       vsri.16   q0, q1, #5                 @ pack green behind red                    \n"
1853         "       vsri.16   q0, q2, #11                @ pack blue into pixels                    \n"
1854         "       vst1.16   {d0,d1}, [%[dest]]         @ store composited pixels                  \n"
1855         "       add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer             \n"
1856         "       bne 0b                               @ next please                              \n"
1857
1858         // Clobbered registers marked as input/outputs
1859         : [dest] "+r" (dest), [count] "+r" (count)
1860
1861         // Inputs
1862         : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
1863
1864         // Clobbers, including the inputs we modify, and potentially lots of memory
1865         : "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12", "q13", "q14", "cc", "memory"
1866         );
1867 }
1868
1869 static void
1870 neon_composite_over_n_0565 (
1871         pixman_implementation_t * impl,
1872         pixman_op_t op,
1873         pixman_image_t * src_image,
1874         pixman_image_t * mask_image,
1875         pixman_image_t * dst_image,
1876         int32_t      src_x,
1877         int32_t      src_y,
1878         int32_t      mask_x,
1879         int32_t      mask_y,
1880         int32_t      dest_x,
1881         int32_t      dest_y,
1882         int32_t      width,
1883         int32_t      height)
1884 {
1885         uint32_t     src, srca;
1886         uint16_t    *dst_line, *aligned_line;
1887         uint32_t     dst_stride;
1888         uint32_t     kernel_count, copy_count, copy_tail;
1889         uint8_t      kernel_offset, copy_offset;
1890
1891         src = _pixman_image_get_solid(src_image, dst_image->bits.format);
1892
1893         // bail out if fully transparent
1894         srca = src >> 24;
1895         if(src == 0)
1896                 return;
1897         if(width == 0 || height == 0)
1898                 return;
1899
1900         if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1901                 // split the blit, so we can use a fixed-size scanline buffer
1902                 // TODO: there must be a more elegant way of doing this.
1903                 int x;
1904                 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1905                         neon_composite_over_n_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
1906                                                                                 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1907                 }
1908                 return;
1909         }
1910
1911         PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1912
1913         // keep within minimum number of aligned quadwords on width
1914         // while also keeping the minimum number of columns to process
1915         {
1916                 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
1917                 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
1918                 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
1919
1920                 // the fast copy should be quadword aligned
1921                 copy_offset = dst_line - ((uint16_t*) aligned_left);
1922                 aligned_line = dst_line - copy_offset;
1923                 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
1924                 copy_tail = 0;
1925
1926                 if(aligned_right - aligned_left > ceiling_length) {
1927                         // unaligned routine is tightest
1928                         kernel_count = (uint32_t) (ceiling_length >> 4);
1929                         kernel_offset = copy_offset;
1930                 } else {
1931                         // aligned routine is equally tight, so it is safer to align
1932                         kernel_count = copy_count;
1933                         kernel_offset = 0;
1934                 }
1935
1936                 // We should avoid reading beyond scanline ends for safety
1937                 if(aligned_line < (dst_line - x_dst) ||
1938                         (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - x_dst) + p_dst->bits.width))
1939                 {
1940                         // switch to precise read
1941                         copy_offset = kernel_offset = 0;
1942                         aligned_line = dst_line;
1943                         kernel_count = (uint32_t) (ceiling_length >> 4);
1944                         copy_count = (width * sizeof(*dst_line)) >> 4;
1945                         copy_tail = (width * sizeof(*dst_line)) & 0xF;
1946                 }
1947         }
1948
1949         {
1950                 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1951
1952                 // row-major order
1953                 // left edge, middle block, right edge
1954                 for( ; height--; aligned_line += dst_stride, dst_line += dst_stride) {
1955
1956                         // Uncached framebuffer access is really, really slow if we do it piecemeal.
1957                         // It should be much faster if we grab it all at once.
1958                         // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1959                         neon_quadword_copy(scan_line, aligned_line, copy_count, copy_tail);
1960
1961                         // Apply the actual filter
1962                         plain_over_565_8_pix_neon(src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
1963
1964                         // Copy the modified scanline back
1965                         neon_quadword_copy(dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
1966                 }
1967         }
1968 }
1969
1970 static inline void ARGB8_over_565_8_pix_neon(
1971         uint32_t *src,
1972         uint16_t *dest,
1973         uint32_t  src_stride,  // bytes, not elements
1974         uint32_t  count        // 8-pixel groups
1975 )
1976 {
1977         asm volatile (
1978         "0:     @ loop\n"
1979         "       pld   [%[src], %[src_stride]]         @ preload from next scanline      \n"
1980         "       vld1.16   {d0,d1}, [%[dest]]         @ load pixels from framebuffer     \n"
1981         "       vld4.8   {d20,d21,d22,d23},[%[src]]! @ load source image pixels         \n"
1982         "       vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits          \n"
1983         "       vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels       \n"
1984         "       vshrn.u16 d4, q0, #3                 @ unpack green                             \n"
1985         "       vmvn      d18, d23                   @ we need the inverse alpha for the background     \n"
1986         "       vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)       \n"
1987         "       vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)  \n"
1988         "       vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)     \n"
1989         "       vmull.u8  q1, d2, d18                @ apply inverse alpha to background red... \n"
1990         "       vmull.u8  q2, d4, d18                @ ...green...                              \n"
1991         "       vmull.u8  q3, d6, d18                @ ...blue                                  \n"
1992         "       subs      %[count], %[count], #1     @ decrement/test loop counter              \n"
1993         "       vmlal.u8  q1, d23, d22               @ add blended foreground red...            \n"
1994         "       vmlal.u8  q2, d23, d21               @ ...green...                              \n"
1995         "       vmlal.u8  q3, d23, d20               @ ...blue                                  \n"
1996         "       vsri.16   q1, q2, #5                 @ pack green behind red                    \n"
1997         "       vsri.16   q1, q3, #11                @ pack blue into pixels                    \n"
1998         "       vst1.16   {d2,d3}, [%[dest]]!        @ store composited pixels                  \n"
1999         "       bne 0b                               @ next please                              \n"
2000
2001         // Clobbered registers marked as input/outputs
2002         : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2003
2004         // Inputs
2005         : [src_stride] "r" (src_stride)
2006
2007         // Clobbers, including the inputs we modify, and potentially lots of memory
2008         : "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
2009         );
2010 }
2011
2012 static void
2013 neon_composite_over_8888_0565 (
2014         pixman_implementation_t * impl,
2015         pixman_op_t op,
2016         pixman_image_t * src_image,
2017         pixman_image_t * mask_image,
2018         pixman_image_t * dst_image,
2019         int32_t      src_x,
2020         int32_t      src_y,
2021         int32_t      mask_x,
2022         int32_t      mask_y,
2023         int32_t      dest_x,
2024         int32_t      dest_y,
2025         int32_t      width,
2026         int32_t      height)
2027 {
2028         uint32_t    *src_line;
2029         uint16_t    *dst_line, *aligned_line;
2030         uint32_t     dst_stride, src_stride;
2031         uint32_t     kernel_count, copy_count, copy_tail;
2032         uint8_t      kernel_offset, copy_offset;
2033
2034         // we assume mask is opaque
2035         // so the only alpha to deal with is embedded in src
2036
2037         if(width > NEON_SCANLINE_BUFFER_PIXELS) {
2038                 // split the blit, so we can use a fixed-size scanline buffer
2039                 int x;
2040                 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
2041                         neon_composite_over_8888_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
2042                                                                                   (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
2043                 }
2044                 return;
2045         }
2046
2047         PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2048         PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2049
2050         // keep within minimum number of aligned quadwords on width
2051         // while also keeping the minimum number of columns to process
2052         {
2053                 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2054                 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2055                 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2056
2057                 // the fast copy should be quadword aligned
2058                 copy_offset = dst_line - ((uint16_t*) aligned_left);
2059                 aligned_line = dst_line - copy_offset;
2060                 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2061                 copy_tail = 0;
2062
2063                 if(aligned_right - aligned_left > ceiling_length) {
2064                         // unaligned routine is tightest
2065                         kernel_count = (uint32_t) (ceiling_length >> 4);
2066                         kernel_offset = copy_offset;
2067                 } else {
2068                         // aligned routine is equally tight, so it is safer to align
2069                         kernel_count = copy_count;
2070                         kernel_offset = 0;
2071                 }
2072
2073                 // We should avoid reading beyond scanline ends for safety
2074                 if(aligned_line < (dst_line - x_dst) ||
2075                         (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - x_dst) + p_dst->bits.width))
2076                 {
2077                         // switch to precise read
2078                         copy_offset = kernel_offset = 0;
2079                         aligned_line = dst_line;
2080                         kernel_count = (uint32_t) (ceiling_length >> 4);
2081                         copy_count = (width * sizeof(*dst_line)) >> 4;
2082                         copy_tail = (width * sizeof(*dst_line)) & 0xF;
2083                 }
2084         }
2085
2086         /* Preload the first input scanline */
2087         {
2088                 uint8_t *src_ptr = (uint8_t*) src_line;
2089                 uint32_t count = (width + 15) / 16;
2090
2091 #ifdef USE_GCC_INLINE_ASM
2092                 asm volatile (
2093                 "0: @ loop                                              \n"
2094                 "       subs    %[count], %[count], #1                  \n"
2095                 "       pld     [%[src]]                                \n"
2096                 "       add     %[src], %[src], #64                     \n"
2097                 "       bgt 0b                                          \n"
2098
2099                 // Clobbered input registers marked as input/outputs
2100                 : [src] "+r" (src_ptr), [count] "+r" (count)
2101                 : // no unclobbered inputs
2102                 : "cc"
2103                 );
2104 #else
2105                 do {
2106                         __pld(src_ptr);
2107                         src_ptr += 64;
2108                 } while(--count);
2109 #endif
2110         }
2111
2112         {
2113                 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
2114
2115                 // row-major order
2116                 // left edge, middle block, right edge
2117                 for( ; height--; src_line += src_stride, aligned_line += dst_stride) {
2118                         // Uncached framebuffer access is really, really slow if we do it piecemeal.
2119                         // It should be much faster if we grab it all at once.
2120                         // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
2121                         neon_quadword_copy(scan_line, aligned_line, copy_count, copy_tail);
2122
2123                         // Apply the actual filter
2124                         ARGB8_over_565_8_pix_neon(src_line, scan_line + kernel_offset, src_stride * sizeof(*src_line), kernel_count);
2125
2126                         // Copy the modified scanline back
2127                         neon_quadword_copy(dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2128                 }
2129         }
2130 }
2131
2132 #endif  // USE_GCC_INLINE_ASM
2133
2134 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2135 {
2136     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,        0 },
2137     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,       0 },
2138     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,     0 },
2139     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,     0 },
2140     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,              0 },
2141     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,              0 },
2142     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,              0 },
2143     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,              0 },
2144 #ifdef USE_GCC_INLINE_ASM
2145     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,              0 },
2146     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,              0 },
2147     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,           0 },
2148     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,           0 },
2149     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,         0 },
2150     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,         0 },
2151 #endif
2152     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,          0 },
2153     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,          0 },
2154     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,          0 },
2155     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,          0 },
2156     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888,        NEED_SOLID_MASK },
2157     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888,        NEED_SOLID_MASK },
2158     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,     0 },
2159     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,     0 },
2160     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,     0 },
2161     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,     0 },
2162     { PIXMAN_OP_NONE },
2163 };
2164
2165 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2166
2167 static void
2168 arm_neon_composite (pixman_implementation_t *imp,
2169                 pixman_op_t     op,
2170                 pixman_image_t *src,
2171                 pixman_image_t *mask,
2172                 pixman_image_t *dest,
2173                 int32_t         src_x,
2174                 int32_t         src_y,
2175                 int32_t         mask_x,
2176                 int32_t         mask_y,
2177                 int32_t         dest_x,
2178                 int32_t         dest_y,
2179                 int32_t        width,
2180                 int32_t        height)
2181 {
2182         if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2183                                op, src, mask, dest,
2184                                src_x, src_y,
2185                                mask_x, mask_y,
2186                                dest_x, dest_y,
2187                                width, height))
2188         {
2189                 return;
2190         }
2191
2192         _pixman_implementation_composite (imp->delegate, op,
2193                                       src, mask, dest,
2194                                       src_x, src_y,
2195                                       mask_x, mask_y,
2196                                       dest_x, dest_y,
2197                                       width, height);
2198 }
2199
2200 static pixman_bool_t
2201 pixman_blt_neon (
2202         void *src_bits,
2203         void *dst_bits,
2204         int src_stride,
2205         int dst_stride,
2206         int src_bpp,
2207         int dst_bpp,
2208         int src_x, int src_y,
2209         int dst_x, int dst_y,
2210         int width, int height)
2211 {
2212         if(!width || !height)
2213                 return TRUE;
2214
2215         // accelerate only straight copies involving complete bytes
2216         if(src_bpp != dst_bpp || (src_bpp & 7))
2217                 return FALSE;
2218
2219         {
2220                 uint32_t bytes_per_pixel = src_bpp >> 3;
2221                 uint32_t byte_width = width * bytes_per_pixel;
2222                 int32_t src_stride_bytes = src_stride * 4; // parameter is in words for some reason
2223                 int32_t dst_stride_bytes = dst_stride * 4;
2224                 uint8_t *src_bytes = ((uint8_t*) src_bits) + src_y * src_stride_bytes + src_x * bytes_per_pixel;
2225                 uint8_t *dst_bytes = ((uint8_t*) dst_bits) + dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2226                 uint32_t quadword_count = byte_width / 16;
2227                 uint32_t offset         = byte_width % 16;
2228
2229                 while(height--) {
2230                         neon_quadword_copy(dst_bytes, src_bytes, quadword_count, offset);
2231                         src_bytes += src_stride_bytes;
2232                         dst_bytes += dst_stride_bytes;
2233                 }
2234         }
2235
2236         return TRUE;
2237 }
2238
2239 static pixman_bool_t
2240 arm_neon_blt (pixman_implementation_t *imp,
2241           uint32_t *src_bits,
2242           uint32_t *dst_bits,
2243           int src_stride,
2244           int dst_stride,
2245           int src_bpp,
2246           int dst_bpp,
2247           int src_x, int src_y,
2248           int dst_x, int dst_y,
2249           int width, int height)
2250 {
2251         if (pixman_blt_neon (
2252                         src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2253                         src_x, src_y, dst_x, dst_y, width, height))
2254                 return TRUE;
2255
2256         return _pixman_implementation_blt (
2257                         imp->delegate,
2258                         src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2259                         src_x, src_y, dst_x, dst_y, width, height);
2260 }
2261
2262 static pixman_bool_t
2263 arm_neon_fill (pixman_implementation_t *imp,
2264            uint32_t *bits,
2265            int stride,
2266            int bpp,
2267            int x,
2268            int y,
2269            int width,
2270            int height,
2271            uint32_t xor)
2272 {
2273         if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2274                 return TRUE;
2275
2276         return _pixman_implementation_fill (
2277                         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2278 }
2279
2280 pixman_implementation_t *
2281 _pixman_implementation_create_arm_neon (void)
2282 {
2283         pixman_implementation_t *simd = _pixman_implementation_create_arm_simd();
2284         pixman_implementation_t *imp  = _pixman_implementation_create (simd);
2285
2286         imp->composite = arm_neon_composite;
2287         imp->blt = arm_neon_blt;
2288         imp->fill = arm_neon_fill;
2289
2290         return imp;
2291 }