src/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp

   1 /*
   2  * Copyright 2012 The Android Open Source Project
   3  *
   4  * Use of this source code is governed by a BSD-style license that can be
   5  * found in the LICENSE file.
   6  */
   7
   8 #include "SkBlitRow_opts_arm_neon.h"
   9
  10 #include "SkBlitMask.h"
  11 #include "SkBlitRow.h"
  12 #include "SkColorPriv.h"
  13 #include "SkDither.h"
  14 #include "SkMathPriv.h"
  15 #include "SkUtils.h"
  16
  17 #include "SkColor_opts_neon.h"
  18 #include <arm_neon.h>
  19
  20 #ifdef SK_CPU_ARM64
  21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
  22     uint8x8x4_t vsrc;
  23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
  24
  25     asm (
  26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
  27         "mov    %[vsrc0].8b, v0.8b             \t\n"
  28         "mov    %[vsrc1].8b, v1.8b             \t\n"
  29         "mov    %[vsrc2].8b, v2.8b             \t\n"
  30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
  31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
  32         : : "v0", "v1", "v2", "v3"
  33     );
  34
  35     vsrc.val[0] = vsrc_0;
  36     vsrc.val[1] = vsrc_1;
  37     vsrc.val[2] = vsrc_2;
  38
  39     return vsrc;
  40 }
  41
  42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
  43     uint8x8x4_t vsrc;
  44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
  45
  46     asm (
  47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
  48         "mov    %[vsrc0].8b, v0.8b             \t\n"
  49         "mov    %[vsrc1].8b, v1.8b             \t\n"
  50         "mov    %[vsrc2].8b, v2.8b             \t\n"
  51         "mov    %[vsrc3].8b, v3.8b             \t\n"
  52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
  53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
  54           [src] "+&r" (src)
  55         : : "v0", "v1", "v2", "v3"
  56     );
  57
  58     vsrc.val[0] = vsrc_0;
  59     vsrc.val[1] = vsrc_1;
  60     vsrc.val[2] = vsrc_2;
  61     vsrc.val[3] = vsrc_3;
  62
  63     return vsrc;
  64 }
  65 #endif
  66
  67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
  68                            const SkPMColor* SK_RESTRICT src, int count,
  69                            U8CPU alpha, int /*x*/, int /*y*/) {
  70     SkASSERT(255 == alpha);
  71
  72     while (count >= 8) {
  73         uint8x8x4_t vsrc;
  74         uint16x8_t vdst;
  75
  76         // Load
  77 #ifdef SK_CPU_ARM64
  78         vsrc = sk_vld4_u8_arm64_3(src);
  79 #else
  80         vsrc = vld4_u8((uint8_t*)src);
  81         src += 8;
  82 #endif
  83
  84         // Convert src to 565
  85         vdst = SkPixel32ToPixel16_neon8(vsrc);
  86
  87         // Store
  88         vst1q_u16(dst, vdst);
  89
  90         // Prepare next iteration
  91         dst += 8;
  92         count -= 8;
  93     };
  94
  95     // Leftovers
  96     while (count > 0) {
  97         SkPMColor c = *src++;
  98         SkPMColorAssert(c);
  99         *dst = SkPixel32ToPixel16_ToU16(c);
 100         dst++;
 101         count--;
 102     };
 103 }
 104
 105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
 106                           const SkPMColor* SK_RESTRICT src, int count,
 107                           U8CPU alpha, int /*x*/, int /*y*/) {
 108     SkASSERT(255 > alpha);
 109
 110     uint16x8_t vmask_blue, vscale;
 111
 112     // prepare constants
 113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
 114     vmask_blue = vmovq_n_u16(0x1F);
 115
 116     while (count >= 8) {
 117         uint8x8x4_t vsrc;
 118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
 119         uint16x8_t vres_r, vres_g, vres_b;
 120
 121         // Load src
 122 #ifdef SK_CPU_ARM64
 123         vsrc = sk_vld4_u8_arm64_3(src);
 124 #else
 125         {
 126         register uint8x8_t d0 asm("d0");
 127         register uint8x8_t d1 asm("d1");
 128         register uint8x8_t d2 asm("d2");
 129         register uint8x8_t d3 asm("d3");
 130
 131         asm (
 132             "vld4.8    {d0-d3},[%[src]]!"
 133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
 134             :
 135         );
 136         vsrc.val[0] = d0;
 137         vsrc.val[1] = d1;
 138         vsrc.val[2] = d2;
 139         }
 140 #endif
 141
 142         // Load and unpack dst
 143         vdst = vld1q_u16(dst);
 144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
 145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
 146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
 147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
 148
 149         // Shift src to 565 range
 150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
 151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
 152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
 153
 154         // Scale src - dst
 155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
 156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
 157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
 158
 159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
 160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
 161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
 162
 163         vres_r += vdst_r;
 164         vres_g += vdst_g;
 165         vres_b += vdst_b;
 166
 167         // Combine
 168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
 169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
 170
 171         // Store
 172         vst1q_u16(dst, vres_b);
 173         dst += 8;
 174         count -= 8;
 175     }
 176     if (count > 0) {
 177         int scale = SkAlpha255To256(alpha);
 178         do {
 179             SkPMColor c = *src++;
 180             SkPMColorAssert(c);
 181             uint16_t d = *dst;
 182             *dst++ = SkPackRGB16(
 183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
 184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
 185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
 186         } while (--count != 0);
 187     }
 188 }
 189
 190 #ifdef SK_CPU_ARM32
 191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
 192                            const SkPMColor* SK_RESTRICT src, int count,
 193                            U8CPU alpha, int /*x*/, int /*y*/) {
 194     SkASSERT(255 == alpha);
 195
 196     if (count >= 8) {
 197         uint16_t* SK_RESTRICT keep_dst = 0;
 198
 199         asm volatile (
 200                       "ands       ip, %[count], #7            \n\t"
 201                       "vmov.u8    d31, #1<<7                  \n\t"
 202                       "vld1.16    {q12}, [%[dst]]             \n\t"
 203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
 204                       // Thumb does not support the standard ARM conditional
 205                       // instructions but instead requires the 'it' instruction
 206                       // to signal conditional execution
 207                       "it eq                                  \n\t"
 208                       "moveq      ip, #8                      \n\t"
 209                       "mov        %[keep_dst], %[dst]         \n\t"
 210
 211                       "add        %[src], %[src], ip, LSL#2   \n\t"
 212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
 213                       "subs       %[count], %[count], ip      \n\t"
 214                       "b          9f                          \n\t"
 215                       // LOOP
 216                       "2:                                         \n\t"
 217
 218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
 219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
 220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
 221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
 222                       "subs       %[count], %[count], #8      \n\t"
 223                       "9:                                         \n\t"
 224                       "pld        [%[dst],#32]                \n\t"
 225                       // expand 0565 q12 to 8888 {d4-d7}
 226                       "vmovn.u16  d4, q12                     \n\t"
 227                       "vshr.u16   q11, q12, #5                \n\t"
 228                       "vshr.u16   q10, q12, #6+5              \n\t"
 229                       "vmovn.u16  d5, q11                     \n\t"
 230                       "vmovn.u16  d6, q10                     \n\t"
 231                       "vshl.u8    d4, d4, #3                  \n\t"
 232                       "vshl.u8    d5, d5, #2                  \n\t"
 233                       "vshl.u8    d6, d6, #3                  \n\t"
 234
 235                       "vmovl.u8   q14, d31                    \n\t"
 236                       "vmovl.u8   q13, d31                    \n\t"
 237                       "vmovl.u8   q12, d31                    \n\t"
 238
 239                       // duplicate in 4/2/1 & 8pix vsns
 240                       "vmvn.8     d30, d3                     \n\t"
 241                       "vmlal.u8   q14, d30, d6                \n\t"
 242                       "vmlal.u8   q13, d30, d5                \n\t"
 243                       "vmlal.u8   q12, d30, d4                \n\t"
 244                       "vshr.u16   q8, q14, #5                 \n\t"
 245                       "vshr.u16   q9, q13, #6                 \n\t"
 246                       "vaddhn.u16 d6, q14, q8                 \n\t"
 247                       "vshr.u16   q8, q12, #5                 \n\t"
 248                       "vaddhn.u16 d5, q13, q9                 \n\t"
 249                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
 250                       "vaddhn.u16 d4, q12, q8                 \n\t"
 251                       // intentionally don't calculate alpha
 252                       // result in d4-d6
 253
 254                       "vqadd.u8   d5, d5, d1                  \n\t"
 255                       "vqadd.u8   d4, d4, d2                  \n\t"
 256
 257                       // pack 8888 {d4-d6} to 0565 q10
 258                       "vshll.u8   q10, d6, #8                 \n\t"
 259                       "vshll.u8   q3, d5, #8                  \n\t"
 260                       "vshll.u8   q2, d4, #8                  \n\t"
 261                       "vsri.u16   q10, q3, #5                 \n\t"
 262                       "vsri.u16   q10, q2, #11                \n\t"
 263
 264                       "bne        2b                          \n\t"
 265
 266                       "1:                                         \n\t"
 267                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
 268                       : [count] "+r" (count)
 269                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
 270                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 271                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 272                       "d30","d31"
 273                       );
 274     }
 275     else
 276     {   // handle count < 8
 277         uint16_t* SK_RESTRICT keep_dst = 0;
 278
 279         asm volatile (
 280                       "vmov.u8    d31, #1<<7                  \n\t"
 281                       "mov        %[keep_dst], %[dst]         \n\t"
 282
 283                       "tst        %[count], #4                \n\t"
 284                       "beq        14f                         \n\t"
 285                       "vld1.16    {d25}, [%[dst]]!            \n\t"
 286                       "vld1.32    {q1}, [%[src]]!             \n\t"
 287
 288                       "14:                                        \n\t"
 289                       "tst        %[count], #2                \n\t"
 290                       "beq        12f                         \n\t"
 291                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
 292                       "vld1.32    {d1}, [%[src]]!             \n\t"
 293
 294                       "12:                                        \n\t"
 295                       "tst        %[count], #1                \n\t"
 296                       "beq        11f                         \n\t"
 297                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
 298                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
 299
 300                       "11:                                        \n\t"
 301                       // unzips achieve the same as a vld4 operation
 302                       "vuzp.u16   q0, q1                      \n\t"
 303                       "vuzp.u8    d0, d1                      \n\t"
 304                       "vuzp.u8    d2, d3                      \n\t"
 305                       // expand 0565 q12 to 8888 {d4-d7}
 306                       "vmovn.u16  d4, q12                     \n\t"
 307                       "vshr.u16   q11, q12, #5                \n\t"
 308                       "vshr.u16   q10, q12, #6+5              \n\t"
 309                       "vmovn.u16  d5, q11                     \n\t"
 310                       "vmovn.u16  d6, q10                     \n\t"
 311                       "vshl.u8    d4, d4, #3                  \n\t"
 312                       "vshl.u8    d5, d5, #2                  \n\t"
 313                       "vshl.u8    d6, d6, #3                  \n\t"
 314
 315                       "vmovl.u8   q14, d31                    \n\t"
 316                       "vmovl.u8   q13, d31                    \n\t"
 317                       "vmovl.u8   q12, d31                    \n\t"
 318
 319                       // duplicate in 4/2/1 & 8pix vsns
 320                       "vmvn.8     d30, d3                     \n\t"
 321                       "vmlal.u8   q14, d30, d6                \n\t"
 322                       "vmlal.u8   q13, d30, d5                \n\t"
 323                       "vmlal.u8   q12, d30, d4                \n\t"
 324                       "vshr.u16   q8, q14, #5                 \n\t"
 325                       "vshr.u16   q9, q13, #6                 \n\t"
 326                       "vaddhn.u16 d6, q14, q8                 \n\t"
 327                       "vshr.u16   q8, q12, #5                 \n\t"
 328                       "vaddhn.u16 d5, q13, q9                 \n\t"
 329                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
 330                       "vaddhn.u16 d4, q12, q8                 \n\t"
 331                       // intentionally don't calculate alpha
 332                       // result in d4-d6
 333
 334                       "vqadd.u8   d5, d5, d1                  \n\t"
 335                       "vqadd.u8   d4, d4, d2                  \n\t"
 336
 337                       // pack 8888 {d4-d6} to 0565 q10
 338                       "vshll.u8   q10, d6, #8                 \n\t"
 339                       "vshll.u8   q3, d5, #8                  \n\t"
 340                       "vshll.u8   q2, d4, #8                  \n\t"
 341                       "vsri.u16   q10, q3, #5                 \n\t"
 342                       "vsri.u16   q10, q2, #11                \n\t"
 343
 344                       // store
 345                       "tst        %[count], #4                \n\t"
 346                       "beq        24f                         \n\t"
 347                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
 348
 349                       "24:                                        \n\t"
 350                       "tst        %[count], #2                \n\t"
 351                       "beq        22f                         \n\t"
 352                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
 353
 354                       "22:                                        \n\t"
 355                       "tst        %[count], #1                \n\t"
 356                       "beq        21f                         \n\t"
 357                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
 358
 359                       "21:                                        \n\t"
 360                       : [count] "+r" (count)
 361                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
 362                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 363                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 364                       "d30","d31"
 365                       );
 366     }
 367 }
 368
 369 #else // #ifdef SK_CPU_ARM32
 370
 371 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
 372                            const SkPMColor* SK_RESTRICT src, int count,
 373                            U8CPU alpha, int /*x*/, int /*y*/) {
 374     SkASSERT(255 == alpha);
 375
 376     if (count >= 16) {
 377         asm (
 378             "movi    v4.8h, #0x80                   \t\n"
 379
 380             "1:                                     \t\n"
 381             "sub     %[count], %[count], #16        \t\n"
 382             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
 383             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
 384             "prfm    pldl1keep, [%[src],#512]       \t\n"
 385             "prfm    pldl1keep, [%[dst],#256]       \t\n"
 386             "ushr    v20.8h, v17.8h, #5             \t\n"
 387             "ushr    v31.8h, v16.8h, #5             \t\n"
 388             "xtn     v6.8b, v31.8h                  \t\n"
 389             "xtn2    v6.16b, v20.8h                 \t\n"
 390             "ushr    v20.8h, v17.8h, #11            \t\n"
 391             "shl     v19.16b, v6.16b, #2            \t\n"
 392             "ushr    v31.8h, v16.8h, #11            \t\n"
 393             "xtn     v22.8b, v31.8h                 \t\n"
 394             "xtn2    v22.16b, v20.8h                \t\n"
 395             "shl     v18.16b, v22.16b, #3           \t\n"
 396             "mvn     v3.16b, v3.16b                 \t\n"
 397             "xtn     v16.8b, v16.8h                 \t\n"
 398             "mov     v7.16b, v4.16b                 \t\n"
 399             "xtn2    v16.16b, v17.8h                \t\n"
 400             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
 401             "shl     v16.16b, v16.16b, #3           \t\n"
 402             "mov     v22.16b, v4.16b                \t\n"
 403             "ushr    v24.8h, v7.8h, #6              \t\n"
 404             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
 405             "ushr    v20.8h, v22.8h, #5             \t\n"
 406             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
 407             "cmp     %[count], #16                  \t\n"
 408             "mov     v6.16b, v4.16b                 \t\n"
 409             "mov     v5.16b, v4.16b                 \t\n"
 410             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
 411             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
 412             "mov     v17.16b, v4.16b                \t\n"
 413             "ushr    v19.8h, v6.8h, #5              \t\n"
 414             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
 415             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
 416             "ushr    v18.8h, v5.8h, #6              \t\n"
 417             "ushr    v21.8h, v17.8h, #5             \t\n"
 418             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
 419             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
 420             "mov     v22.16b, v4.16b                \t\n"
 421             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
 422             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
 423             "ushr    v5.8h, v22.8h, #5              \t\n"
 424             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
 425             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
 426 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
 427             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
 428             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
 429 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
 430             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
 431             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
 432 #else
 433 #error "This function only supports BGRA and RGBA."
 434 #endif
 435             "shll    v22.8h, v20.8b, #8             \t\n"
 436             "shll    v5.8h, v7.8b, #8               \t\n"
 437             "sri     v22.8h, v5.8h, #5              \t\n"
 438             "shll    v17.8h, v6.8b, #8              \t\n"
 439             "shll2   v23.8h, v20.16b, #8            \t\n"
 440             "shll2   v7.8h, v7.16b, #8              \t\n"
 441             "sri     v22.8h, v17.8h, #11            \t\n"
 442             "sri     v23.8h, v7.8h, #5              \t\n"
 443             "shll2   v6.8h, v6.16b, #8              \t\n"
 444             "st1     {v22.8h}, [%[dst]], #16        \t\n"
 445             "sri     v23.8h, v6.8h, #11             \t\n"
 446             "st1     {v23.8h}, [%[dst]], #16        \t\n"
 447             "b.ge    1b                             \t\n"
 448             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
 449             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
 450                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
 451                "v31"
 452         );
 453     }
 454         // Leftovers
 455     if (count > 0) {
 456         do {
 457             SkPMColor c = *src++;
 458             SkPMColorAssert(c);
 459             if (c) {
 460                 *dst = SkSrcOver32To16(c, *dst);
 461             }
 462             dst += 1;
 463         } while (--count != 0);
 464     }
 465 }
 466 #endif // #ifdef SK_CPU_ARM32
 467
 468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
 469     prod += vdupq_n_u16(128);
 470     prod += vshrq_n_u16(prod, 8);
 471     return vshrq_n_u16(prod, 8);
 472 }
 473
 474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
 475                           const SkPMColor* SK_RESTRICT src, int count,
 476                           U8CPU alpha, int /*x*/, int /*y*/) {
 477    SkASSERT(255 > alpha);
 478
 479     /* This code implements a Neon version of S32A_D565_Blend. The results have
 480      * a few mismatches compared to the original code. These mismatches never
 481      * exceed 1.
 482      */
 483
 484     if (count >= 8) {
 485         uint16x8_t valpha_max, vmask_blue;
 486         uint8x8_t valpha;
 487
 488         // prepare constants
 489         valpha_max = vmovq_n_u16(255);
 490         valpha = vdup_n_u8(alpha);
 491         vmask_blue = vmovq_n_u16(SK_B16_MASK);
 492
 493         do {
 494             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
 495             uint16x8_t vres_a, vres_r, vres_g, vres_b;
 496             uint8x8x4_t vsrc;
 497
 498             // load pixels
 499             vdst = vld1q_u16(dst);
 500 #ifdef SK_CPU_ARM64
 501             vsrc = sk_vld4_u8_arm64_4(src);
 502 #else
 503 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
 504             asm (
 505                 "vld4.u8 %h[vsrc], [%[src]]!"
 506                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
 507                 : :
 508             );
 509 #else
 510             register uint8x8_t d0 asm("d0");
 511             register uint8x8_t d1 asm("d1");
 512             register uint8x8_t d2 asm("d2");
 513             register uint8x8_t d3 asm("d3");
 514
 515             asm volatile (
 516                 "vld4.u8    {d0-d3},[%[src]]!;"
 517                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
 518                   [src] "+&r" (src)
 519                 : :
 520             );
 521             vsrc.val[0] = d0;
 522             vsrc.val[1] = d1;
 523             vsrc.val[2] = d2;
 524             vsrc.val[3] = d3;
 525 #endif
 526 #endif // #ifdef SK_CPU_ARM64
 527
 528
 529             // deinterleave dst
 530             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
 531             vdst_b = vdst & vmask_blue;                     // extract blue
 532             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
 533             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
 534
 535             // shift src to 565
 536             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
 537             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
 538             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
 539
 540             // calc src * src_scale
 541             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
 542             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
 543             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
 544             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
 545
 546             // prepare dst_scale
 547             vres_a = SkDiv255Round_neon8(vres_a);
 548             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
 549
 550             // add dst * dst_scale to previous result
 551             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
 552             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
 553             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
 554
 555 #ifdef S32A_D565_BLEND_EXACT
 556             // It is possible to get exact results with this but it is slow,
 557             // even slower than C code in some cases
 558             vres_r = SkDiv255Round_neon8(vres_r);
 559             vres_g = SkDiv255Round_neon8(vres_g);
 560             vres_b = SkDiv255Round_neon8(vres_b);
 561 #else
 562             vres_r = vrshrq_n_u16(vres_r, 8);
 563             vres_g = vrshrq_n_u16(vres_g, 8);
 564             vres_b = vrshrq_n_u16(vres_b, 8);
 565 #endif
 566             // pack result
 567             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
 568             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
 569
 570             // store
 571             vst1q_u16(dst, vres_b);
 572             dst += 8;
 573             count -= 8;
 574         } while (count >= 8);
 575     }
 576
 577     // leftovers
 578     while (count-- > 0) {
 579         SkPMColor sc = *src++;
 580         if (sc) {
 581             uint16_t dc = *dst;
 582             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
 583             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
 584             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
 585             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
 586             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
 587         }
 588         dst += 1;
 589     }
 590 }
 591
 592 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
 593  * each dither value is spaced out into byte lanes, and repeated
 594  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
 595  * start of each row.
 596  */
 597 static const uint8_t gDitherMatrix_Neon[48] = {
 598     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
 599     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
 600     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
 601     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
 602
 603 };
 604
 605 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
 606                                 int count, U8CPU alpha, int x, int y)
 607 {
 608
 609     SkASSERT(255 > alpha);
 610
 611     // rescale alpha to range 1 - 256
 612     int scale = SkAlpha255To256(alpha);
 613
 614     if (count >= 8) {
 615         /* select row and offset for dither array */
 616         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
 617
 618         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
 619         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
 620
 621         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
 622         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
 623
 624         do {
 625
 626             uint8x8x4_t vsrc;
 627             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
 628             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
 629             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
 630             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
 631             uint16x8_t vdst;
 632             uint16x8_t vdst_r, vdst_g, vdst_b;
 633             int16x8_t vres_r, vres_g, vres_b;
 634             int8x8_t vres8_r, vres8_g, vres8_b;
 635
 636             // Load source and add dither
 637 #ifdef SK_CPU_ARM64
 638             vsrc = sk_vld4_u8_arm64_3(src);
 639 #else
 640             {
 641             register uint8x8_t d0 asm("d0");
 642             register uint8x8_t d1 asm("d1");
 643             register uint8x8_t d2 asm("d2");
 644             register uint8x8_t d3 asm("d3");
 645
 646             asm (
 647                 "vld4.8    {d0-d3},[%[src]]! "
 648                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
 649                 :
 650             );
 651             vsrc.val[0] = d0;
 652             vsrc.val[1] = d1;
 653             vsrc.val[2] = d2;
 654             }
 655 #endif
 656             vsrc_r = vsrc.val[NEON_R];
 657             vsrc_g = vsrc.val[NEON_G];
 658             vsrc_b = vsrc.val[NEON_B];
 659
 660             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
 661             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
 662             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
 663
 664             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
 665             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
 666             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
 667
 668             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
 669             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
 670             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
 671
 672             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
 673             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
 674             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
 675
 676             // Load dst and unpack
 677             vdst = vld1q_u16(dst);
 678             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
 679             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
 680             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
 681
 682             // subtract dst from src and widen
 683             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
 684             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
 685             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
 686
 687             // multiply diffs by scale and shift
 688             vres_r = vmulq_s16(vres_r, vscale);
 689             vres_g = vmulq_s16(vres_g, vscale);
 690             vres_b = vmulq_s16(vres_b, vscale);
 691
 692             vres8_r = vshrn_n_s16(vres_r, 8);
 693             vres8_g = vshrn_n_s16(vres_g, 8);
 694             vres8_b = vshrn_n_s16(vres_b, 8);
 695
 696             // add dst to result
 697             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
 698             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
 699             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
 700
 701             // put result into 565 format
 702             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
 703             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
 704
 705             // Store result
 706             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
 707
 708             // Next iteration
 709             dst += 8;
 710             count -= 8;
 711
 712         } while (count >= 8);
 713     }
 714
 715     // Leftovers
 716     if (count > 0) {
 717         int scale = SkAlpha255To256(alpha);
 718         DITHER_565_SCAN(y);
 719         do {
 720             SkPMColor c = *src++;
 721             SkPMColorAssert(c);
 722
 723             int dither = DITHER_VALUE(x);
 724             int sr = SkGetPackedR32(c);
 725             int sg = SkGetPackedG32(c);
 726             int sb = SkGetPackedB32(c);
 727             sr = SkDITHER_R32To565(sr, dither);
 728             sg = SkDITHER_G32To565(sg, dither);
 729             sb = SkDITHER_B32To565(sb, dither);
 730
 731             uint16_t d = *dst;
 732             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
 733                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
 734                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
 735             DITHER_INC_X(x);
 736         } while (--count != 0);
 737     }
 738 }
 739
 740 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 741                                 const SkPMColor* SK_RESTRICT src,
 742                                 int count, U8CPU alpha) {
 743
 744     SkASSERT(255 == alpha);
 745     if (count > 0) {
 746
 747
 748     uint8x8_t alpha_mask;
 749
 750     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 751     alpha_mask = vld1_u8(alpha_mask_setup);
 752
 753     /* do the NEON unrolled code */
 754 #define    UNROLL    4
 755     while (count >= UNROLL) {
 756         uint8x8_t src_raw, dst_raw, dst_final;
 757         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
 758
 759         /* The two prefetches below may make the code slighlty
 760          * slower for small values of count but are worth having
 761          * in the general case.
 762          */
 763         __builtin_prefetch(src+32);
 764         __builtin_prefetch(dst+32);
 765
 766         /* get the source */
 767         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 768 #if    UNROLL > 2
 769         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
 770 #endif
 771
 772         /* get and hold the dst too */
 773         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 774 #if    UNROLL > 2
 775         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
 776 #endif
 777
 778     /* 1st and 2nd bits of the unrolling */
 779     {
 780         uint8x8_t dst_cooked;
 781         uint16x8_t dst_wide;
 782         uint8x8_t alpha_narrow;
 783         uint16x8_t alpha_wide;
 784
 785         /* get the alphas spread out properly */
 786         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
 787         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 788
 789         /* spread the dest */
 790         dst_wide = vmovl_u8(dst_raw);
 791
 792         /* alpha mul the dest */
 793         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 794         dst_cooked = vshrn_n_u16(dst_wide, 8);
 795
 796         /* sum -- ignoring any byte lane overflows */
 797         dst_final = vadd_u8(src_raw, dst_cooked);
 798     }
 799
 800 #if    UNROLL > 2
 801     /* the 3rd and 4th bits of our unrolling */
 802     {
 803         uint8x8_t dst_cooked;
 804         uint16x8_t dst_wide;
 805         uint8x8_t alpha_narrow;
 806         uint16x8_t alpha_wide;
 807
 808         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
 809         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 810
 811         /* spread the dest */
 812         dst_wide = vmovl_u8(dst_raw_2);
 813
 814         /* alpha mul the dest */
 815         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 816         dst_cooked = vshrn_n_u16(dst_wide, 8);
 817
 818         /* sum -- ignoring any byte lane overflows */
 819         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
 820     }
 821 #endif
 822
 823         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 824 #if    UNROLL > 2
 825         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
 826 #endif
 827
 828         src += UNROLL;
 829         dst += UNROLL;
 830         count -= UNROLL;
 831     }
 832 #undef    UNROLL
 833
 834     /* do any residual iterations */
 835         while (--count >= 0) {
 836             *dst = SkPMSrcOver(*src, *dst);
 837             src += 1;
 838             dst += 1;
 839         }
 840     }
 841 }
 842
 843 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
 844                                 const SkPMColor* SK_RESTRICT src,
 845                                 int count, U8CPU alpha) {
 846     SkASSERT(255 == alpha);
 847
 848     if (count <= 0)
 849     return;
 850
 851     /* Use these to check if src is transparent or opaque */
 852     const unsigned int ALPHA_OPAQ  = 0xFF000000;
 853     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
 854
 855 #define UNROLL  4
 856     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
 857     const SkPMColor* SK_RESTRICT src_temp = src;
 858
 859     /* set up the NEON variables */
 860     uint8x8_t alpha_mask;
 861     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 862     alpha_mask = vld1_u8(alpha_mask_setup);
 863
 864     uint8x8_t src_raw, dst_raw, dst_final;
 865     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
 866     uint8x8_t dst_cooked;
 867     uint16x8_t dst_wide;
 868     uint8x8_t alpha_narrow;
 869     uint16x8_t alpha_wide;
 870
 871     /* choose the first processing type */
 872     if( src >= src_end)
 873         goto TAIL;
 874     if(*src <= ALPHA_TRANS)
 875         goto ALPHA_0;
 876     if(*src >= ALPHA_OPAQ)
 877         goto ALPHA_255;
 878     /* fall-thru */
 879
 880 ALPHA_1_TO_254:
 881     do {
 882
 883         /* get the source */
 884         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 885         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
 886
 887         /* get and hold the dst too */
 888         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 889         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
 890
 891
 892         /* get the alphas spread out properly */
 893         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
 894         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
 895         /* we collapsed (255-a)+1 ... */
 896         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 897
 898         /* spread the dest */
 899         dst_wide = vmovl_u8(dst_raw);
 900
 901         /* alpha mul the dest */
 902         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 903         dst_cooked = vshrn_n_u16(dst_wide, 8);
 904
 905         /* sum -- ignoring any byte lane overflows */
 906         dst_final = vadd_u8(src_raw, dst_cooked);
 907
 908         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
 909         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
 910         /* we collapsed (255-a)+1 ... */
 911         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 912
 913         /* spread the dest */
 914         dst_wide = vmovl_u8(dst_raw_2);
 915
 916         /* alpha mul the dest */
 917         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 918         dst_cooked = vshrn_n_u16(dst_wide, 8);
 919
 920         /* sum -- ignoring any byte lane overflows */
 921         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
 922
 923         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 924         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
 925
 926         src += UNROLL;
 927         dst += UNROLL;
 928
 929         /* if 2 of the next pixels aren't between 1 and 254
 930         it might make sense to go to the optimized loops */
 931         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
 932             break;
 933
 934     } while(src < src_end);
 935
 936     if (src >= src_end)
 937         goto TAIL;
 938
 939     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
 940         goto ALPHA_255;
 941
 942     /*fall-thru*/
 943
 944 ALPHA_0:
 945
 946     /*In this state, we know the current alpha is 0 and
 947      we optimize for the next alpha also being zero. */
 948     src_temp = src;  //so we don't have to increment dst every time
 949     do {
 950         if(*(++src) > ALPHA_TRANS)
 951             break;
 952         if(*(++src) > ALPHA_TRANS)
 953             break;
 954         if(*(++src) > ALPHA_TRANS)
 955             break;
 956         if(*(++src) > ALPHA_TRANS)
 957             break;
 958     } while(src < src_end);
 959
 960     dst += (src - src_temp);
 961
 962     /* no longer alpha 0, so determine where to go next. */
 963     if( src >= src_end)
 964         goto TAIL;
 965     if(*src >= ALPHA_OPAQ)
 966         goto ALPHA_255;
 967     else
 968         goto ALPHA_1_TO_254;
 969
 970 ALPHA_255:
 971     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
 972         dst[0]=src[0];
 973         dst[1]=src[1];
 974         dst[2]=src[2];
 975         dst[3]=src[3];
 976         src+=UNROLL;
 977         dst+=UNROLL;
 978         if(src >= src_end)
 979             goto TAIL;
 980     }
 981
 982     //Handle remainder.
 983     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
 984         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
 985             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
 986         }
 987     }
 988
 989     if( src >= src_end)
 990         goto TAIL;
 991     if(*src <= ALPHA_TRANS)
 992         goto ALPHA_0;
 993     else
 994         goto ALPHA_1_TO_254;
 995
 996 TAIL:
 997     /* do any residual iterations */
 998     src_end += UNROLL + 1;  //goto the real end
 999     while(src != src_end) {
1000         if( *src != 0 ) {
1001             if( *src >= ALPHA_OPAQ ) {
1002                 *dst = *src;
1003             }
1004             else {
1005                 *dst = SkPMSrcOver(*src, *dst);
1006             }
1007         }
1008         src++;
1009         dst++;
1010     }
1011
1012 #undef    UNROLL
1013     return;
1014 }
1015
1016 /* Neon version of S32_Blend_BlitRow32()
1017  * portable version is in src/core/SkBlitRow_D32.cpp
1018  */
1019 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1020                               const SkPMColor* SK_RESTRICT src,
1021                               int count, U8CPU alpha) {
1022     SkASSERT(alpha <= 255);
1023
1024     if (count <= 0) {
1025         return;
1026     }
1027
1028     uint16_t src_scale = SkAlpha255To256(alpha);
1029     uint16_t dst_scale = 256 - src_scale;
1030
1031     while (count >= 2) {
1032         uint8x8_t vsrc, vdst, vres;
1033         uint16x8_t vsrc_wide, vdst_wide;
1034
1035         /* These commented prefetches are a big win for count
1036          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1037          * They also hurt a little (<5%) on an A15
1038          */
1039         //__builtin_prefetch(src+32);
1040         //__builtin_prefetch(dst+32);
1041
1042         // Load
1043         vsrc = vreinterpret_u8_u32(vld1_u32(src));
1044         vdst = vreinterpret_u8_u32(vld1_u32(dst));
1045
1046         // Process src
1047         vsrc_wide = vmovl_u8(vsrc);
1048         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1049
1050         // Process dst
1051         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1052
1053         // Combine
1054         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1055
1056         // Store
1057         vst1_u32(dst, vreinterpret_u32_u8(vres));
1058
1059         src += 2;
1060         dst += 2;
1061         count -= 2;
1062     }
1063
1064     if (count == 1) {
1065         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1066         uint16x8_t vsrc_wide, vdst_wide;
1067
1068         // Load
1069         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1070         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1071
1072         // Process
1073         vsrc_wide = vmovl_u8(vsrc);
1074         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1075         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1076         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1077
1078         // Store
1079         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1080     }
1081 }
1082
1083 #ifdef SK_CPU_ARM32
1084 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1085                          const SkPMColor* SK_RESTRICT src,
1086                          int count, U8CPU alpha) {
1087
1088     SkASSERT(255 >= alpha);
1089
1090     if (count <= 0) {
1091         return;
1092     }
1093
1094     unsigned alpha256 = SkAlpha255To256(alpha);
1095
1096     // First deal with odd counts
1097     if (count & 1) {
1098         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1099         uint16x8_t vdst_wide, vsrc_wide;
1100         unsigned dst_scale;
1101
1102         // Load
1103         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1104         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1105
1106         // Calc dst_scale
1107         dst_scale = vget_lane_u8(vsrc, 3);
1108         dst_scale *= alpha256;
1109         dst_scale >>= 8;
1110         dst_scale = 256 - dst_scale;
1111
1112         // Process src
1113         vsrc_wide = vmovl_u8(vsrc);
1114         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1115
1116         // Process dst
1117         vdst_wide = vmovl_u8(vdst);
1118         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1119
1120         // Combine
1121         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1122
1123         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1124         dst++;
1125         src++;
1126         count--;
1127     }
1128
1129     if (count) {
1130         uint8x8_t alpha_mask;
1131         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1132         alpha_mask = vld1_u8(alpha_mask_setup);
1133
1134         do {
1135
1136             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1137             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1138
1139             __builtin_prefetch(src+32);
1140             __builtin_prefetch(dst+32);
1141
1142             // Load
1143             vsrc = vreinterpret_u8_u32(vld1_u32(src));
1144             vdst = vreinterpret_u8_u32(vld1_u32(dst));
1145
1146             // Prepare src_scale
1147             vsrc_scale = vdupq_n_u16(alpha256);
1148
1149             // Calc dst_scale
1150             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1151             vdst_scale = vmovl_u8(vsrc_alphas);
1152             vdst_scale *= vsrc_scale;
1153             vdst_scale = vshrq_n_u16(vdst_scale, 8);
1154             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1155
1156             // Process src
1157             vsrc_wide = vmovl_u8(vsrc);
1158             vsrc_wide *= vsrc_scale;
1159
1160             // Process dst
1161             vdst_wide = vmovl_u8(vdst);
1162             vdst_wide *= vdst_scale;
1163
1164             // Combine
1165             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1166
1167             vst1_u32(dst, vreinterpret_u32_u8(vres));
1168
1169             src += 2;
1170             dst += 2;
1171             count -= 2;
1172         } while(count);
1173     }
1174 }
1175
1176 ///////////////////////////////////////////////////////////////////////////////
1177
1178 #undef    DEBUG_OPAQUE_DITHER
1179
1180 #if    defined(DEBUG_OPAQUE_DITHER)
1181 static void showme8(char *str, void *p, int len)
1182 {
1183     static char buf[256];
1184     char tbuf[32];
1185     int i;
1186     char *pc = (char*) p;
1187     sprintf(buf,"%8s:", str);
1188     for(i=0;i<len;i++) {
1189         sprintf(tbuf, "   %02x", pc[i]);
1190         strcat(buf, tbuf);
1191     }
1192     SkDebugf("%s\n", buf);
1193 }
1194 static void showme16(char *str, void *p, int len)
1195 {
1196     static char buf[256];
1197     char tbuf[32];
1198     int i;
1199     uint16_t *pc = (uint16_t*) p;
1200     sprintf(buf,"%8s:", str);
1201     len = (len / sizeof(uint16_t));    /* passed as bytes */
1202     for(i=0;i<len;i++) {
1203         sprintf(tbuf, " %04x", pc[i]);
1204         strcat(buf, tbuf);
1205     }
1206     SkDebugf("%s\n", buf);
1207 }
1208 #endif
1209 #endif // #ifdef SK_CPU_ARM32
1210
1211 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1212                                    const SkPMColor* SK_RESTRICT src,
1213                                    int count, U8CPU alpha, int x, int y) {
1214     SkASSERT(255 == alpha);
1215
1216 #define    UNROLL    8
1217
1218     if (count >= UNROLL) {
1219
1220 #if defined(DEBUG_OPAQUE_DITHER)
1221     uint16_t tmpbuf[UNROLL];
1222     int td[UNROLL];
1223     int tdv[UNROLL];
1224     int ta[UNROLL];
1225     int tap[UNROLL];
1226     uint16_t in_dst[UNROLL];
1227     int offset = 0;
1228     int noisy = 0;
1229 #endif
1230
1231     uint8x8_t dbase;
1232     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1233     dbase = vld1_u8(dstart);
1234
1235         do {
1236         uint8x8x4_t vsrc;
1237         uint8x8_t sr, sg, sb, sa, d;
1238         uint16x8_t dst8, scale8, alpha8;
1239         uint16x8_t dst_r, dst_g, dst_b;
1240
1241 #if defined(DEBUG_OPAQUE_DITHER)
1242         // calculate 8 elements worth into a temp buffer
1243         {
1244         int my_y = y;
1245         int my_x = x;
1246         SkPMColor* my_src = (SkPMColor*)src;
1247         uint16_t* my_dst = dst;
1248         int i;
1249
1250         DITHER_565_SCAN(my_y);
1251         for(i = 0; i < UNROLL; i++) {
1252             SkPMColor c = *my_src++;
1253             SkPMColorAssert(c);
1254             if (c) {
1255                 unsigned a = SkGetPackedA32(c);
1256
1257                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1258                 tdv[i] = DITHER_VALUE(my_x);
1259                 ta[i] = a;
1260                 tap[i] = SkAlpha255To256(a);
1261                 td[i] = d;
1262
1263                 unsigned sr = SkGetPackedR32(c);
1264                 unsigned sg = SkGetPackedG32(c);
1265                 unsigned sb = SkGetPackedB32(c);
1266                 sr = SkDITHER_R32_FOR_565(sr, d);
1267                 sg = SkDITHER_G32_FOR_565(sg, d);
1268                 sb = SkDITHER_B32_FOR_565(sb, d);
1269
1270                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1271                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1272                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1273                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1274                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1275                 td[i] = d;
1276             } else {
1277                 tmpbuf[i] = *my_dst;
1278                 ta[i] = tdv[i] = td[i] = 0xbeef;
1279             }
1280             in_dst[i] = *my_dst;
1281             my_dst += 1;
1282             DITHER_INC_X(my_x);
1283         }
1284         }
1285 #endif
1286
1287 #ifdef SK_CPU_ARM64
1288         vsrc = sk_vld4_u8_arm64_4(src);
1289 #else
1290         {
1291         register uint8x8_t d0 asm("d0");
1292         register uint8x8_t d1 asm("d1");
1293         register uint8x8_t d2 asm("d2");
1294         register uint8x8_t d3 asm("d3");
1295
1296         asm ("vld4.8    {d0-d3},[%[src]]! "
1297             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1298             :
1299         );
1300         vsrc.val[0] = d0;
1301         vsrc.val[1] = d1;
1302         vsrc.val[2] = d2;
1303         vsrc.val[3] = d3;
1304         }
1305 #endif
1306         sa = vsrc.val[NEON_A];
1307         sr = vsrc.val[NEON_R];
1308         sg = vsrc.val[NEON_G];
1309         sb = vsrc.val[NEON_B];
1310
1311         /* calculate 'd', which will be 0..7
1312          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1313          */
1314         alpha8 = vmovl_u8(dbase);
1315         alpha8 = vmlal_u8(alpha8, sa, dbase);
1316         d = vshrn_n_u16(alpha8, 8);    // narrowing too
1317
1318         // sr = sr - (sr>>5) + d
1319         /* watching for 8-bit overflow.  d is 0..7; risky range of
1320          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1321          * safe  as long as we do ((sr-sr>>5) + d)
1322          */
1323         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1324         sr = vadd_u8(sr, d);
1325
1326         // sb = sb - (sb>>5) + d
1327         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1328         sb = vadd_u8(sb, d);
1329
1330         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1331         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1332         sg = vadd_u8(sg, vshr_n_u8(d,1));
1333
1334         // need to pick up 8 dst's -- at 16 bits each, 128 bits
1335         dst8 = vld1q_u16(dst);
1336         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1337         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1338         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1339
1340         // blend
1341         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1342
1343         // combine the addq and mul, save 3 insns
1344         scale8 = vshrq_n_u16(scale8, 3);
1345         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1346         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1347         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1348
1349         // repack to store
1350         dst8 = vshrq_n_u16(dst_b, 5);
1351         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1352         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1353
1354         vst1q_u16(dst, dst8);
1355
1356 #if defined(DEBUG_OPAQUE_DITHER)
1357         // verify my 8 elements match the temp buffer
1358         {
1359         int i, bad=0;
1360         static int invocation;
1361
1362         for (i = 0; i < UNROLL; i++) {
1363             if (tmpbuf[i] != dst[i]) {
1364                 bad=1;
1365             }
1366         }
1367         if (bad) {
1368             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1369                      invocation, offset);
1370             SkDebugf("  alpha 0x%x\n", alpha);
1371             for (i = 0; i < UNROLL; i++)
1372                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1373                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1374                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1375
1376             showme16("alpha8", &alpha8, sizeof(alpha8));
1377             showme16("scale8", &scale8, sizeof(scale8));
1378             showme8("d", &d, sizeof(d));
1379             showme16("dst8", &dst8, sizeof(dst8));
1380             showme16("dst_b", &dst_b, sizeof(dst_b));
1381             showme16("dst_g", &dst_g, sizeof(dst_g));
1382             showme16("dst_r", &dst_r, sizeof(dst_r));
1383             showme8("sb", &sb, sizeof(sb));
1384             showme8("sg", &sg, sizeof(sg));
1385             showme8("sr", &sr, sizeof(sr));
1386
1387             return;
1388         }
1389         offset += UNROLL;
1390         invocation++;
1391         }
1392 #endif
1393         dst += UNROLL;
1394         count -= UNROLL;
1395         // skip x += UNROLL, since it's unchanged mod-4
1396         } while (count >= UNROLL);
1397     }
1398 #undef    UNROLL
1399
1400     // residuals
1401     if (count > 0) {
1402         DITHER_565_SCAN(y);
1403         do {
1404             SkPMColor c = *src++;
1405             SkPMColorAssert(c);
1406             if (c) {
1407                 unsigned a = SkGetPackedA32(c);
1408
1409                 // dither and alpha are just temporary variables to work-around
1410                 // an ICE in debug.
1411                 unsigned dither = DITHER_VALUE(x);
1412                 unsigned alpha = SkAlpha255To256(a);
1413                 int d = SkAlphaMul(dither, alpha);
1414
1415                 unsigned sr = SkGetPackedR32(c);
1416                 unsigned sg = SkGetPackedG32(c);
1417                 unsigned sb = SkGetPackedB32(c);
1418                 sr = SkDITHER_R32_FOR_565(sr, d);
1419                 sg = SkDITHER_G32_FOR_565(sg, d);
1420                 sb = SkDITHER_B32_FOR_565(sb, d);
1421
1422                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1423                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1424                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1425                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1426                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1427             }
1428             dst += 1;
1429             DITHER_INC_X(x);
1430         } while (--count != 0);
1431     }
1432 }
1433
1434 ///////////////////////////////////////////////////////////////////////////////
1435
1436 #undef    DEBUG_S32_OPAQUE_DITHER
1437
1438 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1439                                  const SkPMColor* SK_RESTRICT src,
1440                                  int count, U8CPU alpha, int x, int y) {
1441     SkASSERT(255 == alpha);
1442
1443 #define    UNROLL    8
1444     if (count >= UNROLL) {
1445     uint8x8_t d;
1446     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1447     d = vld1_u8(dstart);
1448
1449     while (count >= UNROLL) {
1450         uint8x8_t sr, sg, sb;
1451         uint16x8_t dr, dg, db;
1452         uint16x8_t dst8;
1453         uint8x8x4_t vsrc;
1454
1455 #ifdef SK_CPU_ARM64
1456         vsrc = sk_vld4_u8_arm64_3(src);
1457 #else
1458         {
1459         register uint8x8_t d0 asm("d0");
1460         register uint8x8_t d1 asm("d1");
1461         register uint8x8_t d2 asm("d2");
1462         register uint8x8_t d3 asm("d3");
1463
1464         asm (
1465             "vld4.8    {d0-d3},[%[src]]! "
1466             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1467             :
1468         );
1469         vsrc.val[0] = d0;
1470         vsrc.val[1] = d1;
1471         vsrc.val[2] = d2;
1472         }
1473 #endif
1474         sr = vsrc.val[NEON_R];
1475         sg = vsrc.val[NEON_G];
1476         sb = vsrc.val[NEON_B];
1477
1478         /* XXX: if we want to prefetch, hide it in the above asm()
1479          * using the gcc __builtin_prefetch(), the prefetch will
1480          * fall to the bottom of the loop -- it won't stick up
1481          * at the top of the loop, just after the vld4.
1482          */
1483
1484         // sr = sr - (sr>>5) + d
1485         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1486         dr = vaddl_u8(sr, d);
1487
1488         // sb = sb - (sb>>5) + d
1489         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1490         db = vaddl_u8(sb, d);
1491
1492         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1493         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1494         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1495
1496         // pack high bits of each into 565 format  (rgb, b is lsb)
1497         dst8 = vshrq_n_u16(db, 3);
1498         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1499         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1500
1501         // store it
1502         vst1q_u16(dst, dst8);
1503
1504 #if    defined(DEBUG_S32_OPAQUE_DITHER)
1505         // always good to know if we generated good results
1506         {
1507         int i, myx = x, myy = y;
1508         DITHER_565_SCAN(myy);
1509         for (i=0;i<UNROLL;i++) {
1510             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1511             SkPMColor c = src[i-8];
1512             unsigned dither = DITHER_VALUE(myx);
1513             uint16_t val = SkDitherRGB32To565(c, dither);
1514             if (val != dst[i]) {
1515             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1516                 c, dither, val, dst[i], dstart[i]);
1517             }
1518             DITHER_INC_X(myx);
1519         }
1520         }
1521 #endif
1522
1523         dst += UNROLL;
1524         // we don't need to increment src as the asm above has already done it
1525         count -= UNROLL;
1526         x += UNROLL;        // probably superfluous
1527     }
1528     }
1529 #undef    UNROLL
1530
1531     // residuals
1532     if (count > 0) {
1533         DITHER_565_SCAN(y);
1534         do {
1535             SkPMColor c = *src++;
1536             SkPMColorAssert(c);
1537             SkASSERT(SkGetPackedA32(c) == 255);
1538
1539             unsigned dither = DITHER_VALUE(x);
1540             *dst++ = SkDitherRGB32To565(c, dither);
1541             DITHER_INC_X(x);
1542         } while (--count != 0);
1543     }
1544 }
1545
1546 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1547                       SkPMColor color) {
1548     if (count <= 0) {
1549         return;
1550     }
1551
1552     if (0 == color) {
1553         if (src != dst) {
1554             memcpy(dst, src, count * sizeof(SkPMColor));
1555         }
1556         return;
1557     }
1558
1559     unsigned colorA = SkGetPackedA32(color);
1560     if (255 == colorA) {
1561         sk_memset32(dst, color, count);
1562         return;
1563     }
1564
1565     unsigned scale = 256 - SkAlpha255To256(colorA);
1566
1567     if (count >= 8) {
1568         uint32x4_t vcolor;
1569         uint8x8_t vscale;
1570
1571         vcolor = vdupq_n_u32(color);
1572
1573         // scale numerical interval [0-255], so load as 8 bits
1574         vscale = vdup_n_u8(scale);
1575
1576         do {
1577             // load src color, 8 pixels, 4 64 bit registers
1578             // (and increment src).
1579             uint32x2x4_t vsrc;
1580 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1581             asm (
1582                 "vld1.32    %h[vsrc], [%[src]]!"
1583                 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1584                 : :
1585             );
1586 #else // 64bit targets and Clang
1587             vsrc.val[0] = vld1_u32(src);
1588             vsrc.val[1] = vld1_u32(src+2);
1589             vsrc.val[2] = vld1_u32(src+4);
1590             vsrc.val[3] = vld1_u32(src+6);
1591             src += 8;
1592 #endif
1593
1594             // multiply long by scale, 64 bits at a time,
1595             // destination into a 128 bit register.
1596             uint16x8x4_t vtmp;
1597             vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1598             vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1599             vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1600             vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1601
1602             // shift the 128 bit registers, containing the 16
1603             // bit scaled values back to 8 bits, narrowing the
1604             // results to 64 bit registers.
1605             uint8x16x2_t vres;
1606             vres.val[0] = vcombine_u8(
1607                             vshrn_n_u16(vtmp.val[0], 8),
1608                             vshrn_n_u16(vtmp.val[1], 8));
1609             vres.val[1] = vcombine_u8(
1610                             vshrn_n_u16(vtmp.val[2], 8),
1611                             vshrn_n_u16(vtmp.val[3], 8));
1612
1613             // adding back the color, using 128 bit registers.
1614             uint32x4x2_t vdst;
1615             vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1616                                                vreinterpretq_u8_u32(vcolor));
1617             vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1618                                                vreinterpretq_u8_u32(vcolor));
1619
1620             // store back the 8 calculated pixels (2 128 bit
1621             // registers), and increment dst.
1622 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1623             asm (
1624                 "vst1.32    %h[vdst], [%[dst]]!"
1625                 : [dst] "+r" (dst)
1626                 : [vdst] "w" (vdst)
1627                 : "memory"
1628             );
1629 #else // 64bit targets and Clang
1630             vst1q_u32(dst, vdst.val[0]);
1631             vst1q_u32(dst+4, vdst.val[1]);
1632             dst += 8;
1633 #endif
1634             count -= 8;
1635
1636         } while (count >= 8);
1637     }
1638
1639     while (count > 0) {
1640         *dst = color + SkAlphaMulQ(*src, scale);
1641         src += 1;
1642         dst += 1;
1643         count--;
1644     }
1645 }
1646
1647 ///////////////////////////////////////////////////////////////////////////////
1648
1649 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1650     // no dither
1651     S32_D565_Opaque_neon,
1652     S32_D565_Blend_neon,
1653     S32A_D565_Opaque_neon,
1654 #if 0
1655     S32A_D565_Blend_neon,
1656 #else
1657     NULL,   // https://code.google.com/p/skia/issues/detail?id=2845
1658             // https://code.google.com/p/skia/issues/detail?id=2797
1659 #endif
1660
1661     // dither
1662     S32_D565_Opaque_Dither_neon,
1663     S32_D565_Blend_Dither_neon,
1664     S32A_D565_Opaque_Dither_neon,
1665     NULL,   // S32A_D565_Blend_Dither
1666 };
1667
1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1669     NULL,   // S32_Opaque,
1670     S32_Blend_BlitRow32_neon,        // S32_Blend,
1671     /*
1672      * We have two choices for S32A_Opaque procs. The one reads the src alpha
1673      * value and attempts to optimize accordingly.  The optimization is
1674      * sensitive to the source content and is not a win in all cases. For
1675      * example, if there are a lot of transitions between the alpha states,
1676      * the performance will almost certainly be worse.  However, for many
1677      * common cases the performance is equivalent or better than the standard
1678      * case where we do not inspect the src alpha.
1679      */
1680 #if SK_A32_SHIFT == 24
1681     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1682     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1683 #else
1684     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1685 #endif
1686 #ifdef SK_CPU_ARM32
1687     S32A_Blend_BlitRow32_neon        // S32A_Blend
1688 #else
1689     NULL
1690 #endif
1691 };