src/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp

   1 /*
   2  * Copyright 2012 The Android Open Source Project
   3  *
   4  * Use of this source code is governed by a BSD-style license that can be
   5  * found in the LICENSE file.
   6  */
   7
   8 #include "SkBlitRow_opts_arm_neon.h"
   9
  10 #include "SkBlitMask.h"
  11 #include "SkBlitRow.h"
  12 #include "SkColorPriv.h"
  13 #include "SkDither.h"
  14 #include "SkMathPriv.h"
  15 #include "SkUtils.h"
  16
  17 #include "SkColor_opts_neon.h"
  18 #include <arm_neon.h>
  19
  20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
  21                            const SkPMColor* SK_RESTRICT src, int count,
  22                            U8CPU alpha, int /*x*/, int /*y*/) {
  23     SkASSERT(255 == alpha);
  24
  25     while (count >= 8) {
  26         uint8x8x4_t vsrc;
  27         uint16x8_t vdst;
  28
  29         // Load
  30         vsrc = vld4_u8((uint8_t*)src);
  31
  32         // Convert src to 565
  33         vdst = SkPixel32ToPixel16_neon8(vsrc);
  34
  35         // Store
  36         vst1q_u16(dst, vdst);
  37
  38         // Prepare next iteration
  39         dst += 8;
  40         src += 8;
  41         count -= 8;
  42     };
  43
  44     // Leftovers
  45     while (count > 0) {
  46         SkPMColor c = *src++;
  47         SkPMColorAssert(c);
  48         *dst = SkPixel32ToPixel16_ToU16(c);
  49         dst++;
  50         count--;
  51     };
  52 }
  53
  54 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
  55                           const SkPMColor* SK_RESTRICT src, int count,
  56                           U8CPU alpha, int /*x*/, int /*y*/) {
  57     SkASSERT(255 > alpha);
  58
  59     uint16x8_t vmask_blue, vscale;
  60
  61     // prepare constants
  62     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
  63     vmask_blue = vmovq_n_u16(0x1F);
  64
  65     while (count >= 8) {
  66         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
  67         uint16x8_t vres_r, vres_g, vres_b;
  68         uint8x8_t vsrc_r, vsrc_g, vsrc_b;
  69
  70         // Load src
  71         {
  72         register uint8x8_t d0 asm("d0");
  73         register uint8x8_t d1 asm("d1");
  74         register uint8x8_t d2 asm("d2");
  75         register uint8x8_t d3 asm("d3");
  76
  77         asm (
  78             "vld4.8    {d0-d3},[%[src]]!"
  79             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
  80             :
  81         );
  82         vsrc_g = d1;
  83 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
  84         vsrc_r = d2; vsrc_b = d0;
  85 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
  86         vsrc_r = d0; vsrc_b = d2;
  87 #endif
  88         }
  89
  90         // Load and unpack dst
  91         vdst = vld1q_u16(dst);
  92         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
  93         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
  94         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
  95         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
  96
  97         // Shift src to 565
  98         vsrc_r = vshr_n_u8(vsrc_r, 3);    // shift red to 565 range
  99         vsrc_g = vshr_n_u8(vsrc_g, 2);    // shift green to 565 range
 100         vsrc_b = vshr_n_u8(vsrc_b, 3);    // shift blue to 565 range
 101
 102         // Scale src - dst
 103         vres_r = vmovl_u8(vsrc_r) - vdst_r;
 104         vres_g = vmovl_u8(vsrc_g) - vdst_g;
 105         vres_b = vmovl_u8(vsrc_b) - vdst_b;
 106
 107         vres_r = vshrq_n_u16(vres_r * vscale, 8);
 108         vres_g = vshrq_n_u16(vres_g * vscale, 8);
 109         vres_b = vshrq_n_u16(vres_b * vscale, 8);
 110
 111         vres_r += vdst_r;
 112         vres_g += vdst_g;
 113         vres_b += vdst_b;
 114
 115         // Combine
 116         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
 117         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
 118
 119         // Store
 120         vst1q_u16(dst, vres_b);
 121         dst += 8;
 122         count -= 8;
 123     }
 124     if (count > 0) {
 125         int scale = SkAlpha255To256(alpha);
 126         do {
 127             SkPMColor c = *src++;
 128             SkPMColorAssert(c);
 129             uint16_t d = *dst;
 130             *dst++ = SkPackRGB16(
 131                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
 132                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
 133                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
 134         } while (--count != 0);
 135     }
 136 }
 137
 138 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
 139                            const SkPMColor* SK_RESTRICT src, int count,
 140                            U8CPU alpha, int /*x*/, int /*y*/) {
 141     SkASSERT(255 == alpha);
 142
 143     if (count >= 8) {
 144         uint16_t* SK_RESTRICT keep_dst = 0;
 145
 146         asm volatile (
 147                       "ands       ip, %[count], #7            \n\t"
 148                       "vmov.u8    d31, #1<<7                  \n\t"
 149                       "vld1.16    {q12}, [%[dst]]             \n\t"
 150                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
 151                       // Thumb does not support the standard ARM conditional
 152                       // instructions but instead requires the 'it' instruction
 153                       // to signal conditional execution
 154                       "it eq                                  \n\t"
 155                       "moveq      ip, #8                      \n\t"
 156                       "mov        %[keep_dst], %[dst]         \n\t"
 157
 158                       "add        %[src], %[src], ip, LSL#2   \n\t"
 159                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
 160                       "subs       %[count], %[count], ip      \n\t"
 161                       "b          9f                          \n\t"
 162                       // LOOP
 163                       "2:                                         \n\t"
 164
 165                       "vld1.16    {q12}, [%[dst]]!            \n\t"
 166                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
 167                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
 168                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
 169                       "subs       %[count], %[count], #8      \n\t"
 170                       "9:                                         \n\t"
 171                       "pld        [%[dst],#32]                \n\t"
 172                       // expand 0565 q12 to 8888 {d4-d7}
 173                       "vmovn.u16  d4, q12                     \n\t"
 174                       "vshr.u16   q11, q12, #5                \n\t"
 175                       "vshr.u16   q10, q12, #6+5              \n\t"
 176                       "vmovn.u16  d5, q11                     \n\t"
 177                       "vmovn.u16  d6, q10                     \n\t"
 178                       "vshl.u8    d4, d4, #3                  \n\t"
 179                       "vshl.u8    d5, d5, #2                  \n\t"
 180                       "vshl.u8    d6, d6, #3                  \n\t"
 181
 182                       "vmovl.u8   q14, d31                    \n\t"
 183                       "vmovl.u8   q13, d31                    \n\t"
 184                       "vmovl.u8   q12, d31                    \n\t"
 185
 186                       // duplicate in 4/2/1 & 8pix vsns
 187                       "vmvn.8     d30, d3                     \n\t"
 188                       "vmlal.u8   q14, d30, d6                \n\t"
 189                       "vmlal.u8   q13, d30, d5                \n\t"
 190                       "vmlal.u8   q12, d30, d4                \n\t"
 191                       "vshr.u16   q8, q14, #5                 \n\t"
 192                       "vshr.u16   q9, q13, #6                 \n\t"
 193                       "vaddhn.u16 d6, q14, q8                 \n\t"
 194                       "vshr.u16   q8, q12, #5                 \n\t"
 195                       "vaddhn.u16 d5, q13, q9                 \n\t"
 196                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
 197                       "vaddhn.u16 d4, q12, q8                 \n\t"
 198                       // intentionally don't calculate alpha
 199                       // result in d4-d6
 200
 201                       "vqadd.u8   d5, d5, d1                  \n\t"
 202                       "vqadd.u8   d4, d4, d2                  \n\t"
 203
 204                       // pack 8888 {d4-d6} to 0565 q10
 205                       "vshll.u8   q10, d6, #8                 \n\t"
 206                       "vshll.u8   q3, d5, #8                  \n\t"
 207                       "vshll.u8   q2, d4, #8                  \n\t"
 208                       "vsri.u16   q10, q3, #5                 \n\t"
 209                       "vsri.u16   q10, q2, #11                \n\t"
 210
 211                       "bne        2b                          \n\t"
 212
 213                       "1:                                         \n\t"
 214                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
 215                       : [count] "+r" (count)
 216                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
 217                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 218                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 219                       "d30","d31"
 220                       );
 221     }
 222     else
 223     {   // handle count < 8
 224         uint16_t* SK_RESTRICT keep_dst = 0;
 225
 226         asm volatile (
 227                       "vmov.u8    d31, #1<<7                  \n\t"
 228                       "mov        %[keep_dst], %[dst]         \n\t"
 229
 230                       "tst        %[count], #4                \n\t"
 231                       "beq        14f                         \n\t"
 232                       "vld1.16    {d25}, [%[dst]]!            \n\t"
 233                       "vld1.32    {q1}, [%[src]]!             \n\t"
 234
 235                       "14:                                        \n\t"
 236                       "tst        %[count], #2                \n\t"
 237                       "beq        12f                         \n\t"
 238                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
 239                       "vld1.32    {d1}, [%[src]]!             \n\t"
 240
 241                       "12:                                        \n\t"
 242                       "tst        %[count], #1                \n\t"
 243                       "beq        11f                         \n\t"
 244                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
 245                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
 246
 247                       "11:                                        \n\t"
 248                       // unzips achieve the same as a vld4 operation
 249                       "vuzpq.u16  q0, q1                      \n\t"
 250                       "vuzp.u8    d0, d1                      \n\t"
 251                       "vuzp.u8    d2, d3                      \n\t"
 252                       // expand 0565 q12 to 8888 {d4-d7}
 253                       "vmovn.u16  d4, q12                     \n\t"
 254                       "vshr.u16   q11, q12, #5                \n\t"
 255                       "vshr.u16   q10, q12, #6+5              \n\t"
 256                       "vmovn.u16  d5, q11                     \n\t"
 257                       "vmovn.u16  d6, q10                     \n\t"
 258                       "vshl.u8    d4, d4, #3                  \n\t"
 259                       "vshl.u8    d5, d5, #2                  \n\t"
 260                       "vshl.u8    d6, d6, #3                  \n\t"
 261
 262                       "vmovl.u8   q14, d31                    \n\t"
 263                       "vmovl.u8   q13, d31                    \n\t"
 264                       "vmovl.u8   q12, d31                    \n\t"
 265
 266                       // duplicate in 4/2/1 & 8pix vsns
 267                       "vmvn.8     d30, d3                     \n\t"
 268                       "vmlal.u8   q14, d30, d6                \n\t"
 269                       "vmlal.u8   q13, d30, d5                \n\t"
 270                       "vmlal.u8   q12, d30, d4                \n\t"
 271                       "vshr.u16   q8, q14, #5                 \n\t"
 272                       "vshr.u16   q9, q13, #6                 \n\t"
 273                       "vaddhn.u16 d6, q14, q8                 \n\t"
 274                       "vshr.u16   q8, q12, #5                 \n\t"
 275                       "vaddhn.u16 d5, q13, q9                 \n\t"
 276                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
 277                       "vaddhn.u16 d4, q12, q8                 \n\t"
 278                       // intentionally don't calculate alpha
 279                       // result in d4-d6
 280
 281                       "vqadd.u8   d5, d5, d1                  \n\t"
 282                       "vqadd.u8   d4, d4, d2                  \n\t"
 283
 284                       // pack 8888 {d4-d6} to 0565 q10
 285                       "vshll.u8   q10, d6, #8                 \n\t"
 286                       "vshll.u8   q3, d5, #8                  \n\t"
 287                       "vshll.u8   q2, d4, #8                  \n\t"
 288                       "vsri.u16   q10, q3, #5                 \n\t"
 289                       "vsri.u16   q10, q2, #11                \n\t"
 290
 291                       // store
 292                       "tst        %[count], #4                \n\t"
 293                       "beq        24f                         \n\t"
 294                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
 295
 296                       "24:                                        \n\t"
 297                       "tst        %[count], #2                \n\t"
 298                       "beq        22f                         \n\t"
 299                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
 300
 301                       "22:                                        \n\t"
 302                       "tst        %[count], #1                \n\t"
 303                       "beq        21f                         \n\t"
 304                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
 305
 306                       "21:                                        \n\t"
 307                       : [count] "+r" (count)
 308                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
 309                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 310                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 311                       "d30","d31"
 312                       );
 313     }
 314 }
 315
 316 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
 317     prod += vdupq_n_u16(128);
 318     prod += vshrq_n_u16(prod, 8);
 319     return vshrq_n_u16(prod, 8);
 320 }
 321
 322 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
 323                           const SkPMColor* SK_RESTRICT src, int count,
 324                           U8CPU alpha, int /*x*/, int /*y*/) {
 325    SkASSERT(255 > alpha);
 326
 327     /* This code implements a Neon version of S32A_D565_Blend. The results have
 328      * a few mismatches compared to the original code. These mismatches never
 329      * exceed 1.
 330      */
 331
 332     if (count >= 8) {
 333         uint16x8_t valpha_max, vmask_blue;
 334         uint8x8_t valpha;
 335
 336         // prepare constants
 337         valpha_max = vmovq_n_u16(255);
 338         valpha = vdup_n_u8(alpha);
 339         vmask_blue = vmovq_n_u16(SK_B16_MASK);
 340
 341         do {
 342             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
 343             uint16x8_t vres_a, vres_r, vres_g, vres_b;
 344             uint8x8x4_t vsrc;
 345
 346             // load pixels
 347             vdst = vld1q_u16(dst);
 348 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
 349             asm (
 350                 "vld4.u8 %h[vsrc], [%[src]]!"
 351                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
 352                 : :
 353             );
 354 #else
 355             register uint8x8_t d0 asm("d0");
 356             register uint8x8_t d1 asm("d1");
 357             register uint8x8_t d2 asm("d2");
 358             register uint8x8_t d3 asm("d3");
 359
 360             asm volatile (
 361                 "vld4.u8    {d0-d3},[%[src]]!;"
 362                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
 363                   [src] "+&r" (src)
 364                 : :
 365             );
 366             vsrc.val[0] = d0;
 367             vsrc.val[1] = d1;
 368             vsrc.val[2] = d2;
 369             vsrc.val[3] = d3;
 370 #endif
 371
 372
 373             // deinterleave dst
 374             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
 375             vdst_b = vdst & vmask_blue;                     // extract blue
 376             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
 377             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
 378
 379             // shift src to 565
 380             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
 381             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
 382             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
 383
 384             // calc src * src_scale
 385             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
 386             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
 387             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
 388             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
 389
 390             // prepare dst_scale
 391             vres_a = SkDiv255Round_neon8(vres_a);
 392             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
 393
 394             // add dst * dst_scale to previous result
 395             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
 396             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
 397             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
 398
 399 #ifdef S32A_D565_BLEND_EXACT
 400             // It is possible to get exact results with this but it is slow,
 401             // even slower than C code in some cases
 402             vres_r = SkDiv255Round_neon8(vres_r);
 403             vres_g = SkDiv255Round_neon8(vres_g);
 404             vres_b = SkDiv255Round_neon8(vres_b);
 405 #else
 406             vres_r = vrshrq_n_u16(vres_r, 8);
 407             vres_g = vrshrq_n_u16(vres_g, 8);
 408             vres_b = vrshrq_n_u16(vres_b, 8);
 409 #endif
 410             // pack result
 411             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
 412             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
 413
 414             // store
 415             vst1q_u16(dst, vres_b);
 416             dst += 8;
 417             count -= 8;
 418         } while (count >= 8);
 419     }
 420
 421     // leftovers
 422     while (count-- > 0) {
 423         SkPMColor sc = *src++;
 424         if (sc) {
 425             uint16_t dc = *dst;
 426             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
 427             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
 428             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
 429             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
 430             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
 431         }
 432         dst += 1;
 433     }
 434 }
 435
 436 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
 437  * each dither value is spaced out into byte lanes, and repeated
 438  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
 439  * start of each row.
 440  */
 441 static const uint8_t gDitherMatrix_Neon[48] = {
 442     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
 443     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
 444     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
 445     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
 446
 447 };
 448
 449 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
 450                                 int count, U8CPU alpha, int x, int y)
 451 {
 452
 453     SkASSERT(255 > alpha);
 454
 455     // rescale alpha to range 1 - 256
 456     int scale = SkAlpha255To256(alpha);
 457
 458     if (count >= 8) {
 459         /* select row and offset for dither array */
 460         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
 461
 462         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
 463         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
 464
 465         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
 466         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
 467
 468         do {
 469
 470             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
 471             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
 472             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
 473             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
 474             uint16x8_t vdst;
 475             uint16x8_t vdst_r, vdst_g, vdst_b;
 476             int16x8_t vres_r, vres_g, vres_b;
 477             int8x8_t vres8_r, vres8_g, vres8_b;
 478
 479             // Load source and add dither
 480             {
 481             register uint8x8_t d0 asm("d0");
 482             register uint8x8_t d1 asm("d1");
 483             register uint8x8_t d2 asm("d2");
 484             register uint8x8_t d3 asm("d3");
 485
 486             asm (
 487                 "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
 488                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
 489                 :
 490             );
 491             vsrc_g = d1;
 492 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
 493             vsrc_r = d2; vsrc_b = d0;
 494 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
 495             vsrc_r = d0; vsrc_b = d2;
 496 #endif
 497             }
 498
 499             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
 500             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
 501             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
 502
 503             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
 504             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
 505             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
 506
 507             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
 508             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
 509             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
 510
 511             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
 512             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
 513             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
 514
 515             // Load dst and unpack
 516             vdst = vld1q_u16(dst);
 517             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
 518             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
 519             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
 520
 521             // subtract dst from src and widen
 522             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
 523             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
 524             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
 525
 526             // multiply diffs by scale and shift
 527             vres_r = vmulq_s16(vres_r, vscale);
 528             vres_g = vmulq_s16(vres_g, vscale);
 529             vres_b = vmulq_s16(vres_b, vscale);
 530
 531             vres8_r = vshrn_n_s16(vres_r, 8);
 532             vres8_g = vshrn_n_s16(vres_g, 8);
 533             vres8_b = vshrn_n_s16(vres_b, 8);
 534
 535             // add dst to result
 536             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
 537             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
 538             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
 539
 540             // put result into 565 format
 541             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
 542             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
 543
 544             // Store result
 545             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
 546
 547             // Next iteration
 548             dst += 8;
 549             count -= 8;
 550
 551         } while (count >= 8);
 552     }
 553
 554     // Leftovers
 555     if (count > 0) {
 556         int scale = SkAlpha255To256(alpha);
 557         DITHER_565_SCAN(y);
 558         do {
 559             SkPMColor c = *src++;
 560             SkPMColorAssert(c);
 561
 562             int dither = DITHER_VALUE(x);
 563             int sr = SkGetPackedR32(c);
 564             int sg = SkGetPackedG32(c);
 565             int sb = SkGetPackedB32(c);
 566             sr = SkDITHER_R32To565(sr, dither);
 567             sg = SkDITHER_G32To565(sg, dither);
 568             sb = SkDITHER_B32To565(sb, dither);
 569
 570             uint16_t d = *dst;
 571             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
 572                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
 573                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
 574             DITHER_INC_X(x);
 575         } while (--count != 0);
 576     }
 577 }
 578
 579 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 580                                 const SkPMColor* SK_RESTRICT src,
 581                                 int count, U8CPU alpha) {
 582
 583     SkASSERT(255 == alpha);
 584     if (count > 0) {
 585
 586
 587     uint8x8_t alpha_mask;
 588
 589     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 590     alpha_mask = vld1_u8(alpha_mask_setup);
 591
 592     /* do the NEON unrolled code */
 593 #define    UNROLL    4
 594     while (count >= UNROLL) {
 595         uint8x8_t src_raw, dst_raw, dst_final;
 596         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
 597
 598         /* The two prefetches below may make the code slighlty
 599          * slower for small values of count but are worth having
 600          * in the general case.
 601          */
 602         __builtin_prefetch(src+32);
 603         __builtin_prefetch(dst+32);
 604
 605         /* get the source */
 606         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 607 #if    UNROLL > 2
 608         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
 609 #endif
 610
 611         /* get and hold the dst too */
 612         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 613 #if    UNROLL > 2
 614         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
 615 #endif
 616
 617     /* 1st and 2nd bits of the unrolling */
 618     {
 619         uint8x8_t dst_cooked;
 620         uint16x8_t dst_wide;
 621         uint8x8_t alpha_narrow;
 622         uint16x8_t alpha_wide;
 623
 624         /* get the alphas spread out properly */
 625         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
 626         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 627
 628         /* spread the dest */
 629         dst_wide = vmovl_u8(dst_raw);
 630
 631         /* alpha mul the dest */
 632         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 633         dst_cooked = vshrn_n_u16(dst_wide, 8);
 634
 635         /* sum -- ignoring any byte lane overflows */
 636         dst_final = vadd_u8(src_raw, dst_cooked);
 637     }
 638
 639 #if    UNROLL > 2
 640     /* the 3rd and 4th bits of our unrolling */
 641     {
 642         uint8x8_t dst_cooked;
 643         uint16x8_t dst_wide;
 644         uint8x8_t alpha_narrow;
 645         uint16x8_t alpha_wide;
 646
 647         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
 648         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 649
 650         /* spread the dest */
 651         dst_wide = vmovl_u8(dst_raw_2);
 652
 653         /* alpha mul the dest */
 654         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 655         dst_cooked = vshrn_n_u16(dst_wide, 8);
 656
 657         /* sum -- ignoring any byte lane overflows */
 658         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
 659     }
 660 #endif
 661
 662         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 663 #if    UNROLL > 2
 664         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
 665 #endif
 666
 667         src += UNROLL;
 668         dst += UNROLL;
 669         count -= UNROLL;
 670     }
 671 #undef    UNROLL
 672
 673     /* do any residual iterations */
 674         while (--count >= 0) {
 675             *dst = SkPMSrcOver(*src, *dst);
 676             src += 1;
 677             dst += 1;
 678         }
 679     }
 680 }
 681
 682 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
 683                                 const SkPMColor* SK_RESTRICT src,
 684                                 int count, U8CPU alpha) {
 685     SkASSERT(255 == alpha);
 686
 687     if (count <= 0)
 688     return;
 689
 690     /* Use these to check if src is transparent or opaque */
 691     const unsigned int ALPHA_OPAQ  = 0xFF000000;
 692     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
 693
 694 #define UNROLL  4
 695     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
 696     const SkPMColor* SK_RESTRICT src_temp = src;
 697
 698     /* set up the NEON variables */
 699     uint8x8_t alpha_mask;
 700     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 701     alpha_mask = vld1_u8(alpha_mask_setup);
 702
 703     uint8x8_t src_raw, dst_raw, dst_final;
 704     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
 705     uint8x8_t dst_cooked;
 706     uint16x8_t dst_wide;
 707     uint8x8_t alpha_narrow;
 708     uint16x8_t alpha_wide;
 709
 710     /* choose the first processing type */
 711     if( src >= src_end)
 712         goto TAIL;
 713     if(*src <= ALPHA_TRANS)
 714         goto ALPHA_0;
 715     if(*src >= ALPHA_OPAQ)
 716         goto ALPHA_255;
 717     /* fall-thru */
 718
 719 ALPHA_1_TO_254:
 720     do {
 721
 722         /* get the source */
 723         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 724         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
 725
 726         /* get and hold the dst too */
 727         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 728         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
 729
 730
 731         /* get the alphas spread out properly */
 732         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
 733         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
 734         /* we collapsed (255-a)+1 ... */
 735         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 736
 737         /* spread the dest */
 738         dst_wide = vmovl_u8(dst_raw);
 739
 740         /* alpha mul the dest */
 741         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 742         dst_cooked = vshrn_n_u16(dst_wide, 8);
 743
 744         /* sum -- ignoring any byte lane overflows */
 745         dst_final = vadd_u8(src_raw, dst_cooked);
 746
 747         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
 748         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
 749         /* we collapsed (255-a)+1 ... */
 750         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 751
 752         /* spread the dest */
 753         dst_wide = vmovl_u8(dst_raw_2);
 754
 755         /* alpha mul the dest */
 756         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 757         dst_cooked = vshrn_n_u16(dst_wide, 8);
 758
 759         /* sum -- ignoring any byte lane overflows */
 760         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
 761
 762         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 763         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
 764
 765         src += UNROLL;
 766         dst += UNROLL;
 767
 768         /* if 2 of the next pixels aren't between 1 and 254
 769         it might make sense to go to the optimized loops */
 770         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
 771             break;
 772
 773     } while(src < src_end);
 774
 775     if (src >= src_end)
 776         goto TAIL;
 777
 778     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
 779         goto ALPHA_255;
 780
 781     /*fall-thru*/
 782
 783 ALPHA_0:
 784
 785     /*In this state, we know the current alpha is 0 and
 786      we optimize for the next alpha also being zero. */
 787     src_temp = src;  //so we don't have to increment dst every time
 788     do {
 789         if(*(++src) > ALPHA_TRANS)
 790             break;
 791         if(*(++src) > ALPHA_TRANS)
 792             break;
 793         if(*(++src) > ALPHA_TRANS)
 794             break;
 795         if(*(++src) > ALPHA_TRANS)
 796             break;
 797     } while(src < src_end);
 798
 799     dst += (src - src_temp);
 800
 801     /* no longer alpha 0, so determine where to go next. */
 802     if( src >= src_end)
 803         goto TAIL;
 804     if(*src >= ALPHA_OPAQ)
 805         goto ALPHA_255;
 806     else
 807         goto ALPHA_1_TO_254;
 808
 809 ALPHA_255:
 810     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
 811         dst[0]=src[0];
 812         dst[1]=src[1];
 813         dst[2]=src[2];
 814         dst[3]=src[3];
 815         src+=UNROLL;
 816         dst+=UNROLL;
 817         if(src >= src_end)
 818             goto TAIL;
 819     }
 820
 821     //Handle remainder.
 822     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
 823         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
 824             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
 825         }
 826     }
 827
 828     if( src >= src_end)
 829         goto TAIL;
 830     if(*src <= ALPHA_TRANS)
 831         goto ALPHA_0;
 832     else
 833         goto ALPHA_1_TO_254;
 834
 835 TAIL:
 836     /* do any residual iterations */
 837     src_end += UNROLL + 1;  //goto the real end
 838     while(src != src_end) {
 839         if( *src != 0 ) {
 840             if( *src >= ALPHA_OPAQ ) {
 841                 *dst = *src;
 842             }
 843             else {
 844                 *dst = SkPMSrcOver(*src, *dst);
 845             }
 846         }
 847         src++;
 848         dst++;
 849     }
 850
 851 #undef    UNROLL
 852     return;
 853 }
 854
 855 /* Neon version of S32_Blend_BlitRow32()
 856  * portable version is in src/core/SkBlitRow_D32.cpp
 857  */
 858 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 859                               const SkPMColor* SK_RESTRICT src,
 860                               int count, U8CPU alpha) {
 861     SkASSERT(alpha <= 255);
 862
 863     if (count <= 0) {
 864         return;
 865     }
 866
 867     uint16_t src_scale = SkAlpha255To256(alpha);
 868     uint16_t dst_scale = 256 - src_scale;
 869
 870     while (count >= 2) {
 871         uint8x8_t vsrc, vdst, vres;
 872         uint16x8_t vsrc_wide, vdst_wide;
 873
 874         /* These commented prefetches are a big win for count
 875          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
 876          * They also hurt a little (<5%) on an A15
 877          */
 878         //__builtin_prefetch(src+32);
 879         //__builtin_prefetch(dst+32);
 880
 881         // Load
 882         vsrc = vreinterpret_u8_u32(vld1_u32(src));
 883         vdst = vreinterpret_u8_u32(vld1_u32(dst));
 884
 885         // Process src
 886         vsrc_wide = vmovl_u8(vsrc);
 887         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
 888
 889         // Process dst
 890         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
 891
 892         // Combine
 893         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
 894
 895         // Store
 896         vst1_u32(dst, vreinterpret_u32_u8(vres));
 897
 898         src += 2;
 899         dst += 2;
 900         count -= 2;
 901     }
 902
 903     if (count == 1) {
 904         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
 905         uint16x8_t vsrc_wide, vdst_wide;
 906
 907         // Load
 908         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
 909         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
 910
 911         // Process
 912         vsrc_wide = vmovl_u8(vsrc);
 913         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
 914         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
 915         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
 916
 917         // Store
 918         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
 919     }
 920 }
 921
 922 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 923                          const SkPMColor* SK_RESTRICT src,
 924                          int count, U8CPU alpha) {
 925
 926     SkASSERT(255 >= alpha);
 927
 928     if (count <= 0) {
 929         return;
 930     }
 931
 932     unsigned alpha256 = SkAlpha255To256(alpha);
 933
 934     // First deal with odd counts
 935     if (count & 1) {
 936         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
 937         uint16x8_t vdst_wide, vsrc_wide;
 938         unsigned dst_scale;
 939
 940         // Load
 941         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
 942         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
 943
 944         // Calc dst_scale
 945         dst_scale = vget_lane_u8(vsrc, 3);
 946         dst_scale *= alpha256;
 947         dst_scale >>= 8;
 948         dst_scale = 256 - dst_scale;
 949
 950         // Process src
 951         vsrc_wide = vmovl_u8(vsrc);
 952         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
 953
 954         // Process dst
 955         vdst_wide = vmovl_u8(vdst);
 956         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
 957
 958         // Combine
 959         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
 960
 961         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
 962         dst++;
 963         src++;
 964         count--;
 965     }
 966
 967     if (count) {
 968         uint8x8_t alpha_mask;
 969         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 970         alpha_mask = vld1_u8(alpha_mask_setup);
 971
 972         do {
 973
 974             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
 975             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
 976
 977             __builtin_prefetch(src+32);
 978             __builtin_prefetch(dst+32);
 979
 980             // Load
 981             vsrc = vreinterpret_u8_u32(vld1_u32(src));
 982             vdst = vreinterpret_u8_u32(vld1_u32(dst));
 983
 984             // Prepare src_scale
 985             vsrc_scale = vdupq_n_u16(alpha256);
 986
 987             // Calc dst_scale
 988             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
 989             vdst_scale = vmovl_u8(vsrc_alphas);
 990             vdst_scale *= vsrc_scale;
 991             vdst_scale = vshrq_n_u16(vdst_scale, 8);
 992             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
 993
 994             // Process src
 995             vsrc_wide = vmovl_u8(vsrc);
 996             vsrc_wide *= vsrc_scale;
 997
 998             // Process dst
 999             vdst_wide = vmovl_u8(vdst);
1000             vdst_wide *= vdst_scale;
1001
1002             // Combine
1003             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1004
1005             vst1_u32(dst, vreinterpret_u32_u8(vres));
1006
1007             src += 2;
1008             dst += 2;
1009             count -= 2;
1010         } while(count);
1011     }
1012 }
1013
1014 ///////////////////////////////////////////////////////////////////////////////
1015
1016 #undef    DEBUG_OPAQUE_DITHER
1017
1018 #if    defined(DEBUG_OPAQUE_DITHER)
1019 static void showme8(char *str, void *p, int len)
1020 {
1021     static char buf[256];
1022     char tbuf[32];
1023     int i;
1024     char *pc = (char*) p;
1025     sprintf(buf,"%8s:", str);
1026     for(i=0;i<len;i++) {
1027         sprintf(tbuf, "   %02x", pc[i]);
1028         strcat(buf, tbuf);
1029     }
1030     SkDebugf("%s\n", buf);
1031 }
1032 static void showme16(char *str, void *p, int len)
1033 {
1034     static char buf[256];
1035     char tbuf[32];
1036     int i;
1037     uint16_t *pc = (uint16_t*) p;
1038     sprintf(buf,"%8s:", str);
1039     len = (len / sizeof(uint16_t));    /* passed as bytes */
1040     for(i=0;i<len;i++) {
1041         sprintf(tbuf, " %04x", pc[i]);
1042         strcat(buf, tbuf);
1043     }
1044     SkDebugf("%s\n", buf);
1045 }
1046 #endif
1047
1048 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1049                                    const SkPMColor* SK_RESTRICT src,
1050                                    int count, U8CPU alpha, int x, int y) {
1051     SkASSERT(255 == alpha);
1052
1053 #define    UNROLL    8
1054
1055     if (count >= UNROLL) {
1056
1057 #if defined(DEBUG_OPAQUE_DITHER)
1058     uint16_t tmpbuf[UNROLL];
1059     int td[UNROLL];
1060     int tdv[UNROLL];
1061     int ta[UNROLL];
1062     int tap[UNROLL];
1063     uint16_t in_dst[UNROLL];
1064     int offset = 0;
1065     int noisy = 0;
1066 #endif
1067
1068     uint8x8_t dbase;
1069     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1070     dbase = vld1_u8(dstart);
1071
1072         do {
1073         uint8x8_t sr, sg, sb, sa, d;
1074         uint16x8_t dst8, scale8, alpha8;
1075         uint16x8_t dst_r, dst_g, dst_b;
1076
1077 #if defined(DEBUG_OPAQUE_DITHER)
1078         // calculate 8 elements worth into a temp buffer
1079         {
1080         int my_y = y;
1081         int my_x = x;
1082         SkPMColor* my_src = (SkPMColor*)src;
1083         uint16_t* my_dst = dst;
1084         int i;
1085
1086         DITHER_565_SCAN(my_y);
1087         for(i = 0; i < UNROLL; i++) {
1088             SkPMColor c = *my_src++;
1089             SkPMColorAssert(c);
1090             if (c) {
1091                 unsigned a = SkGetPackedA32(c);
1092
1093                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1094                 tdv[i] = DITHER_VALUE(my_x);
1095                 ta[i] = a;
1096                 tap[i] = SkAlpha255To256(a);
1097                 td[i] = d;
1098
1099                 unsigned sr = SkGetPackedR32(c);
1100                 unsigned sg = SkGetPackedG32(c);
1101                 unsigned sb = SkGetPackedB32(c);
1102                 sr = SkDITHER_R32_FOR_565(sr, d);
1103                 sg = SkDITHER_G32_FOR_565(sg, d);
1104                 sb = SkDITHER_B32_FOR_565(sb, d);
1105
1106                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1107                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1108                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1109                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1110                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1111                 td[i] = d;
1112             } else {
1113                 tmpbuf[i] = *my_dst;
1114                 ta[i] = tdv[i] = td[i] = 0xbeef;
1115             }
1116             in_dst[i] = *my_dst;
1117             my_dst += 1;
1118             DITHER_INC_X(my_x);
1119         }
1120         }
1121 #endif
1122
1123
1124         {
1125         register uint8x8_t d0 asm("d0");
1126         register uint8x8_t d1 asm("d1");
1127         register uint8x8_t d2 asm("d2");
1128         register uint8x8_t d3 asm("d3");
1129
1130         asm ("vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1131             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1132             :
1133         );
1134 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1135             sr = d2; sg = d1; sb = d0; sa = d3;
1136 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1137             sr = d0; sg = d1; sb = d2; sa = d3;
1138 #endif
1139         }
1140
1141         /* calculate 'd', which will be 0..7
1142          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1143          */
1144         alpha8 = vmovl_u8(dbase);
1145         alpha8 = vmlal_u8(alpha8, sa, dbase);
1146         d = vshrn_n_u16(alpha8, 8);    // narrowing too
1147
1148         // sr = sr - (sr>>5) + d
1149         /* watching for 8-bit overflow.  d is 0..7; risky range of
1150          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1151          * safe  as long as we do ((sr-sr>>5) + d)
1152          */
1153         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1154         sr = vadd_u8(sr, d);
1155
1156         // sb = sb - (sb>>5) + d
1157         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1158         sb = vadd_u8(sb, d);
1159
1160         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1161         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1162         sg = vadd_u8(sg, vshr_n_u8(d,1));
1163
1164         // need to pick up 8 dst's -- at 16 bits each, 128 bits
1165         dst8 = vld1q_u16(dst);
1166         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1167         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1168         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1169
1170         // blend
1171         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1172
1173         // combine the addq and mul, save 3 insns
1174         scale8 = vshrq_n_u16(scale8, 3);
1175         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1176         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1177         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1178
1179         // repack to store
1180         dst8 = vshrq_n_u16(dst_b, 5);
1181         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1182         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1183
1184         vst1q_u16(dst, dst8);
1185
1186 #if defined(DEBUG_OPAQUE_DITHER)
1187         // verify my 8 elements match the temp buffer
1188         {
1189         int i, bad=0;
1190         static int invocation;
1191
1192         for (i = 0; i < UNROLL; i++) {
1193             if (tmpbuf[i] != dst[i]) {
1194                 bad=1;
1195             }
1196         }
1197         if (bad) {
1198             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1199                      invocation, offset);
1200             SkDebugf("  alpha 0x%x\n", alpha);
1201             for (i = 0; i < UNROLL; i++)
1202                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1203                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1204                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1205
1206             showme16("alpha8", &alpha8, sizeof(alpha8));
1207             showme16("scale8", &scale8, sizeof(scale8));
1208             showme8("d", &d, sizeof(d));
1209             showme16("dst8", &dst8, sizeof(dst8));
1210             showme16("dst_b", &dst_b, sizeof(dst_b));
1211             showme16("dst_g", &dst_g, sizeof(dst_g));
1212             showme16("dst_r", &dst_r, sizeof(dst_r));
1213             showme8("sb", &sb, sizeof(sb));
1214             showme8("sg", &sg, sizeof(sg));
1215             showme8("sr", &sr, sizeof(sr));
1216
1217             return;
1218         }
1219         offset += UNROLL;
1220         invocation++;
1221         }
1222 #endif
1223         dst += UNROLL;
1224         count -= UNROLL;
1225         // skip x += UNROLL, since it's unchanged mod-4
1226         } while (count >= UNROLL);
1227     }
1228 #undef    UNROLL
1229
1230     // residuals
1231     if (count > 0) {
1232         DITHER_565_SCAN(y);
1233         do {
1234             SkPMColor c = *src++;
1235             SkPMColorAssert(c);
1236             if (c) {
1237                 unsigned a = SkGetPackedA32(c);
1238
1239                 // dither and alpha are just temporary variables to work-around
1240                 // an ICE in debug.
1241                 unsigned dither = DITHER_VALUE(x);
1242                 unsigned alpha = SkAlpha255To256(a);
1243                 int d = SkAlphaMul(dither, alpha);
1244
1245                 unsigned sr = SkGetPackedR32(c);
1246                 unsigned sg = SkGetPackedG32(c);
1247                 unsigned sb = SkGetPackedB32(c);
1248                 sr = SkDITHER_R32_FOR_565(sr, d);
1249                 sg = SkDITHER_G32_FOR_565(sg, d);
1250                 sb = SkDITHER_B32_FOR_565(sb, d);
1251
1252                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1253                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1254                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1255                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1256                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1257             }
1258             dst += 1;
1259             DITHER_INC_X(x);
1260         } while (--count != 0);
1261     }
1262 }
1263
1264 ///////////////////////////////////////////////////////////////////////////////
1265
1266 #undef    DEBUG_S32_OPAQUE_DITHER
1267
1268 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1269                                  const SkPMColor* SK_RESTRICT src,
1270                                  int count, U8CPU alpha, int x, int y) {
1271     SkASSERT(255 == alpha);
1272
1273 #define    UNROLL    8
1274     if (count >= UNROLL) {
1275     uint8x8_t d;
1276     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1277     d = vld1_u8(dstart);
1278
1279     while (count >= UNROLL) {
1280         uint8x8_t sr, sg, sb;
1281         uint16x8_t dr, dg, db;
1282         uint16x8_t dst8;
1283
1284         {
1285         register uint8x8_t d0 asm("d0");
1286         register uint8x8_t d1 asm("d1");
1287         register uint8x8_t d2 asm("d2");
1288         register uint8x8_t d3 asm("d3");
1289
1290         asm (
1291             "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1292             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1293             :
1294         );
1295         sg = d1;
1296 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1297         sr = d2; sb = d0;
1298 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1299         sr = d0; sb = d2;
1300 #endif
1301         }
1302         /* XXX: if we want to prefetch, hide it in the above asm()
1303          * using the gcc __builtin_prefetch(), the prefetch will
1304          * fall to the bottom of the loop -- it won't stick up
1305          * at the top of the loop, just after the vld4.
1306          */
1307
1308         // sr = sr - (sr>>5) + d
1309         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1310         dr = vaddl_u8(sr, d);
1311
1312         // sb = sb - (sb>>5) + d
1313         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1314         db = vaddl_u8(sb, d);
1315
1316         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1317         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1318         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1319
1320         // pack high bits of each into 565 format  (rgb, b is lsb)
1321         dst8 = vshrq_n_u16(db, 3);
1322         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1323         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1324
1325         // store it
1326         vst1q_u16(dst, dst8);
1327
1328 #if    defined(DEBUG_S32_OPAQUE_DITHER)
1329         // always good to know if we generated good results
1330         {
1331         int i, myx = x, myy = y;
1332         DITHER_565_SCAN(myy);
1333         for (i=0;i<UNROLL;i++) {
1334             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1335             SkPMColor c = src[i-8];
1336             unsigned dither = DITHER_VALUE(myx);
1337             uint16_t val = SkDitherRGB32To565(c, dither);
1338             if (val != dst[i]) {
1339             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1340                 c, dither, val, dst[i], dstart[i]);
1341             }
1342             DITHER_INC_X(myx);
1343         }
1344         }
1345 #endif
1346
1347         dst += UNROLL;
1348         // we don't need to increment src as the asm above has already done it
1349         count -= UNROLL;
1350         x += UNROLL;        // probably superfluous
1351     }
1352     }
1353 #undef    UNROLL
1354
1355     // residuals
1356     if (count > 0) {
1357         DITHER_565_SCAN(y);
1358         do {
1359             SkPMColor c = *src++;
1360             SkPMColorAssert(c);
1361             SkASSERT(SkGetPackedA32(c) == 255);
1362
1363             unsigned dither = DITHER_VALUE(x);
1364             *dst++ = SkDitherRGB32To565(c, dither);
1365             DITHER_INC_X(x);
1366         } while (--count != 0);
1367     }
1368 }
1369
1370 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1371                       SkPMColor color) {
1372     if (count <= 0) {
1373         return;
1374     }
1375
1376     if (0 == color) {
1377         if (src != dst) {
1378             memcpy(dst, src, count * sizeof(SkPMColor));
1379         }
1380         return;
1381     }
1382
1383     unsigned colorA = SkGetPackedA32(color);
1384     if (255 == colorA) {
1385         sk_memset32(dst, color, count);
1386         return;
1387     }
1388
1389     unsigned scale = 256 - SkAlpha255To256(colorA);
1390
1391     if (count >= 8) {
1392         uint32x4_t vcolor;
1393         uint8x8_t vscale;
1394
1395         vcolor = vdupq_n_u32(color);
1396
1397         // scale numerical interval [0-255], so load as 8 bits
1398         vscale = vdup_n_u8(scale);
1399
1400         do {
1401             // load src color, 8 pixels, 4 64 bit registers
1402             // (and increment src).
1403             uint32x2x4_t vsrc;
1404 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1405             asm (
1406                 "vld1.32    %h[vsrc], [%[src]]!"
1407                 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1408                 : :
1409             );
1410 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1411             vsrc.val[0] = vld1_u32(src);
1412             vsrc.val[1] = vld1_u32(src+2);
1413             vsrc.val[2] = vld1_u32(src+4);
1414             vsrc.val[3] = vld1_u32(src+6);
1415             src += 8;
1416 #endif
1417
1418             // multiply long by scale, 64 bits at a time,
1419             // destination into a 128 bit register.
1420             uint16x8x4_t vtmp;
1421             vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1422             vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1423             vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1424             vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1425
1426             // shift the 128 bit registers, containing the 16
1427             // bit scaled values back to 8 bits, narrowing the
1428             // results to 64 bit registers.
1429             uint8x16x2_t vres;
1430             vres.val[0] = vcombine_u8(
1431                             vshrn_n_u16(vtmp.val[0], 8),
1432                             vshrn_n_u16(vtmp.val[1], 8));
1433             vres.val[1] = vcombine_u8(
1434                             vshrn_n_u16(vtmp.val[2], 8),
1435                             vshrn_n_u16(vtmp.val[3], 8));
1436
1437             // adding back the color, using 128 bit registers.
1438             uint32x4x2_t vdst;
1439             vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1440                                                vreinterpretq_u8_u32(vcolor));
1441             vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1442                                                vreinterpretq_u8_u32(vcolor));
1443
1444             // store back the 8 calculated pixels (2 128 bit
1445             // registers), and increment dst.
1446 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1447             asm (
1448                 "vst1.32    %h[vdst], [%[dst]]!"
1449                 : [dst] "+r" (dst)
1450                 : [vdst] "w" (vdst)
1451                 : "memory"
1452             );
1453 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1454             vst1q_u32(dst, vdst.val[0]);
1455             vst1q_u32(dst+4, vdst.val[1]);
1456             dst += 8;
1457 #endif
1458             count -= 8;
1459
1460         } while (count >= 8);
1461     }
1462
1463     while (count > 0) {
1464         *dst = color + SkAlphaMulQ(*src, scale);
1465         src += 1;
1466         dst += 1;
1467         count--;
1468     }
1469 }
1470
1471 ///////////////////////////////////////////////////////////////////////////////
1472
1473 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1474     // no dither
1475     S32_D565_Opaque_neon,
1476     S32_D565_Blend_neon,
1477     S32A_D565_Opaque_neon,
1478     S32A_D565_Blend_neon,
1479
1480     // dither
1481     S32_D565_Opaque_Dither_neon,
1482     S32_D565_Blend_Dither_neon,
1483     S32A_D565_Opaque_Dither_neon,
1484     NULL,   // S32A_D565_Blend_Dither
1485 };
1486
1487 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1488     NULL,   // S32_Opaque,
1489     S32_Blend_BlitRow32_neon,        // S32_Blend,
1490     /*
1491      * We have two choices for S32A_Opaque procs. The one reads the src alpha
1492      * value and attempts to optimize accordingly.  The optimization is
1493      * sensitive to the source content and is not a win in all cases. For
1494      * example, if there are a lot of transitions between the alpha states,
1495      * the performance will almost certainly be worse.  However, for many
1496      * common cases the performance is equivalent or better than the standard
1497      * case where we do not inspect the src alpha.
1498      */
1499 #if SK_A32_SHIFT == 24
1500     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1501     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1502 #else
1503     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1504 #endif
1505     S32A_Blend_BlitRow32_neon        // S32A_Blend
1506 };