src/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp

   1 /*
   2  * Copyright 2012 The Android Open Source Project
   3  *
   4  * Use of this source code is governed by a BSD-style license that can be
   5  * found in the LICENSE file.
   6  */
   7
   8 #include "SkBlitRow_opts_arm_neon.h"
   9
  10 #include "SkBlitMask.h"
  11 #include "SkBlitRow.h"
  12 #include "SkColorPriv.h"
  13 #include "SkDither.h"
  14 #include "SkMathPriv.h"
  15 #include "SkUtils.h"
  16
  17 #include "SkCachePreload_arm.h"
  18 #include "SkColor_opts_neon.h"
  19 #include <arm_neon.h>
  20
  21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
  22                            const SkPMColor* SK_RESTRICT src, int count,
  23                            U8CPU alpha, int /*x*/, int /*y*/) {
  24     SkASSERT(255 == alpha);
  25
  26     while (count >= 8) {
  27         uint8x8x4_t vsrc;
  28         uint16x8_t vdst;
  29
  30         // Load
  31         vsrc = vld4_u8((uint8_t*)src);
  32
  33         // Convert src to 565
  34         vdst = SkPixel32ToPixel16_neon8(vsrc);
  35
  36         // Store
  37         vst1q_u16(dst, vdst);
  38
  39         // Prepare next iteration
  40         dst += 8;
  41         src += 8;
  42         count -= 8;
  43     };
  44
  45     // Leftovers
  46     while (count > 0) {
  47         SkPMColor c = *src++;
  48         SkPMColorAssert(c);
  49         *dst = SkPixel32ToPixel16_ToU16(c);
  50         dst++;
  51         count--;
  52     };
  53 }
  54
  55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
  56                            const SkPMColor* SK_RESTRICT src, int count,
  57                            U8CPU alpha, int /*x*/, int /*y*/) {
  58     SkASSERT(255 == alpha);
  59
  60     if (count >= 8) {
  61         uint16_t* SK_RESTRICT keep_dst = 0;
  62
  63         asm volatile (
  64                       "ands       ip, %[count], #7            \n\t"
  65                       "vmov.u8    d31, #1<<7                  \n\t"
  66                       "vld1.16    {q12}, [%[dst]]             \n\t"
  67                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
  68                       // Thumb does not support the standard ARM conditional
  69                       // instructions but instead requires the 'it' instruction
  70                       // to signal conditional execution
  71                       "it eq                                  \n\t"
  72                       "moveq      ip, #8                      \n\t"
  73                       "mov        %[keep_dst], %[dst]         \n\t"
  74
  75                       "add        %[src], %[src], ip, LSL#2   \n\t"
  76                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
  77                       "subs       %[count], %[count], ip      \n\t"
  78                       "b          9f                          \n\t"
  79                       // LOOP
  80                       "2:                                         \n\t"
  81
  82                       "vld1.16    {q12}, [%[dst]]!            \n\t"
  83                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
  84                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
  85                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
  86                       "subs       %[count], %[count], #8      \n\t"
  87                       "9:                                         \n\t"
  88                       "pld        [%[dst],#32]                \n\t"
  89                       // expand 0565 q12 to 8888 {d4-d7}
  90                       "vmovn.u16  d4, q12                     \n\t"
  91                       "vshr.u16   q11, q12, #5                \n\t"
  92                       "vshr.u16   q10, q12, #6+5              \n\t"
  93                       "vmovn.u16  d5, q11                     \n\t"
  94                       "vmovn.u16  d6, q10                     \n\t"
  95                       "vshl.u8    d4, d4, #3                  \n\t"
  96                       "vshl.u8    d5, d5, #2                  \n\t"
  97                       "vshl.u8    d6, d6, #3                  \n\t"
  98
  99                       "vmovl.u8   q14, d31                    \n\t"
 100                       "vmovl.u8   q13, d31                    \n\t"
 101                       "vmovl.u8   q12, d31                    \n\t"
 102
 103                       // duplicate in 4/2/1 & 8pix vsns
 104                       "vmvn.8     d30, d3                     \n\t"
 105                       "vmlal.u8   q14, d30, d6                \n\t"
 106                       "vmlal.u8   q13, d30, d5                \n\t"
 107                       "vmlal.u8   q12, d30, d4                \n\t"
 108                       "vshr.u16   q8, q14, #5                 \n\t"
 109                       "vshr.u16   q9, q13, #6                 \n\t"
 110                       "vaddhn.u16 d6, q14, q8                 \n\t"
 111                       "vshr.u16   q8, q12, #5                 \n\t"
 112                       "vaddhn.u16 d5, q13, q9                 \n\t"
 113                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
 114                       "vaddhn.u16 d4, q12, q8                 \n\t"
 115                       // intentionally don't calculate alpha
 116                       // result in d4-d6
 117
 118                       "vqadd.u8   d5, d5, d1                  \n\t"
 119                       "vqadd.u8   d4, d4, d2                  \n\t"
 120
 121                       // pack 8888 {d4-d6} to 0565 q10
 122                       "vshll.u8   q10, d6, #8                 \n\t"
 123                       "vshll.u8   q3, d5, #8                  \n\t"
 124                       "vshll.u8   q2, d4, #8                  \n\t"
 125                       "vsri.u16   q10, q3, #5                 \n\t"
 126                       "vsri.u16   q10, q2, #11                \n\t"
 127
 128                       "bne        2b                          \n\t"
 129
 130                       "1:                                         \n\t"
 131                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
 132                       : [count] "+r" (count)
 133                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
 134                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 135                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 136                       "d30","d31"
 137                       );
 138     }
 139     else
 140     {   // handle count < 8
 141         uint16_t* SK_RESTRICT keep_dst = 0;
 142
 143         asm volatile (
 144                       "vmov.u8    d31, #1<<7                  \n\t"
 145                       "mov        %[keep_dst], %[dst]         \n\t"
 146
 147                       "tst        %[count], #4                \n\t"
 148                       "beq        14f                         \n\t"
 149                       "vld1.16    {d25}, [%[dst]]!            \n\t"
 150                       "vld1.32    {q1}, [%[src]]!             \n\t"
 151
 152                       "14:                                        \n\t"
 153                       "tst        %[count], #2                \n\t"
 154                       "beq        12f                         \n\t"
 155                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
 156                       "vld1.32    {d1}, [%[src]]!             \n\t"
 157
 158                       "12:                                        \n\t"
 159                       "tst        %[count], #1                \n\t"
 160                       "beq        11f                         \n\t"
 161                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
 162                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
 163
 164                       "11:                                        \n\t"
 165                       // unzips achieve the same as a vld4 operation
 166                       "vuzpq.u16  q0, q1                      \n\t"
 167                       "vuzp.u8    d0, d1                      \n\t"
 168                       "vuzp.u8    d2, d3                      \n\t"
 169                       // expand 0565 q12 to 8888 {d4-d7}
 170                       "vmovn.u16  d4, q12                     \n\t"
 171                       "vshr.u16   q11, q12, #5                \n\t"
 172                       "vshr.u16   q10, q12, #6+5              \n\t"
 173                       "vmovn.u16  d5, q11                     \n\t"
 174                       "vmovn.u16  d6, q10                     \n\t"
 175                       "vshl.u8    d4, d4, #3                  \n\t"
 176                       "vshl.u8    d5, d5, #2                  \n\t"
 177                       "vshl.u8    d6, d6, #3                  \n\t"
 178
 179                       "vmovl.u8   q14, d31                    \n\t"
 180                       "vmovl.u8   q13, d31                    \n\t"
 181                       "vmovl.u8   q12, d31                    \n\t"
 182
 183                       // duplicate in 4/2/1 & 8pix vsns
 184                       "vmvn.8     d30, d3                     \n\t"
 185                       "vmlal.u8   q14, d30, d6                \n\t"
 186                       "vmlal.u8   q13, d30, d5                \n\t"
 187                       "vmlal.u8   q12, d30, d4                \n\t"
 188                       "vshr.u16   q8, q14, #5                 \n\t"
 189                       "vshr.u16   q9, q13, #6                 \n\t"
 190                       "vaddhn.u16 d6, q14, q8                 \n\t"
 191                       "vshr.u16   q8, q12, #5                 \n\t"
 192                       "vaddhn.u16 d5, q13, q9                 \n\t"
 193                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
 194                       "vaddhn.u16 d4, q12, q8                 \n\t"
 195                       // intentionally don't calculate alpha
 196                       // result in d4-d6
 197
 198                       "vqadd.u8   d5, d5, d1                  \n\t"
 199                       "vqadd.u8   d4, d4, d2                  \n\t"
 200
 201                       // pack 8888 {d4-d6} to 0565 q10
 202                       "vshll.u8   q10, d6, #8                 \n\t"
 203                       "vshll.u8   q3, d5, #8                  \n\t"
 204                       "vshll.u8   q2, d4, #8                  \n\t"
 205                       "vsri.u16   q10, q3, #5                 \n\t"
 206                       "vsri.u16   q10, q2, #11                \n\t"
 207
 208                       // store
 209                       "tst        %[count], #4                \n\t"
 210                       "beq        24f                         \n\t"
 211                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
 212
 213                       "24:                                        \n\t"
 214                       "tst        %[count], #2                \n\t"
 215                       "beq        22f                         \n\t"
 216                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
 217
 218                       "22:                                        \n\t"
 219                       "tst        %[count], #1                \n\t"
 220                       "beq        21f                         \n\t"
 221                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
 222
 223                       "21:                                        \n\t"
 224                       : [count] "+r" (count)
 225                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
 226                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
 227                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
 228                       "d30","d31"
 229                       );
 230     }
 231 }
 232
 233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
 234     prod += vdupq_n_u16(128);
 235     prod += vshrq_n_u16(prod, 8);
 236     return vshrq_n_u16(prod, 8);
 237 }
 238
 239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
 240                           const SkPMColor* SK_RESTRICT src, int count,
 241                           U8CPU alpha, int /*x*/, int /*y*/) {
 242    SkASSERT(255 > alpha);
 243
 244     /* This code implements a Neon version of S32A_D565_Blend. The results have
 245      * a few mismatches compared to the original code. These mismatches never
 246      * exceed 1.
 247      */
 248
 249     if (count >= 8) {
 250         uint16x8_t valpha_max, vmask_blue;
 251         uint8x8_t valpha;
 252
 253         // prepare constants
 254         valpha_max = vmovq_n_u16(255);
 255         valpha = vdup_n_u8(alpha);
 256         vmask_blue = vmovq_n_u16(SK_B16_MASK);
 257
 258         do {
 259             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
 260             uint16x8_t vres_a, vres_r, vres_g, vres_b;
 261             uint8x8x4_t vsrc;
 262
 263             // load pixels
 264             vdst = vld1q_u16(dst);
 265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
 266             asm (
 267                 "vld4.u8 %h[vsrc], [%[src]]!"
 268                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
 269                 : :
 270             );
 271 #else
 272             register uint8x8_t d0 asm("d0");
 273             register uint8x8_t d1 asm("d1");
 274             register uint8x8_t d2 asm("d2");
 275             register uint8x8_t d3 asm("d3");
 276
 277             asm volatile (
 278                 "vld4.u8    {d0-d3},[%[src]]!;"
 279                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
 280                   [src] "+&r" (src)
 281                 : :
 282             );
 283             vsrc.val[0] = d0;
 284             vsrc.val[1] = d1;
 285             vsrc.val[2] = d2;
 286             vsrc.val[3] = d3;
 287 #endif
 288
 289
 290             // deinterleave dst
 291             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
 292             vdst_b = vdst & vmask_blue;                     // extract blue
 293             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
 294             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
 295
 296             // shift src to 565
 297             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
 298             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
 299             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
 300
 301             // calc src * src_scale
 302             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
 303             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
 304             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
 305             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
 306
 307             // prepare dst_scale
 308             vres_a = SkDiv255Round_neon8(vres_a);
 309             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
 310
 311             // add dst * dst_scale to previous result
 312             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
 313             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
 314             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
 315
 316 #ifdef S32A_D565_BLEND_EXACT
 317             // It is possible to get exact results with this but it is slow,
 318             // even slower than C code in some cases
 319             vres_r = SkDiv255Round_neon8(vres_r);
 320             vres_g = SkDiv255Round_neon8(vres_g);
 321             vres_b = SkDiv255Round_neon8(vres_b);
 322 #else
 323             vres_r = vrshrq_n_u16(vres_r, 8);
 324             vres_g = vrshrq_n_u16(vres_g, 8);
 325             vres_b = vrshrq_n_u16(vres_b, 8);
 326 #endif
 327             // pack result
 328             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
 329             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
 330
 331             // store
 332             vst1q_u16(dst, vres_b);
 333             dst += 8;
 334             count -= 8;
 335         } while (count >= 8);
 336     }
 337
 338     // leftovers
 339     while (count-- > 0) {
 340         SkPMColor sc = *src++;
 341         if (sc) {
 342             uint16_t dc = *dst;
 343             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
 344             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
 345             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
 346             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
 347             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
 348         }
 349         dst += 1;
 350     }
 351 }
 352
 353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
 354  * each dither value is spaced out into byte lanes, and repeated
 355  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
 356  * start of each row.
 357  */
 358 static const uint8_t gDitherMatrix_Neon[48] = {
 359     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
 360     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
 361     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
 362     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
 363
 364 };
 365
 366 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
 367                                 int count, U8CPU alpha, int x, int y)
 368 {
 369
 370     SkASSERT(255 > alpha);
 371
 372     // rescale alpha to range 1 - 256
 373     int scale = SkAlpha255To256(alpha);
 374
 375     if (count >= 8) {
 376         /* select row and offset for dither array */
 377         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
 378
 379         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
 380         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
 381
 382         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
 383         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
 384
 385         do {
 386
 387             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
 388             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
 389             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
 390             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
 391             uint16x8_t vdst;
 392             uint16x8_t vdst_r, vdst_g, vdst_b;
 393             int16x8_t vres_r, vres_g, vres_b;
 394             int8x8_t vres8_r, vres8_g, vres8_b;
 395
 396             // Load source and add dither
 397             {
 398             register uint8x8_t d0 asm("d0");
 399             register uint8x8_t d1 asm("d1");
 400             register uint8x8_t d2 asm("d2");
 401             register uint8x8_t d3 asm("d3");
 402
 403             asm (
 404                 "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
 405                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
 406                 :
 407             );
 408             vsrc_g = d1;
 409 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
 410             vsrc_r = d2; vsrc_b = d0;
 411 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
 412             vsrc_r = d0; vsrc_b = d2;
 413 #endif
 414             }
 415
 416             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
 417             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
 418             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
 419
 420             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
 421             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
 422             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
 423
 424             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
 425             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
 426             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
 427
 428             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
 429             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
 430             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
 431
 432             // Load dst and unpack
 433             vdst = vld1q_u16(dst);
 434             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
 435             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
 436             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
 437
 438             // subtract dst from src and widen
 439             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
 440             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
 441             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
 442
 443             // multiply diffs by scale and shift
 444             vres_r = vmulq_s16(vres_r, vscale);
 445             vres_g = vmulq_s16(vres_g, vscale);
 446             vres_b = vmulq_s16(vres_b, vscale);
 447
 448             vres8_r = vshrn_n_s16(vres_r, 8);
 449             vres8_g = vshrn_n_s16(vres_g, 8);
 450             vres8_b = vshrn_n_s16(vres_b, 8);
 451
 452             // add dst to result
 453             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
 454             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
 455             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
 456
 457             // put result into 565 format
 458             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
 459             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
 460
 461             // Store result
 462             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
 463
 464             // Next iteration
 465             dst += 8;
 466             count -= 8;
 467
 468         } while (count >= 8);
 469     }
 470
 471     // Leftovers
 472     if (count > 0) {
 473         int scale = SkAlpha255To256(alpha);
 474         DITHER_565_SCAN(y);
 475         do {
 476             SkPMColor c = *src++;
 477             SkPMColorAssert(c);
 478
 479             int dither = DITHER_VALUE(x);
 480             int sr = SkGetPackedR32(c);
 481             int sg = SkGetPackedG32(c);
 482             int sb = SkGetPackedB32(c);
 483             sr = SkDITHER_R32To565(sr, dither);
 484             sg = SkDITHER_G32To565(sg, dither);
 485             sb = SkDITHER_B32To565(sb, dither);
 486
 487             uint16_t d = *dst;
 488             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
 489                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
 490                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
 491             DITHER_INC_X(x);
 492         } while (--count != 0);
 493     }
 494 }
 495
 496 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 497                                 const SkPMColor* SK_RESTRICT src,
 498                                 int count, U8CPU alpha) {
 499
 500     SkASSERT(255 == alpha);
 501     if (count > 0) {
 502
 503
 504     uint8x8_t alpha_mask;
 505
 506     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 507     alpha_mask = vld1_u8(alpha_mask_setup);
 508
 509     /* do the NEON unrolled code */
 510 #define    UNROLL    4
 511     while (count >= UNROLL) {
 512         uint8x8_t src_raw, dst_raw, dst_final;
 513         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
 514
 515         /* The two prefetches below may make the code slighlty
 516          * slower for small values of count but are worth having
 517          * in the general case.
 518          */
 519         __builtin_prefetch(src+32);
 520         __builtin_prefetch(dst+32);
 521
 522         /* get the source */
 523         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 524 #if    UNROLL > 2
 525         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
 526 #endif
 527
 528         /* get and hold the dst too */
 529         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 530 #if    UNROLL > 2
 531         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
 532 #endif
 533
 534     /* 1st and 2nd bits of the unrolling */
 535     {
 536         uint8x8_t dst_cooked;
 537         uint16x8_t dst_wide;
 538         uint8x8_t alpha_narrow;
 539         uint16x8_t alpha_wide;
 540
 541         /* get the alphas spread out properly */
 542         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
 543         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 544
 545         /* spread the dest */
 546         dst_wide = vmovl_u8(dst_raw);
 547
 548         /* alpha mul the dest */
 549         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 550         dst_cooked = vshrn_n_u16(dst_wide, 8);
 551
 552         /* sum -- ignoring any byte lane overflows */
 553         dst_final = vadd_u8(src_raw, dst_cooked);
 554     }
 555
 556 #if    UNROLL > 2
 557     /* the 3rd and 4th bits of our unrolling */
 558     {
 559         uint8x8_t dst_cooked;
 560         uint16x8_t dst_wide;
 561         uint8x8_t alpha_narrow;
 562         uint16x8_t alpha_wide;
 563
 564         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
 565         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 566
 567         /* spread the dest */
 568         dst_wide = vmovl_u8(dst_raw_2);
 569
 570         /* alpha mul the dest */
 571         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 572         dst_cooked = vshrn_n_u16(dst_wide, 8);
 573
 574         /* sum -- ignoring any byte lane overflows */
 575         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
 576     }
 577 #endif
 578
 579         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 580 #if    UNROLL > 2
 581         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
 582 #endif
 583
 584         src += UNROLL;
 585         dst += UNROLL;
 586         count -= UNROLL;
 587     }
 588 #undef    UNROLL
 589
 590     /* do any residual iterations */
 591         while (--count >= 0) {
 592             *dst = SkPMSrcOver(*src, *dst);
 593             src += 1;
 594             dst += 1;
 595         }
 596     }
 597 }
 598
 599 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
 600                                 const SkPMColor* SK_RESTRICT src,
 601                                 int count, U8CPU alpha) {
 602     SkASSERT(255 == alpha);
 603
 604     if (count <= 0)
 605     return;
 606
 607     /* Use these to check if src is transparent or opaque */
 608     const unsigned int ALPHA_OPAQ  = 0xFF000000;
 609     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
 610
 611 #define UNROLL  4
 612     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
 613     const SkPMColor* SK_RESTRICT src_temp = src;
 614
 615     /* set up the NEON variables */
 616     uint8x8_t alpha_mask;
 617     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 618     alpha_mask = vld1_u8(alpha_mask_setup);
 619
 620     uint8x8_t src_raw, dst_raw, dst_final;
 621     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
 622     uint8x8_t dst_cooked;
 623     uint16x8_t dst_wide;
 624     uint8x8_t alpha_narrow;
 625     uint16x8_t alpha_wide;
 626
 627     /* choose the first processing type */
 628     if( src >= src_end)
 629         goto TAIL;
 630     if(*src <= ALPHA_TRANS)
 631         goto ALPHA_0;
 632     if(*src >= ALPHA_OPAQ)
 633         goto ALPHA_255;
 634     /* fall-thru */
 635
 636 ALPHA_1_TO_254:
 637     do {
 638
 639         /* get the source */
 640         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 641         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
 642
 643         /* get and hold the dst too */
 644         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 645         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
 646
 647
 648         /* get the alphas spread out properly */
 649         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
 650         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
 651         /* we collapsed (255-a)+1 ... */
 652         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 653
 654         /* spread the dest */
 655         dst_wide = vmovl_u8(dst_raw);
 656
 657         /* alpha mul the dest */
 658         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 659         dst_cooked = vshrn_n_u16(dst_wide, 8);
 660
 661         /* sum -- ignoring any byte lane overflows */
 662         dst_final = vadd_u8(src_raw, dst_cooked);
 663
 664         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
 665         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
 666         /* we collapsed (255-a)+1 ... */
 667         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
 668
 669         /* spread the dest */
 670         dst_wide = vmovl_u8(dst_raw_2);
 671
 672         /* alpha mul the dest */
 673         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
 674         dst_cooked = vshrn_n_u16(dst_wide, 8);
 675
 676         /* sum -- ignoring any byte lane overflows */
 677         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
 678
 679         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 680         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
 681
 682         src += UNROLL;
 683         dst += UNROLL;
 684
 685         /* if 2 of the next pixels aren't between 1 and 254
 686         it might make sense to go to the optimized loops */
 687         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
 688             break;
 689
 690     } while(src < src_end);
 691
 692     if (src >= src_end)
 693         goto TAIL;
 694
 695     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
 696         goto ALPHA_255;
 697
 698     /*fall-thru*/
 699
 700 ALPHA_0:
 701
 702     /*In this state, we know the current alpha is 0 and
 703      we optimize for the next alpha also being zero. */
 704     src_temp = src;  //so we don't have to increment dst every time
 705     do {
 706         if(*(++src) > ALPHA_TRANS)
 707             break;
 708         if(*(++src) > ALPHA_TRANS)
 709             break;
 710         if(*(++src) > ALPHA_TRANS)
 711             break;
 712         if(*(++src) > ALPHA_TRANS)
 713             break;
 714     } while(src < src_end);
 715
 716     dst += (src - src_temp);
 717
 718     /* no longer alpha 0, so determine where to go next. */
 719     if( src >= src_end)
 720         goto TAIL;
 721     if(*src >= ALPHA_OPAQ)
 722         goto ALPHA_255;
 723     else
 724         goto ALPHA_1_TO_254;
 725
 726 ALPHA_255:
 727     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
 728         dst[0]=src[0];
 729         dst[1]=src[1];
 730         dst[2]=src[2];
 731         dst[3]=src[3];
 732         src+=UNROLL;
 733         dst+=UNROLL;
 734         if(src >= src_end)
 735             goto TAIL;
 736     }
 737
 738     //Handle remainder.
 739     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
 740         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
 741             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
 742         }
 743     }
 744
 745     if( src >= src_end)
 746         goto TAIL;
 747     if(*src <= ALPHA_TRANS)
 748         goto ALPHA_0;
 749     else
 750         goto ALPHA_1_TO_254;
 751
 752 TAIL:
 753     /* do any residual iterations */
 754     src_end += UNROLL + 1;  //goto the real end
 755     while(src != src_end) {
 756         if( *src != 0 ) {
 757             if( *src >= ALPHA_OPAQ ) {
 758                 *dst = *src;
 759             }
 760             else {
 761                 *dst = SkPMSrcOver(*src, *dst);
 762             }
 763         }
 764         src++;
 765         dst++;
 766     }
 767
 768 #undef    UNROLL
 769     return;
 770 }
 771
 772 /* Neon version of S32_Blend_BlitRow32()
 773  * portable version is in src/core/SkBlitRow_D32.cpp
 774  */
 775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 776                               const SkPMColor* SK_RESTRICT src,
 777                               int count, U8CPU alpha) {
 778     SkASSERT(alpha <= 255);
 779     if (count > 0) {
 780         uint16_t src_scale = SkAlpha255To256(alpha);
 781         uint16_t dst_scale = 256 - src_scale;
 782
 783     /* run them N at a time through the NEON unit */
 784     /* note that each 1 is 4 bytes, each treated exactly the same,
 785      * so we can work under that guise. We *do* know that the src&dst
 786      * will be 32-bit aligned quantities, so we can specify that on
 787      * the load/store ops and do a neon 'reinterpret' to get us to
 788      * byte-sized (pun intended) pieces that we widen/multiply/shift
 789      * we're limited at 128 bits in the wide ops, which is 8x16bits
 790      * or a pair of 32 bit src/dsts.
 791      */
 792     /* we *could* manually unroll this loop so that we load 128 bits
 793      * (as a pair of 64s) from each of src and dst, processing them
 794      * in pieces. This might give us a little better management of
 795      * the memory latency, but my initial attempts here did not
 796      * produce an instruction stream that looked all that nice.
 797      */
 798 #define    UNROLL    2
 799     while (count >= UNROLL) {
 800         uint8x8_t  src_raw, dst_raw, dst_final;
 801         uint16x8_t  src_wide, dst_wide;
 802
 803         /* get 64 bits of src, widen it, multiply by src_scale */
 804         src_raw = vreinterpret_u8_u32(vld1_u32(src));
 805         src_wide = vmovl_u8(src_raw);
 806         /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
 807         src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
 808
 809         /* ditto with dst */
 810         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
 811         dst_wide = vmovl_u8(dst_raw);
 812
 813         /* combine add with dst multiply into mul-accumulate */
 814         dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
 815
 816         dst_final = vshrn_n_u16(dst_wide, 8);
 817         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
 818
 819         src += UNROLL;
 820         dst += UNROLL;
 821         count -= UNROLL;
 822     }
 823     /* RBE: well, i don't like how gcc manages src/dst across the above
 824      * loop it's constantly calculating src+bias, dst+bias and it only
 825      * adjusts the real ones when we leave the loop. Not sure why
 826      * it's "hoisting down" (hoisting implies above in my lexicon ;))
 827      * the adjustments to src/dst/count, but it does...
 828      * (might be SSA-style internal logic...
 829      */
 830
 831 #if    UNROLL == 2
 832     if (count == 1) {
 833             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
 834     }
 835 #else
 836     if (count > 0) {
 837             do {
 838                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
 839                 src += 1;
 840                 dst += 1;
 841             } while (--count > 0);
 842     }
 843 #endif
 844
 845 #undef    UNROLL
 846     }
 847 }
 848
 849 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 850                          const SkPMColor* SK_RESTRICT src,
 851                          int count, U8CPU alpha) {
 852
 853     SkASSERT(255 >= alpha);
 854
 855     if (count <= 0) {
 856         return;
 857     }
 858
 859     unsigned alpha256 = SkAlpha255To256(alpha);
 860
 861     // First deal with odd counts
 862     if (count & 1) {
 863         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
 864         uint16x8_t vdst_wide, vsrc_wide;
 865         unsigned dst_scale;
 866
 867         // Load
 868         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
 869         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
 870
 871         // Calc dst_scale
 872         dst_scale = vget_lane_u8(vsrc, 3);
 873         dst_scale *= alpha256;
 874         dst_scale >>= 8;
 875         dst_scale = 256 - dst_scale;
 876
 877         // Process src
 878         vsrc_wide = vmovl_u8(vsrc);
 879         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
 880
 881         // Process dst
 882         vdst_wide = vmovl_u8(vdst);
 883         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
 884
 885         // Combine
 886         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
 887
 888         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
 889         dst++;
 890         src++;
 891         count--;
 892     }
 893
 894     if (count) {
 895         uint8x8_t alpha_mask;
 896         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
 897         alpha_mask = vld1_u8(alpha_mask_setup);
 898
 899         do {
 900
 901             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
 902             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
 903
 904             __builtin_prefetch(src+32);
 905             __builtin_prefetch(dst+32);
 906
 907             // Load
 908             vsrc = vreinterpret_u8_u32(vld1_u32(src));
 909             vdst = vreinterpret_u8_u32(vld1_u32(dst));
 910
 911             // Prepare src_scale
 912             vsrc_scale = vdupq_n_u16(alpha256);
 913
 914             // Calc dst_scale
 915             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
 916             vdst_scale = vmovl_u8(vsrc_alphas);
 917             vdst_scale *= vsrc_scale;
 918             vdst_scale = vshrq_n_u16(vdst_scale, 8);
 919             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
 920
 921             // Process src
 922             vsrc_wide = vmovl_u8(vsrc);
 923             vsrc_wide *= vsrc_scale;
 924
 925             // Process dst
 926             vdst_wide = vmovl_u8(vdst);
 927             vdst_wide *= vdst_scale;
 928
 929             // Combine
 930             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
 931
 932             vst1_u32(dst, vreinterpret_u32_u8(vres));
 933
 934             src += 2;
 935             dst += 2;
 936             count -= 2;
 937         } while(count);
 938     }
 939 }
 940
 941 ///////////////////////////////////////////////////////////////////////////////
 942
 943 #undef    DEBUG_OPAQUE_DITHER
 944
 945 #if    defined(DEBUG_OPAQUE_DITHER)
 946 static void showme8(char *str, void *p, int len)
 947 {
 948     static char buf[256];
 949     char tbuf[32];
 950     int i;
 951     char *pc = (char*) p;
 952     sprintf(buf,"%8s:", str);
 953     for(i=0;i<len;i++) {
 954         sprintf(tbuf, "   %02x", pc[i]);
 955         strcat(buf, tbuf);
 956     }
 957     SkDebugf("%s\n", buf);
 958 }
 959 static void showme16(char *str, void *p, int len)
 960 {
 961     static char buf[256];
 962     char tbuf[32];
 963     int i;
 964     uint16_t *pc = (uint16_t*) p;
 965     sprintf(buf,"%8s:", str);
 966     len = (len / sizeof(uint16_t));    /* passed as bytes */
 967     for(i=0;i<len;i++) {
 968         sprintf(tbuf, " %04x", pc[i]);
 969         strcat(buf, tbuf);
 970     }
 971     SkDebugf("%s\n", buf);
 972 }
 973 #endif
 974
 975 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
 976                                    const SkPMColor* SK_RESTRICT src,
 977                                    int count, U8CPU alpha, int x, int y) {
 978     SkASSERT(255 == alpha);
 979
 980 #define    UNROLL    8
 981
 982     if (count >= UNROLL) {
 983     uint8x8_t dbase;
 984
 985 #if    defined(DEBUG_OPAQUE_DITHER)
 986     uint16_t tmpbuf[UNROLL];
 987     int td[UNROLL];
 988     int tdv[UNROLL];
 989     int ta[UNROLL];
 990     int tap[UNROLL];
 991     uint16_t in_dst[UNROLL];
 992     int offset = 0;
 993     int noisy = 0;
 994 #endif
 995
 996     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
 997     dbase = vld1_u8(dstart);
 998
 999         do {
1000         uint8x8_t sr, sg, sb, sa, d;
1001         uint16x8_t dst8, scale8, alpha8;
1002         uint16x8_t dst_r, dst_g, dst_b;
1003
1004 #if    defined(DEBUG_OPAQUE_DITHER)
1005     /* calculate 8 elements worth into a temp buffer */
1006     {
1007       int my_y = y;
1008       int my_x = x;
1009       SkPMColor* my_src = (SkPMColor*)src;
1010       uint16_t* my_dst = dst;
1011       int i;
1012
1013           DITHER_565_SCAN(my_y);
1014           for(i=0;i<UNROLL;i++) {
1015             SkPMColor c = *my_src++;
1016             SkPMColorAssert(c);
1017             if (c) {
1018                 unsigned a = SkGetPackedA32(c);
1019
1020                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1021         tdv[i] = DITHER_VALUE(my_x);
1022         ta[i] = a;
1023         tap[i] = SkAlpha255To256(a);
1024         td[i] = d;
1025
1026                 unsigned sr = SkGetPackedR32(c);
1027                 unsigned sg = SkGetPackedG32(c);
1028                 unsigned sb = SkGetPackedB32(c);
1029                 sr = SkDITHER_R32_FOR_565(sr, d);
1030                 sg = SkDITHER_G32_FOR_565(sg, d);
1031                 sb = SkDITHER_B32_FOR_565(sb, d);
1032
1033                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1034                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1035                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1036                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1037                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1038         td[i] = d;
1039
1040             } else {
1041         tmpbuf[i] = *my_dst;
1042         ta[i] = tdv[i] = td[i] = 0xbeef;
1043         }
1044         in_dst[i] = *my_dst;
1045             my_dst += 1;
1046             DITHER_INC_X(my_x);
1047           }
1048     }
1049 #endif
1050
1051         /* source is in ABGR */
1052         {
1053         register uint8x8_t d0 asm("d0");
1054         register uint8x8_t d1 asm("d1");
1055         register uint8x8_t d2 asm("d2");
1056         register uint8x8_t d3 asm("d3");
1057
1058         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1059             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1060             : "r" (src)
1061                     );
1062             sr = d0; sg = d1; sb = d2; sa = d3;
1063         }
1064
1065         /* calculate 'd', which will be 0..7 */
1066         /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1067 #if defined(SK_BUILD_FOR_ANDROID)
1068         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1069         alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1070 #else
1071         alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1072 #endif
1073         alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1074         d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
1075
1076         /* sr = sr - (sr>>5) + d */
1077         /* watching for 8-bit overflow.  d is 0..7; risky range of
1078          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1079          * safe  as long as we do ((sr-sr>>5) + d) */
1080         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1081         sr = vadd_u8(sr, d);
1082
1083         /* sb = sb - (sb>>5) + d */
1084         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1085         sb = vadd_u8(sb, d);
1086
1087         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1088         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1089         sg = vadd_u8(sg, vshr_n_u8(d,1));
1090
1091         /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1092         dst8 = vld1q_u16(dst);
1093         dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1094         dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1095         dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
1096
1097         /* blend */
1098 #if 1
1099         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1100         /* originally 255-sa + 1 */
1101         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1102 #else
1103         scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1104         scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1105 #endif
1106
1107 #if 1
1108         /* combine the addq and mul, save 3 insns */
1109         scale8 = vshrq_n_u16(scale8, 3);
1110         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1111         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1112         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1113 #else
1114         /* known correct, but +3 insns over above */
1115         scale8 = vshrq_n_u16(scale8, 3);
1116         dst_b = vmulq_u16(dst_b, scale8);
1117         dst_g = vmulq_u16(dst_g, scale8);
1118         dst_r = vmulq_u16(dst_r, scale8);
1119
1120         /* combine */
1121         /* NB: vshll widens, need to preserve those bits */
1122         dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1123         dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1124         dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1125 #endif
1126
1127         /* repack to store */
1128         dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1129         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1130         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1131
1132         vst1q_u16(dst, dst8);
1133
1134 #if    defined(DEBUG_OPAQUE_DITHER)
1135         /* verify my 8 elements match the temp buffer */
1136     {
1137        int i, bad=0;
1138        static int invocation;
1139
1140        for (i=0;i<UNROLL;i++)
1141         if (tmpbuf[i] != dst[i]) bad=1;
1142        if (bad) {
1143         SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1144             invocation, offset);
1145         SkDebugf("  alpha 0x%x\n", alpha);
1146         for (i=0;i<UNROLL;i++)
1147             SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1148             i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1149             dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1150
1151         showme16("alpha8", &alpha8, sizeof(alpha8));
1152         showme16("scale8", &scale8, sizeof(scale8));
1153         showme8("d", &d, sizeof(d));
1154         showme16("dst8", &dst8, sizeof(dst8));
1155         showme16("dst_b", &dst_b, sizeof(dst_b));
1156         showme16("dst_g", &dst_g, sizeof(dst_g));
1157         showme16("dst_r", &dst_r, sizeof(dst_r));
1158         showme8("sb", &sb, sizeof(sb));
1159         showme8("sg", &sg, sizeof(sg));
1160         showme8("sr", &sr, sizeof(sr));
1161
1162         /* cop out */
1163         return;
1164        }
1165        offset += UNROLL;
1166        invocation++;
1167     }
1168 #endif
1169
1170             dst += UNROLL;
1171         src += UNROLL;
1172         count -= UNROLL;
1173         /* skip x += UNROLL, since it's unchanged mod-4 */
1174         } while (count >= UNROLL);
1175     }
1176 #undef    UNROLL
1177
1178     /* residuals */
1179     if (count > 0) {
1180         DITHER_565_SCAN(y);
1181         do {
1182             SkPMColor c = *src++;
1183             SkPMColorAssert(c);
1184             if (c) {
1185                 unsigned a = SkGetPackedA32(c);
1186
1187                 // dither and alpha are just temporary variables to work-around
1188                 // an ICE in debug.
1189                 unsigned dither = DITHER_VALUE(x);
1190                 unsigned alpha = SkAlpha255To256(a);
1191                 int d = SkAlphaMul(dither, alpha);
1192
1193                 unsigned sr = SkGetPackedR32(c);
1194                 unsigned sg = SkGetPackedG32(c);
1195                 unsigned sb = SkGetPackedB32(c);
1196                 sr = SkDITHER_R32_FOR_565(sr, d);
1197                 sg = SkDITHER_G32_FOR_565(sg, d);
1198                 sb = SkDITHER_B32_FOR_565(sb, d);
1199
1200                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1201                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1202                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1203                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1204                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1205             }
1206             dst += 1;
1207             DITHER_INC_X(x);
1208         } while (--count != 0);
1209     }
1210 }
1211
1212 ///////////////////////////////////////////////////////////////////////////////
1213
1214 #undef    DEBUG_S32_OPAQUE_DITHER
1215
1216 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1217                                  const SkPMColor* SK_RESTRICT src,
1218                                  int count, U8CPU alpha, int x, int y) {
1219     SkASSERT(255 == alpha);
1220
1221 #define    UNROLL    8
1222     if (count >= UNROLL) {
1223     uint8x8_t d;
1224     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1225     d = vld1_u8(dstart);
1226
1227     while (count >= UNROLL) {
1228         uint8x8_t sr, sg, sb;
1229         uint16x8_t dr, dg, db;
1230         uint16x8_t dst8;
1231
1232         {
1233         register uint8x8_t d0 asm("d0");
1234         register uint8x8_t d1 asm("d1");
1235         register uint8x8_t d2 asm("d2");
1236         register uint8x8_t d3 asm("d3");
1237
1238         asm (
1239             "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1240             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1241             :
1242         );
1243         sg = d1;
1244 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1245         sr = d2; sb = d0;
1246 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1247         sr = d0; sb = d2;
1248 #endif
1249         }
1250         /* XXX: if we want to prefetch, hide it in the above asm()
1251          * using the gcc __builtin_prefetch(), the prefetch will
1252          * fall to the bottom of the loop -- it won't stick up
1253          * at the top of the loop, just after the vld4.
1254          */
1255
1256         // sr = sr - (sr>>5) + d
1257         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1258         dr = vaddl_u8(sr, d);
1259
1260         // sb = sb - (sb>>5) + d
1261         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1262         db = vaddl_u8(sb, d);
1263
1264         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1265         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1266         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1267
1268         // pack high bits of each into 565 format  (rgb, b is lsb)
1269         dst8 = vshrq_n_u16(db, 3);
1270         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1271         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1272
1273         // store it
1274         vst1q_u16(dst, dst8);
1275
1276 #if    defined(DEBUG_S32_OPAQUE_DITHER)
1277         // always good to know if we generated good results
1278         {
1279         int i, myx = x, myy = y;
1280         DITHER_565_SCAN(myy);
1281         for (i=0;i<UNROLL;i++) {
1282             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1283             SkPMColor c = src[i-8];
1284             unsigned dither = DITHER_VALUE(myx);
1285             uint16_t val = SkDitherRGB32To565(c, dither);
1286             if (val != dst[i]) {
1287             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1288                 c, dither, val, dst[i], dstart[i]);
1289             }
1290             DITHER_INC_X(myx);
1291         }
1292         }
1293 #endif
1294
1295         dst += UNROLL;
1296         // we don't need to increment src as the asm above has already done it
1297         count -= UNROLL;
1298         x += UNROLL;        // probably superfluous
1299     }
1300     }
1301 #undef    UNROLL
1302
1303     // residuals
1304     if (count > 0) {
1305         DITHER_565_SCAN(y);
1306         do {
1307             SkPMColor c = *src++;
1308             SkPMColorAssert(c);
1309             SkASSERT(SkGetPackedA32(c) == 255);
1310
1311             unsigned dither = DITHER_VALUE(x);
1312             *dst++ = SkDitherRGB32To565(c, dither);
1313             DITHER_INC_X(x);
1314         } while (--count != 0);
1315     }
1316 }
1317
1318 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1319                       SkPMColor color) {
1320     if (count <= 0) {
1321         return;
1322     }
1323
1324     if (0 == color) {
1325         if (src != dst) {
1326             memcpy(dst, src, count * sizeof(SkPMColor));
1327         }
1328         return;
1329     }
1330
1331     unsigned colorA = SkGetPackedA32(color);
1332     if (255 == colorA) {
1333         sk_memset32(dst, color, count);
1334     } else {
1335         unsigned scale = 256 - SkAlpha255To256(colorA);
1336
1337         if (count >= 8) {
1338             // at the end of this assembly, count will have been decremented
1339             // to a negative value. That is, if count mod 8 = x, it will be
1340             // -8 +x coming out.
1341             asm volatile (
1342                 PLD128(src, 0)
1343
1344                 "vdup.32    q0, %[color]                \n\t"
1345
1346                 PLD128(src, 128)
1347
1348                 // scale numerical interval [0-255], so load as 8 bits
1349                 "vdup.8     d2, %[scale]                \n\t"
1350
1351                 PLD128(src, 256)
1352
1353                 "subs       %[count], %[count], #8      \n\t"
1354
1355                 PLD128(src, 384)
1356
1357                 "Loop_Color32:                          \n\t"
1358
1359                 // load src color, 8 pixels, 4 64 bit registers
1360                 // (and increment src).
1361                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1362
1363                 PLD128(src, 384)
1364
1365                 // multiply long by scale, 64 bits at a time,
1366                 // destination into a 128 bit register.
1367                 "vmull.u8   q4, d4, d2                  \n\t"
1368                 "vmull.u8   q5, d5, d2                  \n\t"
1369                 "vmull.u8   q6, d6, d2                  \n\t"
1370                 "vmull.u8   q7, d7, d2                  \n\t"
1371
1372                 // shift the 128 bit registers, containing the 16
1373                 // bit scaled values back to 8 bits, narrowing the
1374                 // results to 64 bit registers.
1375                 "vshrn.i16  d8, q4, #8                  \n\t"
1376                 "vshrn.i16  d9, q5, #8                  \n\t"
1377                 "vshrn.i16  d10, q6, #8                 \n\t"
1378                 "vshrn.i16  d11, q7, #8                 \n\t"
1379
1380                 // adding back the color, using 128 bit registers.
1381                 "vadd.i8    q6, q4, q0                  \n\t"
1382                 "vadd.i8    q7, q5, q0                  \n\t"
1383
1384                 // store back the 8 calculated pixels (2 128 bit
1385                 // registers), and increment dst.
1386                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1387
1388                 "subs       %[count], %[count], #8      \n\t"
1389                 "bge        Loop_Color32                \n\t"
1390                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1391                 : [color] "r" (color), [scale] "r" (scale)
1392                 : "cc", "memory",
1393                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1394                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1395                           );
1396             // At this point, if we went through the inline assembly, count is
1397             // a negative value:
1398             // if the value is -8, there is no pixel left to process.
1399             // if the value is -7, there is one pixel left to process
1400             // ...
1401             // And'ing it with 7 will give us the number of pixels
1402             // left to process.
1403             count = count & 0x7;
1404         }
1405
1406         while (count > 0) {
1407             *dst = color + SkAlphaMulQ(*src, scale);
1408             src += 1;
1409             dst += 1;
1410             count--;
1411         }
1412     }
1413 }
1414
1415 ///////////////////////////////////////////////////////////////////////////////
1416
1417 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1418     // no dither
1419     // NOTE: For the S32_D565_Blend function below, we don't have a special
1420     //       version that assumes that each source pixel is opaque. But our
1421     //       S32A is still faster than the default, so use it.
1422     S32_D565_Opaque_neon,
1423     S32A_D565_Blend_neon,   // really S32_D565_Blend
1424     S32A_D565_Opaque_neon,
1425     S32A_D565_Blend_neon,
1426
1427     // dither
1428     S32_D565_Opaque_Dither_neon,
1429     S32_D565_Blend_Dither_neon,
1430     S32A_D565_Opaque_Dither_neon,
1431     NULL,   // S32A_D565_Blend_Dither
1432 };
1433
1434 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1435     NULL,   // S32_Opaque,
1436     S32_Blend_BlitRow32_neon,        // S32_Blend,
1437     /*
1438      * We have two choices for S32A_Opaque procs. The one reads the src alpha
1439      * value and attempts to optimize accordingly.  The optimization is
1440      * sensitive to the source content and is not a win in all cases. For
1441      * example, if there are a lot of transitions between the alpha states,
1442      * the performance will almost certainly be worse.  However, for many
1443      * common cases the performance is equivalent or better than the standard
1444      * case where we do not inspect the src alpha.
1445      */
1446 #if SK_A32_SHIFT == 24
1447     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1448     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1449 #else
1450     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1451 #endif
1452     S32A_Blend_BlitRow32_neon        // S32A_Blend
1453 };