src/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp

   1 /*
   2  * Copyright 2012 The Android Open Source Project
   3  *
   4  * Use of this source code is governed by a BSD-style license that can be
   5  * found in the LICENSE file.
   6  */
   7
   8 #include <emmintrin.h>
   9 #include "SkBitmapProcState_opts_SSE2.h"
  10 #include "SkBlitRow_opts_SSE2.h"
  11 #include "SkColorPriv.h"
  12 #include "SkColor_opts_SSE2.h"
  13 #include "SkDither.h"
  14 #include "SkUtils.h"
  15
  16 /* SSE2 version of S32_Blend_BlitRow32()
  17  * portable version is in core/SkBlitRow_D32.cpp
  18  */
  19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  20                               const SkPMColor* SK_RESTRICT src,
  21                               int count, U8CPU alpha) {
  22     SkASSERT(alpha <= 255);
  23     if (count <= 0) {
  24         return;
  25     }
  26
  27     uint32_t src_scale = SkAlpha255To256(alpha);
  28     uint32_t dst_scale = 256 - src_scale;
  29
  30     if (count >= 4) {
  31         SkASSERT(((size_t)dst & 0x03) == 0);
  32         while (((size_t)dst & 0x0F) != 0) {
  33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  34             src++;
  35             dst++;
  36             count--;
  37         }
  38
  39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
  40         __m128i *d = reinterpret_cast<__m128i*>(dst);
  41         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  42         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
  43
  44         // Move scale factors to upper byte of word
  45         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
  46         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
  47         while (count >= 4) {
  48             // Load 4 pixels each of src and dest.
  49             __m128i src_pixel = _mm_loadu_si128(s);
  50             __m128i dst_pixel = _mm_load_si128(d);
  51
  52             // Interleave Atom port 0/1 operations based on the execution port
  53             // constraints that multiply can only be executed on port 0 (while
  54             // boolean operations can be executed on either port 0 or port 1)
  55             // because GCC currently doesn't do a good job scheduling
  56             // instructions based on these constraints.
  57
  58             // Get red and blue pixels into lower byte of each word.
  59             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
  60             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  61
  62             // Multiply by scale.
  63             // (4 x (0, rs.h, 0, bs.h))
  64             // where rs.h stands for the higher byte of r * scale, and
  65             // bs.h the higher byte of b * scale.
  66             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
  67
  68             // Get alpha and green pixels into higher byte of each word.
  69             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
  70             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
  71
  72             // Multiply by scale.
  73             // (4 x (as.h, as.l, gs.h, gs.l))
  74             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
  75
  76             // Clear the lower byte of the a*scale and g*scale results
  77             // (4 x (as.h, 0, gs.h, 0))
  78             src_ag = _mm_and_si128(src_ag, ag_mask);
  79
  80             // Operations the destination pixels are the same as on the
  81             // source pixels. See the comments above.
  82             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  83             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
  84             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
  85             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
  86             dst_ag = _mm_and_si128(dst_ag, ag_mask);
  87
  88             // Combine back into RGBA.
  89             // (4 x (as.h, rs.h, gs.h, bs.h))
  90             src_pixel = _mm_or_si128(src_rb, src_ag);
  91             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  92
  93             // Add result
  94             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  95             _mm_store_si128(d, result);
  96             s++;
  97             d++;
  98             count -= 4;
  99         }
 100         src = reinterpret_cast<const SkPMColor*>(s);
 101         dst = reinterpret_cast<SkPMColor*>(d);
 102     }
 103
 104     while (count > 0) {
 105         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
 106         src++;
 107         dst++;
 108         count--;
 109     }
 110 }
 111
 112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
 113                                 const SkPMColor* SK_RESTRICT src,
 114                                 int count, U8CPU alpha) {
 115     SkASSERT(alpha == 255);
 116     if (count <= 0) {
 117         return;
 118     }
 119
 120     if (count >= 4) {
 121         SkASSERT(((size_t)dst & 0x03) == 0);
 122         while (((size_t)dst & 0x0F) != 0) {
 123             *dst = SkPMSrcOver(*src, *dst);
 124             src++;
 125             dst++;
 126             count--;
 127         }
 128
 129         const __m128i *s = reinterpret_cast<const __m128i*>(src);
 130         __m128i *d = reinterpret_cast<__m128i*>(dst);
 131 #ifdef SK_USE_ACCURATE_BLENDING
 132         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
 133         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
 134         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
 135         while (count >= 4) {
 136             // Load 4 pixels
 137             __m128i src_pixel = _mm_loadu_si128(s);
 138             __m128i dst_pixel = _mm_load_si128(d);
 139
 140             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
 141             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
 142             // Shift alphas down to lower 8 bits of each quad.
 143             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
 144
 145             // Copy alpha to upper 3rd byte of each quad
 146             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
 147
 148             // Subtract alphas from 255, to get 0..255
 149             alpha = _mm_sub_epi16(c_255, alpha);
 150
 151             // Multiply by red and blue by src alpha.
 152             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
 153             // Multiply by alpha and green by src alpha.
 154             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
 155
 156             // dst_rb_low = (dst_rb >> 8)
 157             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
 158             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
 159
 160             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
 161             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
 162             dst_rb = _mm_add_epi16(dst_rb, c_128);
 163             dst_rb = _mm_srli_epi16(dst_rb, 8);
 164
 165             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
 166             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
 167             dst_ag = _mm_add_epi16(dst_ag, c_128);
 168             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
 169
 170             // Combine back into RGBA.
 171             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
 172
 173             // Add result
 174             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
 175             _mm_store_si128(d, result);
 176             s++;
 177             d++;
 178             count -= 4;
 179         }
 180 #else
 181         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
 182         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
 183         while (count >= 4) {
 184             // Load 4 pixels
 185             __m128i src_pixel = _mm_loadu_si128(s);
 186             __m128i dst_pixel = _mm_load_si128(d);
 187
 188             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
 189             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
 190
 191             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
 192             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
 193
 194             // (a0, a0, a1, a1, a2, g2, a3, g3)
 195             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
 196
 197             // (a0, a0, a1, a1, a2, a2, a3, a3)
 198             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
 199
 200             // Subtract alphas from 256, to get 1..256
 201             alpha = _mm_sub_epi16(c_256, alpha);
 202
 203             // Multiply by red and blue by src alpha.
 204             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
 205             // Multiply by alpha and green by src alpha.
 206             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
 207
 208             // Divide by 256.
 209             dst_rb = _mm_srli_epi16(dst_rb, 8);
 210
 211             // Mask out high bits (already in the right place)
 212             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
 213
 214             // Combine back into RGBA.
 215             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
 216
 217             // Add result
 218             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
 219             _mm_store_si128(d, result);
 220             s++;
 221             d++;
 222             count -= 4;
 223         }
 224 #endif
 225         src = reinterpret_cast<const SkPMColor*>(s);
 226         dst = reinterpret_cast<SkPMColor*>(d);
 227     }
 228
 229     while (count > 0) {
 230         *dst = SkPMSrcOver(*src, *dst);
 231         src++;
 232         dst++;
 233         count--;
 234     }
 235 }
 236
 237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
 238                                const SkPMColor* SK_RESTRICT src,
 239                                int count, U8CPU alpha) {
 240     SkASSERT(alpha <= 255);
 241     if (count <= 0) {
 242         return;
 243     }
 244
 245     if (count >= 4) {
 246         while (((size_t)dst & 0x0F) != 0) {
 247             *dst = SkBlendARGB32(*src, *dst, alpha);
 248             src++;
 249             dst++;
 250             count--;
 251         }
 252
 253         uint32_t src_scale = SkAlpha255To256(alpha);
 254
 255         const __m128i *s = reinterpret_cast<const __m128i*>(src);
 256         __m128i *d = reinterpret_cast<__m128i*>(dst);
 257         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
 258         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
 259         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
 260         while (count >= 4) {
 261             // Load 4 pixels each of src and dest.
 262             __m128i src_pixel = _mm_loadu_si128(s);
 263             __m128i dst_pixel = _mm_load_si128(d);
 264
 265             // Get red and blue pixels into lower byte of each word.
 266             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
 267             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
 268
 269             // Get alpha and green into lower byte of each word.
 270             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
 271             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
 272
 273             // Put per-pixel alpha in low byte of each word.
 274             // After the following two statements, the dst_alpha looks like
 275             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
 276             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
 277             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
 278
 279             // dst_alpha = dst_alpha * src_scale
 280             // Because src_scales are in the higher byte of each word and
 281             // we use mulhi here, the resulting alpha values are already
 282             // in the right place and don't need to be divided by 256.
 283             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
 284             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
 285
 286             // Subtract alphas from 256, to get 1..256
 287             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
 288
 289             // Multiply red and blue by dst pixel alpha.
 290             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
 291             // Multiply alpha and green by dst pixel alpha.
 292             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
 293
 294             // Multiply red and blue by global alpha.
 295             // (4 x (0, rs.h, 0, bs.h))
 296             // where rs.h stands for the higher byte of r * src_scale,
 297             // and bs.h the higher byte of b * src_scale.
 298             // Again, because we use mulhi, the resuling red and blue
 299             // values are already in the right place and don't need to
 300             // be divided by 256.
 301             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
 302             // Multiply alpha and green by global alpha.
 303             // (4 x (0, as.h, 0, gs.h))
 304             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
 305
 306             // Divide by 256.
 307             dst_rb = _mm_srli_epi16(dst_rb, 8);
 308
 309             // Mask out low bits (goodies already in the right place; no need to divide)
 310             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
 311             // Shift alpha and green to higher byte of each word.
 312             // (4 x (as.h, 0, gs.h, 0))
 313             src_ag = _mm_slli_epi16(src_ag, 8);
 314
 315             // Combine back into RGBA.
 316             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
 317             src_pixel = _mm_or_si128(src_rb, src_ag);
 318
 319             // Add two pixels into result.
 320             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
 321             _mm_store_si128(d, result);
 322             s++;
 323             d++;
 324             count -= 4;
 325         }
 326         src = reinterpret_cast<const SkPMColor*>(s);
 327         dst = reinterpret_cast<SkPMColor*>(d);
 328     }
 329
 330     while (count > 0) {
 331         *dst = SkBlendARGB32(*src, *dst, alpha);
 332         src++;
 333         dst++;
 334         count--;
 335     }
 336 }
 337
 338 /* SSE2 version of Color32()
 339  * portable version is in core/SkBlitRow_D32.cpp
 340  */
 341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
 342                   SkPMColor color) {
 343     if (count <= 0) {
 344         return;
 345     }
 346
 347     if (0 == color) {
 348         if (src != dst) {
 349             memcpy(dst, src, count * sizeof(SkPMColor));
 350         }
 351         return;
 352     }
 353
 354     unsigned colorA = SkGetPackedA32(color);
 355     if (255 == colorA) {
 356         sk_memset32(dst, color, count);
 357     } else {
 358         unsigned scale = 256 - SkAlpha255To256(colorA);
 359
 360         if (count >= 4) {
 361             SkASSERT(((size_t)dst & 0x03) == 0);
 362             while (((size_t)dst & 0x0F) != 0) {
 363                 *dst = color + SkAlphaMulQ(*src, scale);
 364                 src++;
 365                 dst++;
 366                 count--;
 367             }
 368
 369             const __m128i *s = reinterpret_cast<const __m128i*>(src);
 370             __m128i *d = reinterpret_cast<__m128i*>(dst);
 371             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
 372             __m128i src_scale_wide = _mm_set1_epi16(scale);
 373             __m128i color_wide = _mm_set1_epi32(color);
 374             while (count >= 4) {
 375                 // Load 4 pixels each of src and dest.
 376                 __m128i src_pixel = _mm_loadu_si128(s);
 377
 378                 // Get red and blue pixels into lower byte of each word.
 379                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
 380
 381                 // Get alpha and green into lower byte of each word.
 382                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
 383
 384                 // Multiply by scale.
 385                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
 386                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
 387
 388                 // Divide by 256.
 389                 src_rb = _mm_srli_epi16(src_rb, 8);
 390                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
 391
 392                 // Combine back into RGBA.
 393                 src_pixel = _mm_or_si128(src_rb, src_ag);
 394
 395                 // Add color to result.
 396                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
 397
 398                 // Store result.
 399                 _mm_store_si128(d, result);
 400                 s++;
 401                 d++;
 402                 count -= 4;
 403             }
 404             src = reinterpret_cast<const SkPMColor*>(s);
 405             dst = reinterpret_cast<SkPMColor*>(d);
 406         }
 407
 408         while (count > 0) {
 409             *dst = color + SkAlphaMulQ(*src, scale);
 410             src += 1;
 411             dst += 1;
 412             count--;
 413         }
 414     }
 415 }
 416
 417 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
 418                                size_t maskRB, SkColor origColor,
 419                                int width, int height) {
 420     SkPMColor color = SkPreMultiplyColor(origColor);
 421     size_t dstOffset = dstRB - (width << 2);
 422     size_t maskOffset = maskRB - width;
 423     SkPMColor* dst = (SkPMColor *)device;
 424     const uint8_t* mask = (const uint8_t*)maskPtr;
 425     do {
 426         int count = width;
 427         if (count >= 4) {
 428             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
 429                 *dst = SkBlendARGB32(color, *dst, *mask);
 430                 mask++;
 431                 dst++;
 432                 count--;
 433             }
 434             __m128i *d = reinterpret_cast<__m128i*>(dst);
 435             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
 436             __m128i c_256 = _mm_set1_epi16(256);
 437             __m128i c_1 = _mm_set1_epi16(1);
 438             __m128i src_pixel = _mm_set1_epi32(color);
 439             while (count >= 4) {
 440                 // Load 4 pixels each of src and dest.
 441                 __m128i dst_pixel = _mm_load_si128(d);
 442
 443                 //set the aphla value
 444                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
 445                                 0, *(mask+3),0, \
 446                                 *(mask+2),0, *(mask+2),\
 447                                 0,*(mask+1), 0,*(mask+1),\
 448                                 0, *mask,0,*mask);
 449
 450                 //call SkAlpha255To256()
 451                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
 452
 453                 // Get red and blue pixels into lower byte of each word.
 454                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
 455                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
 456
 457                 // Get alpha and green into lower byte of each word.
 458                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
 459                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
 460
 461                 // Put per-pixel alpha in low byte of each word.
 462                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
 463                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
 464
 465                 // dst_alpha = dst_alpha * src_scale
 466                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
 467
 468                 // Divide by 256.
 469                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
 470
 471                 // Subtract alphas from 256, to get 1..256
 472                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
 473                 // Multiply red and blue by dst pixel alpha.
 474                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
 475                 // Multiply alpha and green by dst pixel alpha.
 476                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
 477
 478                 // Multiply red and blue by global alpha.
 479                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
 480                 // Multiply alpha and green by global alpha.
 481                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
 482                 // Divide by 256.
 483                 dst_rb = _mm_srli_epi16(dst_rb, 8);
 484                 src_rb = _mm_srli_epi16(src_rb, 8);
 485
 486                 // Mask out low bits (goodies already in the right place; no need to divide)
 487                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
 488                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
 489
 490                 // Combine back into RGBA.
 491                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
 492                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
 493
 494                 // Add two pixels into result.
 495                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
 496                 _mm_store_si128(d, result);
 497                 // load the next 4 pixel
 498                 mask = mask + 4;
 499                 d++;
 500                 count -= 4;
 501             }
 502             dst = reinterpret_cast<SkPMColor *>(d);
 503         }
 504         while (count > 0) {
 505             *dst= SkBlendARGB32(color, *dst, *mask);
 506             dst += 1;
 507             mask++;
 508             count --;
 509         }
 510         dst = (SkPMColor *)((char*)dst + dstOffset);
 511         mask += maskOffset;
 512     } while (--height != 0);
 513 }
 514
 515 // The following (left) shifts cause the top 5 bits of the mask components to
 516 // line up with the corresponding components in an SkPMColor.
 517 // Note that the mask's RGB16 order may differ from the SkPMColor order.
 518 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
 519 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
 520 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
 521
 522 #if SK_R16x5_R32x5_SHIFT == 0
 523     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
 524 #elif SK_R16x5_R32x5_SHIFT > 0
 525     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
 526 #else
 527     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
 528 #endif
 529
 530 #if SK_G16x5_G32x5_SHIFT == 0
 531     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
 532 #elif SK_G16x5_G32x5_SHIFT > 0
 533     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
 534 #else
 535     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
 536 #endif
 537
 538 #if SK_B16x5_B32x5_SHIFT == 0
 539     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
 540 #elif SK_B16x5_B32x5_SHIFT > 0
 541     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
 542 #else
 543     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
 544 #endif
 545
 546 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
 547                                  __m128i &mask, __m128i &srcA) {
 548     // In the following comments, the components of src, dst and mask are
 549     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
 550     // by an R, G, B, or A suffix. Components of one of the four pixels that
 551     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
 552     // example is the blue channel of the second destination pixel. Memory
 553     // layout is shown for an ARGB byte order in a color value.
 554
 555     // src and srcA store 8-bit values interleaved with zeros.
 556     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
 557     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
 558     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
 559     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
 560     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
 561     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
 562     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
 563
 564     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
 565     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
 566     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
 567                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
 568
 569     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
 570     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
 571                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
 572
 573     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
 574     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
 575                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
 576
 577     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
 578     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
 579     // 8-bit position
 580     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
 581     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
 582     mask = _mm_or_si128(_mm_or_si128(r, g), b);
 583
 584     // Interleave R,G,B into the lower byte of word.
 585     // i.e. split the sixteen 8-bit values from mask into two sets of eight
 586     // 16-bit values, padded by zero.
 587     __m128i maskLo, maskHi;
 588     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
 589     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
 590     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
 591     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
 592
 593     // Upscale from 0..31 to 0..32
 594     // (allows to replace division by left-shift further down)
 595     // Left-shift each component by 4 and add the result back to that component,
 596     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
 597     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
 598     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
 599
 600     // Multiply each component of maskLo and maskHi by srcA
 601     maskLo = _mm_mullo_epi16(maskLo, srcA);
 602     maskHi = _mm_mullo_epi16(maskHi, srcA);
 603
 604     // Left shift mask components by 8 (divide by 256)
 605     maskLo = _mm_srli_epi16(maskLo, 8);
 606     maskHi = _mm_srli_epi16(maskHi, 8);
 607
 608     // Interleave R,G,B into the lower byte of the word
 609     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
 610     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
 611     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
 612     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
 613
 614     // mask = (src - dst) * mask
 615     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
 616     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
 617
 618     // mask = (src - dst) * mask >> 5
 619     maskLo = _mm_srai_epi16(maskLo, 5);
 620     maskHi = _mm_srai_epi16(maskHi, 5);
 621
 622     // Add two pixels into result.
 623     // result = dst + ((src - dst) * mask >> 5)
 624     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
 625     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
 626
 627     // Pack into 4 32bit dst pixels.
 628     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
 629     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
 630     // clamping to 255 if necessary.
 631     return _mm_packus_epi16(resultLo, resultHi);
 632 }
 633
 634 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
 635                                        __m128i &mask) {
 636     // In the following comments, the components of src, dst and mask are
 637     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
 638     // by an R, G, B, or A suffix. Components of one of the four pixels that
 639     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
 640     // example is the blue channel of the second destination pixel. Memory
 641     // layout is shown for an ARGB byte order in a color value.
 642
 643     // src and srcA store 8-bit values interleaved with zeros.
 644     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
 645     // mask stores 16-bit values (shown as high and low bytes) interleaved with
 646     // zeros
 647     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
 648     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
 649
 650     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
 651     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
 652     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
 653                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
 654
 655     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
 656     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
 657                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
 658
 659     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
 660     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
 661                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
 662
 663     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
 664     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
 665     // 8-bit position
 666     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
 667     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
 668     mask = _mm_or_si128(_mm_or_si128(r, g), b);
 669
 670     // Interleave R,G,B into the lower byte of word.
 671     // i.e. split the sixteen 8-bit values from mask into two sets of eight
 672     // 16-bit values, padded by zero.
 673     __m128i maskLo, maskHi;
 674     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
 675     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
 676     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
 677     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
 678
 679     // Upscale from 0..31 to 0..32
 680     // (allows to replace division by left-shift further down)
 681     // Left-shift each component by 4 and add the result back to that component,
 682     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
 683     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
 684     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
 685
 686     // Interleave R,G,B into the lower byte of the word
 687     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
 688     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
 689     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
 690     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
 691
 692     // mask = (src - dst) * mask
 693     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
 694     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
 695
 696     // mask = (src - dst) * mask >> 5
 697     maskLo = _mm_srai_epi16(maskLo, 5);
 698     maskHi = _mm_srai_epi16(maskHi, 5);
 699
 700     // Add two pixels into result.
 701     // result = dst + ((src - dst) * mask >> 5)
 702     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
 703     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
 704
 705     // Pack into 4 32bit dst pixels and force opaque.
 706     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
 707     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
 708     // clamping to 255 if necessary. Set alpha components to 0xFF.
 709     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
 710                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
 711 }
 712
 713 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
 714                          SkColor src, int width, SkPMColor) {
 715     if (width <= 0) {
 716         return;
 717     }
 718
 719     int srcA = SkColorGetA(src);
 720     int srcR = SkColorGetR(src);
 721     int srcG = SkColorGetG(src);
 722     int srcB = SkColorGetB(src);
 723
 724     srcA = SkAlpha255To256(srcA);
 725
 726     if (width >= 4) {
 727         SkASSERT(((size_t)dst & 0x03) == 0);
 728         while (((size_t)dst & 0x0F) != 0) {
 729             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
 730             mask++;
 731             dst++;
 732             width--;
 733         }
 734
 735         __m128i *d = reinterpret_cast<__m128i*>(dst);
 736         // Set alpha to 0xFF and replicate source four times in SSE register.
 737         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
 738         // Interleave with zeros to get two sets of four 16-bit values.
 739         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
 740         // Set srcA_sse to contain eight copies of srcA, padded with zero.
 741         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
 742         __m128i srcA_sse = _mm_set1_epi16(srcA);
 743         while (width >= 4) {
 744             // Load four destination pixels into dst_sse.
 745             __m128i dst_sse = _mm_load_si128(d);
 746             // Load four 16-bit masks into lower half of mask_sse.
 747             __m128i mask_sse = _mm_loadl_epi64(
 748                                    reinterpret_cast<const __m128i*>(mask));
 749
 750             // Check whether masks are equal to 0 and get the highest bit
 751             // of each byte of result, if masks are all zero, we will get
 752             // pack_cmp to 0xFFFF
 753             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
 754                                              _mm_setzero_si128()));
 755
 756             // if mask pixels are not all zero, we will blend the dst pixels
 757             if (pack_cmp != 0xFFFF) {
 758                 // Unpack 4 16bit mask pixels to
 759                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
 760                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
 761                 mask_sse = _mm_unpacklo_epi16(mask_sse,
 762                                               _mm_setzero_si128());
 763
 764                 // Process 4 32bit dst pixels
 765                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
 766                                                    mask_sse, srcA_sse);
 767                 _mm_store_si128(d, result);
 768             }
 769
 770             d++;
 771             mask += 4;
 772             width -= 4;
 773         }
 774
 775         dst = reinterpret_cast<SkPMColor*>(d);
 776     }
 777
 778     while (width > 0) {
 779         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
 780         mask++;
 781         dst++;
 782         width--;
 783     }
 784 }
 785
 786 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
 787                                SkColor src, int width, SkPMColor opaqueDst) {
 788     if (width <= 0) {
 789         return;
 790     }
 791
 792     int srcR = SkColorGetR(src);
 793     int srcG = SkColorGetG(src);
 794     int srcB = SkColorGetB(src);
 795
 796     if (width >= 4) {
 797         SkASSERT(((size_t)dst & 0x03) == 0);
 798         while (((size_t)dst & 0x0F) != 0) {
 799             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
 800             mask++;
 801             dst++;
 802             width--;
 803         }
 804
 805         __m128i *d = reinterpret_cast<__m128i*>(dst);
 806         // Set alpha to 0xFF and replicate source four times in SSE register.
 807         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
 808         // Set srcA_sse to contain eight copies of srcA, padded with zero.
 809         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
 810         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
 811         while (width >= 4) {
 812             // Load four destination pixels into dst_sse.
 813             __m128i dst_sse = _mm_load_si128(d);
 814             // Load four 16-bit masks into lower half of mask_sse.
 815             __m128i mask_sse = _mm_loadl_epi64(
 816                                    reinterpret_cast<const __m128i*>(mask));
 817
 818             // Check whether masks are equal to 0 and get the highest bit
 819             // of each byte of result, if masks are all zero, we will get
 820             // pack_cmp to 0xFFFF
 821             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
 822                                              _mm_setzero_si128()));
 823
 824             // if mask pixels are not all zero, we will blend the dst pixels
 825             if (pack_cmp != 0xFFFF) {
 826                 // Unpack 4 16bit mask pixels to
 827                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
 828                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
 829                 mask_sse = _mm_unpacklo_epi16(mask_sse,
 830                                               _mm_setzero_si128());
 831
 832                 // Process 4 32bit dst pixels
 833                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
 834                                                          mask_sse);
 835                 _mm_store_si128(d, result);
 836             }
 837
 838             d++;
 839             mask += 4;
 840             width -= 4;
 841         }
 842
 843         dst = reinterpret_cast<SkPMColor*>(d);
 844     }
 845
 846     while (width > 0) {
 847         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
 848         mask++;
 849         dst++;
 850         width--;
 851     }
 852 }
 853
 854 /* SSE2 version of S32_D565_Opaque()
 855  * portable version is in core/SkBlitRow_D16.cpp
 856  */
 857 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
 858                           const SkPMColor* SK_RESTRICT src, int count,
 859                           U8CPU alpha, int /*x*/, int /*y*/) {
 860     SkASSERT(255 == alpha);
 861
 862     if (count <= 0) {
 863         return;
 864     }
 865
 866     if (count >= 8) {
 867         while (((size_t)dst & 0x0F) != 0) {
 868             SkPMColor c = *src++;
 869             SkPMColorAssert(c);
 870
 871             *dst++ = SkPixel32ToPixel16_ToU16(c);
 872             count--;
 873         }
 874
 875         const __m128i* s = reinterpret_cast<const __m128i*>(src);
 876         __m128i* d = reinterpret_cast<__m128i*>(dst);
 877         __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
 878         __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
 879         __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
 880
 881         while (count >= 8) {
 882             // Load 8 pixels of src.
 883             __m128i src_pixel1 = _mm_loadu_si128(s++);
 884             __m128i src_pixel2 = _mm_loadu_si128(s++);
 885
 886             // Calculate result r.
 887             __m128i r1 = _mm_srli_epi32(src_pixel1,
 888                                         SK_R32_SHIFT + (8 - SK_R16_BITS));
 889             r1 = _mm_and_si128(r1, r16_mask);
 890             __m128i r2 = _mm_srli_epi32(src_pixel2,
 891                                         SK_R32_SHIFT + (8 - SK_R16_BITS));
 892             r2 = _mm_and_si128(r2, r16_mask);
 893             __m128i r = _mm_packs_epi32(r1, r2);
 894
 895             // Calculate result g.
 896             __m128i g1 = _mm_srli_epi32(src_pixel1,
 897                                         SK_G32_SHIFT + (8 - SK_G16_BITS));
 898             g1 = _mm_and_si128(g1, g16_mask);
 899             __m128i g2 = _mm_srli_epi32(src_pixel2,
 900                                         SK_G32_SHIFT + (8 - SK_G16_BITS));
 901             g2 = _mm_and_si128(g2, g16_mask);
 902             __m128i g = _mm_packs_epi32(g1, g2);
 903
 904             // Calculate result b.
 905             __m128i b1 = _mm_srli_epi32(src_pixel1,
 906                                         SK_B32_SHIFT + (8 - SK_B16_BITS));
 907             b1 = _mm_and_si128(b1, b16_mask);
 908             __m128i b2 = _mm_srli_epi32(src_pixel2,
 909                                         SK_B32_SHIFT + (8 - SK_B16_BITS));
 910             b2 = _mm_and_si128(b2, b16_mask);
 911             __m128i b = _mm_packs_epi32(b1, b2);
 912
 913             // Store 8 16-bit colors in dst.
 914             __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
 915             _mm_store_si128(d++, d_pixel);
 916             count -= 8;
 917         }
 918         src = reinterpret_cast<const SkPMColor*>(s);
 919         dst = reinterpret_cast<uint16_t*>(d);
 920     }
 921
 922     if (count > 0) {
 923         do {
 924             SkPMColor c = *src++;
 925             SkPMColorAssert(c);
 926             *dst++ = SkPixel32ToPixel16_ToU16(c);
 927         } while (--count != 0);
 928     }
 929 }
 930
 931 /* SSE2 version of S32A_D565_Opaque()
 932  * portable version is in core/SkBlitRow_D16.cpp
 933  */
 934 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
 935                            const SkPMColor* SK_RESTRICT src,
 936                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
 937     SkASSERT(255 == alpha);
 938
 939     if (count <= 0) {
 940         return;
 941     }
 942
 943     if (count >= 8) {
 944         // Make dst 16 bytes alignment
 945         while (((size_t)dst & 0x0F) != 0) {
 946             SkPMColor c = *src++;
 947             if (c) {
 948               *dst = SkSrcOver32To16(c, *dst);
 949             }
 950             dst += 1;
 951             count--;
 952         }
 953
 954         const __m128i* s = reinterpret_cast<const __m128i*>(src);
 955         __m128i* d = reinterpret_cast<__m128i*>(dst);
 956         __m128i var255 = _mm_set1_epi16(255);
 957         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
 958         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
 959         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
 960
 961         while (count >= 8) {
 962             // Load 8 pixels of src.
 963             __m128i src_pixel1 = _mm_loadu_si128(s++);
 964             __m128i src_pixel2 = _mm_loadu_si128(s++);
 965
 966             // Check whether src pixels are equal to 0 and get the highest bit
 967             // of each byte of result, if src pixels are all zero, src_cmp1 and
 968             // src_cmp2 will be 0xFFFF.
 969             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
 970                                              _mm_setzero_si128()));
 971             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
 972                                              _mm_setzero_si128()));
 973             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
 974                 d++;
 975                 count -= 8;
 976                 continue;
 977             }
 978
 979             // Load 8 pixels of dst.
 980             __m128i dst_pixel = _mm_load_si128(d);
 981
 982             // Extract A from src.
 983             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
 984             sa1 = _mm_srli_epi32(sa1, 24);
 985             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
 986             sa2 = _mm_srli_epi32(sa2, 24);
 987             __m128i sa = _mm_packs_epi32(sa1, sa2);
 988
 989             // Extract R from src.
 990             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
 991             sr1 = _mm_srli_epi32(sr1, 24);
 992             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
 993             sr2 = _mm_srli_epi32(sr2, 24);
 994             __m128i sr = _mm_packs_epi32(sr1, sr2);
 995
 996             // Extract G from src.
 997             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
 998             sg1 = _mm_srli_epi32(sg1, 24);
 999             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1000             sg2 = _mm_srli_epi32(sg2, 24);
1001             __m128i sg = _mm_packs_epi32(sg1, sg2);
1002
1003             // Extract B from src.
1004             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1005             sb1 = _mm_srli_epi32(sb1, 24);
1006             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1007             sb2 = _mm_srli_epi32(sb2, 24);
1008             __m128i sb = _mm_packs_epi32(sb1, sb2);
1009
1010             // Extract R G B from dst.
1011             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1012             dr = _mm_and_si128(dr, r16_mask);
1013             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1014             dg = _mm_and_si128(dg, g16_mask);
1015             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1016             db = _mm_and_si128(db, b16_mask);
1017
1018             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1019
1020             // Calculate R G B of result.
1021             // Original algorithm is in SkSrcOver32To16().
1022             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
1023             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1024             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
1025             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1026             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
1027             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1028
1029             // Pack R G B into 16-bit color.
1030             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1031
1032             // Store 8 16-bit colors in dst.
1033             _mm_store_si128(d++, d_pixel);
1034             count -= 8;
1035         }
1036
1037         src = reinterpret_cast<const SkPMColor*>(s);
1038         dst = reinterpret_cast<uint16_t*>(d);
1039     }
1040
1041     if (count > 0) {
1042         do {
1043             SkPMColor c = *src++;
1044             SkPMColorAssert(c);
1045             if (c) {
1046                 *dst = SkSrcOver32To16(c, *dst);
1047             }
1048             dst += 1;
1049         } while (--count != 0);
1050     }
1051 }
1052
1053 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1054                                  const SkPMColor* SK_RESTRICT src,
1055                                  int count, U8CPU alpha, int x, int y) {
1056     SkASSERT(255 == alpha);
1057
1058     if (count <= 0) {
1059         return;
1060     }
1061
1062     if (count >= 8) {
1063         while (((size_t)dst & 0x0F) != 0) {
1064             DITHER_565_SCAN(y);
1065             SkPMColor c = *src++;
1066             SkPMColorAssert(c);
1067
1068             unsigned dither = DITHER_VALUE(x);
1069             *dst++ = SkDitherRGB32To565(c, dither);
1070             DITHER_INC_X(x);
1071             count--;
1072         }
1073
1074         unsigned short dither_value[8];
1075         __m128i dither;
1076 #ifdef ENABLE_DITHER_MATRIX_4X4
1077         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1078         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1079         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1080         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1081         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1082 #else
1083         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1084         dither_value[0] = dither_value[4] = (dither_scan
1085                                              >> (((x) & 3) << 2)) & 0xF;
1086         dither_value[1] = dither_value[5] = (dither_scan
1087                                              >> (((x + 1) & 3) << 2)) & 0xF;
1088         dither_value[2] = dither_value[6] = (dither_scan
1089                                              >> (((x + 2) & 3) << 2)) & 0xF;
1090         dither_value[3] = dither_value[7] = (dither_scan
1091                                              >> (((x + 3) & 3) << 2)) & 0xF;
1092 #endif
1093         dither = _mm_loadu_si128((__m128i*) dither_value);
1094
1095         const __m128i* s = reinterpret_cast<const __m128i*>(src);
1096         __m128i* d = reinterpret_cast<__m128i*>(dst);
1097
1098         while (count >= 8) {
1099             // Load 8 pixels of src.
1100             __m128i src_pixel1 = _mm_loadu_si128(s++);
1101             __m128i src_pixel2 = _mm_loadu_si128(s++);
1102
1103             // Extract R from src.
1104             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1105             sr1 = _mm_srli_epi32(sr1, 24);
1106             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1107             sr2 = _mm_srli_epi32(sr2, 24);
1108             __m128i sr = _mm_packs_epi32(sr1, sr2);
1109
1110             // SkDITHER_R32To565(sr, dither)
1111             __m128i sr_offset = _mm_srli_epi16(sr, 5);
1112             sr = _mm_add_epi16(sr, dither);
1113             sr = _mm_sub_epi16(sr, sr_offset);
1114             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1115
1116             // Extract G from src.
1117             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1118             sg1 = _mm_srli_epi32(sg1, 24);
1119             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1120             sg2 = _mm_srli_epi32(sg2, 24);
1121             __m128i sg = _mm_packs_epi32(sg1, sg2);
1122
1123             // SkDITHER_R32To565(sg, dither)
1124             __m128i sg_offset = _mm_srli_epi16(sg, 6);
1125             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1126             sg = _mm_sub_epi16(sg, sg_offset);
1127             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1128
1129             // Extract B from src.
1130             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1131             sb1 = _mm_srli_epi32(sb1, 24);
1132             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1133             sb2 = _mm_srli_epi32(sb2, 24);
1134             __m128i sb = _mm_packs_epi32(sb1, sb2);
1135
1136             // SkDITHER_R32To565(sb, dither)
1137             __m128i sb_offset = _mm_srli_epi16(sb, 5);
1138             sb = _mm_add_epi16(sb, dither);
1139             sb = _mm_sub_epi16(sb, sb_offset);
1140             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1141
1142             // Pack and store 16-bit dst pixel.
1143             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
1144             _mm_store_si128(d++, d_pixel);
1145
1146             count -= 8;
1147             x += 8;
1148         }
1149
1150         src = reinterpret_cast<const SkPMColor*>(s);
1151         dst = reinterpret_cast<uint16_t*>(d);
1152     }
1153
1154     if (count > 0) {
1155         DITHER_565_SCAN(y);
1156         do {
1157             SkPMColor c = *src++;
1158             SkPMColorAssert(c);
1159
1160             unsigned dither = DITHER_VALUE(x);
1161             *dst++ = SkDitherRGB32To565(c, dither);
1162             DITHER_INC_X(x);
1163         } while (--count != 0);
1164     }
1165 }
1166
1167 /* SSE2 version of S32A_D565_Opaque_Dither()
1168  * portable version is in core/SkBlitRow_D16.cpp
1169  */
1170 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1171                                   const SkPMColor* SK_RESTRICT src,
1172                                   int count, U8CPU alpha, int x, int y) {
1173     SkASSERT(255 == alpha);
1174
1175     if (count <= 0) {
1176         return;
1177     }
1178
1179     if (count >= 8) {
1180         while (((size_t)dst & 0x0F) != 0) {
1181             DITHER_565_SCAN(y);
1182             SkPMColor c = *src++;
1183             SkPMColorAssert(c);
1184             if (c) {
1185                 unsigned a = SkGetPackedA32(c);
1186
1187                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1188
1189                 unsigned sr = SkGetPackedR32(c);
1190                 unsigned sg = SkGetPackedG32(c);
1191                 unsigned sb = SkGetPackedB32(c);
1192                 sr = SkDITHER_R32_FOR_565(sr, d);
1193                 sg = SkDITHER_G32_FOR_565(sg, d);
1194                 sb = SkDITHER_B32_FOR_565(sb, d);
1195
1196                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1197                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1198                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1199                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1200                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1201             }
1202             dst += 1;
1203             DITHER_INC_X(x);
1204             count--;
1205         }
1206
1207         unsigned short dither_value[8];
1208         __m128i dither, dither_cur;
1209 #ifdef ENABLE_DITHER_MATRIX_4X4
1210         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1211         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1212         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1213         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1214         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1215 #else
1216         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1217         dither_value[0] = dither_value[4] = (dither_scan
1218                                              >> (((x) & 3) << 2)) & 0xF;
1219         dither_value[1] = dither_value[5] = (dither_scan
1220                                              >> (((x + 1) & 3) << 2)) & 0xF;
1221         dither_value[2] = dither_value[6] = (dither_scan
1222                                              >> (((x + 2) & 3) << 2)) & 0xF;
1223         dither_value[3] = dither_value[7] = (dither_scan
1224                                              >> (((x + 3) & 3) << 2)) & 0xF;
1225 #endif
1226         dither = _mm_loadu_si128((__m128i*) dither_value);
1227
1228         const __m128i* s = reinterpret_cast<const __m128i*>(src);
1229         __m128i* d = reinterpret_cast<__m128i*>(dst);
1230         __m128i var256 = _mm_set1_epi16(256);
1231         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1232         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1233         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1234
1235         while (count >= 8) {
1236             // Load 8 pixels of src and dst.
1237             __m128i src_pixel1 = _mm_loadu_si128(s++);
1238             __m128i src_pixel2 = _mm_loadu_si128(s++);
1239             __m128i dst_pixel = _mm_load_si128(d);
1240
1241             // Extract A from src.
1242             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1243             sa1 = _mm_srli_epi32(sa1, 24);
1244             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1245             sa2 = _mm_srli_epi32(sa2, 24);
1246             __m128i sa = _mm_packs_epi32(sa1, sa2);
1247
1248             // Calculate current dither value.
1249             dither_cur = _mm_mullo_epi16(dither,
1250                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
1251             dither_cur = _mm_srli_epi16(dither_cur, 8);
1252
1253             // Extract R from src.
1254             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1255             sr1 = _mm_srli_epi32(sr1, 24);
1256             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1257             sr2 = _mm_srli_epi32(sr2, 24);
1258             __m128i sr = _mm_packs_epi32(sr1, sr2);
1259
1260             // SkDITHER_R32_FOR_565(sr, d)
1261             __m128i sr_offset = _mm_srli_epi16(sr, 5);
1262             sr = _mm_add_epi16(sr, dither_cur);
1263             sr = _mm_sub_epi16(sr, sr_offset);
1264
1265             // Expand sr.
1266             sr = _mm_slli_epi16(sr, 2);
1267
1268             // Extract G from src.
1269             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1270             sg1 = _mm_srli_epi32(sg1, 24);
1271             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1272             sg2 = _mm_srli_epi32(sg2, 24);
1273             __m128i sg = _mm_packs_epi32(sg1, sg2);
1274
1275             // sg = SkDITHER_G32_FOR_565(sg, d).
1276             __m128i sg_offset = _mm_srli_epi16(sg, 6);
1277             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1278             sg = _mm_sub_epi16(sg, sg_offset);
1279
1280             // Expand sg.
1281             sg = _mm_slli_epi16(sg, 3);
1282
1283             // Extract B from src.
1284             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1285             sb1 = _mm_srli_epi32(sb1, 24);
1286             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1287             sb2 = _mm_srli_epi32(sb2, 24);
1288             __m128i sb = _mm_packs_epi32(sb1, sb2);
1289
1290             // sb = SkDITHER_B32_FOR_565(sb, d).
1291             __m128i sb_offset = _mm_srli_epi16(sb, 5);
1292             sb = _mm_add_epi16(sb, dither_cur);
1293             sb = _mm_sub_epi16(sb, sb_offset);
1294
1295             // Expand sb.
1296             sb = _mm_slli_epi16(sb, 2);
1297
1298             // Extract R G B from dst.
1299             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1300             dr = _mm_and_si128(dr, r16_mask);
1301             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1302             dg = _mm_and_si128(dg, g16_mask);
1303             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1304             db = _mm_and_si128(db, b16_mask);
1305
1306             // SkAlpha255To256(255 - a) >> 3
1307             __m128i isa = _mm_sub_epi16(var256, sa);
1308             isa = _mm_srli_epi16(isa, 3);
1309
1310             dr = _mm_mullo_epi16(dr, isa);
1311             dr = _mm_add_epi16(dr, sr);
1312             dr = _mm_srli_epi16(dr, 5);
1313
1314             dg = _mm_mullo_epi16(dg, isa);
1315             dg = _mm_add_epi16(dg, sg);
1316             dg = _mm_srli_epi16(dg, 5);
1317
1318             db = _mm_mullo_epi16(db, isa);
1319             db = _mm_add_epi16(db, sb);
1320             db = _mm_srli_epi16(db, 5);
1321
1322             // Package and store dst pixel.
1323             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1324             _mm_store_si128(d++, d_pixel);
1325
1326             count -= 8;
1327             x += 8;
1328         }
1329
1330         src = reinterpret_cast<const SkPMColor*>(s);
1331         dst = reinterpret_cast<uint16_t*>(d);
1332     }
1333
1334     if (count > 0) {
1335         DITHER_565_SCAN(y);
1336         do {
1337             SkPMColor c = *src++;
1338             SkPMColorAssert(c);
1339             if (c) {
1340                 unsigned a = SkGetPackedA32(c);
1341
1342                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1343
1344                 unsigned sr = SkGetPackedR32(c);
1345                 unsigned sg = SkGetPackedG32(c);
1346                 unsigned sb = SkGetPackedB32(c);
1347                 sr = SkDITHER_R32_FOR_565(sr, d);
1348                 sg = SkDITHER_G32_FOR_565(sg, d);
1349                 sb = SkDITHER_B32_FOR_565(sb, d);
1350
1351                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1352                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1353                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1354                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1355                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1356             }
1357             dst += 1;
1358             DITHER_INC_X(x);
1359         } while (--count != 0);
1360     }
1361 }