SSE2 implementation of S32_D565_Opaque
authorcommit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>
Mon, 24 Feb 2014 04:23:39 +0000 (04:23 +0000)
committercommit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>
Mon, 24 Feb 2014 04:23:39 +0000 (04:23 +0000)
Benchmarks hitting this path can benfit from this patch.
Here are the data:
                                    before      after
        gradient_radial2_mirror   10885.52   10849.48   0.33%
 gradient_radial2_clamp_hicolor   11819.69   11644.83   1.48%
         gradient_radial2_clamp   11816.10   11649.91   1.41%
     bitmaprect_FF_filter_trans       6.27       4.88  22.17%
   bitmaprect_FF_nofilter_trans       6.27       4.88  22.17%
  bitmaprect_FF_filter_identity       6.31       4.86  22.98%
bitmaprect_FF_nofilter_identity       6.25       4.86  22.24%
             bitmap_4444_update       6.26       5.05  19.33%
    bitmap_4444_update_volatile       6.21       5.06  18.52%
                    bitmap_4444       6.22       5.06  18.65%

BUG=
R=mtklein@google.com

Author: qiankun.miao@intel.com

Review URL: https://codereview.chromium.org/172083003

git-svn-id: http://skia.googlecode.com/svn/trunk@13556 2bbb7eff-a529-9590-31e7-b0007b416f81

src/opts/SkBlitRow_opts_SSE2.cpp
src/opts/SkBlitRow_opts_SSE2.h
src/opts/opts_check_SSE2.cpp

index 47651c4..9e99b4b 100644 (file)
@@ -853,6 +853,83 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
     }
 }
 
+/* SSE2 version of S32_D565_Opaque()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+                          const SkPMColor* SK_RESTRICT src, int count,
+                          U8CPU alpha, int /*x*/, int /*y*/) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0) {
+        return;
+    }
+
+    if (count >= 8) {
+        while (((size_t)dst & 0x0F) != 0) {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+
+            *dst++ = SkPixel32ToPixel16_ToU16(c);
+            count--;
+        }
+
+        const __m128i* s = reinterpret_cast<const __m128i*>(src);
+        __m128i* d = reinterpret_cast<__m128i*>(dst);
+        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
+        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
+        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
+
+        while (count >= 8) {
+            // Load 8 pixels of src.
+            __m128i src_pixel1 = _mm_loadu_si128(s++);
+            __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+            // Calculate result r.
+            __m128i r1 = _mm_srli_epi32(src_pixel1,
+                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
+            r1 = _mm_and_si128(r1, r16_mask);
+            __m128i r2 = _mm_srli_epi32(src_pixel2,
+                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
+            r2 = _mm_and_si128(r2, r16_mask);
+            __m128i r = _mm_packs_epi32(r1, r2);
+
+            // Calculate result g.
+            __m128i g1 = _mm_srli_epi32(src_pixel1,
+                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
+            g1 = _mm_and_si128(g1, g16_mask);
+            __m128i g2 = _mm_srli_epi32(src_pixel2,
+                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
+            g2 = _mm_and_si128(g2, g16_mask);
+            __m128i g = _mm_packs_epi32(g1, g2);
+
+            // Calculate result b.
+            __m128i b1 = _mm_srli_epi32(src_pixel1,
+                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
+            b1 = _mm_and_si128(b1, b16_mask);
+            __m128i b2 = _mm_srli_epi32(src_pixel2,
+                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
+            b2 = _mm_and_si128(b2, b16_mask);
+            __m128i b = _mm_packs_epi32(b1, b2);
+
+            // Store 8 16-bit colors in dst.
+            __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
+            _mm_store_si128(d++, d_pixel);
+            count -= 8;
+        }
+        src = reinterpret_cast<const SkPMColor*>(s);
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+
+    if (count > 0) {
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            *dst++ = SkPixel32ToPixel16_ToU16(c);
+        } while (--count != 0);
+    }
+}
+
 /* SSE2 version of S32A_D565_Opaque()
  * portable version is in core/SkBlitRow_D16.cpp
  */
index 66bc95a..03e6a94 100644 (file)
@@ -29,6 +29,9 @@ void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
                                SkColor color, int width, SkPMColor opaqueDst);
 
+void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+                          const SkPMColor* SK_RESTRICT src, int count,
+                          U8CPU alpha, int /*x*/, int /*y*/);
 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
                            const SkPMColor* SK_RESTRICT src,
                            int count, U8CPU alpha, int /*x*/, int /*y*/);
index cc5adf1..c60c008 100644 (file)
@@ -166,7 +166,7 @@ void SkBitmapProcState::platformProcs() {
 }
 
 static SkBlitRow::Proc platform_16_procs[] = {
-    NULL,                               // S32_D565_Opaque
+    S32_D565_Opaque_SSE2,               // S32_D565_Opaque
     NULL,                               // S32_D565_Blend
     S32A_D565_Opaque_SSE2,              // S32A_D565_Opaque
     NULL,                               // S32A_D565_Blend