SSE2 implementation of S32A_D565_Opaque_Dither
authorcommit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>
Fri, 7 Mar 2014 13:24:42 +0000 (13:24 +0000)
committercommit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>
Fri, 7 Mar 2014 13:24:42 +0000 (13:24 +0000)
Run benchmarks with command line option "--forceDither true --forceBlend
1", almost all the benchmarks exercised S32A_D565_Opaque_Dither can get
about 20%-70% performance improvement.
Here are the data on i7-3770:
                                                  before    after
verts                                            4314.81  3627.64  15.93%
constXTile_MM_filter_trans                       1434.22   432.82  69.82%
constXTile_CC_filter_trans_scale                 1440.17   437.00  69.66%
constXTile_RR_filter_trans                       1436.96   431.93  69.94%
constXTile_MM_trans_scale                        1436.33   435.77  69.66%
constXTile_CC_trans                              1433.12   431.36  69.90%
constXTile_RR_trans_scale                        1436.13   436.06  69.64%
constXTile_MM_filter                             1411.55   408.06  71.09%
constXTile_CC_filter_scale                       1416.68   414.18  70.76%
constXTile_RR_filter                             1429.46   409.81  71.33%
constXTile_MM_scale                              1415.00   412.56  70.84%
constXTile_CC                                    1410.32   408.36  71.04%
constXTile_RR_scale                              1413.26   413.16  70.77%
repeatTile_4444_A                                1922.01   879.03  54.27%
repeatTile_4444_A                                1430.68   818.34  42.80%
repeatTile_4444_X                                1817.43   816.63  55.07%
maskshader                                       5911.09  5895.46   0.26%
gradient_create_alpha                               4.41     4.41  -0.15%
gradient_conical_clamp_3color                   35298.71 27574.34  21.88%
gradient_conical_clamp_hicolor                  35262.15 27538.99  21.90%
gradient_conical_clamp                          35276.21 27599.80  21.76%
gradient_radial2_mirror                         20846.74 12969.39  37.79%
gradient_radial2_clamp_hicolor                  21848.12 13967.57  36.07%
gradient_radial2_clamp                          21829.95 13978.57  35.97%
bitmap_4444_A_scale_rotate_bicubic                105.31    87.13  17.26%
bitmap_4444_A_scale_bicubic                        73.69    47.76  35.20%
bitmap_4444_update_scale_rotate_bilerp            125.65    87.86  30.08%
bitmap_4444_update_volatile_scale_rotate_bilerp   125.50    87.65  30.16%
bitmap_4444_scale_rotate_bilerp                   124.46    87.91  29.37%
bitmap_4444_A_scale_rotate_bilerp                 105.09    87.27  16.96%
bitmap_4444_update_scale_bilerp                   106.78    63.28  40.74%
bitmap_4444_update_volatile_scale_bilerp          106.66    63.66  40.32%
bitmap_4444_scale_bilerp                          106.70    63.19  40.78%
bitmap_4444_A_scale_bilerp                         83.05    62.25  25.04%
bitmap_a8                                          98.11    52.76  46.22%
bitmap_a8_A                                        98.24    52.85  46.20%

BUG=
R=mtklein@google.com

Author: qiankun.miao@intel.com

Review URL: https://codereview.chromium.org/179443003

git-svn-id: http://skia.googlecode.com/svn/trunk@13699 2bbb7eff-a529-9590-31e7-b0007b416f81

src/opts/SkBlitRow_opts_SSE2.cpp
src/opts/SkBlitRow_opts_SSE2.h
src/opts/opts_check_SSE2.cpp

index fca42f3b66acbea512fb50e466761432a7b4528a..4aa08e81657a63388535b8d1a454bf129e260b0f 100644 (file)
@@ -1166,3 +1166,199 @@ void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
         } while (--count != 0);
     }
 }
+
+/* SSE2 version of S32A_D565_Opaque_Dither()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+                                  const SkPMColor* SK_RESTRICT src,
+                                  int count, U8CPU alpha, int x, int y) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0) {
+        return;
+    }
+
+    if (count >= 8) {
+        while (((size_t)dst & 0x0F) != 0) {
+            DITHER_565_SCAN(y);
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            if (c) {
+                unsigned a = SkGetPackedA32(c);
+
+                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+                unsigned sr = SkGetPackedR32(c);
+                unsigned sg = SkGetPackedG32(c);
+                unsigned sb = SkGetPackedB32(c);
+                sr = SkDITHER_R32_FOR_565(sr, d);
+                sg = SkDITHER_G32_FOR_565(sg, d);
+                sb = SkDITHER_B32_FOR_565(sb, d);
+
+                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+                // now src and dst expanded are in g:11 r:10 x:1 b:10
+                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+            }
+            dst += 1;
+            DITHER_INC_X(x);
+            count--;
+        }
+
+        unsigned short dither_value[8];
+        __m128i dither, dither_cur;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
+        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
+        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
+        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
+#else
+        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+        dither_value[0] = dither_value[4] = (dither_scan
+                                             >> (((x) & 3) << 2)) & 0xF;
+        dither_value[1] = dither_value[5] = (dither_scan
+                                             >> (((x + 1) & 3) << 2)) & 0xF;
+        dither_value[2] = dither_value[6] = (dither_scan
+                                             >> (((x + 2) & 3) << 2)) & 0xF;
+        dither_value[3] = dither_value[7] = (dither_scan
+                                             >> (((x + 3) & 3) << 2)) & 0xF;
+#endif
+        dither = _mm_loadu_si128((__m128i*) dither_value);
+
+        const __m128i* s = reinterpret_cast<const __m128i*>(src);
+        __m128i* d = reinterpret_cast<__m128i*>(dst);
+        __m128i var256 = _mm_set1_epi16(256);
+        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
+        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
+        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
+
+        while (count >= 8) {
+            // Load 8 pixels of src and dst.
+            __m128i src_pixel1 = _mm_loadu_si128(s++);
+            __m128i src_pixel2 = _mm_loadu_si128(s++);
+            __m128i dst_pixel = _mm_load_si128(d);
+
+            // Extract A from src.
+            __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
+            sa1 = _mm_srli_epi32(sa1, 24);
+            __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
+            sa2 = _mm_srli_epi32(sa2, 24);
+            __m128i sa = _mm_packs_epi32(sa1, sa2);
+
+            // Calculate current dither value.
+            dither_cur = _mm_mullo_epi16(dither,
+                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
+            dither_cur = _mm_srli_epi16(dither_cur, 8);
+
+            // Extract R from src.
+            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+            sr1 = _mm_srli_epi32(sr1, 24);
+            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+            sr2 = _mm_srli_epi32(sr2, 24);
+            __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+            // SkDITHER_R32_FOR_565(sr, d)
+            __m128i sr_offset = _mm_srli_epi16(sr, 5);
+            sr = _mm_add_epi16(sr, dither_cur);
+            sr = _mm_sub_epi16(sr, sr_offset);
+
+            // Expand sr.
+            sr = _mm_slli_epi16(sr, 2);
+
+            // Extract G from src.
+            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+            sg1 = _mm_srli_epi32(sg1, 24);
+            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+            sg2 = _mm_srli_epi32(sg2, 24);
+            __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+            // sg = SkDITHER_G32_FOR_565(sg, d).
+            __m128i sg_offset = _mm_srli_epi16(sg, 6);
+            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
+            sg = _mm_sub_epi16(sg, sg_offset);
+
+            // Expand sg.
+            sg = _mm_slli_epi16(sg, 3);
+
+            // Extract B from src.
+            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+            sb1 = _mm_srli_epi32(sb1, 24);
+            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+            sb2 = _mm_srli_epi32(sb2, 24);
+            __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+            // sb = SkDITHER_B32_FOR_565(sb, d).
+            __m128i sb_offset = _mm_srli_epi16(sb, 5);
+            sb = _mm_add_epi16(sb, dither_cur);
+            sb = _mm_sub_epi16(sb, sb_offset);
+
+            // Expand sb.
+            sb = _mm_slli_epi16(sb, 2);
+
+            // Extract R G B from dst.
+            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
+            dr = _mm_and_si128(dr, r16_mask);
+            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
+            dg = _mm_and_si128(dg, g16_mask);
+            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
+            db = _mm_and_si128(db, b16_mask);
+
+            // SkAlpha255To256(255 - a) >> 3
+            __m128i isa = _mm_sub_epi16(var256, sa);
+            isa = _mm_srli_epi16(isa, 3);
+
+            dr = _mm_mullo_epi16(dr, isa);
+            dr = _mm_add_epi16(dr, sr);
+            dr = _mm_srli_epi16(dr, 5);
+
+            dg = _mm_mullo_epi16(dg, isa);
+            dg = _mm_add_epi16(dg, sg);
+            dg = _mm_srli_epi16(dg, 5);
+
+            db = _mm_mullo_epi16(db, isa);
+            db = _mm_add_epi16(db, sb);
+            db = _mm_srli_epi16(db, 5);
+
+            // Package and store dst pixel.
+            __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
+            _mm_store_si128(d++, d_pixel);
+
+            count -= 8;
+            x += 8;
+        }
+
+        src = reinterpret_cast<const SkPMColor*>(s);
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+
+    if (count > 0) {
+        DITHER_565_SCAN(y);
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            if (c) {
+                unsigned a = SkGetPackedA32(c);
+
+                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+                unsigned sr = SkGetPackedR32(c);
+                unsigned sg = SkGetPackedG32(c);
+                unsigned sb = SkGetPackedB32(c);
+                sr = SkDITHER_R32_FOR_565(sr, d);
+                sg = SkDITHER_G32_FOR_565(sg, d);
+                sb = SkDITHER_B32_FOR_565(sb, d);
+
+                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+                // now src and dst expanded are in g:11 r:10 x:1 b:10
+                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+            }
+            dst += 1;
+            DITHER_INC_X(x);
+        } while (--count != 0);
+    }
+}
index 42f3356350d06316080ba6fe7a4acf83adf3416c..fcf82d08e578c09523beafc47b92d2be940a255d 100644 (file)
@@ -38,3 +38,6 @@ void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
                                  const SkPMColor* SK_RESTRICT src,
                                  int count, U8CPU alpha, int x, int y);
+void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+                                  const SkPMColor* SK_RESTRICT src,
+                                  int count, U8CPU alpha, int x, int y);
index d0dd6ece95542167ec6fb086eb05d113cbfb1e12..b63a2ffa0fe623b264db60eb9ff1fda8caba72e0 100644 (file)
@@ -172,7 +172,7 @@ static SkBlitRow::Proc platform_16_procs[] = {
     NULL,                               // S32A_D565_Blend
     S32_D565_Opaque_Dither_SSE2,        // S32_D565_Opaque_Dither
     NULL,                               // S32_D565_Blend_Dither
-    NULL,                               // S32A_D565_Opaque_Dither
+    S32A_D565_Opaque_Dither_SSE2,       // S32A_D565_Opaque_Dither
     NULL,                               // S32A_D565_Blend_Dither
 };