Add SSSE3 acceleration for S32_D16_filter_DXDY
authorqiankun.miao <qiankun.miao@intel.com>
Thu, 4 Dec 2014 14:27:03 +0000 (06:27 -0800)
committerCommit bot <commit-bot@chromium.org>
Thu, 4 Dec 2014 14:27:03 +0000 (06:27 -0800)
With this CL, related nanobench can be improved for 565 config.
                bitmap_BGRA_8888_scale_rotate_bilerp     115us -> 70.5us       0.61x
bitmap_BGRA_8888_update_volatile_scale_rotate_bilerp     115us -> 70.5us       0.61x
         bitmap_BGRA_8888_update_scale_rotate_bilerp     112us ->   68us       0.6x

BUG=skia:

Committed: https://skia.googlesource.com/skia/+/45a05780867a06b9f8a8d5240cf6c5d5a2c15a35

Review URL: https://codereview.chromium.org/773753002

src/core/SkBitmapProcState.h
src/opts/SkBitmapProcState_opts_SSSE3.cpp
src/opts/SkBitmapProcState_opts_SSSE3.h
src/opts/opts_check_x86.cpp

index dd1f0bf..add5bf4 100644 (file)
@@ -204,6 +204,8 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
                                    uint32_t xy[], int count, int x, int y);
 void S32_D16_filter_DX(const SkBitmapProcState& s,
                        const uint32_t* xy, int count, uint16_t* colors);
+void S32_D16_filter_DXDY(const SkBitmapProcState& s,
+                         const uint32_t* xy, int count, uint16_t* colors);
 
 void highQualityFilter32(const SkBitmapProcState &s, int x, int y,
                          SkPMColor *SK_RESTRICT colors, int count);
index 165f1f5..99bc192 100644 (file)
@@ -6,6 +6,7 @@
  */
 
 #include "SkBitmapProcState_opts_SSSE3.h"
+#include "SkColorPriv.h"
 #include "SkPaint.h"
 #include "SkUtils.h"
 
@@ -720,17 +721,28 @@ void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
 }
 
 void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
-                                    const uint32_t* xy,
-                                    int count, uint32_t* colors) {
+                                      const uint32_t* xy,
+                                      int count, uint32_t* colors) {
     S32_generic_D32_filter_DXDY_SSSE3<false>(s, xy, count, colors);
 }
 
 void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint32_t* colors) {
+                                     const uint32_t* xy,
+                                     int count, uint32_t* colors) {
     S32_generic_D32_filter_DXDY_SSSE3<true>(s, xy, count, colors);
 }
 
+void S32_D16_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+                               const uint32_t* xy,
+                               int count, uint16_t* colors) {
+    SkASSERT(64 >= count);
+    SkAutoSTMalloc<64, uint32_t> colors32(count);
+    S32_generic_D32_filter_DXDY_SSSE3<false>(s, xy, count, colors32);
+    for(int i = 0; i < count; i++) {
+        *colors++ = SkPixel32ToPixel16(colors32[i]);
+    }
+}
+
 #else // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
 
 void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
@@ -746,14 +758,20 @@ void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
 }
 
 void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
-                                    const uint32_t* xy,
-                                    int count, uint32_t* colors) {
+                                      const uint32_t* xy,
+                                      int count, uint32_t* colors) {
     sk_throw();
 }
 
 void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint32_t* colors) {
+                                     const uint32_t* xy,
+                                     int count, uint32_t* colors) {
+    sk_throw();
+}
+
+void S32_D16_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+                               const uint32_t* xy,
+                               int count, uint16_t* colors) {
     sk_throw();
 }
 
index 9fd074a..74504d8 100644 (file)
@@ -23,4 +23,8 @@ void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
                                    const uint32_t* xy,
                                    int count, uint32_t* colors);
 
+void S32_D16_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+                               const uint32_t* xy,
+                               int count, uint16_t* colors);
+
 #endif
index 8fec2ba..34aae92 100644 (file)
@@ -149,26 +149,27 @@ void SkBitmapProcState::platformProcs() {
     if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
         return;
     }
+    const bool ssse3 = supports_simd(SK_CPU_SSE_LEVEL_SSSE3);
 
     /* Check fSampleProc32 */
     if (fSampleProc32 == S32_opaque_D32_filter_DX) {
-        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+        if (ssse3) {
             fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
         } else {
             fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
         }
     } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
-        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+        if (ssse3) {
             fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
         }
     } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+        if (ssse3) {
             fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
         } else {
             fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
         }
     } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
-        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+        if (ssse3) {
             fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
         }
     }
@@ -176,6 +177,8 @@ void SkBitmapProcState::platformProcs() {
     /* Check fSampleProc16 */
     if (fSampleProc16 == S32_D16_filter_DX) {
         fSampleProc16 = S32_D16_filter_DX_SSE2;
+    } else if (ssse3 && fSampleProc16 == S32_D16_filter_DXDY) {
+        fSampleProc16 = S32_D16_filter_DXDY_SSSE3;
     }
 
     /* Check fMatrixProc */