SSE2 version of ClampX_ClampY_{no}filter_scale; yields 10-20% speedup in

author tomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>

Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)

committer tomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>

Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
author tomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
committer tomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h

index 98c8782..fb4957e 100644 (file)
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -136,5 +136,9 @@ void S32_opaque_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
                                int count, SkPMColor colors[]);
  void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
                               int count, SkPMColor colors[]);
+void ClampX_ClampY_filter_scale(const SkBitmapProcState& s, uint32_t xy[],
+                                int count, int x, int y);
+void ClampX_ClampY_nofilter_scale(const SkBitmapProcState& s, uint32_t xy[],
+                                  int count, int x, int y);
  
  #endif
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp

index 9a0a013..10abd17 100644 (file)
--- a/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp
@@ -232,3 +232,254 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
          *colors++ = _mm_cvtsi128_si32(sum);
      } while (--count > 0);
  }
+
+static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
+                                                 SkFixed one) {
+    unsigned i = SkClampMax(f >> 16, max);
+    i = (i << 4) | ((f >> 12) & 0xF);
+    return (i << 14) | SkClampMax((f + one) >> 16, max);
+}
+
+/*  SSE version of ClampX_ClampY_filter_scale()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
+                                     int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+    SkASSERT(s.fInvKy == 0);
+    
+    const unsigned maxX = s.fBitmap->width() - 1;
+    const SkFixed one = s.fFilterOneX;
+    const SkFixed dx = s.fInvSx;
+    SkFixed fx;
+
+    SkPoint pt;
+    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                SkIntToScalar(y) + SK_ScalarHalf, &pt);
+    const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+    const unsigned maxY = s.fBitmap->height() - 1;
+    // compute our two Y values up front
+    *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
+    // now initialize fx
+    fx = SkScalarToFixed(pt.fX) - (one >> 1);
+
+    // test if we don't need to apply the tile proc
+    if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
+        (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
+        if (count >= 4) {
+            // SSE version of decal_filter_scale
+            while ((size_t(xy) & 0x0F) != 0) {
+                SkASSERT((fx >> (16 + 14)) == 0);
+                *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
+                fx += dx;
+                count--;
+            }
+
+            __m128i wide_1    = _mm_set1_epi32(1);
+            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
+            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                              fx + dx, fx);
+
+            while (count >= 4) {
+                __m128i wide_out; 
+    
+                wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
+                wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
+                                        _mm_srai_epi32(wide_fx, 16), wide_1)); 
+                
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
+      
+                xy += 4;
+                fx += dx * 4;
+                wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
+                count -= 4;
+            } // while count >= 4
+        } // if count >= 4
+
+        while (count-- > 0) {
+            SkASSERT((fx >> (16 + 14)) == 0);
+            *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
+            fx += dx;
+        }
+    } else {
+        // SSE2 only support 16bit interger max & min, so only process the case
+        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
+        // height, there should be rare bitmap whose height will be greater 
+        // than max 16bit interger in the real world.
+        if ((count >= 4) && (maxX <= 0xFFFF)) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
+                fx += dx;
+                count--;
+            }
+    
+            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                              fx + dx, fx);
+            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
+            __m128i wide_one  = _mm_set1_epi32(one);
+            __m128i wide_maxX = _mm_set1_epi32(maxX); 
+            __m128i wide_mask = _mm_set1_epi32(0xF);
+
+             while (count >= 4) {
+                __m128i wide_i;
+                __m128i wide_lo;
+                __m128i wide_fx1;
+
+                // i = SkClampMax(f>>16,maxX)
+                wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 
+                                       _mm_setzero_si128());
+                wide_i = _mm_min_epi16(wide_i, wide_maxX);
+    
+                // i<<4 | TILEX_LOW_BITS(fx)
+                wide_lo = _mm_srli_epi32(wide_fx, 12);
+                wide_lo = _mm_and_si128(wide_lo, wide_mask);
+                wide_i  = _mm_slli_epi32(wide_i, 4);         
+                wide_i  = _mm_or_si128(wide_i, wide_lo);     
+    
+                // i<<14
+                wide_i = _mm_slli_epi32(wide_i, 14);
+    
+                // SkClampMax(((f+one))>>16,max)
+                wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
+                wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), 
+                                                        _mm_setzero_si128());
+                wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
+                    
+                // final combination
+                wide_i = _mm_or_si128(wide_i, wide_fx1);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 
+    
+                wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
+                fx += dx * 4;   
+                xy += 4;
+                count -= 4;
+            } // while count >= 4
+        } // if count >= 4
+
+        while (count-- > 0) {
+            *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
+            fx += dx;
+        }
+    }
+}
+
+/*  SSE version of ClampX_ClampY_nofilter_scale()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
+                                    uint32_t xy[], int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+
+    // we store y, x, x, x, x, x
+    const unsigned maxX = s.fBitmap->width() - 1;
+    SkFixed fx;
+    SkPoint pt;
+    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                SkIntToScalar(y) + SK_ScalarHalf, &pt);
+    fx = SkScalarToFixed(pt.fY);
+    const unsigned maxY = s.fBitmap->height() - 1;
+    *xy++ = SkClampMax(fx >> 16, maxY);
+    fx = SkScalarToFixed(pt.fX);
+    
+    if (0 == maxX) {
+        // all of the following X values must be 0
+        memset(xy, 0, count * sizeof(uint16_t));
+        return;
+    }
+
+    const SkFixed dx = s.fInvSx;
+
+    // test if we don't need to apply the tile proc
+    if ((unsigned)(fx >> 16) <= maxX &&
+        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
+        // SSE version of decal_nofilter_scale
+        if (count >= 8) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
+                fx += 2 * dx;
+                count -= 2;
+            }
+
+            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
+            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
+
+            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                             fx + dx, fx);
+            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
+
+            while (count >= 8) {
+                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
+                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
+
+                __m128i wide_result = _mm_packs_epi32(wide_out_low,
+                                                      wide_out_high);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
+            
+                wide_low = _mm_add_epi32(wide_low, wide_dx8);
+                wide_high = _mm_add_epi32(wide_high, wide_dx8);
+
+                xy += 4;
+                fx += dx * 8;
+                count -= 8;
+            }
+        } // if count >= 8
+
+        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
+        while (count-- > 0) {
+            *xx++ = SkToU16(fx >> 16);
+            fx += dx;
+        }
+    } else {
+        // SSE2 only support 16bit interger max & min, so only process the case
+        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
+        // height, there should be rare bitmap whose height will be greater 
+        // than max 16bit interger in the real world.
+        if ((count >= 8) && (maxX <= 0xFFFF)) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = SkClampMax((fx + dx) >> 16, maxX) | 
+                                   SkClampMax(fx >> 16, maxX);
+                fx += 2 * dx;
+                count -= 2;
+            }
+
+            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
+            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
+
+            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                             fx + dx, fx);
+            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
+            __m128i wide_maxX = _mm_set1_epi32(maxX);
+
+            while (count >= 8) {
+                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
+                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
+
+                wide_out_low  = _mm_max_epi16(wide_out_low, 
+                                              _mm_setzero_si128());
+                wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
+                wide_out_high = _mm_max_epi16(wide_out_high,
+                                              _mm_setzero_si128());
+                wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
+
+                __m128i wide_result = _mm_packs_epi32(wide_out_low,
+                                                      wide_out_high);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
+
+                wide_low  = _mm_add_epi32(wide_low, wide_dx8);
+                wide_high = _mm_add_epi32(wide_high, wide_dx8);
+
+                xy += 4;
+                fx += dx * 8;
+                count -= 8;
+            }
+        } // if count >= 8
+
+        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
+        while (count-- > 0) {
+            *xx++ = SkClampMax(fx >> 16, maxX);
+            fx += dx;
+        }
+    }
+}
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h

index 9e56642..0f276b8 100644 (file)
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -17,3 +17,7 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
                                    int count, uint32_t* colors);
  void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
                    SkPMColor color);
+void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
+                                     int count, int x, int y);
+void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
+                                       uint32_t xy[], int count, int x, int y);
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp

index 3003d78..db5e4e8 100644 (file)
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -85,19 +85,27 @@ static bool cachedHasSSSE3() {
  }
  
  void SkBitmapProcState::platformProcs() {
-  if (cachedHasSSSE3()) {
-      if (fSampleProc32 == S32_opaque_D32_filter_DX) {
-          fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
-      } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-          fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
-      }
-  } else if (cachedHasSSE2()) {
+    if (cachedHasSSSE3()) {
+        if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
+        } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
+        }
+    } else if (cachedHasSSE2()) {
          if (fSampleProc32 == S32_opaque_D32_filter_DX) {
              fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
          } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
              fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
          }
      }
+
+    if (cachedHasSSSE3() || cachedHasSSE2()) {
+        if (fMatrixProc == ClampX_ClampY_filter_scale) {
+            fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
+        } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+            fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+        }
+    }
  }
  
  static SkBlitRow::Proc32 platform_32_procs[] = {
author	tomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
	Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
committer	tomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
	Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
src/core/SkBitmapProcState.h		patch \| blob \| history
src/opts/SkBitmapProcState_opts_SSE2.cpp		patch \| blob \| history
src/opts/SkBitmapProcState_opts_SSE2.h		patch \| blob \| history
src/opts/opts_check_SSE2.cpp		patch \| blob \| history