SSE2 version of ClampX_ClampY_{no}filter_scale; yields 10-20% speedup in
authortomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
committertomhudson@google.com <tomhudson@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>
Wed, 22 Feb 2012 18:30:43 +0000 (18:30 +0000)
bitmap_8888 benchmarks on top of last week's SSSE3 patch.
Thanks to Jin Yang.

http://codereview.appspot.com/5685055/

git-svn-id: http://skia.googlecode.com/svn/trunk@3227 2bbb7eff-a529-9590-31e7-b0007b416f81

src/core/SkBitmapProcState.h
src/opts/SkBitmapProcState_opts_SSE2.cpp
src/opts/SkBitmapProcState_opts_SSE2.h
src/opts/opts_check_SSE2.cpp

index 98c8782..fb4957e 100644 (file)
@@ -136,5 +136,9 @@ void S32_opaque_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
                               int count, SkPMColor colors[]);
 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
                              int count, SkPMColor colors[]);
+void ClampX_ClampY_filter_scale(const SkBitmapProcState& s, uint32_t xy[],
+                                int count, int x, int y);
+void ClampX_ClampY_nofilter_scale(const SkBitmapProcState& s, uint32_t xy[],
+                                  int count, int x, int y);
 
 #endif
index 9a0a013..10abd17 100644 (file)
@@ -232,3 +232,254 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
         *colors++ = _mm_cvtsi128_si32(sum);
     } while (--count > 0);
 }
+
+static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
+                                                 SkFixed one) {
+    unsigned i = SkClampMax(f >> 16, max);
+    i = (i << 4) | ((f >> 12) & 0xF);
+    return (i << 14) | SkClampMax((f + one) >> 16, max);
+}
+
+/*  SSE version of ClampX_ClampY_filter_scale()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
+                                     int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+    SkASSERT(s.fInvKy == 0);
+    
+    const unsigned maxX = s.fBitmap->width() - 1;
+    const SkFixed one = s.fFilterOneX;
+    const SkFixed dx = s.fInvSx;
+    SkFixed fx;
+
+    SkPoint pt;
+    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                SkIntToScalar(y) + SK_ScalarHalf, &pt);
+    const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+    const unsigned maxY = s.fBitmap->height() - 1;
+    // compute our two Y values up front
+    *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
+    // now initialize fx
+    fx = SkScalarToFixed(pt.fX) - (one >> 1);
+
+    // test if we don't need to apply the tile proc
+    if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
+        (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
+        if (count >= 4) {
+            // SSE version of decal_filter_scale
+            while ((size_t(xy) & 0x0F) != 0) {
+                SkASSERT((fx >> (16 + 14)) == 0);
+                *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
+                fx += dx;
+                count--;
+            }
+
+            __m128i wide_1    = _mm_set1_epi32(1);
+            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
+            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                              fx + dx, fx);
+
+            while (count >= 4) {
+                __m128i wide_out; 
+    
+                wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
+                wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
+                                        _mm_srai_epi32(wide_fx, 16), wide_1)); 
+                
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
+      
+                xy += 4;
+                fx += dx * 4;
+                wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
+                count -= 4;
+            } // while count >= 4
+        } // if count >= 4
+
+        while (count-- > 0) {
+            SkASSERT((fx >> (16 + 14)) == 0);
+            *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
+            fx += dx;
+        }
+    } else {
+        // SSE2 only support 16bit interger max & min, so only process the case
+        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
+        // height, there should be rare bitmap whose height will be greater 
+        // than max 16bit interger in the real world.
+        if ((count >= 4) && (maxX <= 0xFFFF)) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
+                fx += dx;
+                count--;
+            }
+    
+            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                              fx + dx, fx);
+            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
+            __m128i wide_one  = _mm_set1_epi32(one);
+            __m128i wide_maxX = _mm_set1_epi32(maxX); 
+            __m128i wide_mask = _mm_set1_epi32(0xF);
+
+             while (count >= 4) {
+                __m128i wide_i;
+                __m128i wide_lo;
+                __m128i wide_fx1;
+
+                // i = SkClampMax(f>>16,maxX)
+                wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 
+                                       _mm_setzero_si128());
+                wide_i = _mm_min_epi16(wide_i, wide_maxX);
+    
+                // i<<4 | TILEX_LOW_BITS(fx)
+                wide_lo = _mm_srli_epi32(wide_fx, 12);
+                wide_lo = _mm_and_si128(wide_lo, wide_mask);
+                wide_i  = _mm_slli_epi32(wide_i, 4);         
+                wide_i  = _mm_or_si128(wide_i, wide_lo);     
+    
+                // i<<14
+                wide_i = _mm_slli_epi32(wide_i, 14);
+    
+                // SkClampMax(((f+one))>>16,max)
+                wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
+                wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), 
+                                                        _mm_setzero_si128());
+                wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
+                    
+                // final combination
+                wide_i = _mm_or_si128(wide_i, wide_fx1);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 
+    
+                wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
+                fx += dx * 4;   
+                xy += 4;
+                count -= 4;
+            } // while count >= 4
+        } // if count >= 4
+
+        while (count-- > 0) {
+            *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
+            fx += dx;
+        }
+    }
+}
+
+/*  SSE version of ClampX_ClampY_nofilter_scale()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
+                                    uint32_t xy[], int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+
+    // we store y, x, x, x, x, x
+    const unsigned maxX = s.fBitmap->width() - 1;
+    SkFixed fx;
+    SkPoint pt;
+    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                SkIntToScalar(y) + SK_ScalarHalf, &pt);
+    fx = SkScalarToFixed(pt.fY);
+    const unsigned maxY = s.fBitmap->height() - 1;
+    *xy++ = SkClampMax(fx >> 16, maxY);
+    fx = SkScalarToFixed(pt.fX);
+    
+    if (0 == maxX) {
+        // all of the following X values must be 0
+        memset(xy, 0, count * sizeof(uint16_t));
+        return;
+    }
+
+    const SkFixed dx = s.fInvSx;
+
+    // test if we don't need to apply the tile proc
+    if ((unsigned)(fx >> 16) <= maxX &&
+        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
+        // SSE version of decal_nofilter_scale
+        if (count >= 8) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
+                fx += 2 * dx;
+                count -= 2;
+            }
+
+            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
+            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
+
+            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                             fx + dx, fx);
+            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
+
+            while (count >= 8) {
+                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
+                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
+
+                __m128i wide_result = _mm_packs_epi32(wide_out_low,
+                                                      wide_out_high);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
+            
+                wide_low = _mm_add_epi32(wide_low, wide_dx8);
+                wide_high = _mm_add_epi32(wide_high, wide_dx8);
+
+                xy += 4;
+                fx += dx * 8;
+                count -= 8;
+            }
+        } // if count >= 8
+
+        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
+        while (count-- > 0) {
+            *xx++ = SkToU16(fx >> 16);
+            fx += dx;
+        }
+    } else {
+        // SSE2 only support 16bit interger max & min, so only process the case
+        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
+        // height, there should be rare bitmap whose height will be greater 
+        // than max 16bit interger in the real world.
+        if ((count >= 8) && (maxX <= 0xFFFF)) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = SkClampMax((fx + dx) >> 16, maxX) | 
+                                   SkClampMax(fx >> 16, maxX);
+                fx += 2 * dx;
+                count -= 2;
+            }
+
+            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
+            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
+
+            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                             fx + dx, fx);
+            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
+            __m128i wide_maxX = _mm_set1_epi32(maxX);
+
+            while (count >= 8) {
+                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
+                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
+
+                wide_out_low  = _mm_max_epi16(wide_out_low, 
+                                              _mm_setzero_si128());
+                wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
+                wide_out_high = _mm_max_epi16(wide_out_high,
+                                              _mm_setzero_si128());
+                wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
+
+                __m128i wide_result = _mm_packs_epi32(wide_out_low,
+                                                      wide_out_high);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
+
+                wide_low  = _mm_add_epi32(wide_low, wide_dx8);
+                wide_high = _mm_add_epi32(wide_high, wide_dx8);
+
+                xy += 4;
+                fx += dx * 8;
+                count -= 8;
+            }
+        } // if count >= 8
+
+        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
+        while (count-- > 0) {
+            *xx++ = SkClampMax(fx >> 16, maxX);
+            fx += dx;
+        }
+    }
+}
index 9e56642..0f276b8 100644 (file)
@@ -17,3 +17,7 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
                                   int count, uint32_t* colors);
 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
                   SkPMColor color);
+void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
+                                     int count, int x, int y);
+void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
+                                       uint32_t xy[], int count, int x, int y);
index 3003d78..db5e4e8 100644 (file)
@@ -85,19 +85,27 @@ static bool cachedHasSSSE3() {
 }
 
 void SkBitmapProcState::platformProcs() {
-  if (cachedHasSSSE3()) {
-      if (fSampleProc32 == S32_opaque_D32_filter_DX) {
-          fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
-      } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-          fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
-      }
-  } else if (cachedHasSSE2()) {
+    if (cachedHasSSSE3()) {
+        if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
+        } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
+        }
+    } else if (cachedHasSSE2()) {
         if (fSampleProc32 == S32_opaque_D32_filter_DX) {
             fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
         } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
             fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
         }
     }
+
+    if (cachedHasSSSE3() || cachedHasSSE2()) {
+        if (fMatrixProc == ClampX_ClampY_filter_scale) {
+            fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
+        } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+            fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+        }
+    }
 }
 
 static SkBlitRow::Proc32 platform_32_procs[] = {