From fbdb93ec790a5ef140c9371a8a51278c12649dda Mon Sep 17 00:00:00 2001
From: Victoria Zhislina <no@email>
Date: Fri, 10 Feb 2012 06:05:04 +0000
Subject: [PATCH] CV_ENABLE_UNROLLED

---
 modules/core/include/opencv2/core/internal.hpp   |  6 +++
 modules/core/include/opencv2/core/operations.hpp | 31 +++++++++----
 modules/core/src/arithm.cpp                      | 45 +++++++++++-------
 modules/core/src/convert.cpp                     | 59 +++++++++++++++++++-----
 modules/core/src/copy.cpp                        |  2 +
 modules/core/src/lapack.cpp                      |  6 ++-
 modules/core/src/matmul.cpp                      | 48 +++++++++++++------
 modules/core/src/matrix.cpp                      | 27 +++++++----
 modules/core/src/stat.cpp                        | 27 +++++++----
 9 files changed, 178 insertions(+), 73 deletions(-)
diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp
index b0a68db..e9af13c 100644
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@@ -131,6 +131,12 @@ CV_INLINE IppiSize ippiSize(int width, int height)
 #define CPU_HAS_NEON_FEATURE (false)
 #endif
 
+#ifdef CV_ICC
+#define CV_ENABLE_UNROLLED 0
+#else
+#define CV_ENABLE_UNROLLED 1
+#endif
+
 #ifndef IPPI_CALL
 #define IPPI_CALL(func) CV_Assert((func) >= 0)
 #endif
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index 4e71816..5da73f0 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -48,6 +48,7 @@
   #include <limits.h>
 #endif // SKIP_INCLUDES
 
+
 #ifdef __cplusplus
 
 /////// exchange-add operation for atomic operations on reference counters ///////
@@ -903,12 +904,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
 {
     _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i=0;
+ #if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
     {
         _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
         s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
     }
+#endif
     for( ; i < n; i++ )
     {
         _AccTp v = a[i];
@@ -922,12 +925,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL1(const _Tp* a, int n)
 {
     _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i = 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
     {
         s += (_AccTp)fast_abs(a[i]) + (_AccTp)fast_abs(a[i+1]) +
             (_AccTp)fast_abs(a[i+2]) + (_AccTp)fast_abs(a[i+3]);
     }
+#endif
     for( ; i < n; i++ )
         s += fast_abs(a[i]);
     return s;
@@ -948,12 +953,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
 {
     _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i= 0;
+ #if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
     {
         _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
         s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
     }
+#endif
     for( ; i < n; i++ )
     {
         _AccTp v = a[i] - b[i];
@@ -986,12 +993,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL1(const _Tp* a, const _Tp* b, int n)
 {
     _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i= 0;
+ #if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
     {
         _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
         s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
     }
+#endif
     for( ; i < n; i++ )
     {
         _AccTp v = a[i] - b[i];
@@ -2422,14 +2431,16 @@ template<typename _Tp> inline typename DataType<_Tp>::work_type
 dot(const Vector<_Tp>& v1, const Vector<_Tp>& v2)
 {
     typedef typename DataType<_Tp>::work_type _Tw;
-    size_t i, n = v1.size();
+    size_t i = 0, n = v1.size();
     assert(v1.size() == v2.size());
 
     _Tw s = 0;
     const _Tp *ptr1 = &v1[0], *ptr2 = &v2[0];
-    for( i = 0; i <= n - 4; i += 4 )
+ #if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
         s += (_Tw)ptr1[i]*ptr2[i] + (_Tw)ptr1[i+1]*ptr2[i+1] +
             (_Tw)ptr1[i+2]*ptr2[i+2] + (_Tw)ptr1[i+3]*ptr2[i+3];
+#endif
     for( ; i < n; i++ )
         s += (_Tw)ptr1[i]*ptr2[i];
     return s;
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index e187958..9c46cfa 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -99,7 +99,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
             }
         }
     #endif
-
+#if CV_ENABLE_UNROLLED
         for( ; x <= sz.width - 4; x += 4 )
         {
             T v0 = op(src1[x], src2[x]);
@@ -109,7 +109,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-
+#endif
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
@@ -208,7 +208,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
                 }
         }
 #endif
-
+#if CV_ENABLE_UNROLLED
         for( ; x <= sz.width - 4; x += 4 )
         {
             int v0 = op(src1[x], src2[x]);
@@ -218,7 +218,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-
+#endif
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
@@ -265,6 +265,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
                 }
         }
     #endif
+#if CV_ENABLE_UNROLLED
         for( ; x <= sz.width - 4; x += 4 )
         {
             float v0 = op(src1[x], src2[x]);
@@ -274,7 +275,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-
+#endif
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
@@ -1508,8 +1509,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
     {
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
-            int i;
-            for( i = 0; i <= size.width - 4; i += 4 )
+            int i=0;
+			#if CV_ENABLE_UNROLLED
+            for(; i <= size.width - 4; i += 4 )
             {
                 T t0;
                 T t1;
@@ -1523,7 +1525,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
                 dst[i+2] = t0;
                 dst[i+3] = t1;
             }
-
+            #endif
             for( ; i < size.width; i++ )
                 dst[i] = saturate_cast<T>(src1[i] * src2[i]);
         }
@@ -1532,8 +1534,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
     {
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
-            int i;
-            for( i = 0; i <= size.width - 4; i += 4 )
+            int i = 0;
+			#if CV_ENABLE_UNROLLED
+            for(; i <= size.width - 4; i += 4 )
             {
                 T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
                 T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
@@ -1543,7 +1546,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
                 t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
                 dst[i+2] = t0; dst[i+3] = t1;
             }
-
+            #endif
             for( ; i < size.width; i++ )
                 dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
         }
@@ -1561,6 +1564,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
     {
         int i = 0;
+		#if CV_ENABLE_UNROLLED
         for( ; i <= size.width - 4; i += 4 )
         {
             if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
@@ -1590,7 +1594,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
                 dst[i+2] = z2; dst[i+3] = z3;
             }
         }
-
+        #endif
         for( ; i < size.width; i++ )
             dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
     }
@@ -1606,6 +1610,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
     for( ; size.height--; src2 += step2, dst += step )
     {
         int i = 0;
+		#if CV_ENABLE_UNROLLED
         for( ; i <= size.width - 4; i += 4 )
         {
             if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
@@ -1635,7 +1640,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
                 dst[i+2] = z2; dst[i+3] = z3;
             }
         }
-
+        #endif
         for( ; i < size.width; i++ )
             dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
     }
@@ -1834,6 +1839,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
     {
         int x = 0;
+		#if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )
         {
             T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
@@ -1844,7 +1850,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
             t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
             dst[x+2] = t0; dst[x+3] = t1;
         }
-
+        #endif
         for( ; x < size.width; x++ )
             dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
     }
@@ -1891,6 +1897,7 @@ addWeighted8u( const uchar* src1, size_t step1,
             }
         }
 #endif
+		#if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )
         {
             float t0, t1;
@@ -1906,6 +1913,7 @@ addWeighted8u( const uchar* src1, size_t step1,
             dst[x+2] = saturate_cast<uchar>(t0);
             dst[x+3] = saturate_cast<uchar>(t1);
         }
+        #endif
 
         for( ; x < size.width; x++ )
         {
@@ -1994,6 +2002,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
             int x = 0;
+			#if CV_ENABLE_UNROLLED
             for( ; x <= size.width - 4; x += 4 )
             {
                 int t0, t1;
@@ -2004,7 +2013,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                 t1 = -(src1[x+3] > src2[x+3]) ^ m;
                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
             }
-
+            #endif
             for( ; x < size.width; x++ )
                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
 			   }
@@ -2015,6 +2024,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
             int x = 0;
+			#if CV_ENABLE_UNROLLED
             for( ; x <= size.width - 4; x += 4 )
             {
                 int t0, t1;
@@ -2025,7 +2035,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                 t1 = -(src1[x+3] == src2[x+3]) ^ m;
                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
             }
-
+            #endif
             for( ; x < size.width; x++ )
                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
         }
@@ -2382,6 +2392,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
     for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
     {
         int x = 0;
+		#if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )
         {
             int t0, t1;
@@ -2392,7 +2403,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
             t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
             dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
         }
-
+        #endif
         for( ; x < size.width; x++ )
             dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
     }
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index aaf0f08..945b2fe 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -578,7 +578,8 @@ cvtScaleAbs_( const T* src, size_t sstep,
             dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
     }
 }    
-    
+
+
 template<typename T, typename DT, typename WT> static void
 cvtScale_( const T* src, size_t sstep,
            DT* dst, size_t dstep, Size size,
@@ -590,6 +591,7 @@ cvtScale_( const T* src, size_t sstep,
     for( ; size.height--; src += sstep, dst += dstep )
     {
         int x = 0;
+#if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )
         {
             DT t0, t1;
@@ -600,6 +602,7 @@ cvtScale_( const T* src, size_t sstep,
             t1 = saturate_cast<DT>(src[x+3]*scale + shift);
             dst[x+2] = t0; dst[x+3] = t1;
         }
+#endif
 
         for( ; x < size.width; x++ )
             dst[x] = saturate_cast<DT>(src[x]*scale + shift);
@@ -655,22 +658,54 @@ cvt_( const T* src, size_t sstep,
     for( ; size.height--; src += sstep, dst += dstep )
     {
         int x = 0;
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]);
-            t1 = saturate_cast<DT>(src[x+1]);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]);
-            t1 = saturate_cast<DT>(src[x+3]);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        
+		for( ; x <= size.width - 4; x += 4 )
+		{
+			DT t0, t1;
+			t0 = saturate_cast<DT>(src[x]);
+			t1 = saturate_cast<DT>(src[x+1]);
+			dst[x] = t0; dst[x+1] = t1;
+			t0 = saturate_cast<DT>(src[x+2]);
+			t1 = saturate_cast<DT>(src[x+3]);
+			dst[x+2] = t0; dst[x+3] = t1;
+		}
         for( ; x < size.width; x++ )
             dst[x] = saturate_cast<DT>(src[x]);
     }
 }
 
+//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
+template<> static void
+cvt_<float, short>( const float* src, size_t sstep,
+     short* dst, size_t dstep, Size size )
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+ 
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+		#if CV_SSE2
+		if(USE_SSE2){
+			  for( ; x <= size.width - 8; x += 8 )
+			{
+				__m128 src128 = _mm_loadu_ps (src + x);
+				__m128i src_int128 = _mm_cvtps_epi32 (src128);
+	
+				src128 = _mm_loadu_ps (src + x + 4); 
+				__m128i src1_int128 = _mm_cvtps_epi32 (src128);
+				
+				src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
+				_mm_storeu_si128((__m128i*)(dst + x),src1_int128);
+			}
+		}
+        #endif
+        for( ; x < size.width; x++ )
+            dst[x] = (src[x]);
+    }
+
+}
+ 
+
 template<typename T> static void
 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
 {
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 43f601c..c0d0501 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -59,6 +59,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
         const T* src = (const T*)_src;
         T* dst = (T*)_dst;
         int x = 0;
+		 #if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )
         {
             if( mask[x] )
@@ -70,6 +71,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
             if( mask[x+3] )
                 dst[x+3] = src[x+3];
         }
+        #endif
         for( ; x < size.width; x++ )
             if( mask[x] )
                 dst[x] = src[x];
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 95c7529..4122b82 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -741,7 +741,9 @@ MatrAXPY( int m, int n, const T1* x, int dx,
     for( i = 0; i < m; i++, x += dx, y += dy )
     {
         T2 s = a[i*inca];
-        for( j = 0; j <= n - 4; j += 4 )
+		j=0;
+		 #if CV_ENABLE_UNROLLED
+        for(; j <= n - 4; j += 4 )
         {
             T3 t0 = (T3)(y[j]   + s*x[j]);
             T3 t1 = (T3)(y[j+1] + s*x[j+1]);
@@ -752,7 +754,7 @@ MatrAXPY( int m, int n, const T1* x, int dx,
             y[j+2] = t0;
             y[j+3] = t1;
         }
-        
+        #endif
         for( ; j < n; j++ )
             y[j] = (T3)(y[j] + s*x[j]);
     }
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index 409bbd4..fc655b6 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -63,7 +63,9 @@ GEMM_CopyBlock( const uchar* src, size_t src_step,
 
     for( ; size.height--; src += src_step, dst += dst_step )
     {
-        for( j = 0; j <= size.width - 4; j += 4 )
+		j=0;
+         #if CV_ENABLE_UNROLLED
+        for( ; j <= size.width - 4; j += 4 )
         {
             int t0 = ((const int*)src)[j];
             int t1 = ((const int*)src)[j+1];
@@ -74,7 +76,7 @@ GEMM_CopyBlock( const uchar* src, size_t src_step,
             ((int*)dst)[j+2] = t0;
             ((int*)dst)[j+3] = t1;
         }
-
+        #endif
         for( ; j < size.width; j++ )
             ((int*)dst)[j] = ((const int*)src)[j];
     }
@@ -237,15 +239,16 @@ GEMMSingleMul( const T* a_data, size_t a_step,
                                                c_data += c_step1 )
             {
                 WT s0(0), s1(0), s2(0), s3(0);
-
-                for( k = 0; k <= n - 4; k += 4 )
+                k = 0;
+                 #if CV_ENABLE_UNROLLED
+                for( ; k <= n - 4; k += 4 )
                 {
                     s0 += WT(a_data[k])*WT(b_data[k]);
                     s1 += WT(a_data[k+1])*WT(b_data[k+1]);
                     s2 += WT(a_data[k+2])*WT(b_data[k+2]);
                     s3 += WT(a_data[k+3])*WT(b_data[k+3]);
                 }
-
+                #endif
                 for( ; k < n; k++ )
                     s0 += WT(a_data[k])*WT(b_data[k]);
                 s0 = (s0+s1+s2+s3)*alpha;
@@ -342,8 +345,9 @@ GEMMSingleMul( const T* a_data, size_t a_step,
             for( k = 0; k < n; k++, b_data += b_step )
             {
                 WT al(a_data[k]);
-
-                for( j = 0; j <= m - 4; j += 4 )
+				j=0;
+                 #if CV_ENABLE_UNROLLED
+                for(; j <= m - 4; j += 4 )
                 {
                     WT t0 = d_buf[j] + WT(b_data[j])*al;
                     WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
@@ -354,7 +358,7 @@ GEMMSingleMul( const T* a_data, size_t a_step,
                     d_buf[j+2] = t0;
                     d_buf[j+3] = t1;
                 }
-
+                #endif
                 for( ; j < m; j++ )
                     d_buf[j] += WT(b_data[j])*al;
             }
@@ -509,7 +513,9 @@ GEMMStore( const T* c_data, size_t c_step,
         if( _c_data )
         {
             c_data = _c_data;
-            for( j = 0; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
+			j=0;
+			 #if CV_ENABLE_UNROLLED
+            for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
             {
                 WT t0 = alpha*d_buf[j];
                 WT t1 = alpha*d_buf[j+1];
@@ -524,6 +530,7 @@ GEMMStore( const T* c_data, size_t c_step,
                 d_data[j+2] = T(t0);
                 d_data[j+3] = T(t1);
             }
+            #endif
             for( ; j < d_size.width; j++, c_data += c_step1 )
             {
                 WT t0 = alpha*d_buf[j];
@@ -532,7 +539,9 @@ GEMMStore( const T* c_data, size_t c_step,
         }
         else
         {
-            for( j = 0; j <= d_size.width - 4; j += 4 )
+			j = 0;
+			 #if CV_ENABLE_UNROLLED
+            for( ; j <= d_size.width - 4; j += 4 )
             {
                 WT t0 = alpha*d_buf[j];
                 WT t1 = alpha*d_buf[j+1];
@@ -543,6 +552,7 @@ GEMMStore( const T* c_data, size_t c_step,
                 d_data[j+2] = T(t0);
                 d_data[j+3] = T(t1);
             }
+			#endif
             for( ; j < d_size.width; j++ )
                 d_data[j] = T(alpha*d_buf[j]);
         }
@@ -1987,6 +1997,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
     }
     else
 #endif
+    //vz why do we need unroll here?
     for( ; i <= len - 4; i += 4 )
     {
         float t0, t1;
@@ -1997,7 +2008,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
         t1 = src1[i+3]*alpha + src2[i+3];
         dst[i+2] = t0; dst[i+3] = t1;
     }
-    for( ; i < len; i++ )
+	for(; i < len; i++ )
         dst[i] = src1[i]*alpha + src2[i];
 }
 
@@ -2024,6 +2035,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
     }
     else
 #endif
+     //vz why do we need unroll here? 
     for( ; i <= len - 4; i += 4 )
     {
         double t0, t1;
@@ -2034,7 +2046,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
         t1 = src1[i+3]*alpha + src2[i+3];
         dst[i+2] = t0; dst[i+3] = t1;
     }
-    for( ; i < len; i++ )
+	for(; i < len; i++ )
         dst[i] = src1[i]*alpha + src2[i];
 }
 
@@ -2198,9 +2210,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
         for( i = 0; i < len; i++, mat += matstep )
         {
             double row_sum = 0;
-            for( j = 0; j <= len - 4; j += 4 )
+            j = 0;
+			 #if CV_ENABLE_UNROLLED
+            for(; j <= len - 4; j += 4 )
                 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
                            diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
+            #endif
             for( ; j < len; j++ )
                 row_sum += diff[j]*mat[j];
             result += row_sum * diff[i];
@@ -2226,9 +2241,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
         for( i = 0; i < len; i++, mat += matstep )
         {
             double row_sum = 0;
-            for( j = 0; j <= len - 4; j += 4 )
+            j = 0;
+			 #if CV_ENABLE_UNROLLED
+            for(; j <= len - 4; j += 4 )
                 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
                            diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
+            #endif
             for( ; j < len; j++ )
                 row_sum += diff[j]*mat[j];
             result += row_sum * diff[i];
@@ -2574,9 +2592,11 @@ dotProd_(const T* src1, const T* src2, int len)
 {
     int i = 0;
     double result = 0;
+	 #if CV_ENABLE_UNROLLED
     for( ; i <= len - 4; i += 4 )
         result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
             (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
+    #endif
     for( ; i < len; i++ )
         result += (double)src1[i]*src2[i];
 
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 6c53e64..cf1bbfe 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1657,9 +1657,10 @@ namespace cv
 template<typename T> static void
 transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
 {
-    int i, j, m = sz.width, n = sz.height;
-    
-    for( i = 0; i <= m - 4; i += 4 )
+    int i=0, j, m = sz.width, n = sz.height;
+
+	#if CV_ENABLE_UNROLLED
+    for(; i <= m - 4; i += 4 )
     {
         T* d0 = (T*)(dst + dstep*i);
         T* d1 = (T*)(dst + dstep*(i+1));
@@ -1685,12 +1686,13 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
             d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
         }
     }
-    
+    #endif
     for( ; i < m; i++ )
     {
         T* d0 = (T*)(dst + dstep*i);
-        
-        for( j = 0; j <= n - 4; j += 4 )
+        j = 0;
+		#if CV_ENABLE_UNROLLED
+        for(; j <= n - 4; j += 4 )
         {
             const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
             const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
@@ -1699,7 +1701,7 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
             
             d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
         }
-        
+        #endif
         for( ; j < n; j++ )
         {
             const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
@@ -1878,7 +1880,9 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
     for( ; --size.height; )
     {
         src += srcstep;
-        for( i = 0; i <= size.width - 4; i += 4 )
+        i = 0;
+		#if CV_ENABLE_UNROLLED
+        for(; i <= size.width - 4; i += 4 )
         {
             WT s0, s1;
             s0 = op(buf[i], (WT)src[i]);
@@ -1889,7 +1893,7 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
             s1 = op(buf[i+3], (WT)src[i+3]);
             buf[i+2] = s0; buf[i+3] = s1;
         }
-
+        #endif
         for( ; i < size.width; i++ )
             buf[i] = op(buf[i], (WT)src[i]);
     }
@@ -2467,7 +2471,9 @@ double cv::kmeans( InputArray _data, int K,
                     sample = data.ptr<float>(i);
                     k = labels[i];
                     float* center = centers.ptr<float>(k);
-                    for( j = 0; j <= dims - 4; j += 4 )
+					j=0;
+					#if CV_ENABLE_UNROLLED
+                    for(; j <= dims - 4; j += 4 )
                     {
                         float t0 = center[j] + sample[j];
                         float t1 = center[j+1] + sample[j+1];
@@ -2481,6 +2487,7 @@ double cv::kmeans( InputArray _data, int K,
                         center[j+2] = t0;
                         center[j+3] = t1;
                     }
+                    #endif
                     for( ; j < dims; j++ )
                         center[j] += sample[j];
                     counters[k]++;
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 48f8e5a..3d6348b 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -65,13 +65,16 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
     const T* src = src0;
     if( !mask )
     {
-        int i;
+        int i=0;
         int k = cn % 4;
         if( k == 1 )
         {
             ST s0 = dst[0];
-            for( i = 0; i <= len - 4; i += 4, src += cn*4 )
+
+			#if CV_ENABLE_UNROLLED
+            for(; i <= len - 4; i += 4, src += cn*4 )
                 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
+            #endif
             for( ; i < len; i++, src += cn )
                 s0 += src[0];
             dst[0] = s0;
@@ -151,6 +154,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
             if( mask[i] )
             {
                 int k = 0;
+				#if CV_ENABLE_UNROLLED
                 for( ; k <= cn - 4; k += 4 )
                 {
                     ST s0, s1;
@@ -161,6 +165,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
                     s1 = dst[k+3] + src[k+3];
                     dst[k+2] = s0; dst[k+3] = s1;
                 }
+                #endif
                 for( ; k < cn; k++ )
                     dst[k] += src[k];
                 nzm++;
@@ -205,9 +210,11 @@ static SumFunc sumTab[] =
 template<typename T>
 static int countNonZero_(const T* src, int len )
 {
-    int i, nz = 0;
-    for( i = 0; i <= len - 4; i += 4 )
+    int i=0, nz = 0;
+	#if CV_ENABLE_UNROLLED
+    for(; i <= len - 4; i += 4 )
         nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
+    #endif
     for( ; i < len; i++ )
         nz += src[i] != 0;
     return nz;
@@ -826,14 +833,15 @@ float normL2Sqr_(const float* a, const float* b, int n)
     }
     else
 #endif
-    {
+    //vz why do we need unroll here? no sse = no need to unroll
+	{
         for( ; j <= n - 4; j += 4 )
         {
             float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
             d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
         }
     }
-    
+ 
     for( ; j < n; j++ )
     {
         float t = a[j] - b[j];
@@ -866,6 +874,7 @@ float normL1_(const float* a, const float* b, int n)
     }
     else
 #endif
+     //vz no need to unroll here - if no sse
     {
         for( ; j <= n - 4; j += 4 )
         {
@@ -873,7 +882,7 @@ float normL1_(const float* a, const float* b, int n)
                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
         }
     }
-    
+
     for( ; j < n; j++ )
         d += std::abs(a[j] - b[j]);
     return d;
@@ -906,6 +915,7 @@ int normL1_(const uchar* a, const uchar* b, int n)
     }
     else
 #endif
+     //vz why do we need unroll here? no sse = no unroll
     {
         for( ; j <= n - 4; j += 4 )
         {
@@ -913,7 +923,6 @@ int normL1_(const uchar* a, const uchar* b, int n)
                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
         }
     }
-    
     for( ; j < n; j++ )
         d += std::abs(a[j] - b[j]);
     return d;
@@ -997,9 +1006,11 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
     else
         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
     int i = 0, result = 0;
+	#if CV_ENABLE_UNROLLED
     for( ; i <= n - 4; i += 4 )
         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    #endif
     for( ; i < n; i++ )
         result += tab[a[i] ^ b[i]];
     return result;
-- 
2.7.4