# else
# define CV_SSSE3 0
# endif
-#else
+# if defined __SSE4_1__ || _MSC_VER >= 1600
+# include <smmintrin.h>
+# define CV_SSE4_1 1
+# endif
+# if defined __SSE4_2__ || _MSC_VER >= 1600
+# include <nmmintrin.h>
+# define CV_SSE4_2 1
+# endif
+# if defined __AVX__ || _MSC_VER >= 1600
+# include <immintrin.h>
+# define CV_AVX 1
+# endif
+# else
# define CV_SSE 0
# define CV_SSE2 0
# define CV_SSE3 0
# define CV_SSSE3 0
-#endif
+# define CV_SSE4_1 0
+# define CV_SSE4_2 0
+# define CV_AVX 0
+# endif
#if defined ANDROID && defined __ARM_NEON__
# include "arm_neon.h"
#endif //__cplusplus
-#endif // __OPENCV_CORE_INTERNAL_HPP__
+#endif // __OPENCV_CORE_INTERNAL_HPP__
\ No newline at end of file
}
}
+template<> void
+cvtScale_<short, int, float>( const short* src, size_t sstep,
+ int* dst, size_t dstep, Size size,
+ float scale, float shift )
+{
+ sstep /= sizeof(src[0]);
+ dstep /= sizeof(dst[0]);
+
+ for( ; size.height--; src += sstep, dst += dstep )
+ {
+ int x = 0;
+
+ #if CV_SSE2
+ if(USE_SSE2)//~5X
+ {
+ __m128 scale128 = _mm_set1_ps (scale);
+ __m128 shift128 = _mm_set1_ps (shift);
+ for(; x <= size.width - 8; x += 8 )
+ {
+ __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
+ __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
+ __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
+ __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
+ rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
+ rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
+ r0 = _mm_cvtps_epi32(rf0);
+ r1 = _mm_cvtps_epi32(rf1);
+
+ _mm_storeu_si128((__m128i*)(dst + x), r0);
+ _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
+ }
+ }
+ #endif
+
+ //We will wait Haswell
+ /*
+ #if CV_AVX
+ if(USE_AVX)//2X - bad variant
+ {
+ ////TODO:AVX implementation (optimization?) required
+ __m256 scale256 = _mm256_set1_ps (scale);
+ __m256 shift256 = _mm256_set1_ps (shift);
+ for(; x <= size.width - 8; x += 8 )
+ {
+ __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
+ __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
+ __m256i res = _mm256_cvtps_epi32(r0);
+ _mm256_storeu_si256 ((__m256i*)(dst+x), res);
+ }
+ }
+ #endif*/
+
+ for(; x < size.width; x++ )
+ dst[x] = saturate_cast<int>(src[x]*scale + shift);
+ }
+}
template<typename T, typename DT> static void
cvt_( const T* src, size_t sstep,
cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
}
-/* End of file. */
+/* End of file. */
\ No newline at end of file
}
}
+template<> static void
+copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
+{
+ for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
+ {
+ const uchar* src = (const uchar*)_src;
+ uchar* dst = (uchar*)_dst;
+ int x = 0;
+ #if CV_SSE4_2
+ if(USE_SSE4_2)//
+ {
+ __m128i zero = _mm_setzero_si128 ();
+
+ for( ; x <= size.width - 16; x += 16 )
+ {
+ const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
+ __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x));
+ __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x));
+ __m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
+ rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
+ _mm_storeu_si128((__m128i*)(dst + x), rDst);
+ }
+ }
+ #endif
+ for( ; x < size.width; x++ )
+ if( mask[x] )
+ dst[x] = src[x];
+ }
+}
+
+template<> static void
+copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
+{
+ for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
+ {
+ const ushort* src = (const ushort*)_src;
+ ushort* dst = (ushort*)_dst;
+ int x = 0;
+ #if CV_SSE4_2
+ if(USE_SSE4_2)//
+ {
+ __m128i zero = _mm_setzero_si128 ();
+ for( ; x <= size.width - 8; x += 8 )
+ {
+ const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x));
+ __m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x));
+ _mask = _mm_unpacklo_epi8(_mask, _mask);
+ __m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x));
+ __m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
+ rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
+ _mm_storeu_si128((__m128i*)(dst + x), rDst);
+ }
+ }
+ #endif
+ for( ; x < size.width; x++ )
+ if( mask[x] )
+ dst[x] = src[x];
+ }
+}
+
static void
copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
{
cv::repeat(src, dst.rows/src.rows, dst.cols/src.cols, dst);
}
-/* End of file. */
+/* End of file. */
\ No newline at end of file
if( type == CV_32FC1 )
{
double d = det2(Sf);
+ #if CV_SSE4_2
+ if(USE_SSE4_2)
+ {
+ __m128 zero = _mm_setzero_ps();
+ __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
+ __m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1)
+ __m128 s0 = _mm_blend_ps(t0,t1,12);
+ d = 1./d;
+ result = true;
+ __m128 det =_mm_set1_ps((float)d);
+ s0 = _mm_mul_ps(s0, det);
+ const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
+ __m128 pattern = _mm_load_ps((const float*)inv);
+ s0 = _mm_xor_ps(s0, pattern);//==-1*s0
+ s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
+ _mm_storel_pi((__m64*)dstdata, s0);
+ _mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
+ }
+ #else
if( d != 0. )
{
double t0, t1;
t0 = -Sf(0,1)*d;
t1 = -Sf(1,0)*d;
Df(0,1) = (float)t0;
- Df(1,0) = (float)t1;
+ Df(1,0) = (float)t1;
}
+ #endif
}
else
{
- double d = det2(Sd);
+ double d = det2(Sd);
+ #if CV_SSE2
+ if(USE_SSE2)
+ {
+ __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
+ __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
+ __m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal
+ __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal
+ result = true;
+ d = 1./d;
+ __m128d det = _mm_load1_pd((const double*)&d);
+ sm = _mm_mul_pd(sm, det);
+ //__m128d pattern = _mm_set1_pd(-1.);
+ static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
+ __m128d pattern = _mm_load1_pd((double*)inv);
+ ss = _mm_mul_pd(ss, det);
+ ss = _mm_xor_pd(ss, pattern);//==-1*ss
+ //ss = _mm_mul_pd(ss,pattern);
+ s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
+ s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
+ _mm_store_pd((double*)dstdata, s0);
+ _mm_store_pd((double*)(dstdata+dststep), s1);
+ }
+ #else
if( d != 0. )
{
double t0, t1;
Dd(0,1) = t0;
Dd(1,0) = t1;
}
+ #endif
}
}
else if( n == 3 )
return result;
}
+
/****************************************************************************************\
* Solving a linear system *
\****************************************************************************************/
cv::SVD::backSubst(w, u, v, rhs, dst);
CV_Assert( dst.data == dst0.data );
-}
+}
\ No newline at end of file
};
extern volatile bool USE_SSE2;
+extern volatile bool USE_SSE4_2;
+extern volatile bool USE_AVX;
enum { BLOCK_SIZE = 1024 };
return nz;
}
+template <>
+int countNonZero_ <uchar> (const uchar* src, int len)
+{
+ int i=0, nz = 0;
+ #if CV_SSE4_2
+ if(USE_SSE4_2)//5x-6x
+ {
+ __m128i pattern = _mm_setzero_si128 ();
+ __m128i inv = _mm_set1_epi8((char)1);
+ __int64 CV_DECL_ALIGNED(16) buf[2];
+ for (; i<=len-16; i+=16)
+ {
+ __m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i));
+ __m128i res = _mm_cmpeq_epi8(r0, pattern);
+ res = _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001
+ _mm_store_si128 ((__m128i*)buf, res);
+
+ __int64 countLow = _mm_popcnt_u64(buf[0]);
+ nz += countLow;
+
+ __int64 countHigh = _mm_popcnt_u64(buf[1]);
+ nz +=countHigh;
+ }
+ }
+ #endif
+ for( ; i < len; i++ )
+ nz += src[i] != 0;
+ return nz;
+}
+
static int countNonZero8u( const uchar* src, int len )
{ return countNonZero_(src, len); }
cv::extractImageCOI(imgB, b);
return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
-}
+}
\ No newline at end of file
#endif
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
+volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
+volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
void setUseOptimized( bool flag )
{
}
#endif
-/* End of file. */
+/* End of file. */
\ No newline at end of file