r0 = op16(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
r1 = op16(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 8)));
_mm_storeu_si128((__m128i*)(dst + x), r0);
- _mm_storeu_si128((__m128i*)(dst + x + 16), r1);
+ _mm_storeu_si128((__m128i*)(dst + x + 8), r1);
}
for( ; x <= sz.width - 4; x += 4 )
{
r0 = op32(r0,_mm_load_si128((const __m128i*)(src2 + x)));
r1 = op32(r1,_mm_load_si128((const __m128i*)(src2 + x + 4)));
_mm_store_si128((__m128i*)(dst + x), r0);
- _mm_store_si128((__m128i*)(dst + x + 16), r1);
+ _mm_store_si128((__m128i*)(dst + x + 4), r1);
}
else
for( ; x <= sz.width - 8; x += 8 )
r0 = op32(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
r1 = op32(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 4)));
_mm_storeu_si128((__m128i*)(dst + x), r0);
- _mm_storeu_si128((__m128i*)(dst + x + 16), r1);
+ _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
}
}
#endif
struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }};
struct _VOr8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }};
struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }};
-struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { return _mm_andnot_si128(_mm_setzero_si128(),a); }};
+struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { return _mm_xor_si128(_mm_set1_epi32(-1),a); }};
#endif
for( i = 0; i < n; i++ )
{
- for( k = 0, s = 0; k < m; k++ )
+ for( k = 0, sd = 0; k < m; k++ )
{
_Tp t = At[i*astep + k];
- s += (double)t*t;
+ sd += (double)t*t;
}
- W[i] = s;
+ W[i] = sd;
if( Vt )
{
#if CV_SSE2
if( USE_SSE2 )
{
- int j, len0 = len & -4, blockSize0 = (1 << 15), blockSize;
+ int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize;
__m128i z = _mm_setzero_si128();
while( i < len0 )
{
blockSize = std::min(len0 - i, blockSize0);
__m128i s = _mm_setzero_si128();
- for( j = 0; j <= blockSize - 16; j += 16 )
+ j = 0;
+ for( ; j <= blockSize - 16; j += 16 )
{
__m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j));
__m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j));
s1 = _mm_unpacklo_epi8(b1, z);
s3 = _mm_unpackhi_epi8(b1, z);
s0 = _mm_madd_epi16(s0, s1);
- s1 = _mm_madd_epi16(s2, s3);
+ s2 = _mm_madd_epi16(s2, s3);
s = _mm_add_epi32(s, s0);
s = _mm_add_epi32(s, s2);
}
if( USE_SSE2 )
{
float CV_DECL_ALIGNED(16) buf[4];
- static const float CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+ static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
- __m128 absmask = _mm_load_ps(absbuf);
+ __m128 absmask = _mm_load_ps((const float*)absbuf);
for( ; j <= n - 8; j += 8 )
{
CvMat _w = w, _wdb = wdb;
// use exactly the same threshold as in icvSVD... ,
// so the changes in the library and here should be synchronized.
- double threshold = cv::sum(w)[0]*(is_float ? FLT_EPSILON*10 : DBL_EPSILON*2);
+ double threshold = cv::sum(w)[0]*(DBL_EPSILON*2);//(is_float ? FLT_EPSILON*10 : DBL_EPSILON*2);
wdb = Scalar::all(0);
for( i = 0; i < min_size; i++ )