for( ; x < size.width; x++ )
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
- }
+ }
}
else if( code == CMP_EQ || code == CMP_NE )
{
static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, Size size, void* _cmpop)
{
- cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+ //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+ int code = *(int*)_cmpop;
+ step1 /= sizeof(src1[0]);
+ step2 /= sizeof(src2[0]);
+ if( code == CMP_GE || code == CMP_LT )
+ {
+ std::swap(src1, src2);
+ std::swap(step1, step2);
+ code = code == CMP_GE ? CMP_LE : CMP_GT;
+ }
+
+ if( code == CMP_GT || code == CMP_LE )
+ {
+ int m = code == CMP_GT ? 0 : 255;
+ #if CV_SSE2
+ __m128i m128, c128;
+ if( USE_SSE2 ){
+ m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (0xff);
+ c128 = _mm_set1_epi8 (0x7f);
+ }
+ #endif
+ for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ {
+ int x =0;
+ #if CV_SSE2
+ if( USE_SSE2 ){
+ for( ; x <= size.width - 16; x += 16 )
+ {
+ __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+ __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+ // no simd for 8u comparison, that's why we need the trick
+ r00 = _mm_sub_epi8(r00,c128);
+ r10 = _mm_sub_epi8(r10,c128);
+
+ r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
+ _mm_storeu_si128((__m128i*)(dst + x),r00);
+
+ }
+ }
+ #endif
+
+ for( ; x < size.width; x++ ){
+ dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
+ }
+ }
+ }
+ else if( code == CMP_EQ || code == CMP_NE )
+ {
+ int m = code == CMP_EQ ? 0 : 255;
+ #if CV_SSE2
+ __m128i m128;
+ if( USE_SSE2 ){
+ m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (0xff);
+ }
+ #endif
+ for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ {
+ int x = 0;
+ #if CV_SSE2
+ if( USE_SSE2 ){
+ for( ; x <= size.width - 16; x += 16 )
+ {
+ __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+ __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+ r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
+ _mm_storeu_si128((__m128i*)(dst + x), r00);
+ }
+ }
+ #endif
+ for( ; x < size.width; x++ )
+ dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
+ }
+ }
}
static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
uchar* dst, size_t step, Size size, void* _cmpop)
{
- cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+ //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+
+ int code = *(int*)_cmpop;
+ step1 /= sizeof(src1[0]);
+ step2 /= sizeof(src2[0]);
+ if( code == CMP_GE || code == CMP_LT )
+ {
+ std::swap(src1, src2);
+ std::swap(step1, step2);
+ code = code == CMP_GE ? CMP_LE : CMP_GT;
+ }
+
+ if( code == CMP_GT || code == CMP_LE )
+ {
+ int m = code == CMP_GT ? 0 : 255;
+ #if CV_SSE2
+ __m128i m128;
+ if( USE_SSE2 ){
+ m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (0xffff);
+ }
+ #endif
+ for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ {
+ int x =0;
+ #if CV_SSE2
+ if( USE_SSE2){//
+ for( ; x <= size.width - 16; x += 16 )
+ {
+ __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+ __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+ r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
+ __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
+ __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
+ r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
+ r11 = _mm_packs_epi16(r00, r01);
+ _mm_storeu_si128((__m128i*)(dst + x), r11);
+ }
+ if( x <= size.width-8)
+ {
+ __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+ __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+ r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
+ r10 = _mm_packs_epi16(r00, r00);
+ _mm_storel_epi64((__m128i*)(dst + x), r10);
+
+ x += 8;
+ }
+ }
+ #endif
+
+ for( ; x < size.width; x++ ){
+ dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
+ }
+ }
+ }
+ else if( code == CMP_EQ || code == CMP_NE )
+ {
+ int m = code == CMP_EQ ? 0 : 255;
+ #if CV_SSE2
+ __m128i m128;
+ if( USE_SSE2 ){
+ m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (0xffff);
+ }
+ #endif
+ for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+ {
+ int x = 0;
+ #if CV_SSE2
+ if( USE_SSE2 ){
+ for( ; x <= size.width - 16; x += 16 )
+ {
+ __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+ __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+ r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
+ __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
+ __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
+ r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
+ r11 = _mm_packs_epi16(r00, r01);
+ _mm_storeu_si128((__m128i*)(dst + x), r11);
+ }
+ if( x <= size.width - 8)
+ {
+ __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+ __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+ r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
+ r10 = _mm_packs_epi16(r00, r00);
+ _mm_storel_epi64((__m128i*)(dst + x), r10);
+
+ x += 8;
+ }
+ }
+ #endif
+ for( ; x < size.width; x++ )
+ dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
+ }
+ }
}
static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
}
}
+//vz optimized template specialization
+template<> static void
+cvtScale_<short, short, float>( const short* src, size_t sstep,
+ short* dst, size_t dstep, Size size,
+ float scale, float shift )
+{
+ sstep /= sizeof(src[0]);
+ dstep /= sizeof(dst[0]);
+
+ #if CV_SSE2
+ __m128 scale128, shift128;
+ if(USE_SSE2){
+ scale128 = _mm_set1_ps (scale);
+ shift128 = _mm_set1_ps (shift);
+ }
+ #endif
+
+ for( ; size.height--; src += sstep, dst += dstep )
+ {
+ int x = 0;
+ #if CV_SSE2
+ if(USE_SSE2)
+ {
+ for(; x <= size.width - 8; x += 8 )
+ {
+ __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
+ __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
+ __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
+ __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
+ rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
+ rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
+ r0 = _mm_cvtps_epi32(rf0);
+ r1 = _mm_cvtps_epi32(rf1);
+ r0 = _mm_packs_epi32(r0, r1);
+ _mm_storeu_si128((__m128i*)(dst + x), r0);
+ }
+ }
+ #endif
+
+ for(; x < size.width; x++ )
+ dst[x] = saturate_cast<short>(src[x]*scale + shift);
+ }
+}
+
template<typename T, typename DT> static void
cvt_( const T* src, size_t sstep,