i += v_uint32::nlanes;
}
}
+ vx_cleanup();
return i;
}
}
}
+ vx_cleanup();
return i;
}
int operator()(const uchar** _src, uchar* dst, int width) const
{
int _ksize = kernel.rows + kernel.cols - 1;
+ if( _ksize == 1 )
+ return 0;
int ksize2 = _ksize/2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0, k;
v_float32 d4 = vx_setall_f32(delta);
if( symmetrical )
{
- if (_ksize == 1)
- return 0;
v_float32 f0 = vx_setall_f32(ky[0]);
+ v_float32 f1 = vx_setall_f32(ky[1]);
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
const int* S = src[0] + i;
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
- for( k = 1; k <= ksize2; k++ )
+ const int* S0 = src[1] + i;
+ const int* S1 = src[-1] + i;
+ s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
+ s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
+ s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2);
+ s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3);
+ for( k = 2; k <= ksize2; k++ )
{
v_float32 f = vx_setall_f32(ky[k]);
- const int* S0 = src[k] + i;
- const int* S1 = src[-k] + i;
+ S0 = src[k] + i;
+ S1 = src[-k] + i;
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
const int* S = src[0] + i;
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
- for( k = 1; k <= ksize2; k++ )
+ const int* S0 = src[1] + i;
+ const int* S1 = src[-1] + i;
+ s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
+ s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
+ for( k = 2; k <= ksize2; k++ )
{
v_float32 f = vx_setall_f32(ky[k]);
- const int* S0 = src[k] + i;
- const int* S1 = src[-k] + i;
+ S0 = src[k] + i;
+ S1 = src[-k] + i;
s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
}
#endif
{
v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
- for( k = 1; k <= ksize2; k++ )
+ s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0);
+ for( k = 2; k <= ksize2; k++ )
s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
v_int32x4 s32 = v_round(s0);
v_int16x8 s16 = v_pack(s32, s32);
}
else
{
+ v_float32 f1 = vx_setall_f32(ky[1]);
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
- v_float32 s0 = d4;
- v_float32 s1 = d4;
- v_float32 s2 = d4;
- v_float32 s3 = d4;
- for ( k = 1; k <= ksize2; k++ )
+ const int* S0 = src[1] + i;
+ const int* S1 = src[-1] + i;
+ v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
+ v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4);
+ v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4);
+ for ( k = 2; k <= ksize2; k++ )
{
v_float32 f = vx_setall_f32(ky[k]);
- const int* S0 = src[k] + i;
- const int* S1 = src[-k] + i;
+ S0 = src[k] + i;
+ S1 = src[-k] + i;
s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
}
if( i <= width - v_uint16::nlanes )
{
- v_float32 s0 = d4;
- v_float32 s1 = d4;
- for ( k = 1; k <= ksize2; k++ )
+ const int* S0 = src[1] + i;
+ const int* S1 = src[-1] + i;
+ v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
+ for ( k = 2; k <= ksize2; k++ )
{
v_float32 f = vx_setall_f32(ky[k]);
- const int* S0 = src[k] + i;
- const int* S1 = src[-k] + i;
+ S0 = src[k] + i;
+ S1 = src[-k] + i;
s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
}
if( i <= width - v_int32x4::nlanes )
#endif
{
- v_float32x4 s0 = v_setall_f32(delta);
- for (k = 1; k <= ksize2; k++)
+ v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta));
+ for (k = 2; k <= ksize2; k++)
s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
v_int32x4 s32 = v_round(s0);
v_int16x8 s16 = v_pack(s32, s32);
}
}
+ vx_cleanup();
return i;
}
short* dst = (short*)_dst;
v_float32 df4 = vx_setall_f32(delta);
- v_int32 d4 = v_round(df4);
+ int d = cvRound(delta);
+ v_int16 d8 = vx_setall_s16((short)d);
if( symmetrical )
{
if( ky[0] == 2 && ky[1] == 1 )
{
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
+ v_int32 s0 = vx_load(S1 + i);
+ v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
+ v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
+ v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
+ v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8);
+ v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2),
+ vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8);
+ }
+ if( i <= width - v_int16::nlanes )
{
v_int32 sl = vx_load(S1 + i);
v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
- v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh)));
+ v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8);
+ i += v_int16::nlanes;
}
if( i <= width - v_int32::nlanes )
{
v_int32 s = vx_load(S1 + i);
- v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s));
+ v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s));
i += v_int32::nlanes;
}
}
else if( ky[0] == -2 && ky[1] == 1 )
{
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
+ v_int32 s0 = vx_load(S1 + i);
+ v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
+ v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
+ v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
+ v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0),
+ vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8);
+ v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2),
+ vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8);
+ }
+ if( i <= width - v_int16::nlanes )
{
v_int32 sl = vx_load(S1 + i);
v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
- v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh)));
+ v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8);
+ i += v_int16::nlanes;
}
if( i <= width - v_int32::nlanes )
{
v_int32 s = vx_load(S1 + i);
- v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s));
+ v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s));
i += v_int32::nlanes;
}
}
+#if CV_NEON
else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) )
{
v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_int32 d4 = vx_setall_s32(d);
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+ v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)),
+ v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4))));
+ }
+ if( i <= width - v_int16::nlanes )
+ {
+ v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+ v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+ i += v_int16::nlanes;
+ }
if( i <= width - v_int32::nlanes )
{
v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
i += v_int32::nlanes;
}
}
+#endif
else
{
v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
+ v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+ v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+ v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))),
+ v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4)))));
+ }
+ if( i <= width - v_int16::nlanes )
+ {
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+ i += v_int16::nlanes;
+ }
if( i <= width - v_int32::nlanes )
{
v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
{
if( ky[1] < 0 )
std::swap(S0, S2);
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
- v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4));
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
+ v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
+ v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8);
+ }
+ if( i <= width - v_int16::nlanes )
+ {
+ v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
+ i += v_int16::nlanes;
+ }
if( i <= width - v_int32::nlanes )
{
- v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+ v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d));
i += v_int32::nlanes;
}
}
else
{
v_float32 k1 = vx_setall_f32(ky[1]);
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
+ v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
+ v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+ v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)),
+ v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4))));
+ }
+ if( i <= width - v_int16::nlanes )
+ {
v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+ i += v_int16::nlanes;
+ }
if( i <= width - v_int32::nlanes )
{
v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
}
}
+ vx_cleanup();
return i;
}
const float* _kx = kernel.ptr<float>();
width *= cn;
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+ {
+ const short* src = (const short*)_src + i;
+ v_float32 s0 = vx_setzero_f32();
+ v_float32 s1 = vx_setzero_f32();
+ v_float32 s2 = vx_setzero_f32();
+ v_float32 s3 = vx_setzero_f32();
+ for( k = 0; k < _ksize; k++, src += cn )
+ {
+ v_float32 f = vx_setall_f32(_kx[k]);
+ v_int16 xl = vx_load(src);
+ v_int16 xh = vx_load(src + v_int16::nlanes);
+ s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0);
+ s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1);
+ s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2);
+ s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ v_store(dst + i + 2*v_float32::nlanes, s2);
+ v_store(dst + i + 3*v_float32::nlanes, s3);
+ }
+ if( i <= width - v_int16::nlanes )
{
const short* src = (const short*)_src + i;
v_float32 s0 = vx_setzero_f32();
v_float32 s1 = vx_setzero_f32();
for( k = 0; k < _ksize; k++, src += cn )
{
+ v_float32 f = vx_setall_f32(_kx[k]);
v_int16 x = vx_load(src);
- s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0);
- s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1);
+ s0 = v_muladd(v_cvt_f32(v_expand_low(x)), f, s0);
+ s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1);
}
v_store(dst + i, s0);
v_store(dst + i + v_float32::nlanes, s1);
+ i += v_int16::nlanes;
}
if( i <= width - v_float32::nlanes )
{
v_store(dst + i, s0);
i += v_float32::nlanes;
}
+ vx_cleanup();
return i;
}
int operator()(const uchar** _src, uchar* _dst, int width) const
{
int _ksize = kernel.rows + kernel.cols - 1;
+ if( _ksize == 1 )
+ return 0;
int ksize2 = _ksize / 2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0, k;
v_float32 d4 = vx_setall_f32(delta);
if( symmetrical )
{
- if (_ksize == 1)
- return 0;
v_float32 k0 = vx_setall_f32(ky[0]);
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_float32 k1 = vx_setall_f32(ky[1]);
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
{
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
- for( k = 1; k <= ksize2; k++ )
+ v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
+ v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
+ s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+ s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
+ s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2);
+ s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3);
+ for( k = 2; k <= ksize2; k++ )
{
- v_float32 k1 = vx_setall_f32(ky[k]);
- s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
- s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+ v_float32 k2 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+ s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
+ s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+ }
+ v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+ }
+ if( i <= width - v_int16::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+ v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+ s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+ s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
+ for( k = 2; k <= ksize2; k++ )
+ {
+ v_float32 k2 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
}
v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ i += v_int16::nlanes;
}
if( i <= width - v_float32::nlanes )
{
v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
- for( k = 1; k <= ksize2; k++ )
+ s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+ for( k = 2; k <= ksize2; k++ )
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
v_pack_store(dst + i, v_round(s0));
i += v_float32::nlanes;
}
else
{
- for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_float32 k1 = vx_setall_f32(ky[1]);
+ for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
{
- v_float32 s0 = d4;
- v_float32 s1 = d4;
- for( k = 1; k <= ksize2; k++ )
+ v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+ v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+ v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
+ v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+ for( k = 2; k <= ksize2; k++ )
{
- v_float32 k1 = vx_setall_f32(ky[k]);
- s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0);
- s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+ v_float32 k2 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+ s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
+ s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+ }
+ v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+ }
+ if( i <= width - v_int16::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+ v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+ for( k = 2; k <= ksize2; k++ )
+ {
+ v_float32 k2 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
}
v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ i += v_int16::nlanes;
}
if( i <= width - v_float32::nlanes )
{
- v_float32 s0 = d4;
- for( k = 1; k <= ksize2; k++ )
+ v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+ for( k = 2; k <= ksize2; k++ )
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
v_pack_store(dst + i, v_round(s0));
i += v_float32::nlanes;
}
}
+ vx_cleanup();
return i;
}
}
#endif
int _ksize = kernel.rows + kernel.cols - 1;
+ CV_DbgAssert(_ksize > 0);
const float* src0 = (const float*)_src;
float* dst = (float*)_dst;
const float* _kx = kernel.ptr<float>();
if (haveAVX2)
return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
#endif
- for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ v_float32 k0 = vx_setall_f32(_kx[0]);
+ for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
{
const float* src = src0 + i;
- v_float32 s0 = vx_setzero_f32();
- for( k = 0; k < _ksize; k++, src += cn )
+ v_float32 s0 = vx_load(src) * k0;
+ v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
+ v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0;
+ v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0;
+ src += cn;
+ for( k = 1; k < _ksize; k++, src += cn )
+ {
+ v_float32 k1 = vx_setall_f32(_kx[k]);
+ s0 = v_muladd(vx_load(src), k1, s0);
+ s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
+ s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2);
+ s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ v_store(dst + i + 2*v_float32::nlanes, s2);
+ v_store(dst + i + 3*v_float32::nlanes, s3);
+ }
+ if( i <= width - 2*v_float32::nlanes )
+ {
+ const float* src = src0 + i;
+ v_float32 s0 = vx_load(src) * k0;
+ v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
+ src += cn;
+ for( k = 1; k < _ksize; k++, src += cn )
+ {
+ v_float32 k1 = vx_setall_f32(_kx[k]);
+ s0 = v_muladd(vx_load(src), k1, s0);
+ s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ i += 2*v_float32::nlanes;
+ }
+ if( i <= width - v_float32::nlanes )
+ {
+ const float* src = src0 + i;
+ v_float32 s0 = vx_load(src) * k0;
+ src += cn;
+ for( k = 1; k < _ksize; k++, src += cn )
s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
v_store(dst + i, s0);
+ i += v_float32::nlanes;
}
+ vx_cleanup();
return i;
}
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
{
int i = 0, _ksize = kernel.rows + kernel.cols - 1;
+ if( _ksize == 1 )
+ return 0;
float* dst = (float*)_dst;
const float* src = (const float*)_src + (_ksize/2)*cn;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
if( symmetrical )
{
- if( _ksize == 1 )
- return 0;
if( _ksize == 3 )
{
if( fabs(kx[0]) == 2 && kx[1] == 1 )
{
+#if CV_FMA3 || CV_AVX2
v_float32 k0 = vx_setall_f32(kx[0]);
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+#else
+ if( kx[0] > 0 )
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ {
+ v_float32 x = vx_load(src);
+ v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x));
+ }
+ else
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ {
+ v_float32 x = vx_load(src);
+ v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x));
+ }
+#endif
}
else
{
{
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
{
+#if CV_FMA3 || CV_AVX2
v_float32 k0 = vx_setall_f32(-2);
for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+#else
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ {
+ v_float32 x = vx_load(src);
+ v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x));
+ }
+#endif
}
else
{
}
}
+ vx_cleanup();
return i;
}
return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
#endif
const v_float32 d4 = vx_setall_f32(delta);
- for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ const v_float32 k0 = vx_setall_f32(ky[0]);
+ for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+ v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+ v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
+ v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
+ for( k = 1; k <= ksize2; k++ )
+ {
+ v_float32 k1 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+ s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2);
+ s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ v_store(dst + i + 2*v_float32::nlanes, s2);
+ v_store(dst + i + 3*v_float32::nlanes, s3);
+ }
+ if( i <= width - 2*v_float32::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+ v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+ for( k = 1; k <= ksize2; k++ )
+ {
+ v_float32 k1 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ i += 2*v_float32::nlanes;
+ }
+ if( i <= width - v_float32::nlanes )
{
- v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4);
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
for( k = 1; k <= ksize2; k++ )
s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
v_store(dst + i, s0);
+ i += v_float32::nlanes;
}
}
else
if (haveAVX2)
return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
#endif
+ CV_DbgAssert(ksize2 > 0);
const v_float32 d4 = vx_setall_f32(delta);
- for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ const v_float32 k1 = vx_setall_f32(ky[1]);
+ for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
{
- v_float32 s0 = d4;
- for( k = 1; k <= ksize2; k++ )
+ v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+ v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+ v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
+ v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+ for( k = 2; k <= ksize2; k++ )
+ {
+ v_float32 k2 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+ s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
+ s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ v_store(dst + i + 2*v_float32::nlanes, s2);
+ v_store(dst + i + 3*v_float32::nlanes, s3);
+ }
+ if( i <= width - 2*v_float32::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+ v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+ for( k = 2; k <= ksize2; k++ )
+ {
+ v_float32 k2 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ i += 2*v_float32::nlanes;
+ }
+ if( i <= width - v_float32::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+ for( k = 2; k <= ksize2; k++ )
s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
v_store(dst + i, s0);
+ i += v_float32::nlanes;
}
}
+ vx_cleanup();
return i;
}
{
if( fabs(ky[0]) == 2 && ky[1] == 1 )
{
+#if CV_FMA3 || CV_AVX2
v_float32 k0 = vx_setall_f32(ky[0]);
- for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
+#else
+ if(ky[0] > 0)
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ {
+ v_float32 x = vx_load(S1 + i);
+ v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x));
+ }
+ else
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ {
+ v_float32 x = vx_load(S1 + i);
+ v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x));
+ }
+#endif
}
else
{
}
}
+ vx_cleanup();
return i;
}
int operator()(const uchar** src, uchar* dst, int width) const
{
+ CV_DbgAssert(_nz > 0);
const float* kf = (const float*)&coeffs[0];
int i = 0, k, nz = _nz;
v_float32 d4 = vx_setall_f32(delta);
+ v_float32 f0 = vx_setall_f32(kf[0]);
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
- v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- for( k = 0; k < nz; k++ )
+ v_uint16 xl, xh;
+ v_expand(vx_load(src[0] + i), xl, xh);
+ v_uint32 x0, x1, x2, x3;
+ v_expand(xl, x0, x1);
+ v_expand(xh, x2, x3);
+ v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4);
+ v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f0, d4);
+ v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f0, d4);
+ for( k = 1; k < nz; k++ )
{
v_float32 f = vx_setall_f32(kf[k]);
- v_uint16 xl, xh;
v_expand(vx_load(src[k] + i), xl, xh);
- v_uint32 x0, x1, x2, x3;
v_expand(xl, x0, x1);
v_expand(xh, x2, x3);
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
}
if( i <= width - v_uint16::nlanes )
{
- v_float32 s0 = d4, s1 = d4;
- for( k = 0; k < nz; k++ )
+ v_uint32 x0, x1;
+ v_expand(vx_load_expand(src[0] + i), x0, x1);
+ v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4);
+ for( k = 1; k < nz; k++ )
{
v_float32 f = vx_setall_f32(kf[k]);
- v_uint32 x0, x1;
v_expand(vx_load_expand(src[k] + i), x0, x1);
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
if( i <= width - v_int32x4::nlanes )
#endif
{
- v_float32x4 s0 = v_setall_f32(delta);
- for( k = 0; k < nz; k++ )
+ v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta));
+ for( k = 1; k < nz; k++ )
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
v_int32x4 s32 = v_round(s0);
v_int16x8 s16 = v_pack(s32, s32);
i += v_int32x4::nlanes;
}
+ vx_cleanup();
return i;
}
int operator()(const uchar** src, uchar* _dst, int width) const
{
+ CV_DbgAssert(_nz > 0);
const float* kf = (const float*)&coeffs[0];
short* dst = (short*)_dst;
int i = 0, k, nz = _nz;
v_float32 d4 = vx_setall_f32(delta);
+ v_float32 f0 = vx_setall_f32(kf[0]);
for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
- v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- for( k = 0; k < nz; k++ )
+ v_uint16 xl, xh;
+ v_expand(vx_load(src[0] + i), xl, xh);
+ v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f0, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f0, d4);
+ v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f0, d4);
+ v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f0, d4);
+ for( k = 1; k < nz; k++ )
{
v_float32 f = vx_setall_f32(kf[k]);
- v_uint16 xl, xh;
v_expand(vx_load(src[k] + i), xl, xh);
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0);
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1);
}
if( i <= width - v_uint16::nlanes )
{
- v_float32 s0 = d4, s1 = d4;
- for( k = 0; k < nz; k++ )
+ v_uint16 x = vx_load_expand(src[0] + i);
+ v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f0, d4);
+ for( k = 1; k < nz; k++ )
{
v_float32 f = vx_setall_f32(kf[k]);
- v_uint16 x = vx_load_expand(src[k] + i);
+ x = vx_load_expand(src[k] + i);
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0);
s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
}
}
if( i <= width - v_int32::nlanes )
{
- v_float32 s0 = d4;
- for( k = 0; k < nz; k++ )
+ v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4);
+ for( k = 1; k < nz; k++ )
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
v_pack_store(dst + i, v_round(s0));
i += v_int32::nlanes;
}
+ vx_cleanup();
return i;
}
int i = 0, k, nz = _nz;
v_float32 d4 = vx_setall_f32(delta);
- for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ v_float32 f0 = vx_setall_f32(kf[0]);
+ for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
{
- v_float32 s0 = d4;
- for( k = 0; k < nz; k++ )
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
+ v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
+ v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4);
+ v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4);
+ for( k = 1; k < nz; k++ )
+ {
+ v_float32 f1 = vx_setall_f32(kf[k]);
+ s0 = v_muladd(vx_load(src[k] + i), f1, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
+ s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2);
+ s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ v_store(dst + i + 2*v_float32::nlanes, s2);
+ v_store(dst + i + 3*v_float32::nlanes, s3);
+ }
+ if( i <= width - 2*v_float32::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
+ v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
+ for( k = 1; k < nz; k++ )
+ {
+ v_float32 f1 = vx_setall_f32(kf[k]);
+ s0 = v_muladd(vx_load(src[k] + i), f1, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ i += 2*v_float32::nlanes;
+ }
+ if( i <= width - v_float32::nlanes )
+ {
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
+ for( k = 1; k < nz; k++ )
s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
v_store(dst + i, s0);
+ i += v_float32::nlanes;
}
+ vx_cleanup();
return i;
}
{
CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
RNG& rng = ts->get_rng();
- int depth = cvtest::randInt(rng)%3;
+ int depth = cvtest::randInt(rng)%4;
int cn = CV_MAT_CN(types[INPUT][0]);
- depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : CV_32F;
+ depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F;
types[INPUT][0] = types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth, cn);
}
{
RNG& rng = ts->get_rng();
CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
- int depth = cvtest::randInt(rng) % 2;
- depth = depth == 0 ? CV_8U : CV_32F;
+ int depth = cvtest::randInt(rng) % 4;
+ depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F;
types[INPUT][0] = CV_MAKETYPE(depth,1);
- types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth==CV_8U?CV_16S:CV_32F,1);
+ int sameDepth = cvtest::randInt(rng) % 2;
+ types[OUTPUT][0] = types[REF_OUTPUT][0] = sameDepth ? depth : CV_MAKETYPE(depth==CV_8U?CV_16S:CV_32F,1);
_aperture_size = (cvtest::randInt(rng)%5)*2 - 1;
sizes[INPUT][1] = aperture_size = cvSize(_aperture_size, _aperture_size);
}
ASSERT_EQ(0.0, cvtest::norm(dst_hires(Rect(516, 516, 1016, 1016)), dst_ref(Rect(4, 4, 1016, 1016)), NORM_INF));
}
+
+TEST(Imgproc_Sobel, s16_regression_13506)
+{
+ Mat src = (Mat_<short>(8, 16) << 127, 138, 130, 102, 118, 97, 76, 84, 124, 90, 146, 63, 130, 87, 212, 85,
+ 164, 3, 51, 124, 151, 89, 154, 117, 36, 88, 116, 117, 180, 112, 147, 124,
+ 63, 50, 115, 103, 83, 148, 106, 79, 213, 106, 135, 53, 79, 106, 122, 112,
+ 218, 107, 81, 126, 78, 138, 85, 142, 151, 108, 104, 158, 155, 81, 112, 178,
+ 184, 96, 187, 148, 150, 112, 138, 162, 222, 146, 128, 49, 124, 46, 165, 104,
+ 119, 164, 77, 144, 186, 98, 106, 148, 155, 157, 160, 151, 156, 149, 43, 122,
+ 106, 155, 120, 132, 159, 115, 126, 188, 44, 79, 164, 201, 153, 97, 139, 133,
+ 133, 98, 111, 165, 66, 106, 131, 85, 176, 156, 67, 108, 142, 91, 74, 137);
+ Mat ref = (Mat_<short>(8, 16) << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -1020, -796, -489, -469, -247, 317, 760, 1429, 1983, 1384, 254, -459, -899, -1197, -1172, -1058,
+ 2552, 2340, 1617, 591, 9, 96, 722, 1985, 2746, 1916, 676, 9, -635, -1115, -779, -380,
+ 3546, 3349, 2838, 2206, 1388, 669, 938, 1880, 2252, 1785, 1083, 606, 180, -298, -464, -418,
+ 816, 966, 1255, 1652, 1619, 924, 535, 288, 5, 601, 1581, 1870, 1520, 625, -627, -1260,
+ -782, -610, -395, -267, -122, -42, -317, -1378, -2293, -1451, 596, 1870, 1679, 763, -69, -394,
+ -882, -681, -463, -818, -1167, -732, -463, -1042, -1604, -1592, -1047, -334, -104, -117, 229, 512,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ Mat dst;
+ Sobel(src, dst, CV_16S, 0, 1, 5);
+ ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF));
+}
}} // namespace