#include "opencv2/core/opencl/ocl_defs.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "hal_replacement.hpp"
+ #include "opencv2/core/hal/intrin.hpp"
#include "filter.hpp"
};
- #if CV_SSE2
+ #if CV_SIMD
///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
{
- if( !checkHardwareSupport(CV_CPU_SSE2) )
- return 0;
-
int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
int* dst = (int*)_dst;
const int* _kx = kernel.ptr<int>();
if( smallValues )
{
- __m128i z = _mm_setzero_si128();
- for( ; i <= width - 8; i += 8 )
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
const uchar* src = _src + i;
- __m128i s0 = z, s1 = z;
-
- for( k = 0; k < _ksize; k++, src += cn )
+ v_int32 s0 = vx_setzero_s32();
+ v_int32 s1 = vx_setzero_s32();
+ v_int32 s2 = vx_setzero_s32();
+ v_int32 s3 = vx_setzero_s32();
+ k = 0;
+ for (; k <= _ksize - 2; k += 2, src += 2 * cn)
+ {
+ v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+ v_uint8 x0, x1;
+ v_zip(vx_load(src), vx_load(src + cn), x0, x1);
+ s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+ s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+ s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+ s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+ }
+ if (k < _ksize)
+ {
+ v_int32 f = vx_setall_s32(_kx[k]);
+ v_uint16 x0, x1;
+ v_expand(vx_load(src), x0, x1);
+ s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+ s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+ s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+ s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_int32::nlanes, s1);
+ v_store(dst + i + 2*v_int32::nlanes, s2);
+ v_store(dst + i + 3*v_int32::nlanes, s3);
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ const uchar* src = _src + i;
+ v_int32 s0 = vx_setzero_s32();
+ v_int32 s1 = vx_setzero_s32();
+ k = 0;
+ for( ; k <= _ksize - 2; k += 2, src += 2*cn )
{
- __m128i f = _mm_cvtsi32_si128(_kx[k]);
- f = _mm_shuffle_epi32(f, 0);
-
- __m128i x0 = _mm_loadl_epi64((const __m128i*)src);
- x0 = _mm_unpacklo_epi8(x0, z);
-
- __m128i x1 = _mm_unpackhi_epi16(x0, z);
- x0 = _mm_unpacklo_epi16(x0, z);
-
- x0 = _mm_madd_epi16(x0, f);
- x1 = _mm_madd_epi16(x1, f);
-
- s0 = _mm_add_epi32(s0, x0);
- s1 = _mm_add_epi32(s1, x1);
+ v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+ v_uint16 x0, x1;
+ v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1);
+ s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+ s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
}
-
- _mm_store_si128((__m128i*)(dst + i), s0);
- _mm_store_si128((__m128i*)(dst + i + 4), s1);
+ if( k < _ksize )
+ {
+ v_int32 f = vx_setall_s32(_kx[k]);
+ v_uint32 x0, x1;
+ v_expand(vx_load_expand(src), x0, x1);
+ s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+ s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_int32::nlanes, s1);
+ i += v_uint16::nlanes;
}
-
- if( i <= width - 4 )
+ if( i <= width - v_uint32::nlanes )
{
+ v_int32 d = vx_setzero_s32();
+ k = 0;
const uchar* src = _src + i;
- __m128i s0 = z;
-
- for( k = 0; k < _ksize; k++, src += cn )
+ for (; k <= _ksize - 2; k += 2, src += 2*cn)
{
- __m128i f = _mm_cvtsi32_si128(_kx[k]);
- f = _mm_shuffle_epi32(f, 0);
-
- __m128i x0 = _mm_cvtsi32_si128(*(const int*)src);
- x0 = _mm_unpacklo_epi8(x0, z);
- x0 = _mm_unpacklo_epi16(x0, z);
- x0 = _mm_madd_epi16(x0, f);
- s0 = _mm_add_epi32(s0, x0);
+ v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+ v_uint32 x0, x1;
+ v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1);
+ d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f));
}
- _mm_store_si128((__m128i*)(dst + i), s0);
- i += 4;
+ if (k < _ksize)
+ d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])));
+ v_store(dst + i, d);
+ i += v_uint32::nlanes;
}
}
return i;
int operator()(const uchar* src, uchar* _dst, int width, int cn) const
{
- if( !checkHardwareSupport(CV_CPU_SSE2) )
- return 0;
-
int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
int* dst = (int*)_dst;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
src += (_ksize/2)*cn;
width *= cn;
- __m128i z = _mm_setzero_si128();
if( symmetrical )
{
if( _ksize == 1 )
if( _ksize == 3 )
{
if( kx[0] == 2 && kx[1] == 1 )
- for( ; i <= width - 16; i += 16, src += 16 )
+ {
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src), x1l, x1h);
+ v_expand(vx_load(src + cn), x2l, x2h);
+ x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l));
+ x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h));
+ v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l)));
+ v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l)));
+ v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h)));
+ v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h)));
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_uint16 x = vx_load_expand(src);
+ x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)));
+ v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x)));
+ v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x)));
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if( i <= width - v_uint32::nlanes )
{
- __m128i x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_si128((__m128i*)(src - cn));
- x1 = _mm_loadu_si128((__m128i*)src);
- x2 = _mm_loadu_si128((__m128i*)(src + cn));
- y0 = _mm_unpackhi_epi8(x0, z);
- x0 = _mm_unpacklo_epi8(x0, z);
- y1 = _mm_unpackhi_epi8(x1, z);
- x1 = _mm_unpacklo_epi8(x1, z);
- y2 = _mm_unpackhi_epi8(x2, z);
- x2 = _mm_unpacklo_epi8(x2, z);
- x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
- y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
- _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
- _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
- _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
- _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
+ v_uint32 x = vx_load_expand_q(src);
+ x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn);
+ v_store(dst + i, v_reinterpret_as_s32(x));
+ i += v_uint32::nlanes;
}
+ }
else if( kx[0] == -2 && kx[1] == 1 )
- for( ; i <= width - 16; i += 16, src += 16 )
+ {
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src), x1l, x1h);
+ v_expand(vx_load(src + cn), x2l, x2h);
+ x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+ x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+ v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+ v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+ v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+ v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_uint16 x = vx_load_expand(src);
+ x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x));
+ v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+ v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if( i <= width - v_uint32::nlanes )
{
- __m128i x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_si128((__m128i*)(src - cn));
- x1 = _mm_loadu_si128((__m128i*)src);
- x2 = _mm_loadu_si128((__m128i*)(src + cn));
- y0 = _mm_unpackhi_epi8(x0, z);
- x0 = _mm_unpacklo_epi8(x0, z);
- y1 = _mm_unpackhi_epi8(x1, z);
- x1 = _mm_unpacklo_epi8(x1, z);
- y2 = _mm_unpackhi_epi8(x2, z);
- x2 = _mm_unpacklo_epi8(x2, z);
- x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
- y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
- _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
- _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
- _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
- _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+ v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+ x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x);
+ v_store(dst + i, x);
+ i += v_uint32::nlanes;
}
+ }
else
{
- __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
- k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
- k1 = _mm_packs_epi32(k1, k1);
-
- for( ; i <= width - 8; i += 8, src += 8 )
+ v_int16 k0 = vx_setall_s16((short)kx[0]);
+ v_int16 k1 = vx_setall_s16((short)kx[1]);
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src), x1l, x1h);
+ v_expand(vx_load(src + cn), x2l, x2h);
+
+ v_int32 dl, dh;
+ v_int16 x0, x1;
+ v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh);
+ v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1);
+ dl += v_dotprod(x0, k1);
+ dh += v_dotprod(x1, k1);
+ v_store(dst + i, dl);
+ v_store(dst + i + v_int32::nlanes, dh);
+
+ v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh);
+ v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1);
+ dl += v_dotprod(x0, k1);
+ dh += v_dotprod(x1, k1);
+ v_store(dst + i + 2*v_int32::nlanes, dl);
+ v_store(dst + i + 3*v_int32::nlanes, dh);
+ }
+ if ( i <= width - v_uint16::nlanes )
+ {
+ v_int32 dl, dh;
+ v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh);
+ v_int16 x0, x1;
+ v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1);
+ dl += v_dotprod(x0, k1);
+ dh += v_dotprod(x1, k1);
+ v_store(dst + i, dl);
+ v_store(dst + i + v_int32::nlanes, dh);
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if ( i <= width - v_uint32::nlanes )
{
- __m128i x0 = _mm_loadl_epi64((__m128i*)(src - cn));
- __m128i x1 = _mm_loadl_epi64((__m128i*)src);
- __m128i x2 = _mm_loadl_epi64((__m128i*)(src + cn));
-
- x0 = _mm_unpacklo_epi8(x0, z);
- x1 = _mm_unpacklo_epi8(x1, z);
- x2 = _mm_unpacklo_epi8(x2, z);
- __m128i x3 = _mm_unpacklo_epi16(x0, x2);
- __m128i x4 = _mm_unpackhi_epi16(x0, x2);
- __m128i x5 = _mm_unpacklo_epi16(x1, z);
- __m128i x6 = _mm_unpackhi_epi16(x1, z);
- x3 = _mm_madd_epi16(x3, k1);
- x4 = _mm_madd_epi16(x4, k1);
- x5 = _mm_madd_epi16(x5, k0);
- x6 = _mm_madd_epi16(x6, k0);
- x3 = _mm_add_epi32(x3, x5);
- x4 = _mm_add_epi32(x4, x6);
-
- _mm_store_si128((__m128i*)(dst + i), x3);
- _mm_store_si128((__m128i*)(dst + i + 4), x4);
+ v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1])));
+ i += v_uint32::nlanes;
}
}
}
else if( _ksize == 5 )
{
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
- for( ; i <= width - 16; i += 16, src += 16 )
+ {
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+ v_expand(vx_load(src - 2*cn), x0l, x0h);
+ v_expand(vx_load(src), x1l, x1h);
+ v_expand(vx_load(src + 2*cn), x2l, x2h);
+ x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+ x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+ v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+ v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+ v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+ v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+ }
+ if( i <= width - v_uint16::nlanes )
{
- __m128i x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
- x1 = _mm_loadu_si128((__m128i*)src);
- x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
- y0 = _mm_unpackhi_epi8(x0, z);
- x0 = _mm_unpacklo_epi8(x0, z);
- y1 = _mm_unpackhi_epi8(x1, z);
- x1 = _mm_unpacklo_epi8(x1, z);
- y2 = _mm_unpackhi_epi8(x2, z);
- x2 = _mm_unpacklo_epi8(x2, z);
- x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
- y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
- _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
- _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
- _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
- _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+ v_uint16 x = vx_load_expand(src);
+ x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x));
+ v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+ v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
}
+ if( i <= width - v_uint32::nlanes )
+ {
+ v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+ x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x);
+ v_store(dst + i, x);
+ i += v_uint32::nlanes;
+ }
+ }
else
{
- __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
- k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
- k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
- k1 = _mm_packs_epi32(k1, k1);
- k2 = _mm_packs_epi32(k2, k2);
+ v_int16 k0 = vx_setall_s16((short)(kx[0]));
+ v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_int32 x0, x1, x2, x3;
+ v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+ v_int16 xl, xh;
+
+ v_expand(vx_load(src), x0l, x0h);
+ v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1);
+ v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3);
+
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src + cn), x1l, x1h);
+ v_expand(vx_load(src - 2*cn), x2l, x2h);
+ v_expand(vx_load(src + 2*cn), x3l, x3h);
+ v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh);
+ x0 += v_dotprod(xl, k12);
+ x1 += v_dotprod(xh, k12);
+ v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh);
+ x2 += v_dotprod(xl, k12);
+ x3 += v_dotprod(xh, k12);
+
+ v_store(dst + i, x0);
+ v_store(dst + i + v_int32::nlanes, x1);
+ v_store(dst + i + 2*v_int32::nlanes, x2);
+ v_store(dst + i + 3*v_int32::nlanes, x3);
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_int32 x1, x2;
+ v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2);
- for( ; i <= width - 8; i += 8, src += 8 )
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh);
+ x1 += v_dotprod(xl, k12);
+ x2 += v_dotprod(xh, k12);
+
+ v_store(dst + i, x1);
+ v_store(dst + i + v_int32::nlanes, x2);
+ i += v_uint16::nlanes, src += v_uint16::nlanes;
+ }
+ if( i <= width - v_uint32::nlanes )
+ {
+ v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]),
+ v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]),
+ v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2]))));
+ i += v_uint32::nlanes;
+ }
+ }
+ }
+ else
+ {
+ v_int16 k0 = vx_setall_s16((short)(kx[0]));
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint8 v_src = vx_load(src);
+ v_int32 s0, s1, s2, s3;
+ v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+ v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+ for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn)
+ {
+ v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+
+ v_uint8 v_src0 = vx_load(src - j);
+ v_uint8 v_src1 = vx_load(src - j - cn);
+ v_uint8 v_src2 = vx_load(src + j);
+ v_uint8 v_src3 = vx_load(src + j + cn);
+
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
+ s0 += v_dotprod(xl, k12);
+ s1 += v_dotprod(xh, k12);
+ v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
+ s2 += v_dotprod(xl, k12);
+ s3 += v_dotprod(xh, k12);
+ }
+ if( k < _ksize / 2 + 1 )
+ {
+ v_int16 k1 = vx_setall_s16((short)(kx[k]));
+
+ v_uint8 v_src0 = vx_load(src - j);
+ v_uint8 v_src1 = vx_load(src + j);
+
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
+ s0 += v_dotprod(xl, k1);
+ s1 += v_dotprod(xh, k1);
+ v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
+ s2 += v_dotprod(xl, k1);
+ s3 += v_dotprod(xh, k1);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_int32::nlanes, s1);
+ v_store(dst + i + 2*v_int32::nlanes, s2);
+ v_store(dst + i + 3*v_int32::nlanes, s3);
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_int32 s0, s1;
+ v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+ for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn)
+ {
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh);
+ v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16)));
+ s0 += v_dotprod(xl, k12);
+ s1 += v_dotprod(xh, k12);
+ }
+ if ( k < _ksize / 2 + 1 )
{
- __m128i x0 = _mm_loadl_epi64((__m128i*)src);
-
- x0 = _mm_unpacklo_epi8(x0, z);
- __m128i x1 = _mm_unpacklo_epi16(x0, z);
- __m128i x2 = _mm_unpackhi_epi16(x0, z);
- x1 = _mm_madd_epi16(x1, k0);
- x2 = _mm_madd_epi16(x2, k0);
-
- __m128i x3 = _mm_loadl_epi64((__m128i*)(src - cn));
- __m128i x4 = _mm_loadl_epi64((__m128i*)(src + cn));
-
- x3 = _mm_unpacklo_epi8(x3, z);
- x4 = _mm_unpacklo_epi8(x4, z);
- __m128i x5 = _mm_unpacklo_epi16(x3, x4);
- __m128i x6 = _mm_unpackhi_epi16(x3, x4);
- x5 = _mm_madd_epi16(x5, k1);
- x6 = _mm_madd_epi16(x6, k1);
- x1 = _mm_add_epi32(x1, x5);
- x2 = _mm_add_epi32(x2, x6);
-
- x3 = _mm_loadl_epi64((__m128i*)(src - cn*2));
- x4 = _mm_loadl_epi64((__m128i*)(src + cn*2));
-
- x3 = _mm_unpacklo_epi8(x3, z);
- x4 = _mm_unpacklo_epi8(x4, z);
- x5 = _mm_unpacklo_epi16(x3, x4);
- x6 = _mm_unpackhi_epi16(x3, x4);
- x5 = _mm_madd_epi16(x5, k2);
- x6 = _mm_madd_epi16(x6, k2);
- x1 = _mm_add_epi32(x1, x5);
- x2 = _mm_add_epi32(x2, x6);
-
- _mm_store_si128((__m128i*)(dst + i), x1);
- _mm_store_si128((__m128i*)(dst + i + 4), x2);
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh);
+ v_int16 k1 = vx_setall_s16((short)(kx[k]));
+ s0 += v_dotprod(xl, k1);
+ s1 += v_dotprod(xh, k1);
}
+ v_store(dst + i, s0);
+ v_store(dst + i + v_int32::nlanes, s1);
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if( i <= width - v_uint32::nlanes )
+ {
+ v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+ for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn )
+ s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0);
+ v_store(dst + i, s0);
+ i += v_uint32::nlanes;
}
}
}
if( _ksize == 3 )
{
if( kx[0] == 0 && kx[1] == 1 )
- for( ; i <= width - 16; i += 16, src += 16 )
+ {
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x2l, x2h;
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src + cn), x2l, x2h);
+ v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l));
+ v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h));
+ v_store(dst + i, v_expand_low(dl));
+ v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+ v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh));
+ v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh));
+ }
+ if( i <= width - v_uint16::nlanes )
{
- __m128i x0, x1, y0;
- x0 = _mm_loadu_si128((__m128i*)(src + cn));
- x1 = _mm_loadu_si128((__m128i*)(src - cn));
- y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
- x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
- _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
- _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
- _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
- _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+ v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn)));
+ v_store(dst + i, v_expand_low(dl));
+ v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
}
+ if (i <= width - v_uint32::nlanes)
+ {
+ v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)));
+ i += v_uint32::nlanes;
+ }
+ }
else
{
- __m128i k0 = _mm_set_epi32(-kx[1], kx[1], -kx[1], kx[1]);
- k0 = _mm_packs_epi32(k0, k0);
-
- for( ; i <= width - 16; i += 16, src += 16 )
+ v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16)));
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x2l, x2h;
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src + cn), x2l, x2h);
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh);
+ v_store(dst + i, v_dotprod(xl, k0));
+ v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+ v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh);
+ v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0));
+ v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0));
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh);
+ v_store(dst + i, v_dotprod(xl, k0));
+ v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if (i <= width - v_uint32::nlanes)
{
- __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
- __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
-
- __m128i x2 = _mm_unpacklo_epi8(x0, z);
- __m128i x3 = _mm_unpacklo_epi8(x1, z);
- __m128i x4 = _mm_unpackhi_epi8(x0, z);
- __m128i x5 = _mm_unpackhi_epi8(x1, z);
- __m128i x6 = _mm_unpacklo_epi16(x2, x3);
- __m128i x7 = _mm_unpacklo_epi16(x4, x5);
- __m128i x8 = _mm_unpackhi_epi16(x2, x3);
- __m128i x9 = _mm_unpackhi_epi16(x4, x5);
- x6 = _mm_madd_epi16(x6, k0);
- x7 = _mm_madd_epi16(x7, k0);
- x8 = _mm_madd_epi16(x8, k0);
- x9 = _mm_madd_epi16(x9, k0);
-
- _mm_store_si128((__m128i*)(dst + i), x6);
- _mm_store_si128((__m128i*)(dst + i + 4), x8);
- _mm_store_si128((__m128i*)(dst + i + 8), x7);
- _mm_store_si128((__m128i*)(dst + i + 12), x9);
+ v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1])));
+ i += v_uint32::nlanes;
}
}
}
else if( _ksize == 5 )
{
- __m128i k0 = _mm_loadl_epi64((__m128i*)(kx + 1));
- k0 = _mm_unpacklo_epi64(k0, k0);
- k0 = _mm_packs_epi32(k0, k0);
-
- for( ; i <= width - 16; i += 16, src += 16 )
+ v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+ v_expand(vx_load(src - cn), x0l, x0h);
+ v_expand(vx_load(src - 2*cn), x1l, x1h);
+ v_expand(vx_load(src + cn), x2l, x2h);
+ v_expand(vx_load(src + 2*cn), x3l, x3h);
+ v_int16 x0, x1;
+ v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1);
+ v_store(dst + i, v_dotprod(x0, k0));
+ v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+ v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1);
+ v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0));
+ v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0));
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_int16 x0, x1;
+ v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))),
+ v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1);
+ v_store(dst + i, v_dotprod(x0, k0));
+ v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if( i <= width - v_uint32::nlanes )
{
- __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
- __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
-
- __m128i x2 = _mm_unpackhi_epi8(x0, z);
- __m128i x3 = _mm_unpackhi_epi8(x1, z);
- x0 = _mm_unpacklo_epi8(x0, z);
- x1 = _mm_unpacklo_epi8(x1, z);
- __m128i x5 = _mm_sub_epi16(x2, x3);
- __m128i x4 = _mm_sub_epi16(x0, x1);
-
- __m128i x6 = _mm_loadu_si128((__m128i*)(src + cn * 2));
- __m128i x7 = _mm_loadu_si128((__m128i*)(src - cn * 2));
-
- __m128i x8 = _mm_unpackhi_epi8(x6, z);
- __m128i x9 = _mm_unpackhi_epi8(x7, z);
- x6 = _mm_unpacklo_epi8(x6, z);
- x7 = _mm_unpacklo_epi8(x7, z);
- __m128i x11 = _mm_sub_epi16(x8, x9);
- __m128i x10 = _mm_sub_epi16(x6, x7);
-
- __m128i x13 = _mm_unpackhi_epi16(x5, x11);
- __m128i x12 = _mm_unpackhi_epi16(x4, x10);
- x5 = _mm_unpacklo_epi16(x5, x11);
- x4 = _mm_unpacklo_epi16(x4, x10);
- x5 = _mm_madd_epi16(x5, k0);
- x4 = _mm_madd_epi16(x4, k0);
- x13 = _mm_madd_epi16(x13, k0);
- x12 = _mm_madd_epi16(x12, k0);
-
- _mm_store_si128((__m128i*)(dst + i), x4);
- _mm_store_si128((__m128i*)(dst + i + 4), x12);
- _mm_store_si128((__m128i*)(dst + i + 8), x5);
- _mm_store_si128((__m128i*)(dst + i + 12), x13);
+ v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]),
+ (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2])));
+ i += v_uint32::nlanes;
}
}
- }
-
- src -= (_ksize/2)*cn;
- kx -= _ksize/2;
- for( ; i <= width - 4; i += 4, src += 4 )
- {
- __m128i s0 = z;
-
- for( k = j = 0; k < _ksize; k++, j += cn )
+ else
{
- __m128i f = _mm_cvtsi32_si128(kx[k]);
- f = _mm_shuffle_epi32(f, 0);
-
- __m128i x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
- x0 = _mm_unpacklo_epi8(x0, z);
- x0 = _mm_unpacklo_epi16(x0, z);
- x0 = _mm_madd_epi16(x0, f);
- s0 = _mm_add_epi32(s0, x0);
+ v_int16 k0 = vx_setall_s16((short)(kx[0]));
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+ {
+ v_uint8 v_src = vx_load(src);
+ v_int32 s0, s1, s2, s3;
+ v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+ v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+ for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+ {
+ v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+
+ v_uint8 v_src0 = vx_load(src - j);
+ v_uint8 v_src1 = vx_load(src - j - cn);
+ v_uint8 v_src2 = vx_load(src + j);
+ v_uint8 v_src3 = vx_load(src + j + cn);
+
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh);
+ s0 += v_dotprod(xl, k12);
+ s1 += v_dotprod(xh, k12);
+ v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh);
+ s2 += v_dotprod(xl, k12);
+ s3 += v_dotprod(xh, k12);
+ }
+ if( k < _ksize / 2 + 1 )
+ {
+ v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+ v_uint8 v_src0 = vx_load(src - j);
+ v_uint8 v_src1 = vx_load(src + j);
+
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh);
+ s0 += v_dotprod(xl, k12);
+ s1 += v_dotprod(xh, k12);
+ v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh);
+ s2 += v_dotprod(xl, k12);
+ s3 += v_dotprod(xh, k12);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_int32::nlanes, s1);
+ v_store(dst + i + 2*v_int32::nlanes, s2);
+ v_store(dst + i + 3*v_int32::nlanes, s3);
+ }
+ if( i <= width - v_uint16::nlanes )
+ {
+ v_int32 s0, s1;
+ v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+ for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+ {
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh);
+ v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+ s0 += v_dotprod(xl, k12);
+ s1 += v_dotprod(xh, k12);
+ }
+ if( k < _ksize / 2 + 1 )
+ {
+ v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+ v_int16 xl, xh;
+ v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh);
+ s0 += v_dotprod(xl, k1);
+ s1 += v_dotprod(xh, k1);
+ }
+ v_store(dst + i, s0);
+ v_store(dst + i + v_int32::nlanes, s1);
+ i += v_uint16::nlanes; src += v_uint16::nlanes;
+ }
+ if( i <= width - v_uint32::nlanes )
+ {
+ v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+ for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn)
+ s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0);
+ v_store(dst + i, s0);
+ i += v_uint32::nlanes;
+ }
}
- _mm_store_si128((__m128i*)(dst + i), s0);
}
return i;
int operator()(const uchar** _src, uchar* dst, int width) const
{
- if( !checkHardwareSupport(CV_CPU_SSE2) )
- return 0;
-
- int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+ int _ksize = kernel.rows + kernel.cols - 1;
+ int ksize2 = _ksize/2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0, k;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
const int** src = (const int**)_src;
- const __m128i *S, *S2;
- __m128 d4 = _mm_set1_ps(delta);
+ v_float32 d4 = vx_setall_f32(delta);
if( symmetrical )
{
- for( ; i <= width - 16; i += 16 )
- {
- __m128 f = _mm_load_ss(ky);
- f = _mm_shuffle_ps(f, f, 0);
- __m128 s0, s1, s2, s3;
- __m128i x0, x1;
- S = (const __m128i*)(src[0] + i);
- s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
- s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
- s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
- s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
- s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
- s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
- s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
- s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-
+ if (_ksize == 1)
+ return 0;
+ v_float32 f0 = vx_setall_f32(ky[0]);
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+ {
+ const int* S = src[0] + i;
+ v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
+ v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
+ v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
for( k = 1; k <= ksize2; k++ )
{
- S = (const __m128i*)(src[k] + i);
- S2 = (const __m128i*)(src[-k] + i);
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
- x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
- s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
- x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
- x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
- s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+ v_float32 f = vx_setall_f32(ky[k]);
+ const int* S0 = src[k] + i;
+ const int* S1 = src[-k] + i;
+ s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+ s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
+ s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+ s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3);
}
-
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
- x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
- x0 = _mm_packus_epi16(x0, x1);
- _mm_storeu_si128((__m128i*)(dst + i), x0);
+ v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
}
-
- for( ; i <= width - 4; i += 4 )
+ if( i <= width - v_uint16::nlanes )
{
- __m128 f = _mm_load_ss(ky);
- f = _mm_shuffle_ps(f, f, 0);
- __m128i x0;
- __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
- s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-
+ const int* S = src[0] + i;
+ v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+ v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
for( k = 1; k <= ksize2; k++ )
{
- S = (const __m128i*)(src[k] + i);
- S2 = (const __m128i*)(src[-k] + i);
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
- s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+ v_float32 f = vx_setall_f32(ky[k]);
+ const int* S0 = src[k] + i;
+ const int* S1 = src[-k] + i;
+ s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+ s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
}
-
- x0 = _mm_cvtps_epi32(s0);
- x0 = _mm_packs_epi32(x0, x0);
- x0 = _mm_packus_epi16(x0, x0);
- *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+ v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ i += v_uint16::nlanes;
+ }
+ #if CV_SIMD_WIDTH > 16
+ while( i <= width - v_int32x4::nlanes )
+ #else
+ if( i <= width - v_int32x4::nlanes )
+ #endif
+ {
+ v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
+ for( k = 1; k <= ksize2; k++ )
+ s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+ v_int32x4 s32 = v_round(s0);
+ v_int16x8 s16 = v_pack(s32, s32);
+ *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+ i += v_int32x4::nlanes;
}
}
else
{
- for( ; i <= width - 16; i += 16 )
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
- __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- __m128i x0, x1;
-
- for( k = 1; k <= ksize2; k++ )
+ v_float32 s0 = d4;
+ v_float32 s1 = d4;
+ v_float32 s2 = d4;
+ v_float32 s3 = d4;
+ for ( k = 1; k <= ksize2; k++ )
{
- S = (const __m128i*)(src[k] + i);
- S2 = (const __m128i*)(src[-k] + i);
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
- x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
- s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
- x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
- x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
- s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+ v_float32 f = vx_setall_f32(ky[k]);
+ const int* S0 = src[k] + i;
+ const int* S1 = src[-k] + i;
+ s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+ s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
+ s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+ s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3);
}
-
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
- x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
- x0 = _mm_packus_epi16(x0, x1);
- _mm_storeu_si128((__m128i*)(dst + i), x0);
+ v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
}
-
- for( ; i <= width - 4; i += 4 )
+ if( i <= width - v_uint16::nlanes )
{
- __m128 f, s0 = d4;
- __m128i x0;
-
- for( k = 1; k <= ksize2; k++ )
+ v_float32 s0 = d4;
+ v_float32 s1 = d4;
+ for ( k = 1; k <= ksize2; k++ )
{
- S = (const __m128i*)(src[k] + i);
- S2 = (const __m128i*)(src[-k] + i);
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
- s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+ v_float32 f = vx_setall_f32(ky[k]);
+ const int* S0 = src[k] + i;
+ const int* S1 = src[-k] + i;
+ s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+ s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
}
-
- x0 = _mm_cvtps_epi32(s0);
- x0 = _mm_packs_epi32(x0, x0);
- x0 = _mm_packus_epi16(x0, x0);
- *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+ v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ i += v_uint16::nlanes;
+ }
+ #if CV_SIMD_WIDTH > 16
+ while( i <= width - v_int32x4::nlanes )
+ #else
+ if( i <= width - v_int32x4::nlanes )
+ #endif
+ {
+ v_float32x4 s0 = v_setall_f32(delta);
+ for (k = 1; k <= ksize2; k++)
+ s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+ v_int32x4 s32 = v_round(s0);
+ v_int16x8 s16 = v_pack(s32, s32);
+ *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+ i += v_int32x4::nlanes;
}
}
int operator()(const uchar** _src, uchar* _dst, int width) const
{
- if( !checkHardwareSupport(CV_CPU_SSE2) )
- return 0;
-
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0;
const int** src = (const int**)_src;
const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
short* dst = (short*)_dst;
- __m128 df4 = _mm_set1_ps(delta);
- __m128i d4 = _mm_cvtps_epi32(df4);
+ v_float32 df4 = vx_setall_f32(delta);
+ v_int32 d4 = v_round(df4);
if( symmetrical )
{
if( ky[0] == 2 && ky[1] == 1 )
{
- for( ; i <= width - 8; i += 8 )
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ {
+ v_int32 sl = vx_load(S1 + i);
+ v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+ v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh)));
+ }
+ if( i <= width - v_int32::nlanes )
{
- __m128i s0, s1, s2, s3, s4, s5;
- s0 = _mm_load_si128((__m128i*)(S0 + i));
- s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
- s2 = _mm_load_si128((__m128i*)(S1 + i));
- s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
- s4 = _mm_load_si128((__m128i*)(S2 + i));
- s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
- s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
- s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
- s0 = _mm_add_epi32(s0, d4);
- s1 = _mm_add_epi32(s1, d4);
- _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+ v_int32 s = vx_load(S1 + i);
+ v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s));
+ i += v_int32::nlanes;
}
}
else if( ky[0] == -2 && ky[1] == 1 )
{
- for( ; i <= width - 8; i += 8 )
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ {
+ v_int32 sl = vx_load(S1 + i);
+ v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+ v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh)));
+ }
+ if( i <= width - v_int32::nlanes )
+ {
+ v_int32 s = vx_load(S1 + i);
+ v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s));
+ i += v_int32::nlanes;
+ }
+ }
+ else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) )
+ {
+ v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+ v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+ if( i <= width - v_int32::nlanes )
{
- __m128i s0, s1, s2, s3, s4, s5;
- s0 = _mm_load_si128((__m128i*)(S0 + i));
- s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
- s2 = _mm_load_si128((__m128i*)(S1 + i));
- s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
- s4 = _mm_load_si128((__m128i*)(S2 + i));
- s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
- s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
- s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
- s0 = _mm_add_epi32(s0, d4);
- s1 = _mm_add_epi32(s1, d4);
- _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+ v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+ i += v_int32::nlanes;
}
}
else
{
- __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
- for( ; i <= width - 8; i += 8 )
+ v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+ v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+ if( i <= width - v_int32::nlanes )
{
- __m128 s0, s1;
- s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
- s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
- s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
- s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
- __m128i x0, x1;
- x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
- _mm_load_si128((__m128i*)(S2 + i)));
- x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
- _mm_load_si128((__m128i*)(S2 + i + 4)));
- s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
- s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
- _mm_storeu_si128((__m128i*)(dst + i), x0);
+ v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
+ i += v_int32::nlanes;
}
}
}
{
if( ky[1] < 0 )
std::swap(S0, S2);
- for( ; i <= width - 8; i += 8 )
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4));
+ if( i <= width - v_int32::nlanes )
{
- __m128i s0, s1, s2, s3;
- s0 = _mm_load_si128((__m128i*)(S2 + i));
- s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
- s2 = _mm_load_si128((__m128i*)(S0 + i));
- s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
- s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
- s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
- _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+ v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+ i += v_int32::nlanes;
}
}
else
{
- __m128 k1 = _mm_set1_ps(ky[1]);
- for( ; i <= width - 8; i += 8 )
+ v_float32 k1 = vx_setall_f32(ky[1]);
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+ v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
+ v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+ if( i <= width - v_int32::nlanes )
{
- __m128 s0 = df4, s1 = df4;
- __m128i x0, x1;
- x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
- _mm_load_si128((__m128i*)(S0 + i)));
- x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
- _mm_load_si128((__m128i*)(S0 + i + 4)));
- s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
- s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
- _mm_storeu_si128((__m128i*)(dst + i), x0);
+ v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
+ i += v_int32::nlanes;
}
}
}
struct RowVec_16s32f
{
- RowVec_16s32f() { sse2_supported = false; }
+ RowVec_16s32f() {}
RowVec_16s32f( const Mat& _kernel )
{
kernel = _kernel;
- sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
}
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
{
- if( !sse2_supported )
- return 0;
-
int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
float* dst = (float*)_dst;
const float* _kx = kernel.ptr<float>();
width *= cn;
- for( ; i <= width - 8; i += 8 )
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
{
const short* src = (const short*)_src + i;
- __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+ v_float32 s0 = vx_setzero_f32();
+ v_float32 s1 = vx_setzero_f32();
for( k = 0; k < _ksize; k++, src += cn )
{
- f = _mm_load_ss(_kx+k);
- f = _mm_shuffle_ps(f, f, 0);
-
- __m128i x0i = _mm_loadu_si128((const __m128i*)src);
- __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
- x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
- x0 = _mm_cvtepi32_ps(x0i);
- x1 = _mm_cvtepi32_ps(x1i);
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
+ v_int16 x = vx_load(src);
+ s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0);
+ s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1);
}
- _mm_store_ps(dst + i, s0);
- _mm_store_ps(dst + i + 4, s1);
+ v_store(dst + i, s0);
+ v_store(dst + i + v_float32::nlanes, s1);
+ }
+ if( i <= width - v_float32::nlanes )
+ {
+ const short* src = (const short*)_src + i;
+ v_float32 s0 = vx_setzero_f32();
+ for( k = 0; k < _ksize; k++, src += cn )
+ s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0);
+ v_store(dst + i, s0);
+ i += v_float32::nlanes;
}
return i;
}
Mat kernel;
- bool sse2_supported;
};
struct SymmColumnVec_32f16s
{
- SymmColumnVec_32f16s() { symmetryType=0; delta = 0; sse2_supported = false; }
+ SymmColumnVec_32f16s() { symmetryType=0; delta = 0; }
SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
{
symmetryType = _symmetryType;
kernel = _kernel;
delta = (float)_delta;
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
- sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
}
int operator()(const uchar** _src, uchar* _dst, int width) const
{
- if( !sse2_supported )
- return 0;
-
- int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+ int _ksize = kernel.rows + kernel.cols - 1;
+ int ksize2 = _ksize / 2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0, k;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
const float** src = (const float**)_src;
- const float *S, *S2;
short* dst = (short*)_dst;
- __m128 d4 = _mm_set1_ps(delta);
+ v_float32 d4 = vx_setall_f32(delta);
if( symmetrical )
{
- for( ; i <= width - 16; i += 16 )
+ if (_ksize == 1)
+ return 0;
+ v_float32 k0 = vx_setall_f32(ky[0]);
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
{
- __m128 f = _mm_load_ss(ky);
- f = _mm_shuffle_ps(f, f, 0);
- __m128 s0, s1, s2, s3;
- __m128 x0, x1;
- S = src[0] + i;
- s0 = _mm_load_ps(S);
- s1 = _mm_load_ps(S+4);
- s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
- s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
- s2 = _mm_load_ps(S+8);
- s3 = _mm_load_ps(S+12);
- s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
- s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+ v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
for( k = 1; k <= ksize2; k++ )
{
- S = src[k] + i;
- S2 = src[-k] + i;
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
- x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
- x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
- x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
- s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+ v_float32 k1 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
}
-
- __m128i s0i = _mm_cvtps_epi32(s0);
- __m128i s1i = _mm_cvtps_epi32(s1);
- __m128i s2i = _mm_cvtps_epi32(s2);
- __m128i s3i = _mm_cvtps_epi32(s3);
-
- _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
- _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+ v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
}
-
- for( ; i <= width - 4; i += 4 )
+ if( i <= width - v_float32::nlanes )
{
- __m128 f = _mm_load_ss(ky);
- f = _mm_shuffle_ps(f, f, 0);
- __m128 x0, s0 = _mm_load_ps(src[0] + i);
- s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
for( k = 1; k <= ksize2; k++ )
- {
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- S = src[k] + i;
- S2 = src[-k] + i;
- x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- }
-
- __m128i s0i = _mm_cvtps_epi32(s0);
- _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+ v_pack_store(dst + i, v_round(s0));
+ i += v_float32::nlanes;
}
}
else
{
- for( ; i <= width - 16; i += 16 )
+ for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
{
- __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- __m128 x0, x1;
- S = src[0] + i;
-
+ v_float32 s0 = d4;
+ v_float32 s1 = d4;
for( k = 1; k <= ksize2; k++ )
{
- S = src[k] + i;
- S2 = src[-k] + i;
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
- x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
- x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
- x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
- s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+ v_float32 k1 = vx_setall_f32(ky[k]);
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0);
+ s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
}
-
- __m128i s0i = _mm_cvtps_epi32(s0);
- __m128i s1i = _mm_cvtps_epi32(s1);
- __m128i s2i = _mm_cvtps_epi32(s2);
- __m128i s3i = _mm_cvtps_epi32(s3);
-
- _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
- _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+ v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
}
-
- for( ; i <= width - 4; i += 4 )
+ if( i <= width - v_float32::nlanes )
{
- __m128 f, x0, s0 = d4;
-
+ v_float32 s0 = d4;
for( k = 1; k <= ksize2; k++ )
- {
- f = _mm_load_ss(ky+k);
- f = _mm_shuffle_ps(f, f, 0);
- x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- }
-
- __m128i s0i = _mm_cvtps_epi32(s0);
- _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+ v_pack_store(dst + i, v_round(s0));
+ i += v_float32::nlanes;
}
}
int symmetryType;
float delta;
Mat kernel;
- bool sse2_supported;
};
{
RowVec_32f()
{
- haveSSE = checkHardwareSupport(CV_CPU_SSE);
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
#if defined USE_IPP_SEP_FILTERS
bufsz = -1;
RowVec_32f( const Mat& _kernel )
{
kernel = _kernel;
- haveSSE = checkHardwareSupport(CV_CPU_SSE);
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
#if defined USE_IPP_SEP_FILTERS
bufsz = -1;
float* dst = (float*)_dst;
const float* _kx = kernel.ptr<float>();
- if( !haveSSE )
- return 0;
-
int i = 0, k;
width *= cn;
if (haveAVX2)
return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
#endif
- for( ; i <= width - 8; i += 8 )
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
{
const float* src = src0 + i;
- __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+ v_float32 s0 = vx_setzero_f32();
for( k = 0; k < _ksize; k++, src += cn )
- {
- f = _mm_set1_ps(_kx[k]);
-
- x0 = _mm_loadu_ps(src);
- x1 = _mm_loadu_ps(src + 4);
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
- }
- _mm_store_ps(dst + i, s0);
- _mm_store_ps(dst + i + 4, s1);
+ s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
+ v_store(dst + i, s0);
}
return i;
}
Mat kernel;
- bool haveSSE;
bool haveAVX2;
#if defined USE_IPP_SEP_FILTERS
private:
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
{
- if( !checkHardwareSupport(CV_CPU_SSE) )
- return 0;
-
int i = 0, _ksize = kernel.rows + kernel.cols - 1;
float* dst = (float*)_dst;
const float* src = (const float*)_src + (_ksize/2)*cn;
return 0;
if( _ksize == 3 )
{
- if( kx[0] == 2 && kx[1] == 1 )
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_ps(src - cn);
- x1 = _mm_loadu_ps(src);
- x2 = _mm_loadu_ps(src + cn);
- y0 = _mm_loadu_ps(src - cn + 4);
- y1 = _mm_loadu_ps(src + 4);
- y2 = _mm_loadu_ps(src + cn + 4);
- x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
- y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
- else if( kx[0] == -2 && kx[1] == 1 )
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_ps(src - cn);
- x1 = _mm_loadu_ps(src);
- x2 = _mm_loadu_ps(src + cn);
- y0 = _mm_loadu_ps(src - cn + 4);
- y1 = _mm_loadu_ps(src + 4);
- y2 = _mm_loadu_ps(src + cn + 4);
- x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
- y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ if( fabs(kx[0]) == 2 && kx[1] == 1 )
+ {
+ v_float32 k0 = vx_setall_f32(kx[0]);
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+ }
else
{
- __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_ps(src - cn);
- x1 = _mm_loadu_ps(src);
- x2 = _mm_loadu_ps(src + cn);
- y0 = _mm_loadu_ps(src - cn + 4);
- y1 = _mm_loadu_ps(src + 4);
- y2 = _mm_loadu_ps(src + cn + 4);
-
- x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
- y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
- x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
- y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]);
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1));
}
}
else if( _ksize == 5 )
{
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_ps(src - cn*2);
- x1 = _mm_loadu_ps(src);
- x2 = _mm_loadu_ps(src + cn*2);
- y0 = _mm_loadu_ps(src - cn*2 + 4);
- y1 = _mm_loadu_ps(src + 4);
- y2 = _mm_loadu_ps(src + cn*2 + 4);
- x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
- y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ {
+ v_float32 k0 = vx_setall_f32(-2);
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+ }
else
{
- __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x1, x2, y0, y1, y2;
- x0 = _mm_loadu_ps(src - cn);
- x1 = _mm_loadu_ps(src);
- x2 = _mm_loadu_ps(src + cn);
- y0 = _mm_loadu_ps(src - cn + 4);
- y1 = _mm_loadu_ps(src + 4);
- y2 = _mm_loadu_ps(src + cn + 4);
-
- x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
- y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
- x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
- y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
-
- x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
- y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
- x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
- y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
-
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)));
}
}
}
if( _ksize == 3 )
{
if( kx[0] == 0 && kx[1] == 1 )
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x2, y0, y2;
- x0 = _mm_loadu_ps(src + cn);
- x2 = _mm_loadu_ps(src - cn);
- y0 = _mm_loadu_ps(src + cn + 4);
- y2 = _mm_loadu_ps(src - cn + 4);
- x0 = _mm_sub_ps(x0, x2);
- y0 = _mm_sub_ps(y0, y2);
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, vx_load(src + cn) - vx_load(src - cn));
else
{
- __m128 k1 = _mm_set1_ps(kx[1]);
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x2, y0, y2;
- x0 = _mm_loadu_ps(src + cn);
- x2 = _mm_loadu_ps(src - cn);
- y0 = _mm_loadu_ps(src + cn + 4);
- y2 = _mm_loadu_ps(src - cn + 4);
-
- x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
- y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ v_float32 k1 = vx_setall_f32(kx[1]);
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1);
}
}
else if( _ksize == 5 )
{
- __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- __m128 x0, x2, y0, y2;
- x0 = _mm_loadu_ps(src + cn);
- x2 = _mm_loadu_ps(src - cn);
- y0 = _mm_loadu_ps(src + cn + 4);
- y2 = _mm_loadu_ps(src - cn + 4);
-
- x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
- y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
-
- x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
- y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
- x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
- y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
-
- _mm_store_ps(dst + i, x0);
- _mm_store_ps(dst + i + 4, y0);
- }
+ v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1));
}
}
{
SymmColumnVec_32f() {
symmetryType=0;
- haveSSE = checkHardwareSupport(CV_CPU_SSE);
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
delta = 0;
}
symmetryType = _symmetryType;
kernel = _kernel;
delta = (float)_delta;
- haveSSE = checkHardwareSupport(CV_CPU_SSE);
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
}
int operator()(const uchar** _src, uchar* _dst, int width) const
{
- if( !haveSSE )
- return 0;
-
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0, k;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
const float** src = (const float**)_src;
- const float *S, *S2;
float* dst = (float*)_dst;
if( symmetrical )
if (haveAVX2)
return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
#endif
- const __m128 d4 = _mm_set1_ps(delta);
- for( ; i <= width - 16; i += 16 )
- {
- __m128 f = _mm_set1_ps(ky[0]);
- __m128 s0, s1, s2, s3;
- __m128 x0, x1;
- S = src[0] + i;
- s0 = _mm_load_ps(S);
- s1 = _mm_load_ps(S+4);
- s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
- s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
- s2 = _mm_load_ps(S+8);
- s3 = _mm_load_ps(S+12);
- s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
- s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-
- for( k = 1; k <= ksize2; k++ )
- {
- S = src[k] + i;
- S2 = src[-k] + i;
- f = _mm_set1_ps(ky[k]);
- x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
- x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
- x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
- x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
- s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
- }
-
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- _mm_storeu_ps(dst + i + 8, s2);
- _mm_storeu_ps(dst + i + 12, s3);
- }
-
- for( ; i <= width - 4; i += 4 )
- {
- __m128 f = _mm_set1_ps(ky[0]);
- __m128 x0, s0 = _mm_load_ps(src[0] + i);
- s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-
- for( k = 1; k <= ksize2; k++ )
- {
- f = _mm_set1_ps(ky[k]);
- S = src[k] + i;
- S2 = src[-k] + i;
- x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- }
-
- _mm_storeu_ps(dst + i, s0);
- }
- }
- else
- {
- #if CV_TRY_AVX2
- if (haveAVX2)
- return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
- #endif
- const __m128 d4 = _mm_set1_ps(delta);
- for( ; i <= width - 16; i += 16 )
- {
- __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- __m128 x0, x1;
- S = src[0] + i;
-
- for( k = 1; k <= ksize2; k++ )
- {
- S = src[k] + i;
- S2 = src[-k] + i;
- f = _mm_set1_ps(ky[k]);
- x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
- x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
- x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
- x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
- s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
- }
-
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- _mm_storeu_ps(dst + i + 8, s2);
- _mm_storeu_ps(dst + i + 12, s3);
- }
-
- for( ; i <= width - 4; i += 4 )
+ const v_float32 d4 = vx_setall_f32(delta);
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
{
- __m128 f, x0, s0 = d4;
-
+ v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4);
for( k = 1; k <= ksize2; k++ )
- {
- f = _mm_set1_ps(ky[k]);
- x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
- }
-
- _mm_storeu_ps(dst + i, s0);
+ s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+ v_store(dst + i, s0);
}
}
-
- return i;
- }
-
- int symmetryType;
- float delta;
- Mat kernel;
- bool haveSSE;
- bool haveAVX2;
- };
-
-
- struct SymmColumnSmallVec_32f
- {
- SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; }
- SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
- {
- symmetryType = _symmetryType;
- kernel = _kernel;
- delta = (float)_delta;
- CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
- }
-
- int operator()(const uchar** _src, uchar* _dst, int width) const
- {
- if( !checkHardwareSupport(CV_CPU_SSE) )
- return 0;
-
- int ksize2 = (kernel.rows + kernel.cols - 1)/2;
- const float* ky = kernel.ptr<float>() + ksize2;
- int i = 0;
- bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
- const float** src = (const float**)_src;
- const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
- float* dst = (float*)_dst;
- __m128 d4 = _mm_set1_ps(delta);
-
- if( symmetrical )
- {
- if( ky[0] == 2 && ky[1] == 1 )
- {
- for( ; i <= width - 8; i += 8 )
- {
- __m128 s0, s1, s2, s3, s4, s5;
- s0 = _mm_load_ps(S0 + i);
- s1 = _mm_load_ps(S0 + i + 4);
- s2 = _mm_load_ps(S1 + i);
- s3 = _mm_load_ps(S1 + i + 4);
- s4 = _mm_load_ps(S2 + i);
- s5 = _mm_load_ps(S2 + i + 4);
- s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
- s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
- s0 = _mm_add_ps(s0, d4);
- s1 = _mm_add_ps(s1, d4);
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- }
- }
- else if( ky[0] == -2 && ky[1] == 1 )
- {
- for( ; i <= width - 8; i += 8 )
- {
- __m128 s0, s1, s2, s3, s4, s5;
- s0 = _mm_load_ps(S0 + i);
- s1 = _mm_load_ps(S0 + i + 4);
- s2 = _mm_load_ps(S1 + i);
- s3 = _mm_load_ps(S1 + i + 4);
- s4 = _mm_load_ps(S2 + i);
- s5 = _mm_load_ps(S2 + i + 4);
- s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
- s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
- s0 = _mm_add_ps(s0, d4);
- s1 = _mm_add_ps(s1, d4);
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- }
- }
- else
- {
- __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
- for( ; i <= width - 8; i += 8 )
- {
- __m128 s0, s1, x0, x1;
- s0 = _mm_load_ps(S1 + i);
- s1 = _mm_load_ps(S1 + i + 4);
- s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
- s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
- x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
- x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- }
- }
- }
- else
- {
- if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
- {
- if( ky[1] < 0 )
- std::swap(S0, S2);
- for( ; i <= width - 8; i += 8 )
- {
- __m128 s0, s1, s2, s3;
- s0 = _mm_load_ps(S2 + i);
- s1 = _mm_load_ps(S2 + i + 4);
- s2 = _mm_load_ps(S0 + i);
- s3 = _mm_load_ps(S0 + i + 4);
- s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
- s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- }
- }
- else
- {
- __m128 k1 = _mm_set1_ps(ky[1]);
- for( ; i <= width - 8; i += 8 )
- {
- __m128 s0 = d4, s1 = d4, x0, x1;
- x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
- x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
- s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
- s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- }
- }
- }
-
- return i;
- }
-
- int symmetryType;
- float delta;
- Mat kernel;
- };
-
-
- /////////////////////////////// non-separable filters ///////////////////////////////
-
- ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
-
- struct FilterVec_8u
- {
- FilterVec_8u() { delta = 0; _nz = 0; }
- FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
- {
- Mat kernel;
- _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
- delta = (float)(_delta/(1 << _bits));
- std::vector<Point> coords;
- preprocess2DKernel(kernel, coords, coeffs);
- _nz = (int)coords.size();
- }
-
- int operator()(const uchar** src, uchar* dst, int width) const
- {
- if( !checkHardwareSupport(CV_CPU_SSE2) )
- return 0;
-
- const float* kf = (const float*)&coeffs[0];
- int i = 0, k, nz = _nz;
- __m128 d4 = _mm_set1_ps(delta);
-
- for( ; i <= width - 16; i += 16 )
- {
- __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- __m128i x0, x1, z = _mm_setzero_si128();
-
- for( k = 0; k < nz; k++ )
- {
- __m128 f = _mm_load_ss(kf+k), t0, t1;
- f = _mm_shuffle_ps(f, f, 0);
-
- x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
- x1 = _mm_unpackhi_epi8(x0, z);
- x0 = _mm_unpacklo_epi8(x0, z);
-
- t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
- t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
- s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-
- t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
- t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
- s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
- }
-
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
- x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
- x0 = _mm_packus_epi16(x0, x1);
- _mm_storeu_si128((__m128i*)(dst + i), x0);
- }
-
- for( ; i <= width - 4; i += 4 )
- {
- __m128 s0 = d4;
- __m128i x0, z = _mm_setzero_si128();
-
- for( k = 0; k < nz; k++ )
- {
- __m128 f = _mm_load_ss(kf+k), t0;
- f = _mm_shuffle_ps(f, f, 0);
-
- x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
- x0 = _mm_unpacklo_epi8(x0, z);
- t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
- s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
- }
-
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
- x0 = _mm_packus_epi16(x0, x0);
- *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
- }
-
- return i;
- }
-
- int _nz;
- std::vector<uchar> coeffs;
- float delta;
- };
-
-
- struct FilterVec_8u16s
- {
- FilterVec_8u16s() { delta = 0; _nz = 0; }
- FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
- {
- Mat kernel;
- _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
- delta = (float)(_delta/(1 << _bits));
- std::vector<Point> coords;
- preprocess2DKernel(kernel, coords, coeffs);
- _nz = (int)coords.size();
- }
-
- int operator()(const uchar** src, uchar* _dst, int width) const
- {
- if( !checkHardwareSupport(CV_CPU_SSE2) )
- return 0;
-
- const float* kf = (const float*)&coeffs[0];
- short* dst = (short*)_dst;
- int i = 0, k, nz = _nz;
- __m128 d4 = _mm_set1_ps(delta);
-
- for( ; i <= width - 16; i += 16 )
- {
- __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- __m128i x0, x1, z = _mm_setzero_si128();
-
- for( k = 0; k < nz; k++ )
- {
- __m128 f = _mm_load_ss(kf+k), t0, t1;
- f = _mm_shuffle_ps(f, f, 0);
-
- x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
- x1 = _mm_unpackhi_epi8(x0, z);
- x0 = _mm_unpacklo_epi8(x0, z);
-
- t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
- t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
- s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-
- t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
- t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
- s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
- }
-
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
- x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
- _mm_storeu_si128((__m128i*)(dst + i), x0);
- _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
- }
-
- for( ; i <= width - 4; i += 4 )
- {
- __m128 s0 = d4;
- __m128i x0, z = _mm_setzero_si128();
-
- for( k = 0; k < nz; k++ )
- {
- __m128 f = _mm_load_ss(kf+k), t0;
- f = _mm_shuffle_ps(f, f, 0);
-
- x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
- x0 = _mm_unpacklo_epi8(x0, z);
- t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
- s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
- }
-
- x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
- _mm_storel_epi64((__m128i*)(dst + i), x0);
- }
-
- return i;
- }
-
- int _nz;
- std::vector<uchar> coeffs;
- float delta;
- };
-
-
- struct FilterVec_32f
- {
- FilterVec_32f() { delta = 0; _nz = 0; }
- FilterVec_32f(const Mat& _kernel, int, double _delta)
- {
- delta = (float)_delta;
- std::vector<Point> coords;
- preprocess2DKernel(_kernel, coords, coeffs);
- _nz = (int)coords.size();
- }
-
- int operator()(const uchar** _src, uchar* _dst, int width) const
- {
- if( !checkHardwareSupport(CV_CPU_SSE) )
- return 0;
-
- const float* kf = (const float*)&coeffs[0];
- const float** src = (const float**)_src;
- float* dst = (float*)_dst;
- int i = 0, k, nz = _nz;
- __m128 d4 = _mm_set1_ps(delta);
-
- for( ; i <= width - 16; i += 16 )
- {
- __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-
- for( k = 0; k < nz; k++ )
- {
- __m128 f = _mm_load_ss(kf+k), t0, t1;
- f = _mm_shuffle_ps(f, f, 0);
- const float* S = src[k] + i;
-
- t0 = _mm_loadu_ps(S);
- t1 = _mm_loadu_ps(S + 4);
- s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
- s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-
- t0 = _mm_loadu_ps(S + 8);
- t1 = _mm_loadu_ps(S + 12);
- s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
- s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
- }
-
- _mm_storeu_ps(dst + i, s0);
- _mm_storeu_ps(dst + i + 4, s1);
- _mm_storeu_ps(dst + i + 8, s2);
- _mm_storeu_ps(dst + i + 12, s3);
- }
-
- for( ; i <= width - 4; i += 4 )
- {
- __m128 s0 = d4;
-
- for( k = 0; k < nz; k++ )
- {
- __m128 f = _mm_load_ss(kf+k), t0;
- f = _mm_shuffle_ps(f, f, 0);
- t0 = _mm_loadu_ps(src[k] + i);
- s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
- }
- _mm_storeu_ps(dst + i, s0);
- }
-
- return i;
- }
-
- int _nz;
- std::vector<uchar> coeffs;
- float delta;
- };
-
-
- #elif CV_NEON
-
- struct SymmRowSmallVec_8u32s
- {
- SymmRowSmallVec_8u32s() { smallValues = false; }
- SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
- {
- kernel = _kernel;
- symmetryType = _symmetryType;
- smallValues = true;
- int k, ksize = kernel.rows + kernel.cols - 1;
- for( k = 0; k < ksize; k++ )
- {
- int v = kernel.ptr<int>()[k];
- if( v < SHRT_MIN || v > SHRT_MAX )
- {
- smallValues = false;
- break;
- }
- }
- }
-
- int operator()(const uchar* src, uchar* _dst, int width, int cn) const
- {
- if( !checkHardwareSupport(CV_CPU_NEON) )
- return 0;
-
- int i = 0, _ksize = kernel.rows + kernel.cols - 1;
- int* dst = (int*)_dst;
- bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
- const int* kx = kernel.ptr<int>() + _ksize/2;
- if( !smallValues )
- return 0;
-
- src += (_ksize/2)*cn;
- width *= cn;
-
- if( symmetrical )
- {
- if( _ksize == 1 )
- return 0;
- if( _ksize == 3 )
- {
- if( kx[0] == 2 && kx[1] == 1 )
- {
- uint16x8_t zq = vdupq_n_u16(0);
-
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- uint8x8_t x0, x1, x2;
- x0 = vld1_u8( (uint8_t *) (src - cn) );
- x1 = vld1_u8( (uint8_t *) (src) );
- x2 = vld1_u8( (uint8_t *) (src + cn) );
-
- uint16x8_t y0, y1, y2;
- y0 = vaddl_u8(x0, x2);
- y1 = vshll_n_u8(x1, 1);
- y2 = vaddq_u16(y0, y1);
-
- uint16x8x2_t str;
- str.val[0] = y2; str.val[1] = zq;
- vst2q_u16( (uint16_t *) (dst + i), str );
- }
- }
- else if( kx[0] == -2 && kx[1] == 1 )
- return 0;
- else
- {
- int32x4_t k32 = vdupq_n_s32(0);
- k32 = vld1q_lane_s32(kx, k32, 0);
- k32 = vld1q_lane_s32(kx + 1, k32, 1);
-
- int16x4_t k = vqmovn_s32(k32);
-
- uint8x8_t z = vdup_n_u8(0);
-
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- uint8x8_t x0, x1, x2;
- x0 = vld1_u8( (uint8_t *) (src - cn) );
- x1 = vld1_u8( (uint8_t *) (src) );
- x2 = vld1_u8( (uint8_t *) (src + cn) );
-
- int16x8_t y0, y1;
- int32x4_t y2, y3;
- y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
- y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
- y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
- y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
- y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
- y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
-
- vst1q_s32((int32_t *)(dst + i), y2);
- vst1q_s32((int32_t *)(dst + i + 4), y3);
- }
- }
- }
- else if( _ksize == 5 )
- {
- if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
- return 0;
- else
- {
- int32x4_t k32 = vdupq_n_s32(0);
- k32 = vld1q_lane_s32(kx, k32, 0);
- k32 = vld1q_lane_s32(kx + 1, k32, 1);
- k32 = vld1q_lane_s32(kx + 2, k32, 2);
-
- int16x4_t k = vqmovn_s32(k32);
-
- uint8x8_t z = vdup_n_u8(0);
-
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- uint8x8_t x0, x1, x2, x3, x4;
- x0 = vld1_u8( (uint8_t *) (src - cn) );
- x1 = vld1_u8( (uint8_t *) (src) );
- x2 = vld1_u8( (uint8_t *) (src + cn) );
-
- int16x8_t y0, y1;
- int32x4_t accl, acch;
- y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
- y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
- accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
- accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
- acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
- acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
-
- int16x8_t y2;
- x3 = vld1_u8( (uint8_t *) (src - cn*2) );
- x4 = vld1_u8( (uint8_t *) (src + cn*2) );
- y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
- accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
- acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
-
- vst1q_s32((int32_t *)(dst + i), accl);
- vst1q_s32((int32_t *)(dst + i + 4), acch);
- }
- }
- }
- }
- else
- {
- if( _ksize == 3 )
- {
- if( kx[0] == 0 && kx[1] == 1 )
- {
- uint8x8_t z = vdup_n_u8(0);
-
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- uint8x8_t x0, x1;
- x0 = vld1_u8( (uint8_t *) (src - cn) );
- x1 = vld1_u8( (uint8_t *) (src + cn) );
-
- int16x8_t y0;
- y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
- vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-
- vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
- vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
- }
- }
- else
- {
- int32x4_t k32 = vdupq_n_s32(0);
- k32 = vld1q_lane_s32(kx + 1, k32, 1);
-
- int16x4_t k = vqmovn_s32(k32);
-
- uint8x8_t z = vdup_n_u8(0);
-
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- uint8x8_t x0, x1;
- x0 = vld1_u8( (uint8_t *) (src - cn) );
- x1 = vld1_u8( (uint8_t *) (src + cn) );
-
- int16x8_t y0;
- int32x4_t y1, y2;
- y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
- vreinterpretq_s16_u16(vaddl_u8(x0, z)));
- y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
- y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
-
- vst1q_s32((int32_t *)(dst + i), y1);
- vst1q_s32((int32_t *)(dst + i + 4), y2);
- }
- }
- }
- else if( _ksize == 5 )
- {
- int32x4_t k32 = vdupq_n_s32(0);
- k32 = vld1q_lane_s32(kx + 1, k32, 1);
- k32 = vld1q_lane_s32(kx + 2, k32, 2);
-
- int16x4_t k = vqmovn_s32(k32);
-
- uint8x8_t z = vdup_n_u8(0);
-
- for( ; i <= width - 8; i += 8, src += 8 )
- {
- uint8x8_t x0, x1;
- x0 = vld1_u8( (uint8_t *) (src - cn) );
- x1 = vld1_u8( (uint8_t *) (src + cn) );
-
- int32x4_t accl, acch;
- int16x8_t y0;
- y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
- vreinterpretq_s16_u16(vaddl_u8(x0, z)));
- accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
- acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
-
- uint8x8_t x2, x3;
- x2 = vld1_u8( (uint8_t *) (src - cn*2) );
- x3 = vld1_u8( (uint8_t *) (src + cn*2) );
-
- int16x8_t y1;
- y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
- vreinterpretq_s16_u16(vaddl_u8(x2, z)));
- accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
- acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
-
- vst1q_s32((int32_t *)(dst + i), accl);
- vst1q_s32((int32_t *)(dst + i + 4), acch);
- }
- }
- }
-
- return i;
- }
-
- Mat kernel;
- int symmetryType;
- bool smallValues;
- };
-
-
- struct SymmColumnVec_32s8u
- {
- SymmColumnVec_32s8u() { symmetryType=0; }
- SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
- {
- symmetryType = _symmetryType;
- _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
- delta = (float)(_delta/(1 << _bits));
- CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
- }
-
- int operator()(const uchar** _src, uchar* dst, int width) const
- {
- if( !checkHardwareSupport(CV_CPU_NEON) )
- return 0;
-
- int _ksize = kernel.rows + kernel.cols - 1;
- int ksize2 = _ksize / 2;
- const float* ky = kernel.ptr<float>() + ksize2;
- int i = 0, k;
- bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
- const int** src = (const int**)_src;
- const int *S, *S2;
-
- float32x4_t d4 = vdupq_n_f32(delta);
-
- if( symmetrical )
- {
- if( _ksize == 1 )
- return 0;
-
-
- float32x2_t k32;
- k32 = vdup_n_f32(0);
- k32 = vld1_lane_f32(ky, k32, 0);
- k32 = vld1_lane_f32(ky + 1, k32, 1);
-
- for( ; i <= width - 8; i += 8 )
- {
- float32x4_t accl, acch;
- float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
-
- S = src[0] + i;
-
- f0l = vcvtq_f32_s32( vld1q_s32(S) );
- f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-
- S = src[1] + i;
- S2 = src[-1] + i;
-
- f1l = vcvtq_f32_s32( vld1q_s32(S) );
- f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
- f2l = vcvtq_f32_s32( vld1q_s32(S2) );
- f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
- accl = acch = d4;
- accl = vmlaq_lane_f32(accl, f0l, k32, 0);
- acch = vmlaq_lane_f32(acch, f0h, k32, 0);
- accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
- acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
-
- for( k = 2; k <= ksize2; k++ )
- {
- S = src[k] + i;
- S2 = src[-k] + i;
-
- float32x4_t f3l, f3h, f4l, f4h;
- f3l = vcvtq_f32_s32( vld1q_s32(S) );
- f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
- f4l = vcvtq_f32_s32( vld1q_s32(S2) );
- f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
- accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
- acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
- }
-
- int32x4_t s32l, s32h;
- s32l = vcvtq_s32_f32(accl);
- s32h = vcvtq_s32_f32(acch);
-
- int16x4_t s16l, s16h;
- s16l = vqmovn_s32(s32l);
- s16h = vqmovn_s32(s32h);
-
- uint8x8_t u8;
- u8 = vqmovun_s16(vcombine_s16(s16l, s16h));
-
- vst1_u8((uint8_t *)(dst + i), u8);
- }
- }
- else
- {
- float32x2_t k32;
- k32 = vdup_n_f32(0);
- k32 = vld1_lane_f32(ky + 1, k32, 1);
-
- for( ; i <= width - 8; i += 8 )
- {
- float32x4_t accl, acch;
- float32x4_t f1l, f1h, f2l, f2h;
-
- S = src[1] + i;
- S2 = src[-1] + i;
-
- f1l = vcvtq_f32_s32( vld1q_s32(S) );
- f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
- f2l = vcvtq_f32_s32( vld1q_s32(S2) );
- f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
- accl = acch = d4;
- accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
- acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
-
- for( k = 2; k <= ksize2; k++ )
- {
- S = src[k] + i;
- S2 = src[-k] + i;
-
- float32x4_t f3l, f3h, f4l, f4h;
- f3l = vcvtq_f32_s32( vld1q_s32(S) );
- f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
- f4l = vcvtq_f32_s32( vld1q_s32(S2) );
- f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
- accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
- acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
- }
-
- int32x4_t s32l, s32h;
- s32l = vcvtq_s32_f32(accl);
- s32h = vcvtq_s32_f32(acch);
-
- int16x4_t s16l, s16h;
- s16l = vqmovn_s32(s32l);
- s16h = vqmovn_s32(s32h);
-
- uint8x8_t u8;
- u8 = vqmovun_s16(vcombine_s16(s16l, s16h));
-
- vst1_u8((uint8_t *)(dst + i), u8);
+ else
+ {
+ #if CV_TRY_AVX2
+ if (haveAVX2)
+ return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
+ #endif
+ const v_float32 d4 = vx_setall_f32(delta);
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ {
+ v_float32 s0 = d4;
+ for( k = 1; k <= ksize2; k++ )
+ s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+ v_store(dst + i, s0);
}
}
int symmetryType;
float delta;
Mat kernel;
+ bool haveAVX2;
};
- struct SymmColumnSmallVec_32s16s
+ struct SymmColumnSmallVec_32f
{
- SymmColumnSmallVec_32s16s() { symmetryType=0; }
- SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+ SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; }
+ SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
{
symmetryType = _symmetryType;
- _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
- delta = (float)(_delta/(1 << _bits));
+ kernel = _kernel;
+ delta = (float)_delta;
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
}
int operator()(const uchar** _src, uchar* _dst, int width) const
{
- if( !checkHardwareSupport(CV_CPU_NEON) )
- return 0;
-
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
const float* ky = kernel.ptr<float>() + ksize2;
int i = 0;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
- const int** src = (const int**)_src;
- const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
- short* dst = (short*)_dst;
- float32x4_t df4 = vdupq_n_f32(delta);
- int32x4_t d4 = vcvtq_s32_f32(df4);
+ const float** src = (const float**)_src;
+ const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
+ float* dst = (float*)_dst;
+ v_float32 d4 = vx_setall_f32(delta);
if( symmetrical )
{
- if( ky[0] == 2 && ky[1] == 1 )
- {
- for( ; i <= width - 4; i += 4 )
- {
- int32x4_t x0, x1, x2;
- x0 = vld1q_s32((int32_t const *)(S0 + i));
- x1 = vld1q_s32((int32_t const *)(S1 + i));
- x2 = vld1q_s32((int32_t const *)(S2 + i));
-
- int32x4_t y0, y1, y2, y3;
- y0 = vaddq_s32(x0, x2);
- y1 = vqshlq_n_s32(x1, 1);
- y2 = vaddq_s32(y0, y1);
- y3 = vaddq_s32(y2, d4);
-
- int16x4_t t;
- t = vqmovn_s32(y3);
-
- vst1_s16((int16_t *)(dst + i), t);
- }
- }
- else if( ky[0] == -2 && ky[1] == 1 )
- {
- for( ; i <= width - 4; i += 4 )
- {
- int32x4_t x0, x1, x2;
- x0 = vld1q_s32((int32_t const *)(S0 + i));
- x1 = vld1q_s32((int32_t const *)(S1 + i));
- x2 = vld1q_s32((int32_t const *)(S2 + i));
-
- int32x4_t y0, y1, y2, y3;
- y0 = vaddq_s32(x0, x2);
- y1 = vqshlq_n_s32(x1, 1);
- y2 = vsubq_s32(y0, y1);
- y3 = vaddq_s32(y2, d4);
-
- int16x4_t t;
- t = vqmovn_s32(y3);
-
- vst1_s16((int16_t *)(dst + i), t);
- }
- }
- else if( ky[0] == 10 && ky[1] == 3 )
+ if( fabs(ky[0]) == 2 && ky[1] == 1 )
{
- for( ; i <= width - 4; i += 4 )
- {
- int32x4_t x0, x1, x2, x3;
- x0 = vld1q_s32((int32_t const *)(S0 + i));
- x1 = vld1q_s32((int32_t const *)(S1 + i));
- x2 = vld1q_s32((int32_t const *)(S2 + i));
-
- x3 = vaddq_s32(x0, x2);
-
- int32x4_t y0;
- y0 = vmlaq_n_s32(d4, x1, 10);
- y0 = vmlaq_n_s32(y0, x3, 3);
-
- int16x4_t t;
- t = vqmovn_s32(y0);
-
- vst1_s16((int16_t *)(dst + i), t);
- }
+ v_float32 k0 = vx_setall_f32(ky[0]);
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
}
else
{
- float32x2_t k32 = vdup_n_f32(0);
- k32 = vld1_lane_f32(ky, k32, 0);
- k32 = vld1_lane_f32(ky + 1, k32, 1);
-
- for( ; i <= width - 4; i += 4 )
- {
- int32x4_t x0, x1, x2, x3, x4;
- x0 = vld1q_s32((int32_t const *)(S0 + i));
- x1 = vld1q_s32((int32_t const *)(S1 + i));
- x2 = vld1q_s32((int32_t const *)(S2 + i));
-
- x3 = vaddq_s32(x0, x2);
-
- float32x4_t s0, s1, s2;
- s0 = vcvtq_f32_s32(x1);
- s1 = vcvtq_f32_s32(x3);
- s2 = vmlaq_lane_f32(df4, s0, k32, 0);
- s2 = vmlaq_lane_f32(s2, s1, k32, 1);
-
- x4 = vcvtq_s32_f32(s2);
-
- int16x4_t x5;
- x5 = vqmovn_s32(x4);
-
- vst1_s16((int16_t *)(dst + i), x5);
- }
+ v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
}
}
else
{
if( ky[1] < 0 )
std::swap(S0, S2);
- for( ; i <= width - 4; i += 4 )
- {
- int32x4_t x0, x1;
- x0 = vld1q_s32((int32_t const *)(S0 + i));
- x1 = vld1q_s32((int32_t const *)(S2 + i));
-
- int32x4_t y0, y1;
- y0 = vsubq_s32(x1, x0);
- y1 = vqaddq_s32(y0, d4);
-
- int16x4_t t;
- t = vqmovn_s32(y1);
-
- vst1_s16((int16_t *)(dst + i), t);
- }
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
}
else
{
- float32x2_t k32 = vdup_n_f32(0);
- k32 = vld1_lane_f32(ky + 1, k32, 1);
-
- for( ; i <= width - 4; i += 4 )
- {
- int32x4_t x0, x1, x2, x3;
- x0 = vld1q_s32((int32_t const *)(S0 + i));
- x1 = vld1q_s32((int32_t const *)(S2 + i));
-
- x2 = vsubq_s32(x1, x0);
-
- float32x4_t s0, s1;
- s0 = vcvtq_f32_s32(x2);
- s1 = vmlaq_lane_f32(df4, s0, k32, 1);
-
- x3 = vcvtq_s32_f32(s1);
-
- int16x4_t x4;
- x4 = vqmovn_s32(x3);
-
- vst1_s16((int16_t *)(dst + i), x4);
- }
+ v_float32 k1 = vx_setall_f32(ky[1]);
+ for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4));
}
}
};
- struct SymmColumnVec_32f16s
+ /////////////////////////////// non-separable filters ///////////////////////////////
+
+ ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
+
+ struct FilterVec_8u
{
- SymmColumnVec_32f16s() { symmetryType=0; }
- SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
+ FilterVec_8u() { delta = 0; _nz = 0; }
+ FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
{
- symmetryType = _symmetryType;
- kernel = _kernel;
- delta = (float)_delta;
- CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
- neon_supported = checkHardwareSupport(CV_CPU_NEON);
+ Mat kernel;
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+ delta = (float)(_delta/(1 << _bits));
+ std::vector<Point> coords;
+ preprocess2DKernel(kernel, coords, coeffs);
+ _nz = (int)coords.size();
}
- int operator()(const uchar** _src, uchar* _dst, int width) const
+ int operator()(const uchar** src, uchar* dst, int width) const
{
- if( !neon_supported )
- return 0;
-
- int _ksize = kernel.rows + kernel.cols - 1;
- int ksize2 = _ksize / 2;
- const float* ky = kernel.ptr<float>() + ksize2;
- int i = 0, k;
- bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
- const float** src = (const float**)_src;
- const float *S, *S2;
- short* dst = (short*)_dst;
-
- float32x4_t d4 = vdupq_n_f32(delta);
+ const float* kf = (const float*)&coeffs[0];
+ int i = 0, k, nz = _nz;
- if( symmetrical )
+ v_float32 d4 = vx_setall_f32(delta);
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
- if( _ksize == 1 )
- return 0;
-
-
- float32x2_t k32;
- k32 = vdup_n_f32(0);
- k32 = vld1_lane_f32(ky, k32, 0);
- k32 = vld1_lane_f32(ky + 1, k32, 1);
-
- for( ; i <= width - 8; i += 8 )
+ v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
+ for( k = 0; k < nz; k++ )
{
- float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
- float32x4_t accl, acch;
-
- S = src[0] + i;
-
- x0l = vld1q_f32(S);
- x0h = vld1q_f32(S + 4);
-
- S = src[1] + i;
- S2 = src[-1] + i;
-
- x1l = vld1q_f32(S);
- x1h = vld1q_f32(S + 4);
- x2l = vld1q_f32(S2);
- x2h = vld1q_f32(S2 + 4);
-
- accl = acch = d4;
- accl = vmlaq_lane_f32(accl, x0l, k32, 0);
- acch = vmlaq_lane_f32(acch, x0h, k32, 0);
- accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
- acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
-
- for( k = 2; k <= ksize2; k++ )
- {
- S = src[k] + i;
- S2 = src[-k] + i;
-
- float32x4_t x3l, x3h, x4l, x4h;
- x3l = vld1q_f32(S);
- x3h = vld1q_f32(S + 4);
- x4l = vld1q_f32(S2);
- x4h = vld1q_f32(S2 + 4);
-
- accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
- acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
- }
-
- int32x4_t s32l, s32h;
- s32l = vcvtq_s32_f32(accl);
- s32h = vcvtq_s32_f32(acch);
-
- int16x4_t s16l, s16h;
- s16l = vqmovn_s32(s32l);
- s16h = vqmovn_s32(s32h);
-
- vst1_s16((int16_t *)(dst + i), s16l);
- vst1_s16((int16_t *)(dst + i + 4), s16h);
- }
+ v_float32 f = vx_setall_f32(kf[k]);
+ v_uint16 xl, xh;
+ v_expand(vx_load(src[k] + i), xl, xh);
+ v_uint32 x0, x1, x2, x3;
+ v_expand(xl, x0, x1);
+ v_expand(xh, x2, x3);
+ s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+ s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
+ s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2);
+ s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3);
+ }
+ v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
}
- else
+ if( i <= width - v_uint16::nlanes )
{
- float32x2_t k32;
- k32 = vdup_n_f32(0);
- k32 = vld1_lane_f32(ky + 1, k32, 1);
-
- for( ; i <= width - 8; i += 8 )
+ v_float32 s0 = d4, s1 = d4;
+ for( k = 0; k < nz; k++ )
{
- float32x4_t x1l, x1h, x2l, x2h;
- float32x4_t accl, acch;
-
- S = src[1] + i;
- S2 = src[-1] + i;
-
- x1l = vld1q_f32(S);
- x1h = vld1q_f32(S + 4);
- x2l = vld1q_f32(S2);
- x2h = vld1q_f32(S2 + 4);
-
- accl = acch = d4;
- accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
- acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
-
- for( k = 2; k <= ksize2; k++ )
- {
- S = src[k] + i;
- S2 = src[-k] + i;
-
- float32x4_t x3l, x3h, x4l, x4h;
- x3l = vld1q_f32(S);
- x3h = vld1q_f32(S + 4);
- x4l = vld1q_f32(S2);
- x4h = vld1q_f32(S2 + 4);
-
- accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
- acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
- }
-
- int32x4_t s32l, s32h;
- s32l = vcvtq_s32_f32(accl);
- s32h = vcvtq_s32_f32(acch);
-
- int16x4_t s16l, s16h;
- s16l = vqmovn_s32(s32l);
- s16h = vqmovn_s32(s32h);
-
- vst1_s16((int16_t *)(dst + i), s16l);
- vst1_s16((int16_t *)(dst + i + 4), s16h);
+ v_float32 f = vx_setall_f32(kf[k]);
+ v_uint32 x0, x1;
+ v_expand(vx_load_expand(src[k] + i), x0, x1);
+ s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+ s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
}
+ v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ i += v_uint16::nlanes;
+ }
+ #if CV_SIMD_WIDTH > 16
+ while( i <= width - v_int32x4::nlanes )
+ #else
+ if( i <= width - v_int32x4::nlanes )
+ #endif
+ {
+ v_float32x4 s0 = v_setall_f32(delta);
+ for( k = 0; k < nz; k++ )
+ s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
+ v_int32x4 s32 = v_round(s0);
+ v_int16x8 s16 = v_pack(s32, s32);
+ *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+ i += v_int32x4::nlanes;
}
return i;
}
- int symmetryType;
+ int _nz;
+ std::vector<uchar> coeffs;
float delta;
- Mat kernel;
- bool neon_supported;
};
- struct SymmRowSmallVec_32f
+ struct FilterVec_8u16s
{
- SymmRowSmallVec_32f() {}
- SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
+ FilterVec_8u16s() { delta = 0; _nz = 0; }
+ FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
{
- kernel = _kernel;
- symmetryType = _symmetryType;
+ Mat kernel;
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+ delta = (float)(_delta/(1 << _bits));
+ std::vector<Point> coords;
+ preprocess2DKernel(kernel, coords, coeffs);
+ _nz = (int)coords.size();
}
- int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
+ int operator()(const uchar** src, uchar* _dst, int width) const
{
- if( !checkHardwareSupport(CV_CPU_NEON) )
- return 0;
-
- int i = 0, _ksize = kernel.rows + kernel.cols - 1;
- float* dst = (float*)_dst;
- const float* src = (const float*)_src + (_ksize/2)*cn;
- bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
- const float* kx = kernel.ptr<float>() + _ksize/2;
- width *= cn;
+ const float* kf = (const float*)&coeffs[0];
+ short* dst = (short*)_dst;
+ int i = 0, k, nz = _nz;
- if( symmetrical )
+ v_float32 d4 = vx_setall_f32(delta);
+ for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
{
- if( _ksize == 1 )
- return 0;
- if( _ksize == 3 )
- {
- if( kx[0] == 2 && kx[1] == 1 )
- return 0;
- else if( kx[0] == -2 && kx[1] == 1 )
- return 0;
- else
- {
- return 0;
- }
- }
- else if( _ksize == 5 )
+ v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
+ for( k = 0; k < nz; k++ )
{
- if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
- return 0;
- else
- {
- float32x2_t k0, k1;
- k0 = k1 = vdup_n_f32(0);
- k0 = vld1_lane_f32(kx + 0, k0, 0);
- k0 = vld1_lane_f32(kx + 1, k0, 1);
- k1 = vld1_lane_f32(kx + 2, k1, 0);
-
- for( ; i <= width - 4; i += 4, src += 4 )
- {
- float32x4_t x0, x1, x2, x3, x4;
- x0 = vld1q_f32(src);
- x1 = vld1q_f32(src - cn);
- x2 = vld1q_f32(src + cn);
- x3 = vld1q_f32(src - cn*2);
- x4 = vld1q_f32(src + cn*2);
-
- float32x4_t y0;
- y0 = vmulq_lane_f32(x0, k0, 0);
- y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
- y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
-
- vst1q_f32(dst + i, y0);
- }
- }
+ v_float32 f = vx_setall_f32(kf[k]);
+ v_uint16 xl, xh;
+ v_expand(vx_load(src[k] + i), xl, xh);
+ s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0);
+ s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1);
+ s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2);
+ s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3);
}
+ v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
}
- else
+ if( i <= width - v_uint16::nlanes )
{
- if( _ksize == 3 )
- {
- if( kx[0] == 0 && kx[1] == 1 )
- return 0;
- else
- {
- return 0;
- }
- }
- else if( _ksize == 5 )
+ v_float32 s0 = d4, s1 = d4;
+ for( k = 0; k < nz; k++ )
{
- float32x2_t k;
- k = vdup_n_f32(0);
- k = vld1_lane_f32(kx + 1, k, 0);
- k = vld1_lane_f32(kx + 2, k, 1);
-
- for( ; i <= width - 4; i += 4, src += 4 )
- {
- float32x4_t x0, x1, x2, x3;
- x0 = vld1q_f32(src - cn);
- x1 = vld1q_f32(src + cn);
- x2 = vld1q_f32(src - cn*2);
- x3 = vld1q_f32(src + cn*2);
-
- float32x4_t y0;
- y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
- y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
-
- vst1q_f32(dst + i, y0);
- }
+ v_float32 f = vx_setall_f32(kf[k]);
+ v_uint16 x = vx_load_expand(src[k] + i);
+ s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0);
+ s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
}
+ v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+ i += v_uint16::nlanes;
+ }
+ if( i <= width - v_int32::nlanes )
+ {
+ v_float32 s0 = d4;
+ for( k = 0; k < nz; k++ )
+ s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
+ v_pack_store(dst + i, v_round(s0));
+ i += v_int32::nlanes;
}
return i;
}
- Mat kernel;
- int symmetryType;
+ int _nz;
+ std::vector<uchar> coeffs;
+ float delta;
};
- typedef RowNoVec RowVec_8u32s;
- typedef RowNoVec RowVec_16s32f;
- typedef RowNoVec RowVec_32f;
- typedef ColumnNoVec SymmColumnVec_32f;
- typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
- typedef FilterNoVec FilterVec_8u;
- typedef FilterNoVec FilterVec_8u16s;
- typedef FilterNoVec FilterVec_32f;
+ struct FilterVec_32f
+ {
+ FilterVec_32f() { delta = 0; _nz = 0; }
+ FilterVec_32f(const Mat& _kernel, int, double _delta)
+ {
+ delta = (float)_delta;
+ std::vector<Point> coords;
+ preprocess2DKernel(_kernel, coords, coeffs);
+ _nz = (int)coords.size();
+ }
+
+ int operator()(const uchar** _src, uchar* _dst, int width) const
+ {
+ const float* kf = (const float*)&coeffs[0];
+ const float** src = (const float**)_src;
+ float* dst = (float*)_dst;
+ int i = 0, k, nz = _nz;
+
+ v_float32 d4 = vx_setall_f32(delta);
+ for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+ {
+ v_float32 s0 = d4;
+ for( k = 0; k < nz; k++ )
+ s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
+ v_store(dst + i, s0);
+ }
+
+ return i;
+ }
+ int _nz;
+ std::vector<uchar> coeffs;
+ float delta;
+ };
#else
return success;
}
-#ifdef HAVE_IPP
+#if 0 //defined HAVE_IPP
static bool ippFilter2D(int stype, int dtype, int kernel_type,
uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
double delta, int borderType)
{
{
- #if CV_SSE2
int sdepth = CV_MAT_DEPTH(stype);
int ddepth = CV_MAT_DEPTH(dtype);
- int dft_filter_size = ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3) ? 130 : 50;
- #else
- CV_UNUSED(stype);
- CV_UNUSED(dtype);
- int dft_filter_size = 50;
- #endif
+ int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50;
if (kernel_width * kernel_height < dft_filter_size)
return false;
}
if (res)
return;
- CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type,
+ /*CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type,
src_data, src_step,
dst_data, dst_step,
width, height,
kernel_data, kernel_step,
kernel_width, kernel_height,
anchor_x, anchor_y,
- delta, borderType, isSubmatrix))
+ delta, borderType, isSubmatrix))*/
res = dftFilter2D(stype, dtype, kernel_type,
src_data, src_step,