\****************************************************************************************/
template<typename T, typename WT, typename MT>
-struct MomentsInTile_SSE
+struct MomentsInTile_SIMD
{
int operator() (const T *, int, WT &, WT &, WT &, MT &)
{
#if CV_SSE2
template <>
-struct MomentsInTile_SSE<uchar, int, int>
+struct MomentsInTile_SIMD<uchar, int, int>
{
- MomentsInTile_SSE()
+ MomentsInTile_SIMD()
{
useSIMD = checkHardwareSupport(CV_CPU_SSE2);
}
for( ; x <= len - 8; x += 8 )
{
__m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
- qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
- __m128i px = _mm_mullo_epi16(p, qx);
__m128i sx = _mm_mullo_epi16(qx, qx);
+
+ qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
- qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));
+ qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));
qx = _mm_add_epi16(qx, dx);
}
- int CV_DECL_ALIGNED(16) buf[4];
_mm_store_si128((__m128i*)buf, qx0);
x0 = buf[0] + buf[1] + buf[2] + buf[3];
_mm_store_si128((__m128i*)buf, qx1);
return x;
}
+ int CV_DECL_ALIGNED(16) buf[4];
bool useSIMD;
};
+#elif CV_NEON
+
+template <>
+struct MomentsInTile_SIMD<uchar, int, int>
+{
+ MomentsInTile_SIMD()
+ {
+ ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 };
+ qx_init = vld1_u16(init);
+ v_step = vdup_n_u16(4);
+ }
+
+ int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
+ {
+ int x = 0;
+
+ uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z,
+ v_x2 = v_z, v_x3 = v_z;
+ uint16x4_t qx = qx_init;
+
+ for( ; x <= len - 8; x += 8 )
+ {
+ uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x));
+
+ // first part
+ uint32x4_t v_qx = vmovl_u16(qx);
+ uint16x4_t v_p = vget_low_u16(v_src);
+ uint32x4_t v_px = vmull_u16(qx, v_p);
+
+ v_x0 = vaddw_u16(v_x0, v_p);
+ v_x1 = vaddq_u32(v_x1, v_px);
+ v_px = vmulq_u32(v_px, v_qx);
+ v_x2 = vaddq_u32(v_x2, v_px);
+ v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx));
+ qx = vadd_u16(qx, v_step);
+
+ // second part
+ v_qx = vmovl_u16(qx);
+ v_p = vget_high_u16(v_src);
+ v_px = vmull_u16(qx, v_p);
+
+ v_x0 = vaddw_u16(v_x0, v_p);
+ v_x1 = vaddq_u32(v_x1, v_px);
+ v_px = vmulq_u32(v_px, v_qx);
+ v_x2 = vaddq_u32(v_x2, v_px);
+ v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx));
+
+ qx = vadd_u16(qx, v_step);
+ }
+
+ vst1q_u32(buf, v_x0);
+ x0 = buf[0] + buf[1] + buf[2] + buf[3];
+ vst1q_u32(buf, v_x1);
+ x1 = buf[0] + buf[1] + buf[2] + buf[3];
+ vst1q_u32(buf, v_x2);
+ x2 = buf[0] + buf[1] + buf[2] + buf[3];
+ vst1q_u32(buf, v_x3);
+ x3 = buf[0] + buf[1] + buf[2] + buf[3];
+
+ return x;
+ }
+
+ uint CV_DECL_ALIGNED(16) buf[4];
+ uint16x4_t qx_init, v_step;
+};
+
#endif
#if CV_SSE4_1
template <>
-struct MomentsInTile_SSE<ushort, int, int64>
+struct MomentsInTile_SIMD<ushort, int, int64>
{
- MomentsInTile_SSE()
+ MomentsInTile_SIMD()
{
useSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
}
v_ix1 = _mm_add_epi32(v_ix1, v_delta);
}
- int CV_DECL_ALIGNED(16) buf[4];
- int64 CV_DECL_ALIGNED(16) buf64[2];
-
_mm_store_si128((__m128i*)buf, v_x0);
x0 = buf[0] + buf[1] + buf[2] + buf[3];
_mm_store_si128((__m128i*)buf, v_x1);
return x;
}
+ int CV_DECL_ALIGNED(16) buf[4];
+ int64 CV_DECL_ALIGNED(16) buf64[2];
bool useSIMD;
};
Size size = img.size();
int x, y;
MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
- MomentsInTile_SSE<T, WT, MT> vop;
+ MomentsInTile_SIMD<T, WT, MT> vop;
for( y = 0; y < size.height; y++ )
{