+ return x;
+ }
+
+ bool useSIMD;
+};
+
+#endif
+
+#if CV_SSE4_1
+
+template <>
+struct MomentsInTile_SSE<ushort, int, int64>
+{
+ MomentsInTile_SSE()
+ {
+ useSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
+ }
+
+ int operator() (const ushort * ptr, int len, int & x0, int & x1, int & x2, int64 & x3)
+ {
+ int x = 0;
+
+ if (useSIMD)
+ {
+ __m128i vx_init0 = _mm_setr_epi32(0, 1, 2, 3), vx_init1 = _mm_setr_epi32(4, 5, 6, 7),
+ v_delta = _mm_set1_epi32(8), v_zero = _mm_setzero_si128(), v_x0 = v_zero,
+ v_x1 = v_zero, v_x2 = v_zero, v_x3 = v_zero, v_ix0 = vx_init0, v_ix1 = vx_init1;
+
+ for( ; x <= len - 8; x += 8 )
+ {
+ __m128i v_src = _mm_loadu_si128((const __m128i *)(ptr + x));
+ __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_zero), v_src1 = _mm_unpackhi_epi16(v_src, v_zero);
+
+ v_x0 = _mm_add_epi32(v_x0, _mm_add_epi32(v_src0, v_src1));
+ __m128i v_x1_0 = _mm_mullo_epi32(v_src0, v_ix0), v_x1_1 = _mm_mullo_epi32(v_src1, v_ix1);
+ v_x1 = _mm_add_epi32(v_x1, _mm_add_epi32(v_x1_0, v_x1_1));
+
+ __m128i v_2ix0 = _mm_mullo_epi32(v_ix0, v_ix0), v_2ix1 = _mm_mullo_epi32(v_ix1, v_ix1);
+ v_x2 = _mm_add_epi32(v_x2, _mm_add_epi32(_mm_mullo_epi32(v_2ix0, v_src0), _mm_mullo_epi32(v_2ix1, v_src1)));
+
+ __m128i t = _mm_add_epi32(_mm_mullo_epi32(v_2ix0, v_x1_0), _mm_mullo_epi32(v_2ix1, v_x1_1));
+ v_x3 = _mm_add_epi64(v_x3, _mm_add_epi64(_mm_unpacklo_epi32(t, v_zero), _mm_unpackhi_epi32(t, v_zero)));
+
+ v_ix0 = _mm_add_epi32(v_ix0, v_delta);
+ v_ix1 = _mm_add_epi32(v_ix1, v_delta);
+ }
+
+ int CV_DECL_ALIGNED(16) buf[4];
+ int64 CV_DECL_ALIGNED(16) buf64[2];
+
+ _mm_store_si128((__m128i*)buf, v_x0);
+ x0 = buf[0] + buf[1] + buf[2] + buf[3];
+ _mm_store_si128((__m128i*)buf, v_x1);
+ x1 = buf[0] + buf[1] + buf[2] + buf[3];
+ _mm_store_si128((__m128i*)buf, v_x2);
+ x2 = buf[0] + buf[1] + buf[2] + buf[3];
+
+ _mm_store_si128((__m128i*)buf64, v_x3);
+ x3 = buf64[0] + buf64[1];
+ }
+
+ return x;
+ }
+
+ bool useSIMD;
+};
+
+#endif
+
+template<typename T, typename WT, typename MT>
+#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 9
+// Workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60196
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+static void momentsInTile( const Mat& img, double* moments )
+{
+ Size size = img.size();
+ int x, y;
+ MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
+ MomentsInTile_SSE<T, WT, MT> vop;
+
+ for( y = 0; y < size.height; y++ )
+ {
+ const T* ptr = (const T*)(img.data + y*img.step);
+ WT x0 = 0, x1 = 0, x2 = 0;
+ MT x3 = 0;
+ x = vop(ptr, size.width, x0, x1, x2, x3);
+