uint8x8_t v_mask;
};
+#elif CV_SSE2
+
+template <>
+struct Cmp_SIMD<schar>
+{
+ explicit Cmp_SIMD(int code_) :
+ code(code_)
+ {
+ CV_Assert(code == CMP_GT || code == CMP_LE ||
+ code == CMP_EQ || code == CMP_NE);
+
+ haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+
+ v_mask = _mm_set1_epi8(0xff);
+ }
+
+ int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
+ {
+ int x = 0;
+
+ if (!haveSSE)
+ return x;
+
+ if (code == CMP_GT)
+ for ( ; x <= width - 16; x += 16)
+ _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x))));
+ else if (code == CMP_LE)
+ for ( ; x <= width - 16; x += 16)
+ {
+ __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x)));
+ _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
+ }
+ else if (code == CMP_EQ)
+ for ( ; x <= width - 16; x += 16)
+ _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x))));
+ else if (code == CMP_NE)
+ for ( ; x <= width - 16; x += 16)
+ {
+ __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x)));
+ _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
+ }
+
+ return x;
+ }
+
+ int code;
+ __m128i v_mask;
+ bool haveSSE;
+};
+
+template <>
+struct Cmp_SIMD<int>
+{
+ explicit Cmp_SIMD(int code_) :
+ code(code_)
+ {
+ CV_Assert(code == CMP_GT || code == CMP_LE ||
+ code == CMP_EQ || code == CMP_NE);
+
+ haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+
+ v_mask = _mm_set1_epi32(0xffffffff);
+ }
+
+ int operator () (const int * src1, const int * src2, uchar * dst, int width) const
+ {
+ int x = 0;
+
+ if (!haveSSE)
+ return x;
+
+ if (code == CMP_GT)
+ for ( ; x <= width - 8; x += 8)
+ {
+ __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x)));
+ __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+ _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+ _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
+ }
+ else if (code == CMP_LE)
+ for ( ; x <= width - 8; x += 8)
+ {
+ __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x)));
+ __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+ _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+ _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
+ }
+ else if (code == CMP_EQ)
+ for ( ; x <= width - 8; x += 8)
+ {
+ __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x)));
+ __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+ _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+ _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
+ }
+ else if (code == CMP_NE)
+ for ( ; x <= width - 8; x += 8)
+ {
+ __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+ _mm_loadu_si128((const __m128i *)(src2 + x)));
+ __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+ _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+ _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
+ }
+
+ return x;
+ }
+
+ int code;
+ __m128i v_mask;
+ bool haveSSE;
+};
+
#endif
template<typename T> static void