}
};
-#if CV_SSE2
+#if CV_SIMD128
+
+static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+ v_uint32x4 v_src0, v_src1;
+ v_expand(v_load_expand(src), v_src0, v_src1);
+
+ a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
+ b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
+}
+
+static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+ v_int32x4 v_src0, v_src1;
+ v_expand(v_load_expand(src), v_src0, v_src1);
+
+ a = v_shift + v_scale * v_cvt_f32(v_src0);
+ b = v_shift + v_scale * v_cvt_f32(v_src1);
+}
+
+static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+ v_uint32x4 v_src0, v_src1;
+ v_expand(v_load(src), v_src0, v_src1);
+
+ a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
+ b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
+}
+
+static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+ v_int32x4 v_src0, v_src1;
+ v_expand(v_load(src), v_src0, v_src1);
+
+ a = v_shift + v_scale * v_cvt_f32(v_src0);
+ b = v_shift + v_scale * v_cvt_f32(v_src1);
+}
+
+static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+ a = v_shift + v_scale * v_cvt_f32(v_load(src));
+ b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes));
+}
template <>
struct cvtScaleAbs_SIMD<uchar, uchar, float>
{
int operator () (const uchar * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
-
- if (USE_SSE2)
+ if (hasSIMD128())
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
-
- for ( ; x <= width - 16; x += 16)
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ const int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
{
- __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
- __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
- __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
- v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
- __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
- v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
- __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
- v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
- __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
- v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
-
- __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
- _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
+ v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
+ v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
+ v_dst_0 = v_abs(v_dst_0);
+ v_dst_1 = v_abs(v_dst_1);
+ v_dst_2 = v_abs(v_dst_2);
+ v_dst_3 = v_abs(v_dst_3);
+
+ v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1));
+ v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3));
+ v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1));
}
}
-
return x;
}
};
struct cvtScaleAbs_SIMD<schar, uchar, float>
{
int operator () (const schar * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
-
- if (USE_SSE2)
+ if (hasSIMD128())
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
-
- for ( ; x <= width - 16; x += 16)
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ const int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth*2; x += cWidth*2)
{
- __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
- __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
- v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
- __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
- _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
- v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
- __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
- _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
- v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
- __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
- _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
- v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
- __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
- _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
- v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
-
- __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
- _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
+ v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
+ v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
+ v_dst_0 = v_abs(v_dst_0);
+ v_dst_1 = v_abs(v_dst_1);
+ v_dst_2 = v_abs(v_dst_2);
+ v_dst_3 = v_abs(v_dst_3);
+
+ v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1));
+ v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3));
+ v_store(dst + x, v_pack(v_dsti_0, v_dsti_1));
}
}
-
return x;
}
};
struct cvtScaleAbs_SIMD<ushort, uchar, float>
{
int operator () (const ushort * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
-
- if (USE_SSE2)
+ if (hasSIMD128())
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
-
- for ( ; x <= width - 8; x += 8)
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ const int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
{
- __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
- __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
- v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
- __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
- v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
-
- __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
- _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+ v_float32x4 v_dst0, v_dst1;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
+ v_dst0 = v_abs(v_dst0);
+ v_dst1 = v_abs(v_dst1);
+
+ v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
+ v_pack_u_store(dst + x, v_dst);
}
}
-
return x;
}
};
struct cvtScaleAbs_SIMD<short, uchar, float>
{
int operator () (const short * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
-
- if (USE_SSE2)
+ if (hasSIMD128())
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
-
- for ( ; x <= width - 8; x += 8)
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ const int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
{
- __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
- __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
- v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
- __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
- v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
-
- __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
- _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+ v_float32x4 v_dst0, v_dst1;
+ v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
+ v_dst0 = v_abs(v_dst0);
+ v_dst1 = v_abs(v_dst1);
+
+ v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
+ v_pack_u_store(dst + x, v_dst);
}
}
-
return x;
}
};
struct cvtScaleAbs_SIMD<int, uchar, float>
{
int operator () (const int * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
-
- if (USE_SSE2)
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ const int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
+ v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale;
+ v_dst_0 = v_abs(v_dst_0 + v_shift);
- for ( ; x <= width - 8; x += 4)
- {
- __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
- __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
- v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+ v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale;
+ v_dst_1 = v_abs(v_dst_1 + v_shift);
- __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
- _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
- }
+ v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
+ v_pack_u_store(dst + x, v_dst);
}
return x;
struct cvtScaleAbs_SIMD<float, uchar, float>
{
int operator () (const float * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
-
- if (USE_SSE2)
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
+ v_float32x4 v_dst_0 = v_load(src + x) * v_scale;
+ v_dst_0 = v_abs(v_dst_0 + v_shift);
- for ( ; x <= width - 8; x += 4)
- {
- __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
- v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
+ v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale;
+ v_dst_1 = v_abs(v_dst_1 + v_shift);
- __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
- }
+ v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
+ v_pack_u_store(dst + x, v_dst);
}
-
return x;
}
};
+#if CV_SIMD128_64F
template <>
struct cvtScaleAbs_SIMD<double, uchar, float>
{
int operator () (const double * src, uchar * dst, int width,
- float scale, float shift) const
+ float scale, float shift) const
{
int x = 0;
- if (USE_SSE2)
+ if (hasSIMD128())
{
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
- v_zero_f = _mm_setzero_ps();
- __m128i v_zero_i = _mm_setzero_si128();
-
- for ( ; x <= width - 8; x += 8)
+ v_float32x4 v_scale = v_setall_f32(scale);
+ v_float32x4 v_shift = v_setall_f32(shift);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
{
- __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
- __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-
- __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
- v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+ v_float32x4 v_src1, v_src2, v_dummy;
+ v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy);
+ v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy);
- __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
- v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+ v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift);
+ v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift);
- __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
- _mm_cvtps_epi32(v_dst2));
-
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
+ v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2));
+ v_pack_u_store(dst + x, v_dst_i);
}
}
return x;
}
};
-
-#elif CV_NEON
-
-template <>
-struct cvtScaleAbs_SIMD<uchar, uchar, float>
-{
- int operator () (const uchar * src, uchar * dst, int width,
- float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift);
-
- for ( ; x <= width - 16; x += 16)
- {
- uint8x16_t v_src = vld1q_u8(src + x);
- uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
-
- uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
- float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
- v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
- v_quat = vmovl_u16(vget_high_u16(v_half));
- float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
- v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
- v_half = vmovl_u8(vget_high_u8(v_src));
-
- v_quat = vmovl_u16(vget_low_u16(v_half));
- float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
- v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
-
- v_quat = vmovl_u16(vget_high_u16(v_half));
- float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
- v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
-
- uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
- uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
-
- vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<schar, uchar, float>
-{
- int operator () (const schar * src, uchar * dst, int width,
- float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift);
-
- for ( ; x <= width - 16; x += 16)
- {
- int8x16_t v_src = vld1q_s8(src + x);
- int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
-
- int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
- float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
- v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
- v_quat = vmovl_s16(vget_high_s16(v_half));
- float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
- v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
- v_half = vmovl_s8(vget_high_s8(v_src));
-
- v_quat = vmovl_s16(vget_low_s16(v_half));
- float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
- v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
-
- v_quat = vmovl_s16(vget_high_s16(v_half));
- float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
- v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
-
- uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
- uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
-
- vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<ushort, uchar, float>
-{
- int operator () (const ushort * src, uchar * dst, int width,
- float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- uint16x8_t v_src = vld1q_u16(src + x);
-
- uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
- float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
- v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
- v_half = vmovl_u16(vget_high_u16(v_src));
- float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
- v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
-
- vst1_u8(dst + x, vqmovn_u16(v_dst));
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<short, uchar, float>
-{
- int operator () (const short * src, uchar * dst, int width,
- float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- int16x8_t v_src = vld1q_s16(src + x);
-
- int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
- float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
- v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
- v_half = vmovl_s16(vget_high_s16(v_src));
- float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
- v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
-
- vst1_u8(dst + x, vqmovn_u16(v_dst));
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<int, uchar, float>
-{
- int operator () (const int * src, uchar * dst, int width,
- float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
- v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
- uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
-
- float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
- v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
- uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
-
- uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
- vst1_u8(dst + x, vqmovn_u16(v_dst));
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<float, uchar, float>
-{
- int operator () (const float * src, uchar * dst, int width,
- float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
- v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
- uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
-
- float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
- v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
- uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
-
- uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
- vst1_u8(dst + x, vqmovn_u16(v_dst));
- }
-
- return x;
- }
-};
+#endif // CV_SIMD128_64F
#endif
}
};
-#if CV_SSE2
+#if CV_SIMD128
// from uchar
int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_pack_u_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<uchar, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
- }
+#endif
+ if (hasSIMD128())
+ {
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- bool haveSSE;
+ v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
+ }
+ return x;
+ }
};
-#endif
-
template <>
struct cvtScale_SIMD<uchar, short, float>
{
int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
- _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+ v_store(dst + x, v_round(v_src1));
+ v_store(dst + x + cWidth, v_round(v_src2));
+ }
}
-
return x;
}
};
int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- _mm_storeu_ps(dst + x, v_dst_0);
- _mm_storeu_ps(dst + x + 4, v_dst_1);
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, double, double>
-{
- int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-
- __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x, v_dst_0);
- _mm_storeu_pd(dst + x + 2, v_dst_1);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
- v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x + 4, v_dst_0);
- _mm_storeu_pd(dst + x + 6, v_dst_1);
+ v_store(dst + x, v_src1);
+ v_store(dst + x + cWidth, v_src2);
+ }
}
-
return x;
}
};
int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_pack_u_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<schar, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
- }
+#endif
+ if (hasSIMD128())
+ {
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- bool haveSSE;
+ v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
+ }
+ return x;
+ }
};
-#endif
-
template <>
struct cvtScale_SIMD<schar, short, float>
{
int operator () (const schar * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const schar * src, int * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
- _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+ v_store(dst + x, v_round(v_src1));
+ v_store(dst + x + cWidth, v_round(v_src2));
+ }
}
-
return x;
}
};
int operator () (const schar * src, float * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- _mm_storeu_ps(dst + x, v_dst_0);
- _mm_storeu_ps(dst + x + 4, v_dst_1);
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<schar, double, double>
-{
- int operator () (const schar * src, double * dst, int width, double scale, double shift) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
- v_src = _mm_srai_epi16(v_src, 8);
-
- __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x, v_dst_0);
- _mm_storeu_pd(dst + x + 2, v_dst_1);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
- v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x + 4, v_dst_0);
- _mm_storeu_pd(dst + x + 6, v_dst_1);
+ v_store(dst + x, v_src1);
+ v_store(dst + x + cWidth, v_src2);
+ }
}
-
return x;
}
};
int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_pack_u_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<ushort, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
- }
+#endif
+ if (hasSIMD128())
+ {
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- bool haveSSE;
+ v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
+ }
+ return x;
+ }
};
-#endif
-
template <>
struct cvtScale_SIMD<ushort, short, float>
{
int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
- _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+ v_store(dst + x, v_round(v_src1));
+ v_store(dst + x + cWidth, v_round(v_src2));
+ }
}
-
return x;
}
};
int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- _mm_storeu_ps(dst + x, v_dst_0);
- _mm_storeu_ps(dst + x + 4, v_dst_1);
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, double, double>
-{
- int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-
- __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x, v_dst_0);
- _mm_storeu_pd(dst + x + 2, v_dst_1);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
- v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x + 4, v_dst_0);
- _mm_storeu_pd(dst + x + 6, v_dst_1);
+ v_store(dst + x, v_src1);
+ v_store(dst + x + cWidth, v_src2);
+ }
}
-
return x;
}
};
int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_pack_u_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const short * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<short, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
- }
-
- bool haveSSE;
-};
-
#endif
-
-template <>
-struct cvtScale_SIMD<short, short, float>
-{
- int operator () (const short * src, short * dst, int width, float scale, float shift) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<short, int, float>
+struct cvtScale_SIMD<short, short, float>
{
- int operator () (const short * src, int * dst, int width, float scale, float shift) const
+ int operator () (const short * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
- _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const short * src, float * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
- _mm_storeu_ps(dst + x, v_dst_0);
- _mm_storeu_ps(dst + x + 4, v_dst_1);
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<short, double, double>
-{
- int operator () (const short * src, double * dst, int width, double scale, double shift) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-
- __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x, v_dst_0);
- _mm_storeu_pd(dst + x + 2, v_dst_1);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
- v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
- v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
- _mm_storeu_pd(dst + x + 4, v_dst_0);
- _mm_storeu_pd(dst + x + 6, v_dst_1);
+ v_store(dst + x, v_src1);
+ v_store(dst + x + cWidth, v_src2);
+ }
}
-
return x;
}
};
int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
- v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_pack_u_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const int * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
- v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<int, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
- }
+#endif
+ if (hasSIMD128())
+ {
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- bool haveSSE;
+ v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
+ }
+ return x;
+ }
};
-#endif
-
template <>
struct cvtScale_SIMD<int, short, float>
{
int operator () (const int * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
- v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_src1, v_src2;
+ v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
+#if CV_SIMD128_64F
template <>
struct cvtScale_SIMD<int, int, double>
{
int operator () (const int * src, int * dst, int width, double scale, double shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
- v_src = _mm_srli_si128(v_src, 8);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
- __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
- _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
-
- _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
+ v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
+ v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
+ v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
+ }
}
-
return x;
}
};
int operator () (const int * src, float * dst, int width, double scale, double shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
- v_src = _mm_srli_si128(v_src, 8);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
- _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
- _mm_cvtpd_ps(v_dst_1)));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
+ v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
+ v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
+ v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)));
+ }
}
-
return x;
}
};
+#endif //CV_SIMD128_64F
+
+// from float
template <>
-struct cvtScale_SIMD<int, double, double>
+struct cvtScale_SIMD<float, uchar, float>
{
- int operator () (const int * src, double * dst, int width, double scale, double shift) const
+ int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
- v_src = _mm_srli_si128(v_src, 8);
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+ v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
- _mm_storeu_pd(dst + x, v_dst_0);
- _mm_storeu_pd(dst + x + 2, v_dst_1);
+ v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+ v_pack_u_store(dst + x, v_dst);
+ }
}
-
- return x;
- }
-};
-
-// from float
-
-template <>
-struct cvtScale_SIMD<float, uchar, float>
-{
- int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
- {
- __m128 v_src = _mm_loadu_ps(src + x);
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_loadu_ps(src + x + 4);
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
- }
-
return x;
}
};
int operator () (const float * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_loadu_ps(src + x);
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_loadu_ps(src + x + 4);
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+ v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<float, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
- }
+#endif
+ if (hasSIMD128())
+ {
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+ v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
- bool haveSSE;
+ v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
+ v_store(dst + x, v_dst);
+ }
+ }
+ return x;
+ }
};
-#endif
-
template <>
struct cvtScale_SIMD<float, short, float>
{
int operator () (const float * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_loadu_ps(src + x);
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_loadu_ps(src + x + 4);
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+ v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const float * src, int * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_loadu_ps(src + x);
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_loadu_ps(src + x + 4);
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
- _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift));
}
-
return x;
}
};
int operator () (const float * src, float * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_loadu_ps(src + x);
- __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
- _mm_storeu_ps(dst + x, v_dst);
+ v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_store(dst + x, v_load(src + x) * v_scale + v_shift);
}
-
return x;
}
};
-template <>
-struct cvtScale_SIMD<float, double, double>
-{
- int operator () (const float * src, double * dst, int width, double scale, double shift) const
- {
- int x = 0;
+#if CV_SIMD128_64F
- if (!USE_SSE2)
- return x;
+static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2)
+{
+ int cWidth = v_float64x2::nlanes;
+ v_float64x2 v_src1 = v_shift + v_scale * v_load(src);
+ v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth);
+ v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2);
+ v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3);
+ v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2));
+ v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4));
+}
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2)
+{
+ v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
+ v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
+ v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
+ v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
- for ( ; x <= width - 4; x += 4)
- {
- __m128 v_src = _mm_loadu_ps(src + x);
- __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
- v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
- __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
+ v_store(dst, v_dst1);
+ v_store(dst + v_float64x2::nlanes, v_dst2);
+ v_store(dst + v_float64x2::nlanes * 2, v_dst3);
+ v_store(dst + v_float64x2::nlanes * 3, v_dst4);
+}
- _mm_storeu_pd(dst + x, v_dst_0);
- _mm_storeu_pd(dst + x + 2, v_dst_1);
- }
+static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2)
+{
+ v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
+ v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
+ v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
+ v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
- return x;
- }
-};
+ v_store(dst, v_dst1);
+ v_store(dst + v_float64x2::nlanes, v_dst2);
+ v_store(dst + v_float64x2::nlanes * 2, v_dst3);
+ v_store(dst + v_float64x2::nlanes * 3, v_dst4);
+}
// from double
int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+ v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_float32x4 v_dst1, v_dst2;
+ v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+ v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2)));
+ }
}
-
return x;
}
};
int operator () (const double * src, schar * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128i v_zero = _mm_setzero_si128();
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+ v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_float32x4 v_dst1, v_dst2;
+ v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+ v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+ v_pack_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
template <>
struct cvtScale_SIMD<double, ushort, float>
{
- cvtScale_SIMD()
- {
- haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
- }
-
int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
{
- if (haveSSE)
+ int x = 0;
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift);
- else
- return 0;
+#endif
+ if (hasSIMD128())
+ {
+ v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_dst1, v_dst2;
+ v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+ v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
+ v_store(dst + x, v_dst);
+ }
+ }
+ return x;
}
-
- bool haveSSE;
};
-#endif
-
template <>
struct cvtScale_SIMD<double, short, float>
{
int operator () (const double * src, short * dst, int width, float scale, float shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
- __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
- _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
- __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
- _mm_cvtps_epi32(v_dst_1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_float32x4 v_dst1, v_dst2;
+ v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+ v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+ v_store(dst + x, v_dst);
+ }
}
-
return x;
}
};
int operator () (const double * src, int * dst, int width, double scale, double shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128d v_src = _mm_loadu_pd(src + x);
- __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
- v_src = _mm_loadu_pd(src + x + 2);
- __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
- __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
- _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
+ v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
- _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
+ v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
+ }
}
-
return x;
}
};
int operator () (const double * src, float * dst, int width, double scale, double shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128d v_src = _mm_loadu_pd(src + x);
- __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
- v_src = _mm_loadu_pd(src + x + 2);
- __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
- __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
- _mm_cvtpd_ps(v_dst1));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
+ v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
+ v_float32x4 v_dst1 = v_cvt_f32(v_src1);
+ v_float32x4 v_dst2 = v_cvt_f32(v_src2);
- _mm_storeu_ps(dst + x, v_dst);
+ v_store(dst + x, v_combine_low(v_dst1, v_dst2));
+ }
}
-
return x;
}
};
+// to double
+
template <>
-struct cvtScale_SIMD<double, double, double>
+struct cvtScale_SIMD<uchar, double, double>
{
- int operator () (const double * src, double * dst, int width, double scale, double shift) const
+ int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
- for ( ; x <= width - 2; x += 2)
+ if (hasSIMD128())
{
- __m128d v_src = _mm_loadu_pd(src + x);
- __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
- _mm_storeu_pd(dst + x, v_dst);
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_load_expand(src + x), v_src1, v_src2);
+ v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
+ , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
+ }
}
-
return x;
}
};
-#elif CV_NEON
-
-// from uchar
-
template <>
-struct cvtScale_SIMD<uchar, uchar, float>
+struct cvtScale_SIMD<schar, double, double>
{
- int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
+ int operator () (const schar * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1_u8(dst + x, vqmovn_u16(v_dst));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_load_expand(src + x), v_src1, v_src2);
+ v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<uchar, schar, float>
+struct cvtScale_SIMD<ushort, double, double>
{
- int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
+ int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1_s8(dst + x, vqmovn_s16(v_dst));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_load(src + x), v_src1, v_src2);
+ v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
+ , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<uchar, ushort, float>
+struct cvtScale_SIMD<short, double, double>
{
- int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
+ int operator () (const short * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1q_u16(dst + x, v_dst);
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_load(src + x), v_src1, v_src2);
+ v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<uchar, short, float>
+struct cvtScale_SIMD<int, double, double>
{
- int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
+ int operator () (const int * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1q_s16(dst + x, v_dst);
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src1 = v_load(src + x);
+ v_int32x4 v_src2 = v_load(src + x + cWidth);
+ v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<uchar, int, float>
+struct cvtScale_SIMD<float, double, double>
{
- int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
+ int operator () (const float * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
- vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src1 = v_load(src + x);
+ v_float32x4 v_src2 = v_load(src + x + cWidth);
+ v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<uchar, float, float>
+struct cvtScale_SIMD<double, double, double>
{
- int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
+ int operator () (const double * src, double * dst, int width, double scale, double shift) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
- vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
+ v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
+ v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
+ v_store(dst + x, v_src1);
+ v_store(dst + x + cWidth, v_src2);
+ }
}
-
return x;
}
};
+#endif
+#endif
-// from schar
-
-template <>
-struct cvtScale_SIMD<schar, uchar, float>
+template<typename T, typename DT, typename WT> static void
+cvtScale_( const T* src, size_t sstep,
+ DT* dst, size_t dstep, Size size,
+ WT scale, WT shift )
{
- int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
+ sstep /= sizeof(src[0]);
+ dstep /= sizeof(dst[0]);
+
+ cvtScale_SIMD<T, DT, WT> vop;
+
+ for( ; size.height--; src += sstep, dst += dstep )
{
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+ int x = vop(src, dst, size.width, scale, shift);
- for ( ; x <= width - 8; x += 8)
+ #if CV_ENABLE_UNROLLED
+ for( ; x <= size.width - 4; x += 4 )
{
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1_u8(dst + x, vqmovn_u16(v_dst));
+ DT t0, t1;
+ t0 = saturate_cast<DT>(src[x]*scale + shift);
+ t1 = saturate_cast<DT>(src[x+1]*scale + shift);
+ dst[x] = t0; dst[x+1] = t1;
+ t0 = saturate_cast<DT>(src[x+2]*scale + shift);
+ t1 = saturate_cast<DT>(src[x+3]*scale + shift);
+ dst[x+2] = t0; dst[x+3] = t1;
}
+ #endif
- return x;
+ for( ; x < size.width; x++ )
+ dst[x] = saturate_cast<DT>(src[x]*scale + shift);
}
-};
+}
-template <>
-struct cvtScale_SIMD<schar, schar, float>
+template<> void
+cvtScale_<short, int, float>( const short* src, size_t sstep,
+ int* dst, size_t dstep, Size size,
+ float scale, float shift )
{
- int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
+ sstep /= sizeof(src[0]);
+ dstep /= sizeof(dst[0]);
+
+ for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ #if CV_TRY_AVX2
+ if (CV_CPU_HAS_SUPPORT_AVX2)
{
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1_s8(dst + x, vqmovn_s16(v_dst));
+ opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width);
+ continue;
}
+ #endif
+ #if CV_SIMD128
+ if (hasSIMD128())
+ {
+ v_float32x4 v_shift = v_setall_f32(shift);
+ v_float32x4 v_scale = v_setall_f32(scale);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= size.width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src = v_load(src + x);
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_float32x4 v_tmp1 = v_cvt_f32(v_src1);
+ v_float32x4 v_tmp2 = v_cvt_f32(v_src2);
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<schar, ushort, float>
-{
- int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1q_u16(dst + x, v_dst);
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<schar, short, float>
-{
- int operator () (const schar * src, short * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1q_s16(dst + x, v_dst);
- }
+ v_tmp1 = v_tmp1 * v_scale + v_shift;
+ v_tmp2 = v_tmp2 * v_scale + v_shift;
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<schar, int, float>
-{
- int operator () (const schar * src, int * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
- vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
- }
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<schar, float, float>
-{
- int operator () (const schar * src, float * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
- vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
- }
-
- return x;
- }
-};
-
-// from ushort
-
-template <>
-struct cvtScale_SIMD<ushort, uchar, float>
-{
- int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- uint16x8_t v_src = vld1q_u16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1_u8(dst + x, vqmovn_u16(v_dst));
+ v_store(dst + x, v_round(v_tmp1));
+ v_store(dst + x + cWidth, v_round(v_tmp2));
+ }
}
+ #endif
- return x;
+ for(; x < size.width; x++ )
+ dst[x] = saturate_cast<int>(src[x]*scale + shift);
}
-};
+}
-template <>
-struct cvtScale_SIMD<ushort, schar, float>
+template <typename T, typename DT>
+struct Cvt_SIMD
{
- int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
+ int operator() (const T *, DT *, int) const
{
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- uint16x8_t v_src = vld1q_u16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1_s8(dst + x, vqmovn_s16(v_dst));
- }
-
- return x;
+ return 0;
}
};
-template <>
-struct cvtScale_SIMD<ushort, ushort, float>
-{
- int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- uint16x8_t v_src = vld1q_u16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1q_u16(dst + x, v_dst);
- }
-
- return x;
- }
-};
+#if CV_SIMD128
+// from uchar
template <>
-struct cvtScale_SIMD<ushort, short, float>
+struct Cvt_SIMD<uchar, schar>
{
- int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
+ int operator() (const uchar * src, schar * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vld1q_u16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1q_s16(dst + x, v_dst);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
+ v_store_low(dst + x, v_pack(v_src, v_src));
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<ushort, int, float>
+struct Cvt_SIMD<uchar, ushort>
{
- int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
+ int operator() (const uchar * src, ushort * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vld1q_u16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
- vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
- vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_store(dst + x, v_load_expand(src + x));
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<ushort, float, float>
+struct Cvt_SIMD<uchar, short>
{
- int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
+ int operator() (const uchar * src, short * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vld1q_u16(src + x);
- vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
- vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
+ v_store(dst + x, v_src);
+ }
}
-
return x;
}
};
-// from short
-
template <>
-struct cvtScale_SIMD<short, uchar, float>
+struct Cvt_SIMD<uchar, int>
{
- int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
+ int operator() (const uchar * src, int * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vld1q_s16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1_u8(dst + x, vqmovn_u16(v_dst));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint16x8 v_src = v_load_expand(src + x);
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_reinterpret_as_s32(v_src1));
+ v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<short, schar, float>
+struct Cvt_SIMD<uchar, float>
{
- int operator () (const short * src, schar * dst, int width, float scale, float shift) const
+ int operator() (const uchar * src, float * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vld1q_s16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1_s8(dst + x, vqmovn_s16(v_dst));
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint16x8 v_src = v_load_expand(src + x);
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+ v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
+ }
}
-
return x;
}
};
-template <>
-struct cvtScale_SIMD<short, ushort, float>
-{
- int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- int16x8_t v_src = vld1q_s16(src + x);
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1q_u16(dst + x, v_dst);
- }
-
- return x;
- }
-};
+// from schar
template <>
-struct cvtScale_SIMD<short, float, float>
+struct Cvt_SIMD<schar, uchar>
{
- int operator () (const short * src, float * dst, int width, float scale, float shift) const
+ int operator() (const schar * src, uchar * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vld1q_s16(src + x);
- vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
- vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_pack_u_store(dst + x, v_load_expand(src + x));
}
return x;
}
};
-// from int
-
template <>
-struct cvtScale_SIMD<int, uchar, float>
+struct Cvt_SIMD<schar, short>
{
- int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
+ int operator() (const schar * src, short * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1_u8(dst + x, vqmovn_u16(v_dst));
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_store(dst + x, v_load_expand(src + x));
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<int, schar, float>
+struct Cvt_SIMD<schar, ushort>
{
- int operator () (const int * src, schar * dst, int width, float scale, float shift) const
+ int operator() (const schar * src, ushort * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1_s8(dst + x, vqmovn_s16(v_dst));
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_int16x8 v_src = v_load_expand(src + x);
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_pack_u(v_src1, v_src2));
+ }
}
-
return x;
}
};
-template <>
-struct cvtScale_SIMD<int, ushort, float>
-{
- int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1q_u16(dst + x, v_dst);
- }
-
- return x;
- }
-};
template <>
-struct cvtScale_SIMD<int, short, float>
+struct Cvt_SIMD<schar, int>
{
- int operator () (const int * src, short * dst, int width, float scale, float shift) const
+ int operator() (const schar * src, int * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1q_s16(dst + x, v_dst);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src = v_load_expand(src + x);
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_src1);
+ v_store(dst + x + cWidth, v_src2);
+ }
}
-
return x;
}
};
-// from float
-
template <>
-struct cvtScale_SIMD<float, uchar, float>
+struct Cvt_SIMD<schar, float>
{
- int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
+ int operator() (const schar * src, float * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1_u8(dst + x, vqmovn_u16(v_dst));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src = v_load_expand(src + x);
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_cvt_f32(v_src1));
+ v_store(dst + x + cWidth, v_cvt_f32(v_src2));
+ }
}
-
return x;
}
};
-template <>
-struct cvtScale_SIMD<float, schar, float>
-{
- int operator () (const float * src, schar * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
- {
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1_s8(dst + x, vqmovn_s16(v_dst));
- }
-
- return x;
- }
-};
+// from ushort
template <>
-struct cvtScale_SIMD<float, ushort, float>
+struct Cvt_SIMD<ushort, uchar>
{
- int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
+ int operator() (const ushort * src, uchar * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
- uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
- vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
- vst1q_u16(dst + x, v_dst);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_store(dst + x, v_pack(v_src1, v_src2));
+ }
}
-
return x;
}
};
template <>
-struct cvtScale_SIMD<float, short, float>
+struct Cvt_SIMD<ushort, schar>
{
- int operator () (const float * src, short * dst, int width, float scale, float shift) const
+ int operator() (const ushort * src, schar * dst, int width) const
{
int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
- float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
- vst1q_s16(dst + x, v_dst);
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_uint32x4 v_dst10, v_dst11, v_dst20, v_dst21;
+ v_expand(v_src1, v_dst10, v_dst11);
+ v_expand(v_src2, v_dst20, v_dst21);
+
+ v_store(dst + x, v_pack(
+ v_pack(v_reinterpret_as_s32(v_dst10), v_reinterpret_as_s32(v_dst11)),
+ v_pack(v_reinterpret_as_s32(v_dst20), v_reinterpret_as_s32(v_dst21))));
+ }
}
-
- return x;
- }
-};
-
-template <>
-struct cvtScale_SIMD<float, int, float>
-{
- int operator () (const float * src, int * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 4; x += 4)
- vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
-
return x;
}
};
template <>
-struct cvtScale_SIMD<float, float, float>
-{
- int operator () (const float * src, float * dst, int width, float scale, float shift) const
- {
- int x = 0;
- float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
- for ( ; x <= width - 4; x += 4)
- vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
-
- return x;
- }
-};
-
-#endif
-
-template<typename T, typename DT, typename WT> static void
-cvtScale_( const T* src, size_t sstep,
- DT* dst, size_t dstep, Size size,
- WT scale, WT shift )
-{
- sstep /= sizeof(src[0]);
- dstep /= sizeof(dst[0]);
-
- cvtScale_SIMD<T, DT, WT> vop;
-
- for( ; size.height--; src += sstep, dst += dstep )
- {
- int x = vop(src, dst, size.width, scale, shift);
-
- #if CV_ENABLE_UNROLLED
- for( ; x <= size.width - 4; x += 4 )
- {
- DT t0, t1;
- t0 = saturate_cast<DT>(src[x]*scale + shift);
- t1 = saturate_cast<DT>(src[x+1]*scale + shift);
- dst[x] = t0; dst[x+1] = t1;
- t0 = saturate_cast<DT>(src[x+2]*scale + shift);
- t1 = saturate_cast<DT>(src[x+3]*scale + shift);
- dst[x+2] = t0; dst[x+3] = t1;
- }
- #endif
-
- for( ; x < size.width; x++ )
- dst[x] = saturate_cast<DT>(src[x]*scale + shift);
- }
-}
-
-//vz optimized template specialization
-template<> void
-cvtScale_<short, short, float>( const short* src, size_t sstep,
- short* dst, size_t dstep, Size size,
- float scale, float shift )
-{
- sstep /= sizeof(src[0]);
- dstep /= sizeof(dst[0]);
-
- for( ; size.height--; src += sstep, dst += dstep )
- {
- int x = 0;
- #if CV_SSE2
- if(USE_SSE2)
- {
- __m128 scale128 = _mm_set1_ps (scale);
- __m128 shift128 = _mm_set1_ps (shift);
- for(; x <= size.width - 8; x += 8 )
- {
- __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
- __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
- __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
- __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
- rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
- rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
- r0 = _mm_cvtps_epi32(rf0);
- r1 = _mm_cvtps_epi32(rf1);
- r0 = _mm_packs_epi32(r0, r1);
- _mm_storeu_si128((__m128i*)(dst + x), r0);
- }
- }
- #elif CV_NEON
- float32x4_t v_shift = vdupq_n_f32(shift);
- for(; x <= size.width - 8; x += 8 )
- {
- int16x8_t v_src = vld1q_s16(src + x);
- float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
- float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
-
- v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
- v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
-
- vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
- }
- #endif
-
- for(; x < size.width; x++ )
- dst[x] = saturate_cast<short>(src[x]*scale + shift);
- }
-}
-
-template<> void
-cvtScale_<short, int, float>( const short* src, size_t sstep,
- int* dst, size_t dstep, Size size,
- float scale, float shift )
+struct Cvt_SIMD<ushort, short>
{
- sstep /= sizeof(src[0]);
- dstep /= sizeof(dst[0]);
-
- for( ; size.height--; src += sstep, dst += dstep )
+ int operator() (const ushort * src, short * dst, int width) const
{
int x = 0;
-
- #if CV_TRY_AVX2
- if (CV_CPU_HAS_SUPPORT_AVX2)
- {
- opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width);
- continue;
- }
- #endif
- #if CV_SSE2
- if (USE_SSE2)//~5X
+ if (hasSIMD128())
{
- __m128 scale128 = _mm_set1_ps (scale);
- __m128 shift128 = _mm_set1_ps (shift);
- for(; x <= size.width - 8; x += 8 )
+ int cWidth = v_uint16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
{
- __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
-
- __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
- __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
- rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
- rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
-
- _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
- _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
+ v_uint16x8 v_src = v_load(src + x);
+ v_uint32x4 v_dst0, v_dst1;
+ v_expand(v_src, v_dst0, v_dst1);
+ v_store(dst + x, v_pack(v_reinterpret_as_s32(v_dst0), v_reinterpret_as_s32(v_dst1)));
}
}
- #elif CV_NEON
- float32x4_t v_shift = vdupq_n_f32(shift);
- for(; x <= size.width - 8; x += 8 )
- {
- int16x8_t v_src = vld1q_s16(src + x);
- float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
- float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
-
- v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
- v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
-
- vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
- vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
- }
- #endif
-
- for(; x < size.width; x++ )
- dst[x] = saturate_cast<int>(src[x]*scale + shift);
- }
-}
-
-template <typename T, typename DT>
-struct Cvt_SIMD
-{
- int operator() (const T *, DT *, int) const
- {
- return 0;
- }
-};
-
-#if CV_SSE2
-
-// from double
-
-template <>
-struct Cvt_SIMD<double, uchar>
-{
- int operator() (const double * src, uchar * dst, int width) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- for ( ; x <= width - 8; x += 8)
- {
- __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
- __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
- __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
- __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
- v_src0 = _mm_movelh_ps(v_src0, v_src1);
- v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
- _mm_cvtps_epi32(v_src1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
- }
-
return x;
}
};
template <>
-struct Cvt_SIMD<double, schar>
+struct Cvt_SIMD<ushort, int>
{
- int operator() (const double * src, schar * dst, int width) const
+ int operator() (const ushort * src, int * dst, int width) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
- __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
- __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
- __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
- v_src0 = _mm_movelh_ps(v_src0, v_src1);
- v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
- _mm_cvtps_epi32(v_src1));
- _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint16x8 v_src = v_load(src + x);
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_reinterpret_as_s32(v_src1));
+ v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
+ }
}
-
return x;
}
};
-#if CV_TRY_SSE4_1
-
-template <>
-struct Cvt_SIMD<double, ushort>
-{
- bool haveSIMD;
- Cvt_SIMD() { haveSIMD = CV_CPU_HAS_SUPPORT_SSE4_1; }
-
- int operator() (const double * src, ushort * dst, int width) const
- {
- if (haveSIMD)
- return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width);
- else
- return 0;
- }
-};
-
-#endif // CV_SSE4_1
-
template <>
-struct Cvt_SIMD<double, short>
+struct Cvt_SIMD<ushort, float>
{
- int operator() (const double * src, short * dst, int width) const
+ int operator() (const ushort * src, float * dst, int width) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
- __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
- __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
- __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
- v_src0 = _mm_movelh_ps(v_src0, v_src1);
- v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
- __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
- _mm_cvtps_epi32(v_src1));
- _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint16x8 v_src = v_load(src + x);
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+ v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
+ }
}
-
return x;
}
};
-template <>
-struct Cvt_SIMD<double, int>
-{
- int operator() (const double * src, int * dst, int width) const
- {
- int x = 0;
-
- if (!USE_SSE2)
- return x;
- for ( ; x <= width - 4; x += 4)
- {
- __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
- __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
- v_src0 = _mm_movelh_ps(v_src0, v_src1);
-
- _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
- }
-
- return x;
- }
-};
+// from short
template <>
-struct Cvt_SIMD<double, float>
+struct Cvt_SIMD<short, uchar>
{
- int operator() (const double * src, float * dst, int width) const
+ int operator() (const short * src, uchar * dst, int width) const
{
int x = 0;
-
- if (!USE_SSE2)
- return x;
-
- for ( ; x <= width - 4; x += 4)
+ if (hasSIMD128())
{
- __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
- __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-
- _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_store(dst + x, v_pack_u(v_src1, v_src2));
+ }
}
-
- return x;
- }
-};
-
-
-#elif CV_NEON
-
-// from uchar
-
-template <>
-struct Cvt_SIMD<uchar, schar>
-{
- int operator() (const uchar * src, schar * dst, int width) const
- {
- int x = 0;
-
- for ( ; x <= width - 8; x += 8)
- vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
-
- return x;
- }
-};
-
-
-template <>
-struct Cvt_SIMD<uchar, ushort>
-{
- int operator() (const uchar * src, ushort * dst, int width) const
- {
- int x = 0;
-
- for ( ; x <= width - 8; x += 8)
- vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
-
return x;
}
};
template <>
-struct Cvt_SIMD<uchar, short>
-{
- int operator() (const uchar * src, short * dst, int width) const
- {
- int x = 0;
-
- for ( ; x <= width - 8; x += 8)
- vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
-
- return x;
- }
-};
-
-template <>
-struct Cvt_SIMD<uchar, int>
+struct Cvt_SIMD<short, schar>
{
- int operator() (const uchar * src, int * dst, int width) const
+ int operator() (const short * src, schar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
- vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_store(dst + x, v_pack(v_src1, v_src2));
+ }
}
-
return x;
}
};
template <>
-struct Cvt_SIMD<uchar, float>
+struct Cvt_SIMD<short, ushort>
{
- int operator() (const uchar * src, float * dst, int width) const
+ int operator() (const short * src, ushort * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
- vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
- vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
+ int cWidth = v_int16x8::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ {
+ v_int16x8 v_src = v_load(src + x);
+ v_int32x4 v_dst1, v_dst2;
+ v_expand(v_src, v_dst1, v_dst2);
+ v_store(dst + x, v_pack_u(v_dst1, v_dst2));
+ }
}
-
return x;
}
};
-// from schar
-
template <>
-struct Cvt_SIMD<schar, uchar>
+struct Cvt_SIMD<short, int>
{
- int operator() (const schar * src, uchar * dst, int width) const
+ int operator() (const short * src, int * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
- vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
-
+ if (hasSIMD128())
+ {
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src = v_load(src + x);
+ v_int32x4 v_dst1, v_dst2;
+ v_expand(v_src, v_dst1, v_dst2);
+ v_store(dst + x, v_dst1);
+ v_store(dst + x + cWidth, v_dst2);
+ }
+ }
return x;
}
};
template <>
-struct Cvt_SIMD<schar, short>
+struct Cvt_SIMD<short, float>
{
- int operator() (const schar * src, short * dst, int width) const
+ int operator() (const short * src, float * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
- vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
-
+ if (hasSIMD128())
+ {
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int16x8 v_src = v_load(src + x);
+ v_int32x4 v_dst1, v_dst2;
+ v_expand(v_src, v_dst1, v_dst2);
+ v_store(dst + x, v_cvt_f32(v_dst1));
+ v_store(dst + x + cWidth, v_cvt_f32(v_dst2));
+ }
+ }
return x;
}
};
+// from int
+
template <>
-struct Cvt_SIMD<schar, ushort>
+struct Cvt_SIMD<int, uchar>
{
- int operator() (const schar * src, ushort * dst, int width) const
+ int operator() (const int * src, uchar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
- vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
+ v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
+ v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
+ v_store(dst + x, v_pack(v_dst1, v_dst2));
+ }
}
-
return x;
}
};
-
template <>
-struct Cvt_SIMD<schar, int>
+struct Cvt_SIMD<int, schar>
{
- int operator() (const schar * src, int * dst, int width) const
+ int operator() (const int * src, schar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
- vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
+ v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
+ v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
+ v_store(dst + x, v_pack(v_dst1, v_dst2));
+ }
}
-
return x;
}
};
+
template <>
-struct Cvt_SIMD<schar, float>
+struct Cvt_SIMD<int, ushort>
{
- int operator() (const schar * src, float * dst, int width) const
+ int operator() (const int * src, ushort * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
- vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
- vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_store(dst + x, v_pack_u(v_src1, v_src2));
+ }
}
-
return x;
}
};
-// from ushort
-
template <>
-struct Cvt_SIMD<ushort, uchar>
+struct Cvt_SIMD<int, short>
{
- int operator() (const ushort * src, uchar * dst, int width) const
+ int operator() (const int * src, short * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
- vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+ v_store(dst + x, v_pack(v_src1, v_src2));
+ }
}
-
return x;
}
};
template <>
-struct Cvt_SIMD<ushort, schar>
+struct Cvt_SIMD<int, float>
{
- int operator() (const ushort * src, schar * dst, int width) const
+ int operator() (const int * src, float * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
- int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
- int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
- int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
- int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
-
- vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
- vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
+ int cWidth = v_int32x4::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_store(dst + x, v_cvt_f32(v_load(src + x)));
}
-
return x;
}
};
+// from float
+
template <>
-struct Cvt_SIMD<ushort, short>
+struct Cvt_SIMD<float, uchar>
{
- int operator() (const ushort * src, short * dst, int width) const
+ int operator() (const float * src, uchar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vld1q_u16(src + x);
- int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
- int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
-
- vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int32x4 v_src1 = v_round(v_load(src + x));
+ v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+ v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
+ v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
+ v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
+ v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
+ v_store(dst + x, v_pack(v_dst1, v_dst2));
+ }
}
-
return x;
}
};
template <>
-struct Cvt_SIMD<ushort, int>
+struct Cvt_SIMD<float, schar>
{
- int operator() (const ushort * src, int * dst, int width) const
+ int operator() (const float * src, schar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vld1q_u16(src + x);
- vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
- vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int32x4 v_src1 = v_round(v_load(src + x));
+ v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+ v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
+ v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
+ v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
+ v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
+ v_store(dst + x, v_pack(v_dst1, v_dst2));
+ }
}
-
return x;
}
};
template <>
-struct Cvt_SIMD<ushort, float>
+struct Cvt_SIMD<float, ushort>
{
- int operator() (const ushort * src, float * dst, int width) const
+ int operator() (const float * src, ushort * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint16x8_t v_src = vld1q_u16(src + x);
- vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
- vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src1 = v_round(v_load(src + x));
+ v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+ v_store(dst + x, v_pack_u(v_src1, v_src2));
+ }
}
-
return x;
}
};
-// from short
-
template <>
-struct Cvt_SIMD<short, uchar>
+struct Cvt_SIMD<float, short>
{
- int operator() (const short * src, uchar * dst, int width) const
+ int operator() (const float * src, short * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
- vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src1 = v_round(v_load(src + x));
+ v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+ v_store(dst + x, v_pack(v_src1, v_src2));
+ }
}
-
return x;
}
};
template <>
-struct Cvt_SIMD<short, schar>
+struct Cvt_SIMD<float, int>
{
- int operator() (const short * src, schar * dst, int width) const
+ int operator() (const float * src, int * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
- vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
+ int cWidth = v_float32x4::nlanes;
+ for (; x <= width - cWidth; x += cWidth)
+ v_store(dst + x, v_round(v_load(src + x)));
}
-
return x;
}
};
+#if CV_SIMD128_64F
+// from double
template <>
-struct Cvt_SIMD<short, ushort>
+struct Cvt_SIMD<double, uchar>
{
- int operator() (const short * src, ushort * dst, int width) const
+ int operator() (const double * src, uchar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vld1q_s16(src + x);
- uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
- uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
- vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+ v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+ v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+ v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+
+ v_src0 = v_combine_low(v_src0, v_src1);
+ v_src1 = v_combine_low(v_src2, v_src3);
+ v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
+ v_pack_u_store(dst + x, v_dst);
+ }
+ }
return x;
}
};
template <>
-struct Cvt_SIMD<short, int>
+struct Cvt_SIMD<double, schar>
{
- int operator() (const short * src, int * dst, int width) const
+ int operator() (const double * src, schar * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int16x8_t v_src = vld1q_s16(src + x);
- vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
- vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+ v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+ v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+ v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+
+ v_src0 = v_combine_low(v_src0, v_src1);
+ v_src1 = v_combine_low(v_src2, v_src3);
+ v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
+ v_store_low(dst + x, v_pack(v_dst, v_dst));
+ }
+ }
return x;
}
};
template <>
-struct Cvt_SIMD<short, float>
+struct Cvt_SIMD<double, ushort>
{
- int operator() (const short * src, float * dst, int width) const
+ int operator() (const double * src, ushort * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+#if CV_TRY_SSE4_1
+ if (CV_CPU_HAS_SUPPORT_SSE4_1)
+ return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width);
+#endif
+ if (hasSIMD128())
{
- int16x8_t v_src = vld1q_s16(src + x);
- vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
- vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+ v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+ v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+ v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+ v_src0 = v_combine_low(v_src0, v_src1);
+ v_src1 = v_combine_low(v_src2, v_src3);
+
+ v_uint16x8 v_dst = v_pack_u(v_round(v_src0), v_round(v_src1));
+ v_store(dst + x, v_dst);
+ }
+ }
return x;
}
};
-// from int
-
template <>
-struct Cvt_SIMD<int, uchar>
+struct Cvt_SIMD<double, short>
{
- int operator() (const int * src, uchar * dst, int width) const
+ int operator() (const double * src, short * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
- int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
- uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
- uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
- vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+ v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+ v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+ v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+ v_src0 = v_combine_low(v_src0, v_src1);
+ v_src1 = v_combine_low(v_src2, v_src3);
+
+ v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
+ v_store(dst + x, v_dst);
+ }
+ }
return x;
}
};
template <>
-struct Cvt_SIMD<int, schar>
+struct Cvt_SIMD<double, int>
{
- int operator() (const int * src, schar * dst, int width) const
+ int operator() (const double * src, int * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
- int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
- int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
- int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
- vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+ v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+ v_store(dst + x, v_round(v_combine_low(v_src0, v_src1)));
+ }
+ }
return x;
}
};
-
template <>
-struct Cvt_SIMD<int, ushort>
+struct Cvt_SIMD<double, float>
{
- int operator() (const int * src, ushort * dst, int width) const
+ int operator() (const double * src, float * dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
- vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+ v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+ v_store(dst + x, v_combine_low(v_src0, v_src1));
+ }
+ }
return x;
}
};
+// to double
+
template <>
-struct Cvt_SIMD<int, short>
+struct Cvt_SIMD<uchar, double>
{
- int operator() (const int * src, short * dst, int width) const
+ int operator() (const uchar* src, double* dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
- vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_uint16x8 v_src = v_load_expand(src + x);
+ v_uint32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src1)));
+ v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src1)));
+ v_store(dst + x + cWidth * 2, v_cvt_f64(v_reinterpret_as_s32(v_src2)));
+ v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_reinterpret_as_s32(v_src2)));
+ }
}
-
return x;
}
};
template <>
-struct Cvt_SIMD<int, float>
+struct Cvt_SIMD<schar, double>
{
- int operator() (const int * src, float * dst, int width) const
+ int operator() (const schar* src, double* dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 4; x += 4)
- vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
-
+ if (hasSIMD128())
+ {
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 4; x += cWidth * 4)
+ {
+ v_int16x8 v_src = v_load_expand(src + x);
+ v_int32x4 v_src1, v_src2;
+ v_expand(v_src, v_src1, v_src2);
+ v_store(dst + x, v_cvt_f64(v_src1));
+ v_store(dst + x + cWidth, v_cvt_f64_high(v_src1));
+ v_store(dst + x + cWidth * 2, v_cvt_f64(v_src2));
+ v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_src2));
+ }
+ }
return x;
}
};
-// from float
-
template <>
-struct Cvt_SIMD<float, uchar>
+struct Cvt_SIMD<ushort, double>
{
- int operator() (const float * src, uchar * dst, int width) const
+ int operator() (const ushort* src, double* dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
- uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
- uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
- uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
- uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
- uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
- vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_uint32x4 v_src = v_load_expand(src + x);
+ v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src)));
+ v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src)));
+ }
+ }
return x;
}
};
template <>
-struct Cvt_SIMD<float, schar>
+struct Cvt_SIMD<short, double>
{
- int operator() (const float * src, schar * dst, int width) const
+ int operator() (const short* src, double* dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 16; x += 16)
+ if (hasSIMD128())
{
- int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
- int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
- int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
- int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
- int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
- int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
- vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src = v_load_expand(src + x);
+ v_store(dst + x, v_cvt_f64(v_src));
+ v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
+ }
+ }
return x;
}
};
-
template <>
-struct Cvt_SIMD<float, ushort>
+struct Cvt_SIMD<int, double>
{
- int operator() (const float * src, ushort * dst, int width) const
+ int operator() (const int* src, double* dst, int width) const
{
int x = 0;
-
- for ( ; x <= width - 8; x += 8)
+ if (hasSIMD128())
{
- uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
- uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
- vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
- }
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_int32x4 v_src = v_load(src + x);
+ v_store(dst + x, v_cvt_f64(v_src));
+ v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
+ }
+ }
return x;
}
};
template <>
-struct Cvt_SIMD<float, int>
+struct Cvt_SIMD<float, double>
{
- int operator() (const float * src, int * dst, int width) const
+ int operator() (const float* src, double* dst, int width) const
{
int x = 0;
+ if (hasSIMD128())
+ {
+ int cWidth = v_float64x2::nlanes;
+ for (; x <= width - cWidth * 2; x += cWidth * 2)
+ {
+ v_float32x4 v_src = v_load(src + x);
- for ( ; x <= width - 4; x += 4)
- vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
-
+ v_store(dst + x, v_cvt_f64(v_src));
+ v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
+ }
+ }
return x;
}
};
-
-#endif
+#endif // CV_SIMD128_64F
+#endif // CV_SIMD128
// template for FP16 HW conversion function
template<typename T, typename DT> static void
}
}
-//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
-template<> void
-cvt_<float, short>( const float* src, size_t sstep,
- short* dst, size_t dstep, Size size )
-{
- sstep /= sizeof(src[0]);
- dstep /= sizeof(dst[0]);
-
- for( ; size.height--; src += sstep, dst += dstep )
- {
- int x = 0;
- #if CV_SSE2
- if(USE_SSE2)
- {
- for( ; x <= size.width - 8; x += 8 )
- {
- __m128 src128 = _mm_loadu_ps (src + x);
- __m128i src_int128 = _mm_cvtps_epi32 (src128);
-
- src128 = _mm_loadu_ps (src + x + 4);
- __m128i src1_int128 = _mm_cvtps_epi32 (src128);
-
- src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
- _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
- }
- }
- #elif CV_NEON
- for( ; x <= size.width - 8; x += 8 )
- {
- float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
- int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
- vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
- vst1q_s16(dst + x, v_dst);
- }
- #endif
- for( ; x < size.width; x++ )
- dst[x] = saturate_cast<short>(src[x]);
- }
-
-}
-
-
template<typename T> static void
cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
{