From ca1a0a1108d9f4e894bf747cd5f0ee2b36cfc25c Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Fri, 15 Dec 2017 06:00:35 +0900 Subject: [PATCH] core: remove raw SSE2/NEON implementation from convert.cpp (#9831) * remove raw SSE2/NEON implementation from convert.cpp * remove raw implementation from Cvt_SIMD * remove raw implementation from cvtScale_SIMD * remove raw implementation from cvtScaleAbs_SIMD * remove duplicated implementation cvt_ * remove duplicated implementation cvtScale_ * add "from double" version of Cvt_SIMD * modify the condition of test ConvertScaleAbs * Update convert.cpp fixed crash in cvtScaleAbs(8s=>8u) * fixed compile error on Win32 * fixed several test failures because of accuracy loss in cvtScale(int=>int) * fixed NEON implementation of v_cvt_f64(int=>double) intrinsic * another attempt to fix test failures * keep trying to fix the test failures and just introduced compile warnings * fixed one remaining test (subtractScalar) --- modules/core/perf/perf_arithm.cpp | 24 +- modules/core/src/convert.cpp | 3613 +++++++++++++------------------------ modules/core/test/test_arithm.cpp | 5 + 3 files changed, 1297 insertions(+), 2345 deletions(-) diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp index c6c2a1b..310cc0b 100644 --- a/modules/core/perf/perf_arithm.cpp +++ b/modules/core/perf/perf_arithm.cpp @@ -80,16 +80,18 @@ PERF_TEST_P(Size_MatType, absdiff, TYPICAL_MATS_CORE_ARITHM) declare.in(a, b, WARMUP_RNG).out(c); + double eps = 1e-8; if (CV_MAT_DEPTH(type) == CV_32S) { //see ticket 1529: absdiff can be without saturation on 32S a /= 2; b /= 2; + eps = 1; } TEST_CYCLE() absdiff(a, b, c); - SANITY_CHECK(c, 1e-8); + SANITY_CHECK(c, eps); } PERF_TEST_P(Size_MatType, absdiffScalar, TYPICAL_MATS_CORE_ARITHM) @@ -102,16 +104,18 @@ PERF_TEST_P(Size_MatType, absdiffScalar, TYPICAL_MATS_CORE_ARITHM) declare.in(a, b, WARMUP_RNG).out(c); + double eps = 1e-8; if (CV_MAT_DEPTH(type) == CV_32S) { //see ticket 1529: absdiff can be without saturation on 32S a /= 2; b /= 2; + eps = 1; } TEST_CYCLE() absdiff(a, b, c); - SANITY_CHECK(c, 1e-8); + SANITY_CHECK(c, eps); } PERF_TEST_P(Size_MatType, add, TYPICAL_MATS_CORE_ARITHM) @@ -125,16 +129,18 @@ PERF_TEST_P(Size_MatType, add, TYPICAL_MATS_CORE_ARITHM) declare.in(a, b, WARMUP_RNG).out(c); declare.time(50); + double eps = 1e-8; if (CV_MAT_DEPTH(type) == CV_32S) { //see ticket 1529: add can be without saturation on 32S a /= 2; b /= 2; + eps = 1; } TEST_CYCLE() add(a, b, c); - SANITY_CHECK(c, 1e-8); + SANITY_CHECK(c, eps); } PERF_TEST_P(Size_MatType, addScalar, TYPICAL_MATS_CORE_ARITHM) @@ -147,16 +153,18 @@ PERF_TEST_P(Size_MatType, addScalar, TYPICAL_MATS_CORE_ARITHM) declare.in(a, b, WARMUP_RNG).out(c); + double eps = 1e-8; if (CV_MAT_DEPTH(type) == CV_32S) { //see ticket 1529: add can be without saturation on 32S a /= 2; b /= 2; + eps = 1; } TEST_CYCLE() add(a, b, c); - SANITY_CHECK(c, 1e-8); + SANITY_CHECK(c, eps); } PERF_TEST_P(Size_MatType, subtract, TYPICAL_MATS_CORE_ARITHM) @@ -169,16 +177,18 @@ PERF_TEST_P(Size_MatType, subtract, TYPICAL_MATS_CORE_ARITHM) declare.in(a, b, WARMUP_RNG).out(c); + double eps = 1e-8; if (CV_MAT_DEPTH(type) == CV_32S) { //see ticket 1529: subtract can be without saturation on 32S a /= 2; b /= 2; + eps = 1; } TEST_CYCLE() subtract(a, b, c); - SANITY_CHECK(c, 1e-8); + SANITY_CHECK(c, eps); } PERF_TEST_P(Size_MatType, subtractScalar, TYPICAL_MATS_CORE_ARITHM) @@ -191,16 +201,18 @@ PERF_TEST_P(Size_MatType, subtractScalar, TYPICAL_MATS_CORE_ARITHM) declare.in(a, b, WARMUP_RNG).out(c); + double eps = 1e-8; if (CV_MAT_DEPTH(type) == CV_32S) { //see ticket 1529: subtract can be without saturation on 32S a /= 2; b /= 2; + eps = 1; } TEST_CYCLE() subtract(a, b, c); - SANITY_CHECK(c, 1e-8); + SANITY_CHECK(c, eps); } PERF_TEST_P(Size_MatType, multiply, TYPICAL_MATS_CORE_ARITHM) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index dfc2f2d..0a2b158 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -960,41 +960,77 @@ struct cvtScaleAbs_SIMD } }; -#if CV_SSE2 +#if CV_SIMD128 + +static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) +{ + v_uint32x4 v_src0, v_src1; + v_expand(v_load_expand(src), v_src0, v_src1); + + a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0)); + b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1)); +} + +static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) +{ + v_int32x4 v_src0, v_src1; + v_expand(v_load_expand(src), v_src0, v_src1); + + a = v_shift + v_scale * v_cvt_f32(v_src0); + b = v_shift + v_scale * v_cvt_f32(v_src1); +} + +static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) +{ + v_uint32x4 v_src0, v_src1; + v_expand(v_load(src), v_src0, v_src1); + + a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0)); + b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1)); +} + +static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) +{ + v_int32x4 v_src0, v_src1; + v_expand(v_load(src), v_src0, v_src1); + + a = v_shift + v_scale * v_cvt_f32(v_src0); + b = v_shift + v_scale * v_cvt_f32(v_src1); +} + +static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) +{ + a = v_shift + v_scale * v_cvt_f32(v_load(src)); + b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes)); +} template <> struct cvtScaleAbs_SIMD { int operator () (const uchar * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - - if (USE_SSE2) + if (hasSIMD128()) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); - - for ( ; x <= width - 16; x += 16) + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + const int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) { - __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); - __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i); - __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift); - v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); - __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift); - v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); - __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift); - v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); - __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift); - v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); - - __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), - _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); - _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); + v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1); + v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3); + v_dst_0 = v_abs(v_dst_0); + v_dst_1 = v_abs(v_dst_1); + v_dst_2 = v_abs(v_dst_2); + v_dst_3 = v_abs(v_dst_3); + + v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1)); + v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3)); + v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1)); } } - return x; } }; @@ -1003,40 +1039,29 @@ template <> struct cvtScaleAbs_SIMD { int operator () (const schar * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - - if (USE_SSE2) + if (hasSIMD128()) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); - - for ( ; x <= width - 16; x += 16) + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + const int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth*2; x += cWidth*2) { - __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); - __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8), - v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8); - __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( - _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); - v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); - __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( - _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); - v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); - __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( - _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); - v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); - __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( - _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); - v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); - - __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), - _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); - _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); + v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1); + v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3); + v_dst_0 = v_abs(v_dst_0); + v_dst_1 = v_abs(v_dst_1); + v_dst_2 = v_abs(v_dst_2); + v_dst_3 = v_abs(v_dst_3); + + v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1)); + v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3)); + v_store(dst + x, v_pack(v_dsti_0, v_dsti_1)); } } - return x; } }; @@ -1045,29 +1070,25 @@ template <> struct cvtScaleAbs_SIMD { int operator () (const ushort * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - - if (USE_SSE2) + if (hasSIMD128()) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); - - for ( ; x <= width - 8; x += 8) + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + const int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) { - __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); - __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift); - v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); - __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift); - v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); - - __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i); - _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); + v_float32x4 v_dst0, v_dst1; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1); + v_dst0 = v_abs(v_dst0); + v_dst1 = v_abs(v_dst1); + + v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1)); + v_pack_u_store(dst + x, v_dst); } } - return x; } }; @@ -1076,29 +1097,25 @@ template <> struct cvtScaleAbs_SIMD { int operator () (const short * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - - if (USE_SSE2) + if (hasSIMD128()) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); - - for ( ; x <= width - 8; x += 8) + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + const int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) { - __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); - __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift); - v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); - __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift); - v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); - - __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i); - _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); + v_float32x4 v_dst0, v_dst1; + v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1); + v_dst0 = v_abs(v_dst0); + v_dst1 = v_abs(v_dst1); + + v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1)); + v_pack_u_store(dst + x, v_dst); } } - return x; } }; @@ -1107,25 +1124,22 @@ template <> struct cvtScaleAbs_SIMD { int operator () (const int * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - - if (USE_SSE2) + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + const int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); + v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale; + v_dst_0 = v_abs(v_dst_0 + v_shift); - for ( ; x <= width - 8; x += 4) - { - __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); - __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); - v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); + v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale; + v_dst_1 = v_abs(v_dst_1 + v_shift); - __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i); - _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); - } + v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1)); + v_pack_u_store(dst + x, v_dst); } return x; @@ -1136,273 +1150,59 @@ template <> struct cvtScaleAbs_SIMD { int operator () (const float * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - - if (USE_SSE2) + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); + v_float32x4 v_dst_0 = v_load(src + x) * v_scale; + v_dst_0 = v_abs(v_dst_0 + v_shift); - for ( ; x <= width - 8; x += 4) - { - __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift); - v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst); + v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale; + v_dst_1 = v_abs(v_dst_1 + v_shift); - __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); - } + v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1)); + v_pack_u_store(dst + x, v_dst); } - return x; } }; +#if CV_SIMD128_64F template <> struct cvtScaleAbs_SIMD { int operator () (const double * src, uchar * dst, int width, - float scale, float shift) const + float scale, float shift) const { int x = 0; - if (USE_SSE2) + if (hasSIMD128()) { - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), - v_zero_f = _mm_setzero_ps(); - __m128i v_zero_i = _mm_setzero_si128(); - - for ( ; x <= width - 8; x += 8) + v_float32x4 v_scale = v_setall_f32(scale); + v_float32x4 v_shift = v_setall_f32(shift); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) { - __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); - __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); - - __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift); - v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); + v_float32x4 v_src1, v_src2, v_dummy; + v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy); + v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy); - __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift); - v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); + v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift); + v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift); - __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), - _mm_cvtps_epi32(v_dst2)); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); + v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2)); + v_pack_u_store(dst + x, v_dst_i); } } return x; } }; - -#elif CV_NEON - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const uchar * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift); - - for ( ; x <= width - 16; x += 16) - { - uint8x16_t v_src = vld1q_u8(src + x); - uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); - - uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half)); - float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); - v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); - - v_quat = vmovl_u16(vget_high_u16(v_half)); - float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); - v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); - - v_half = vmovl_u8(vget_high_u8(v_src)); - - v_quat = vmovl_u16(vget_low_u16(v_half)); - float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); - v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift)); - - v_quat = vmovl_u16(vget_high_u16(v_half)); - float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); - v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift)); - - uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); - uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst_3))); - - vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1))); - } - - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const schar * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift); - - for ( ; x <= width - 16; x += 16) - { - int8x16_t v_src = vld1q_s8(src + x); - int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); - - int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half)); - float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); - v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); - - v_quat = vmovl_s16(vget_high_s16(v_half)); - float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); - v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); - - v_half = vmovl_s8(vget_high_s8(v_src)); - - v_quat = vmovl_s16(vget_low_s16(v_half)); - float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); - v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift)); - - v_quat = vmovl_s16(vget_high_s16(v_half)); - float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); - v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift)); - - uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); - uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst_3))); - - vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1))); - } - - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const ushort * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift); - - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src = vld1q_u16(src + x); - - uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src)); - float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale); - v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); - - v_half = vmovl_u16(vget_high_u16(v_src)); - float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale); - v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); - - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const short * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift); - - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src = vld1q_s16(src + x); - - int32x4_t v_half = vmovl_s16(vget_low_s16(v_src)); - float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale); - v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); - - v_half = vmovl_s16(vget_high_s16(v_src)); - float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale); - v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); - - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const int * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift); - - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale); - v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); - uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)); - - float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale); - v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); - uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)); - - uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - - return x; - } -}; - -template <> -struct cvtScaleAbs_SIMD -{ - int operator () (const float * src, uchar * dst, int width, - float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift); - - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale); - v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); - uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)); - - float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale); - v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); - uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)); - - uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - - return x; - } -}; +#endif // CV_SIMD128_64F #endif @@ -1445,7 +1245,7 @@ struct cvtScale_SIMD } }; -#if CV_SSE2 +#if CV_SIMD128 // from uchar @@ -1455,27 +1255,19 @@ struct cvtScale_SIMD int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_pack_u_store(dst + x, v_dst); + } } - return x; } }; @@ -1486,81 +1278,69 @@ struct cvtScale_SIMD int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; - } +#endif + if (hasSIMD128()) + { + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - bool haveSSE; + v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } + } + return x; + } }; -#endif - template <> struct cvtScale_SIMD { int operator () (const uchar * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } } - return x; } }; @@ -1571,26 +1351,19 @@ struct cvtScale_SIMD int operator () (const uchar * src, int * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); - _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + v_store(dst + x, v_round(v_src1)); + v_store(dst + x + cWidth, v_round(v_src2)); + } } - return x; } }; @@ -1601,60 +1374,19 @@ struct cvtScale_SIMD int operator () (const uchar * src, float * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - _mm_storeu_ps(dst + x, v_dst_0); - _mm_storeu_ps(dst + x + 4, v_dst_1); - } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const uchar * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); - - __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x, v_dst_0); - _mm_storeu_pd(dst + x + 2, v_dst_1); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); - v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x + 4, v_dst_0); - _mm_storeu_pd(dst + x + 6, v_dst_1); + v_store(dst + x, v_src1); + v_store(dst + x + cWidth, v_src2); + } } - return x; } }; @@ -1667,27 +1399,19 @@ struct cvtScale_SIMD int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_pack_u_store(dst + x, v_dst); + } } - return x; } }; @@ -1698,81 +1422,69 @@ struct cvtScale_SIMD int operator () (const schar * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; - } +#endif + if (hasSIMD128()) + { + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - bool haveSSE; + v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } + } + return x; + } }; -#endif - template <> struct cvtScale_SIMD { int operator () (const schar * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } } - return x; } }; @@ -1783,26 +1495,19 @@ struct cvtScale_SIMD int operator () (const schar * src, int * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); - _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + v_store(dst + x, v_round(v_src1)); + v_store(dst + x + cWidth, v_round(v_src2)); + } } - return x; } }; @@ -1813,61 +1518,19 @@ struct cvtScale_SIMD int operator () (const schar * src, float * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - _mm_storeu_ps(dst + x, v_dst_0); - _mm_storeu_ps(dst + x + 4, v_dst_1); - } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))); - v_src = _mm_srai_epi16(v_src, 8); - - __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x, v_dst_0); - _mm_storeu_pd(dst + x + 2, v_dst_1); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); - v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); - v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x + 4, v_dst_0); - _mm_storeu_pd(dst + x + 6, v_dst_1); + v_store(dst + x, v_src1); + v_store(dst + x + cWidth, v_src2); + } } - return x; } }; @@ -1880,27 +1543,19 @@ struct cvtScale_SIMD int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_pack_u_store(dst + x, v_dst); + } } - return x; } }; @@ -1911,81 +1566,69 @@ struct cvtScale_SIMD int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; - } +#endif + if (hasSIMD128()) + { + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - bool haveSSE; + v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } + } + return x; + } }; -#endif - template <> struct cvtScale_SIMD { int operator () (const ushort * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } } - return x; } }; @@ -1996,26 +1639,19 @@ struct cvtScale_SIMD int operator () (const ushort * src, int * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); - _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + v_store(dst + x, v_round(v_src1)); + v_store(dst + x + cWidth, v_round(v_src2)); + } } - return x; } }; @@ -2026,60 +1662,19 @@ struct cvtScale_SIMD int operator () (const ushort * src, float * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - _mm_storeu_ps(dst + x, v_dst_0); - _mm_storeu_ps(dst + x + 4, v_dst_1); - } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - - __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x, v_dst_0); - _mm_storeu_pd(dst + x + 2, v_dst_1); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); - v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x + 4, v_dst_0); - _mm_storeu_pd(dst + x + 6, v_dst_1); + v_store(dst + x, v_src1); + v_store(dst + x + cWidth, v_src2); + } } - return x; } }; @@ -2092,27 +1687,19 @@ struct cvtScale_SIMD int operator () (const short * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_pack_u_store(dst + x, v_dst); + } } - return x; } }; @@ -2123,111 +1710,69 @@ struct cvtScale_SIMD int operator () (const short * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const short * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; - } - - bool haveSSE; -}; - #endif - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const short * src, int * dst, int width, float scale, float shift) const + int operator () (const short * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); - _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } } - return x; } }; @@ -2238,60 +1783,19 @@ struct cvtScale_SIMD int operator () (const short * src, float * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); - - _mm_storeu_ps(dst + x, v_dst_0); - _mm_storeu_ps(dst + x + 4, v_dst_1); - } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, double * dst, int width, double scale, double shift) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - - __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x, v_dst_0); - _mm_storeu_pd(dst + x + 2, v_dst_1); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); - v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); - v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); - v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); - _mm_storeu_pd(dst + x + 4, v_dst_0); - _mm_storeu_pd(dst + x + 6, v_dst_1); + v_store(dst + x, v_src1); + v_store(dst + x + cWidth, v_src2); + } } - return x; } }; @@ -2304,26 +1808,19 @@ struct cvtScale_SIMD int operator () (const int * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); - - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_pack_u_store(dst + x, v_dst); + } } - return x; } }; @@ -2334,108 +1831,92 @@ struct cvtScale_SIMD int operator () (const int * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); - - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const int * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; - } +#endif + if (hasSIMD128()) + { + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - bool haveSSE; + v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } + } + return x; + } }; -#endif - template <> struct cvtScale_SIMD { int operator () (const int * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); - - v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_src1, v_src2; + v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); + v_store(dst + x, v_dst); + } } - return x; } }; +#if CV_SIMD128_64F template <> struct cvtScale_SIMD { int operator () (const int * src, int * dst, int width, double scale, double shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - - v_src = _mm_srli_si128(v_src, 8); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - - __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1))); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] }; + v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf); + v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2); + v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2))); + } } - return x; } }; @@ -2446,84 +1927,44 @@ struct cvtScale_SIMD int operator () (const int * src, float * dst, int width, double scale, double shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - - v_src = _mm_srli_si128(v_src, 8); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - - _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0), - _mm_cvtpd_ps(v_dst_1))); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] }; + v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf); + v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2); + v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2))); + } } - return x; } }; +#endif //CV_SIMD128_64F + +// from float template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const int * src, double * dst, int width, double scale, double shift) const + int operator () (const float * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); - - v_src = _mm_srli_si128(v_src, 8); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); + v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - _mm_storeu_pd(dst + x, v_dst_0); - _mm_storeu_pd(dst + x + 2, v_dst_1); + v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); + v_pack_u_store(dst + x, v_dst); + } } - - return x; - } -}; - -// from float - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) - { - __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_loadu_ps(src + x + 4); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); - } - return x; } }; @@ -2534,78 +1975,69 @@ struct cvtScale_SIMD int operator () (const float * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_loadu_ps(src + x + 4); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); + v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const float * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; - } +#endif + if (hasSIMD128()) + { + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); + v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - bool haveSSE; + v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2)); + v_store(dst + x, v_dst); + } + } + return x; + } }; -#endif - template <> struct cvtScale_SIMD { int operator () (const float * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_loadu_ps(src + x + 4); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); + v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); + v_store(dst + x, v_dst); + } } - return x; } }; @@ -2616,24 +2048,13 @@ struct cvtScale_SIMD int operator () (const float * src, int * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_loadu_ps(src + x + 4); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); - _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift)); } - return x; } }; @@ -2644,49 +2065,55 @@ struct cvtScale_SIMD int operator () (const float * src, float * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128 v_src = _mm_loadu_ps(src + x); - __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - _mm_storeu_ps(dst + x, v_dst); + v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_store(dst + x, v_load(src + x) * v_scale + v_shift); } - return x; } }; -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, double * dst, int width, double scale, double shift) const - { - int x = 0; +#if CV_SIMD128_64F - if (!USE_SSE2) - return x; +static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2) +{ + int cWidth = v_float64x2::nlanes; + v_float64x2 v_src1 = v_shift + v_scale * v_load(src); + v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth); + v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2); + v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3); + v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)); + v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4)); +} - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); +static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2) +{ + v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1); + v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1); + v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2); + v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2); - for ( ; x <= width - 4; x += 4) - { - __m128 v_src = _mm_loadu_ps(src + x); - __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); - v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); - __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); + v_store(dst, v_dst1); + v_store(dst + v_float64x2::nlanes, v_dst2); + v_store(dst + v_float64x2::nlanes * 2, v_dst3); + v_store(dst + v_float64x2::nlanes * 3, v_dst4); +} - _mm_storeu_pd(dst + x, v_dst_0); - _mm_storeu_pd(dst + x + 2, v_dst_1); - } +static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2) +{ + v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1); + v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1); + v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2); + v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2); - return x; - } -}; + v_store(dst, v_dst1); + v_store(dst + v_float64x2::nlanes, v_dst2); + v_store(dst + v_float64x2::nlanes * 2, v_dst3); + v_store(dst + v_float64x2::nlanes * 3, v_dst4); +} // from double @@ -2696,28 +2123,17 @@ struct cvtScale_SIMD int operator () (const double * src, uchar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_float32x4 v_dst1, v_dst2; + v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); + v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2))); + } } - return x; } }; @@ -2728,82 +2144,66 @@ struct cvtScale_SIMD int operator () (const double * src, schar * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_float32x4 v_dst1, v_dst2; + v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); + v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); + v_pack_store(dst + x, v_dst); + } } - return x; } }; -#if CV_TRY_SSE4_1 - template <> struct cvtScale_SIMD { - cvtScale_SIMD() - { - haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1; - } - int operator () (const double * src, ushort * dst, int width, float scale, float shift) const { - if (haveSSE) + int x = 0; +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift); - else - return 0; +#endif + if (hasSIMD128()) + { + v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_dst1, v_dst2; + v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); + v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2)); + v_store(dst + x, v_dst); + } + } + return x; } - - bool haveSSE; }; -#endif - template <> struct cvtScale_SIMD { int operator () (const double * src, short * dst, int width, float scale, float shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); - __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), - _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); - __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), - _mm_cvtps_epi32(v_dst_1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_float32x4 v_dst1, v_dst2; + v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); + v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); + v_store(dst + x, v_dst); + } } - return x; } }; @@ -2814,26 +2214,18 @@ struct cvtScale_SIMD int operator () (const double * src, int * dst, int width, double scale, double shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128d v_src = _mm_loadu_pd(src + x); - __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); - - v_src = _mm_loadu_pd(src + x + 2); - __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); - - __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1))); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); + v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); - _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); + v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2))); + } } - return x; } }; @@ -2844,1597 +2236,1182 @@ struct cvtScale_SIMD int operator () (const double * src, float * dst, int width, double scale, double shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128d v_src = _mm_loadu_pd(src + x); - __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); - - v_src = _mm_loadu_pd(src + x + 2); - __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); - - __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0), - _mm_cvtpd_ps(v_dst1)); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); + v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); + v_float32x4 v_dst1 = v_cvt_f32(v_src1); + v_float32x4 v_dst2 = v_cvt_f32(v_src2); - _mm_storeu_ps(dst + x, v_dst); + v_store(dst + x, v_combine_low(v_dst1, v_dst2)); + } } - return x; } }; +// to double + template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const double * src, double * dst, int width, double scale, double shift) const + int operator () (const uchar * src, double * dst, int width, double scale, double shift) const { int x = 0; - - if (!USE_SSE2) - return x; - - __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); - - for ( ; x <= width - 2; x += 2) + if (hasSIMD128()) { - __m128d v_src = _mm_loadu_pd(src + x); - __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); - _mm_storeu_pd(dst + x, v_dst); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_uint32x4 v_src1, v_src2; + v_expand(v_load_expand(src + x), v_src1, v_src2); + v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift + , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2)); + } } - return x; } }; -#elif CV_NEON - -// from uchar - template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const + int operator () (const schar * src, double * dst, int width, double scale, double shift) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int32x4 v_src1, v_src2; + v_expand(v_load_expand(src + x), v_src1, v_src2); + v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const + int operator () (const ushort * src, double * dst, int width, double scale, double shift) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_uint32x4 v_src1, v_src2; + v_expand(v_load(src + x), v_src1, v_src2); + v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift + , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2)); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const + int operator () (const short * src, double * dst, int width, double scale, double shift) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int32x4 v_src1, v_src2; + v_expand(v_load(src + x), v_src1, v_src2); + v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, short * dst, int width, float scale, float shift) const + int operator () (const int * src, double * dst, int width, double scale, double shift) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src1 = v_load(src + x); + v_int32x4 v_src2 = v_load(src + x + cWidth); + v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, int * dst, int width, float scale, float shift) const + int operator () (const float * src, double * dst, int width, double scale, double shift) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); - vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src1 = v_load(src + x); + v_float32x4 v_src2 = v_load(src + x + cWidth); + v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct cvtScale_SIMD { - int operator () (const uchar * src, float * dst, int width, float scale, float shift) const + int operator () (const double * src, double * dst, int width, double scale, double shift) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); - vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); + v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); + v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); + v_store(dst + x, v_src1); + v_store(dst + x + cWidth, v_src2); + } } - return x; } }; +#endif +#endif -// from schar - -template <> -struct cvtScale_SIMD +template static void +cvtScale_( const T* src, size_t sstep, + DT* dst, size_t dstep, Size size, + WT scale, WT shift ) { - int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + cvtScale_SIMD vop; + + for( ; size.height--; src += sstep, dst += dstep ) { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + int x = vop(src, dst, size.width, scale, shift); - for ( ; x <= width - 8; x += 8) + #if CV_ENABLE_UNROLLED + for( ; x <= size.width - 4; x += 4 ) { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); + DT t0, t1; + t0 = saturate_cast
(src[x]*scale + shift); + t1 = saturate_cast
(src[x+1]*scale + shift); + dst[x] = t0; dst[x+1] = t1; + t0 = saturate_cast
(src[x+2]*scale + shift); + t1 = saturate_cast
(src[x+3]*scale + shift); + dst[x+2] = t0; dst[x+3] = t1; } + #endif - return x; + for( ; x < size.width; x++ ) + dst[x] = saturate_cast
(src[x]*scale + shift); } -}; +} -template <> -struct cvtScale_SIMD +template<> void +cvtScale_( const short* src, size_t sstep, + int* dst, size_t dstep, Size size, + float scale, float shift ) { - int operator () (const schar * src, schar * dst, int width, float scale, float shift) const + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + #if CV_TRY_AVX2 + if (CV_CPU_HAS_SUPPORT_AVX2) { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); + opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width); + continue; } + #endif + #if CV_SIMD128 + if (hasSIMD128()) + { + v_float32x4 v_shift = v_setall_f32(shift); + v_float32x4 v_scale = v_setall_f32(scale); + int cWidth = v_int32x4::nlanes; + for (; x <= size.width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src = v_load(src + x); + v_int32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_float32x4 v_tmp1 = v_cvt_f32(v_src1); + v_float32x4 v_tmp2 = v_cvt_f32(v_src2); - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, short * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } + v_tmp1 = v_tmp1 * v_scale + v_shift; + v_tmp2 = v_tmp2 * v_scale + v_shift; - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, int * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); - vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); - } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const schar * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); - vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); - } - - return x; - } -}; - -// from ushort - -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src = vld1q_u16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); + v_store(dst + x, v_round(v_tmp1)); + v_store(dst + x + cWidth, v_round(v_tmp2)); + } } + #endif - return x; + for(; x < size.width; x++ ) + dst[x] = saturate_cast(src[x]*scale + shift); } -}; +} -template <> -struct cvtScale_SIMD +template +struct Cvt_SIMD { - int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const + int operator() (const T *, DT *, int) const { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src = vld1q_u16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - - return x; + return 0; } }; -template <> -struct cvtScale_SIMD -{ - int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src = vld1q_u16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - - return x; - } -}; +#if CV_SIMD128 +// from uchar template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const ushort * src, short * dst, int width, float scale, float shift) const + int operator() (const uchar * src, schar * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vld1q_u16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x)); + v_store_low(dst + x, v_pack(v_src, v_src)); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const ushort * src, int * dst, int width, float scale, float shift) const + int operator() (const uchar * src, ushort * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vld1q_u16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); - - vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); - vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_store(dst + x, v_load_expand(src + x)); } - return x; } }; template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const ushort * src, float * dst, int width, float scale, float shift) const + int operator() (const uchar * src, short * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vld1q_u16(src + x); - vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); - vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x)); + v_store(dst + x, v_src); + } } - return x; } }; -// from short - template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const short * src, uchar * dst, int width, float scale, float shift) const + int operator() (const uchar * src, int * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vld1q_s16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint16x8 v_src = v_load_expand(src + x); + v_uint32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_reinterpret_as_s32(v_src1)); + v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2)); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const short * src, schar * dst, int width, float scale, float shift) const + int operator() (const uchar * src, float * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vld1q_s16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint16x8 v_src = v_load_expand(src + x); + v_uint32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1))); + v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2))); + } } - return x; } }; -template <> -struct cvtScale_SIMD -{ - int operator () (const short * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src = vld1q_s16(src + x); - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - - return x; - } -}; +// from schar template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const short * src, float * dst, int width, float scale, float shift) const + int operator() (const schar * src, uchar * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vld1q_s16(src + x); - vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); - vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_pack_u_store(dst + x, v_load_expand(src + x)); } return x; } }; -// from int - template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const int * src, uchar * dst, int width, float scale, float shift) const + int operator() (const schar * src, short * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_store(dst + x, v_load_expand(src + x)); } - return x; } }; template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const int * src, schar * dst, int width, float scale, float shift) const + int operator() (const schar * src, ushort * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_int16x8 v_src = v_load_expand(src + x); + v_int32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_pack_u(v_src1, v_src2)); + } } - return x; } }; -template <> -struct cvtScale_SIMD -{ - int operator () (const int * src, ushort * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - - return x; - } -}; template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const int * src, short * dst, int width, float scale, float shift) const + int operator() (const schar * src, int * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src = v_load_expand(src + x); + v_int32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_src1); + v_store(dst + x + cWidth, v_src2); + } } - return x; } }; -// from float - template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const float * src, uchar * dst, int width, float scale, float shift) const + int operator() (const schar * src, float * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src = v_load_expand(src + x); + v_int32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_cvt_f32(v_src1)); + v_store(dst + x + cWidth, v_cvt_f32(v_src2)); + } } - return x; } }; -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, schar * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - - return x; - } -}; +// from ushort template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const float * src, ushort * dst, int width, float scale, float shift) const + int operator() (const ushort * src, uchar * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_store(dst + x, v_pack(v_src1, v_src2)); + } } - return x; } }; template <> -struct cvtScale_SIMD +struct Cvt_SIMD { - int operator () (const float * src, short * dst, int width, float scale, float shift) const + int operator() (const ushort * src, schar * dst, int width) const { int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); - float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_uint32x4 v_dst10, v_dst11, v_dst20, v_dst21; + v_expand(v_src1, v_dst10, v_dst11); + v_expand(v_src2, v_dst20, v_dst21); + + v_store(dst + x, v_pack( + v_pack(v_reinterpret_as_s32(v_dst10), v_reinterpret_as_s32(v_dst11)), + v_pack(v_reinterpret_as_s32(v_dst20), v_reinterpret_as_s32(v_dst21)))); + } } - - return x; - } -}; - -template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, int * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 4; x += 4) - vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift))); - return x; } }; template <> -struct cvtScale_SIMD -{ - int operator () (const float * src, float * dst, int width, float scale, float shift) const - { - int x = 0; - float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); - - for ( ; x <= width - 4; x += 4) - vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)); - - return x; - } -}; - -#endif - -template static void -cvtScale_( const T* src, size_t sstep, - DT* dst, size_t dstep, Size size, - WT scale, WT shift ) -{ - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - cvtScale_SIMD vop; - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = vop(src, dst, size.width, scale, shift); - - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - DT t0, t1; - t0 = saturate_cast
(src[x]*scale + shift); - t1 = saturate_cast
(src[x+1]*scale + shift); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast
(src[x+2]*scale + shift); - t1 = saturate_cast
(src[x+3]*scale + shift); - dst[x+2] = t0; dst[x+3] = t1; - } - #endif - - for( ; x < size.width; x++ ) - dst[x] = saturate_cast
(src[x]*scale + shift); - } -} - -//vz optimized template specialization -template<> void -cvtScale_( const short* src, size_t sstep, - short* dst, size_t dstep, Size size, - float scale, float shift ) -{ - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - #if CV_SSE2 - if(USE_SSE2) - { - __m128 scale128 = _mm_set1_ps (scale); - __m128 shift128 = _mm_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); - __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); - __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); - __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); - rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); - rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); - r0 = _mm_cvtps_epi32(rf0); - r1 = _mm_cvtps_epi32(rf1); - r0 = _mm_packs_epi32(r0, r1); - _mm_storeu_si128((__m128i*)(dst + x), r0); - } - } - #elif CV_NEON - float32x4_t v_shift = vdupq_n_f32(shift); - for(; x <= size.width - 8; x += 8 ) - { - int16x8_t v_src = vld1q_s16(src + x); - float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); - float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); - - v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); - v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)), - vqmovn_s32(cv_vrndq_s32_f32(v_tmp2)))); - } - #endif - - for(; x < size.width; x++ ) - dst[x] = saturate_cast(src[x]*scale + shift); - } -} - -template<> void -cvtScale_( const short* src, size_t sstep, - int* dst, size_t dstep, Size size, - float scale, float shift ) +struct Cvt_SIMD { - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) + int operator() (const ushort * src, short * dst, int width) const { int x = 0; - - #if CV_TRY_AVX2 - if (CV_CPU_HAS_SUPPORT_AVX2) - { - opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width); - continue; - } - #endif - #if CV_SSE2 - if (USE_SSE2)//~5X + if (hasSIMD128()) { - __m128 scale128 = _mm_set1_ps (scale); - __m128 shift128 = _mm_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) + int cWidth = v_uint16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) { - __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x)); - - __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); - __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16)); - rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); - rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); - - _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0)); - _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1)); + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_dst0, v_dst1; + v_expand(v_src, v_dst0, v_dst1); + v_store(dst + x, v_pack(v_reinterpret_as_s32(v_dst0), v_reinterpret_as_s32(v_dst1))); } } - #elif CV_NEON - float32x4_t v_shift = vdupq_n_f32(shift); - for(; x <= size.width - 8; x += 8 ) - { - int16x8_t v_src = vld1q_s16(src + x); - float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); - float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); - - v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); - v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); - - vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1)); - vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2)); - } - #endif - - for(; x < size.width; x++ ) - dst[x] = saturate_cast(src[x]*scale + shift); - } -} - -template -struct Cvt_SIMD -{ - int operator() (const T *, DT *, int) const - { - return 0; - } -}; - -#if CV_SSE2 - -// from double - -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, uchar * dst, int width) const - { - int x = 0; - - if (!USE_SSE2) - return x; - - for ( ; x <= width - 8; x += 8) - { - __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); - __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); - __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); - __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); - - v_src0 = _mm_movelh_ps(v_src0, v_src1); - v_src1 = _mm_movelh_ps(v_src2, v_src3); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), - _mm_cvtps_epi32(v_src1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst)); - } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const double * src, schar * dst, int width) const + int operator() (const ushort * src, int * dst, int width) const { int x = 0; - - if (!USE_SSE2) - return x; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); - __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); - __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); - __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); - - v_src0 = _mm_movelh_ps(v_src0, v_src1); - v_src1 = _mm_movelh_ps(v_src2, v_src3); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), - _mm_cvtps_epi32(v_src1)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst)); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_reinterpret_as_s32(v_src1)); + v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2)); + } } - return x; } }; -#if CV_TRY_SSE4_1 - -template <> -struct Cvt_SIMD -{ - bool haveSIMD; - Cvt_SIMD() { haveSIMD = CV_CPU_HAS_SUPPORT_SSE4_1; } - - int operator() (const double * src, ushort * dst, int width) const - { - if (haveSIMD) - return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width); - else - return 0; - } -}; - -#endif // CV_SSE4_1 - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const double * src, short * dst, int width) const + int operator() (const ushort * src, float * dst, int width) const { int x = 0; - - if (!USE_SSE2) - return x; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); - __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); - __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); - __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); - - v_src0 = _mm_movelh_ps(v_src0, v_src1); - v_src1 = _mm_movelh_ps(v_src2, v_src3); - - __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), - _mm_cvtps_epi32(v_src1)); - _mm_storeu_si128((__m128i *)(dst + x), v_dst); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1))); + v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2))); + } } - return x; } }; -template <> -struct Cvt_SIMD -{ - int operator() (const double * src, int * dst, int width) const - { - int x = 0; - - if (!USE_SSE2) - return x; - for ( ; x <= width - 4; x += 4) - { - __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); - __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); - v_src0 = _mm_movelh_ps(v_src0, v_src1); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0)); - } - - return x; - } -}; +// from short template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const double * src, float * dst, int width) const + int operator() (const short * src, uchar * dst, int width) const { int x = 0; - - if (!USE_SSE2) - return x; - - for ( ; x <= width - 4; x += 4) + if (hasSIMD128()) { - __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); - __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); - - _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1)); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_store(dst + x, v_pack_u(v_src1, v_src2)); + } } - - return x; - } -}; - - -#elif CV_NEON - -// from uchar - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, schar * dst, int width) const - { - int x = 0; - - for ( ; x <= width - 8; x += 8) - vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))))); - - return x; - } -}; - - -template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, ushort * dst, int width) const - { - int x = 0; - - for ( ; x <= width - 8; x += 8) - vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x))); - return x; } }; template <> -struct Cvt_SIMD -{ - int operator() (const uchar * src, short * dst, int width) const - { - int x = 0; - - for ( ; x <= width - 8; x += 8) - vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))); - - return x; - } -}; - -template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const uchar * src, int * dst, int width) const + int operator() (const short * src, schar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)))); - vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)))); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_store(dst + x, v_pack(v_src1, v_src2)); + } } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const uchar * src, float * dst, int width) const + int operator() (const short * src, ushort * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); - vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src)))); - vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src)))); + int cWidth = v_int16x8::nlanes; + for (; x <= width - cWidth; x += cWidth) + { + v_int16x8 v_src = v_load(src + x); + v_int32x4 v_dst1, v_dst2; + v_expand(v_src, v_dst1, v_dst2); + v_store(dst + x, v_pack_u(v_dst1, v_dst2)); + } } - return x; } }; -// from schar - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const schar * src, uchar * dst, int width) const + int operator() (const short * src, int * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) - vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x)))); - + if (hasSIMD128()) + { + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src = v_load(src + x); + v_int32x4 v_dst1, v_dst2; + v_expand(v_src, v_dst1, v_dst2); + v_store(dst + x, v_dst1); + v_store(dst + x + cWidth, v_dst2); + } + } return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const schar * src, short * dst, int width) const + int operator() (const short * src, float * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) - vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x))); - + if (hasSIMD128()) + { + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int16x8 v_src = v_load(src + x); + v_int32x4 v_dst1, v_dst2; + v_expand(v_src, v_dst1, v_dst2); + v_store(dst + x, v_cvt_f32(v_dst1)); + v_store(dst + x + cWidth, v_cvt_f32(v_dst2)); + } + } return x; } }; +// from int + template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const schar * src, ushort * dst, int width) const + int operator() (const int * src, uchar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))), - vqmovun_s32(vmovl_s16(vget_high_s16(v_src))))); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3); + v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2); + v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4); + v_store(dst + x, v_pack(v_dst1, v_dst2)); + } } - return x; } }; - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const schar * src, int * dst, int width) const + int operator() (const int * src, schar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src))); - vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src))); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3); + v_int16x8 v_dst1 = v_pack(v_src1, v_src2); + v_int16x8 v_dst2 = v_pack(v_src3, v_src4); + v_store(dst + x, v_pack(v_dst1, v_dst2)); + } } - return x; } }; + template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const schar * src, float * dst, int width) const + int operator() (const int * src, ushort * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); - vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)))); - vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)))); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_store(dst + x, v_pack_u(v_src1, v_src2)); + } } - return x; } }; -// from ushort - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const ushort * src, uchar * dst, int width) const + int operator() (const int * src, short * dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); - vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2))); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); + v_store(dst + x, v_pack(v_src1, v_src2)); + } } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const ushort * src, schar * dst, int width) const + int operator() (const int * src, float * dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); - int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1))); - int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1))); - int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2))); - int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2))); - - vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))), - vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21))))); + int cWidth = v_int32x4::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_store(dst + x, v_cvt_f32(v_load(src + x))); } - return x; } }; +// from float + template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const ushort * src, short * dst, int width) const + int operator() (const float * src, uchar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vld1q_u16(src + x); - int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))); - int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int32x4 v_src1 = v_round(v_load(src + x)); + v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); + v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2)); + v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3)); + v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2); + v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4); + v_store(dst + x, v_pack(v_dst1, v_dst2)); + } } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const ushort * src, int * dst, int width) const + int operator() (const float * src, schar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vld1q_u16(src + x); - vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)))); - vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)))); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int32x4 v_src1 = v_round(v_load(src + x)); + v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); + v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2)); + v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3)); + v_int16x8 v_dst1 = v_pack(v_src1, v_src2); + v_int16x8 v_dst2 = v_pack(v_src3, v_src4); + v_store(dst + x, v_pack(v_dst1, v_dst2)); + } } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const ushort * src, float * dst, int width) const + int operator() (const float * src, ushort * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint16x8_t v_src = vld1q_u16(src + x); - vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src)))); - vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src)))); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src1 = v_round(v_load(src + x)); + v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); + v_store(dst + x, v_pack_u(v_src1, v_src2)); + } } - return x; } }; -// from short - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const short * src, uchar * dst, int width) const + int operator() (const float * src, short * dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8); - vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2))); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src1 = v_round(v_load(src + x)); + v_int32x4 v_src2 = v_round(v_load(src + x + cWidth)); + v_store(dst + x, v_pack(v_src1, v_src2)); + } } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const short * src, schar * dst, int width) const + int operator() (const float * src, int * dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8); - vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2))); + int cWidth = v_float32x4::nlanes; + for (; x <= width - cWidth; x += cWidth) + v_store(dst + x, v_round(v_load(src + x))); } - return x; } }; +#if CV_SIMD128_64F +// from double template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const short * src, ushort * dst, int width) const + int operator() (const double * src, uchar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vld1q_s16(src + x); - uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src))); - uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src))); - vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); + v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); + v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); + v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); + + v_src0 = v_combine_low(v_src0, v_src1); + v_src1 = v_combine_low(v_src2, v_src3); + v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1)); + v_pack_u_store(dst + x, v_dst); + } + } return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const short * src, int * dst, int width) const + int operator() (const double * src, schar * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int16x8_t v_src = vld1q_s16(src + x); - vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src))); - vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src))); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); + v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); + v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); + v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); + + v_src0 = v_combine_low(v_src0, v_src1); + v_src1 = v_combine_low(v_src2, v_src3); + v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1)); + v_store_low(dst + x, v_pack(v_dst, v_dst)); + } + } return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const short * src, float * dst, int width) const + int operator() (const double * src, ushort * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) +#if CV_TRY_SSE4_1 + if (CV_CPU_HAS_SUPPORT_SSE4_1) + return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width); +#endif + if (hasSIMD128()) { - int16x8_t v_src = vld1q_s16(src + x); - vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)))); - vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)))); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); + v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); + v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); + v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); + v_src0 = v_combine_low(v_src0, v_src1); + v_src1 = v_combine_low(v_src2, v_src3); + + v_uint16x8 v_dst = v_pack_u(v_round(v_src0), v_round(v_src1)); + v_store(dst + x, v_dst); + } + } return x; } }; -// from int - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const int * src, uchar * dst, int width) const + int operator() (const double * src, short * dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); - int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12); - uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2))); - uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4))); - vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2)); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); + v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); + v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2)); + v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3)); + v_src0 = v_combine_low(v_src0, v_src1); + v_src1 = v_combine_low(v_src2, v_src3); + + v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1)); + v_store(dst + x, v_dst); + } + } return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const int * src, schar * dst, int width) const + int operator() (const double * src, int * dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); - int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12); - int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); - int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4))); - vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2)); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); + v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); + v_store(dst + x, v_round(v_combine_low(v_src0, v_src1))); + } + } return x; } }; - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const int * src, ushort * dst, int width) const + int operator() (const double * src, float * dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); - vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2))); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src0 = v_cvt_f32(v_load(src + x)); + v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth)); + v_store(dst + x, v_combine_low(v_src0, v_src1)); + } + } return x; } }; +// to double + template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const int * src, short * dst, int width) const + int operator() (const uchar* src, double* dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_uint16x8 v_src = v_load_expand(src + x); + v_uint32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src1))); + v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src1))); + v_store(dst + x + cWidth * 2, v_cvt_f64(v_reinterpret_as_s32(v_src2))); + v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_reinterpret_as_s32(v_src2))); + } } - return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const int * src, float * dst, int width) const + int operator() (const schar* src, double* dst, int width) const { int x = 0; - - for ( ; x <= width - 4; x += 4) - vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x))); - + if (hasSIMD128()) + { + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 4; x += cWidth * 4) + { + v_int16x8 v_src = v_load_expand(src + x); + v_int32x4 v_src1, v_src2; + v_expand(v_src, v_src1, v_src2); + v_store(dst + x, v_cvt_f64(v_src1)); + v_store(dst + x + cWidth, v_cvt_f64_high(v_src1)); + v_store(dst + x + cWidth * 2, v_cvt_f64(v_src2)); + v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_src2)); + } + } return x; } }; -// from float - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const float * src, uchar * dst, int width) const + int operator() (const ushort* src, double* dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x)); - uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4)); - uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8)); - uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12)); - uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2))); - uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4))); - vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2)); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_uint32x4 v_src = v_load_expand(src + x); + v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src))); + v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src))); + } + } return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const float * src, schar * dst, int width) const + int operator() (const short* src, double* dst, int width) const { int x = 0; - - for ( ; x <= width - 16; x += 16) + if (hasSIMD128()) { - int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x)); - int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4)); - int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8)); - int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12)); - int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); - int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4))); - vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2)); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src = v_load_expand(src + x); + v_store(dst + x, v_cvt_f64(v_src)); + v_store(dst + x + cWidth, v_cvt_f64_high(v_src)); + } + } return x; } }; - template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const float * src, ushort * dst, int width) const + int operator() (const int* src, double* dst, int width) const { int x = 0; - - for ( ; x <= width - 8; x += 8) + if (hasSIMD128()) { - uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x)); - uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4)); - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2))); - } + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_int32x4 v_src = v_load(src + x); + v_store(dst + x, v_cvt_f64(v_src)); + v_store(dst + x + cWidth, v_cvt_f64_high(v_src)); + } + } return x; } }; template <> -struct Cvt_SIMD +struct Cvt_SIMD { - int operator() (const float * src, int * dst, int width) const + int operator() (const float* src, double* dst, int width) const { int x = 0; + if (hasSIMD128()) + { + int cWidth = v_float64x2::nlanes; + for (; x <= width - cWidth * 2; x += cWidth * 2) + { + v_float32x4 v_src = v_load(src + x); - for ( ; x <= width - 4; x += 4) - vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x))); - + v_store(dst + x, v_cvt_f64(v_src)); + v_store(dst + x + cWidth, v_cvt_f64_high(v_src)); + } + } return x; } }; - -#endif +#endif // CV_SIMD128_64F +#endif // CV_SIMD128 // template for FP16 HW conversion function template static void @@ -4618,48 +3595,6 @@ cvt_( const T* src, size_t sstep, } } -//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest -template<> void -cvt_( const float* src, size_t sstep, - short* dst, size_t dstep, Size size ) -{ - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - #if CV_SSE2 - if(USE_SSE2) - { - for( ; x <= size.width - 8; x += 8 ) - { - __m128 src128 = _mm_loadu_ps (src + x); - __m128i src_int128 = _mm_cvtps_epi32 (src128); - - src128 = _mm_loadu_ps (src + x + 4); - __m128i src1_int128 = _mm_cvtps_epi32 (src128); - - src1_int128 = _mm_packs_epi32(src_int128, src1_int128); - _mm_storeu_si128((__m128i*)(dst + x),src1_int128); - } - } - #elif CV_NEON - for( ; x <= size.width - 8; x += 8 ) - { - float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4); - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)), - vqmovn_s32(cv_vrndq_s32_f32(v_src2))); - vst1q_s16(dst + x, v_dst); - } - #endif - for( ; x < size.width; x++ ) - dst[x] = saturate_cast(src[x]); - } - -} - - template static void cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size ) { diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index ae5e4a7..c5b5ede 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -806,6 +806,11 @@ struct ConvertScaleAbsOp : public BaseElemWiseOp { cvtest::add(src[0], alpha, Mat(), 0, Scalar::all(gamma[0]), dst, CV_8UC(src[0].channels()), true); } + int getRandomType(RNG& rng) + { + return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, + ninputs > 1 ? ARITHM_MAX_CHANNELS : 4); + } double getMaxErr(int) { return 1; -- 2.7.4