// Resize (bi-linear, 32F)
void calcRowLinear_32F(float *dst[],
- const float *src0[],
- const float *src1[],
- const float alpha[],
- const int mapsx[],
- const float beta[],
- const Size & inSz,
- const Size & outSz,
- int lpi) {
- bool xRatioEq1 = inSz.width == outSz.width;
- bool yRatioEq1 = inSz.height == outSz.height;
-
- if (!xRatioEq1 && !yRatioEq1) {
- for (int l = 0; l < lpi; l++) {
- float beta0 = beta[l];
- float beta1 = 1 - beta0;
-
- int x = 0;
-
- #if CV_SIMD128
- for (; x <= outSz.width - 4; x += 4) {
- v_float32x4 alpha0 = v_load(&alpha[x]);
- // v_float32x4 alpha1 = 1.f - alpha0;
-
- v_int32x4 sx = v_load(&mapsx[x]);
-
- v_float32x4 s0l, s0h, s00, s01;
- v_gather_pairs(src0[l], sx, s0l, s0h);
- v_deinterleave(s0l, s0h, s00, s01);
-
- // v_float32x4 res0 = s00*alpha0 + s01*alpha1;
- v_float32x4 res0 = v_fma(s00 - s01, alpha0, s01);
-
- v_float32x4 s1l, s1h, s10, s11;
- v_gather_pairs(src1[l], sx, s1l, s1h);
- v_deinterleave(s1l, s1h, s10, s11);
-
- // v_float32x4 res1 = s10*alpha0 + s11*alpha1;
- v_float32x4 res1 = v_fma(s10 - s11, alpha0, s11);
-
- // v_float32x4 d = res0*beta0 + res1*beta1;
- v_float32x4 d = v_fma(res0 - res1, beta0, res1);
-
- v_store(&dst[l][x], d);
- }
- #endif
-
- for (; x < outSz.width; x++) {
- float alpha0 = alpha[x];
- float alpha1 = 1 - alpha0;
- int sx0 = mapsx[x];
- int sx1 = sx0 + 1;
- float res0 = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
- float res1 = src1[l][sx0]*alpha0 + src1[l][sx1]*alpha1;
- dst[l][x] = beta0*res0 + beta1*res1;
- }
- }
-
- } else if (!xRatioEq1) {
- GAPI_DbgAssert(yRatioEq1);
-
- for (int l = 0; l < lpi; l++) {
- int x = 0;
-
- #if CV_SIMD128
- for (; x <= outSz.width - 4; x += 4) {
- v_float32x4 alpha0 = v_load(&alpha[x]);
- // v_float32x4 alpha1 = 1.f - alpha0;
-
- v_int32x4 sx = v_load(&mapsx[x]);
-
- v_float32x4 s0l, s0h, s00, s01;
- v_gather_pairs(src0[l], sx, s0l, s0h);
- v_deinterleave(s0l, s0h, s00, s01);
-
- // v_float32x4 d = s00*alpha0 + s01*alpha1;
- v_float32x4 d = v_fma(s00 - s01, alpha0, s01);
-
- v_store(&dst[l][x], d);
- }
- #endif
-
- for (; x < outSz.width; x++) {
- float alpha0 = alpha[x];
- float alpha1 = 1 - alpha0;
- int sx0 = mapsx[x];
- int sx1 = sx0 + 1;
- dst[l][x] = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
- }
- }
-
- } else if (!yRatioEq1) {
- GAPI_DbgAssert(xRatioEq1);
- int length = inSz.width; // == outSz.width
-
- for (int l = 0; l < lpi; l++) {
- float beta0 = beta[l];
- float beta1 = 1 - beta0;
-
- int x = 0;
-
- #if CV_SIMD128
- for (; x <= length - 4; x += 4) {
- v_float32x4 s0 = v_load(&src0[l][x]);
- v_float32x4 s1 = v_load(&src1[l][x]);
-
- // v_float32x4 d = s0*beta0 + s1*beta1;
- v_float32x4 d = v_fma(s0 - s1, beta0, s1);
-
- v_store(&dst[l][x], d);
- }
- #endif
-
- for (; x < length; x++) {
- dst[l][x] = beta0*src0[l][x] + beta1*src1[l][x];
- }
- }
-
- } else {
- GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
- int length = inSz.width; // == outSz.width
- for (int l = 0; l < lpi; l++) {
- memcpy(dst[l], src0[l], length * sizeof(float));
- }
- }
+ const float *src0[],
+ const float *src1[],
+ const float alpha[],
+ const int mapsx[],
+ const float beta[],
+ const Size& inSz,
+ const Size& outSz,
+ int lpi) {
+ calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
}
//------------------------------------------------------------------------------
}
}
+// Resize (bi-linear, 32FC1)
+static inline void calcRowLinear_32FC1(float *dst[],
+ const float *src0[],
+ const float *src1[],
+ const float alpha[],
+ const int mapsx[],
+ const float beta[],
+ const Size& inSz,
+ const Size& outSz,
+ int lpi) {
+ bool xRatioEq1 = inSz.width == outSz.width;
+ bool yRatioEq1 = inSz.height == outSz.height;
+
+#if CPU_SIMD
+ const int nlanes = v_float32::nlanes;
+#endif
+
+ if (!xRatioEq1 && !yRatioEq1) {
+ for (int line = 0; line < lpi; ++line) {
+ float beta0 = beta[line];
+ float beta1 = 1 - beta0;
+
+ int x = 0;
+
+#if CPU_SIMD
+ for (; x <= outSz.width - nlanes; x += nlanes) {
+ v_float32 alpha0 = vx_load(&alpha[x]);
+ // v_float32 alpha1 = 1.f - alpha0;
+
+ v_float32 low1, high1, s00, s01;
+ v_gather_pairs(src0[line], mapsx, x, low1, high1);
+ v_deinterleave(low1, high1, s00, s01);
+
+ // v_float32 res0 = s00*alpha0 + s01*alpha1;
+ v_float32 res0 = v_fma(s00 - s01, alpha0, s01);
+
+ v_float32 low2, high2, s10, s11;
+ v_gather_pairs(src1[line], mapsx, x, low2, high2);
+ v_deinterleave(low2, high2, s10, s11);
+
+ // v_float32 res1 = s10*alpha0 + s11*alpha1;
+ v_float32 res1 = v_fma(s10 - s11, alpha0, s11);
+
+ // v_float32 d = res0*beta0 + res1*beta1;
+ v_float32 d = v_fma(res0 - res1, beta0, res1);
+
+ vx_store(&dst[line][x], d);
+ }
+#endif
+
+ for (; x < outSz.width; ++x) {
+ float alpha0 = alpha[x];
+ float alpha1 = 1 - alpha0;
+ int sx0 = mapsx[x];
+ int sx1 = sx0 + 1;
+ float res0 = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
+ float res1 = src1[line][sx0] * alpha0 + src1[line][sx1] * alpha1;
+ dst[line][x] = beta0 * res0 + beta1 * res1;
+ }
+ }
+
+ } else if (!xRatioEq1) {
+ GAPI_DbgAssert(yRatioEq1);
+
+ for (int line = 0; line < lpi; ++line) {
+ int x = 0;
+
+#if CPU_SIMD
+ for (; x <= outSz.width - nlanes; x += nlanes) {
+ v_float32 alpha0 = vx_load(&alpha[x]);
+ // v_float32 alpha1 = 1.f - alpha0;
+
+ v_float32 low, high, s00, s01;
+ v_gather_pairs(src0[line], mapsx, x, low, high);
+ v_deinterleave(low, high, s00, s01);
+
+ // v_float32 d = s00*alpha0 + s01*alpha1;
+ v_float32 d = v_fma(s00 - s01, alpha0, s01);
+
+ vx_store(&dst[line][x], d);
+ }
+#endif
+
+ for (; x < outSz.width; ++x) {
+ float alpha0 = alpha[x];
+ float alpha1 = 1 - alpha0;
+ int sx0 = mapsx[x];
+ int sx1 = sx0 + 1;
+ dst[line][x] = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
+ }
+ }
+
+ } else if (!yRatioEq1) {
+ GAPI_DbgAssert(xRatioEq1);
+ int length = inSz.width; // == outSz.width
+
+ for (int line = 0; line < lpi; ++line) {
+ float beta0 = beta[line];
+ float beta1 = 1 - beta0;
+
+ int x = 0;
+
+#if CPU_SIMD
+ for (; x <= length - nlanes; x += nlanes) {
+ v_float32 s0 = vx_load(&src0[line][x]);
+ v_float32 s1 = vx_load(&src1[line][x]);
+
+ // v_float32 d = s0*beta0 + s1*beta1;
+ v_float32 d = v_fma(s0 - s1, beta0, s1);
+
+ vx_store(&dst[line][x], d);
+ }
+#endif
+
+ for (; x < length; ++x) {
+ dst[line][x] = beta0 * src0[line][x] + beta1 * src1[line][x];
+ }
+ }
+
+ } else {
+ GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
+ int length = inSz.width; // == outSz.width
+ for (int line = 0; line < lpi; ++line) {
+ memcpy(dst[line], src0[line], length * sizeof(float));
+ }
+ }
+}
+
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine
////////// Other math /////////
/** Some frequent operations **/
-#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
- inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
- { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
- inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
- { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
- inline _Tpvec v_sqrt(const _Tpvec& x) \
- { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
- inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
- { return v_fma(a, a, b * b); } \
- inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
+ static inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+ { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val));} \
+ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+ { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
+ inline _Tpvec v_sqrt(const _Tpvec& x) \
+ { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
+ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+ { return v_fma(a, a, b * b); } \
+ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, b*b)); }
OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
return v_uint8x32(_mm256_blend_epi16(_mm256_srli_si256(a.val, shift), b.val, mask));
}
+template<int mask, int shift>
+static inline __m256 v_blend_shiftleft(const v_float32x8& a, const v_float32x8& b)
+{
+ return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_castps_si256(a.val), _mm256_slli_si256(_mm256_castps_si256(b.val), shift), mask));
+}
+
+template<int mask, int shift>
+static inline __m256 v_blend_shiftright(const v_float32x8& a, const v_float32x8& b)
+{
+ return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_srli_si256(_mm256_castps_si256(a.val), shift), _mm256_castps_si256(b.val), mask));
+}
+
static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
char b5, char b6, char b7, char b8, char b9,
char b10, char b11, char b12, char b13, char b14,
{
__m256 tmp0 = _mm256_unpacklo_ps(low.val, high.val);
__m256 tmp1 = _mm256_unpackhi_ps(low.val, high.val);
- even.val = _mm256_unpacklo_ps(tmp0, tmp1);
- odd .val = _mm256_unpackhi_ps(tmp0, tmp1);
+ __m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
+ __m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
+ even.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp2), 216 /*11011000*/));
+ odd.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp3), 216 /*11011000*/));
}
static inline void v_deinterleave(const v_uint8x32& v0, const v_uint8x32& v1,
return r;
}
+static inline void v_gather_pairs(const float src[], const int mapsx[], int x,
+ v_float32x8& low, v_float32x8& high) {
+ low.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[x + 0]]),
+ *reinterpret_cast<const int64_t*>(&src[mapsx[x + 1]]),
+ *reinterpret_cast<const int64_t*>(&src[mapsx[x + 2]]),
+ *reinterpret_cast<const int64_t*>(&src[mapsx[x + 3]])));
+ high.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[x + 4]]),
+ *reinterpret_cast<const int64_t*>(&src[mapsx[x + 5]]),
+ *reinterpret_cast<const int64_t*>(&src[mapsx[x + 6]]),
+ *reinterpret_cast<const int64_t*>(&src[mapsx[x + 7]])));
+}
+
namespace {
template<int chanNum>
static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) {
static inline void v_deinterleave(const v_float32x16& low, const v_float32x16& high,
v_float32x16& even, v_float32x16& odd)
{
- __m512 tmp0 = _mm512_unpacklo_ps(low.val, high.val);
- __m512 tmp1 = _mm512_unpackhi_ps(low.val, high.val);
- even.val = _mm512_unpacklo_ps(tmp0, tmp1);
- odd .val = _mm512_unpackhi_ps(tmp0, tmp1);
-}
-
-static inline void v_deinterleave(const v_uint8x64& i0, const v_uint8x64& i1,
- const v_uint8x64& i2, const v_uint8x64& i3,
- v_uint8x64& o0, v_uint8x64& o1,
- v_uint8x64& o2, v_uint8x64& o3)
-{
- __m512i u0 = i0.val; // a0 b0 c0 d0 a1 b1 c1 d1 ...
- __m512i u1 = i1.val; // a4 b4 c4 d4 ...
- __m512i u2 = i2.val; // a8 b8 c8 d8 ...
- __m512i u3 = i3.val; // a12 b12 c12 d12 ...
-
- __m512i v0 = _mm512_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
- __m512i v1 = _mm512_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
- __m512i v2 = _mm512_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
- __m512i v3 = _mm512_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
-
- u0 = _mm512_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
- u1 = _mm512_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
- u2 = _mm512_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
- u3 = _mm512_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+ __m512i permute_mask1 = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+ __m512i permute_mask2 = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
- v0 = _mm512_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
- v1 = _mm512_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
- v2 = _mm512_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
- v3 = _mm512_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
-
- o0.val = _mm512_unpacklo_epi8(v0, v1); // a0 a1 a2 a3 ...
- o1.val = _mm512_unpackhi_epi8(v0, v1); // b0 b1 b2 b3 ...
- o2.val = _mm512_unpacklo_epi8(v2, v3); // c0 c1 c2 c3 ...
- o3.val = _mm512_unpackhi_epi8(v2, v3); // d0 d1 d2 d3 ...
+ even.val = _mm512_permutex2var_ps(low.val, permute_mask1, high.val);
+ odd.val = _mm512_permutex2var_ps(low.val, permute_mask2, high.val);
}
static inline v_uint8x64 v_interleave_low(const v_uint8x64& a, const v_uint8x64& b)
static inline v_uint8x64 v_mask_blend_shiftleft(const v_uint8x64& a, const v_uint8x64& b)
{
return v_uint8x64(_mm512_mask_blend_epi16(mask,
- a.val, _mm512_bslli_epi128(b.val, shift)));
+ a.val, _mm512_bslli_epi128(b.val, shift)));
}
template<int mask, int shift>
static inline v_uint8x64 v_mask_blend_shiftright(const v_uint8x64& a, const v_uint8x64& b)
{
return v_uint8x64(_mm512_mask_blend_epi16(mask,
- _mm512_bsrli_epi128(a.val, shift), b.val));
+ _mm512_bsrli_epi128(a.val, shift), b.val));
}
static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b)
| ((uint32_t)((uint8_t)(b3)) << 3*8))
static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
- char b5, char b6, char b7, char b8, char b9,
- char b10, char b11, char b12, char b13, char b14,
- char b15, char b16, char b17, char b18, char b19,
- char b20, char b21, char b22, char b23, char b24,
- char b25, char b26, char b27, char b28, char b29,
- char b30, char b31, char b32, char b33, char b34,
- char b35, char b36, char b37, char b38, char b39,
- char b40, char b41, char b42, char b43, char b44,
- char b45, char b46, char b47, char b48, char b49,
- char b50, char b51, char b52, char b53, char b54,
- char b55, char b56, char b57, char b58, char b59,
- char b60, char b61, char b62, char b63)
+ char b5, char b6, char b7, char b8, char b9,
+ char b10, char b11, char b12, char b13, char b14,
+ char b15, char b16, char b17, char b18, char b19,
+ char b20, char b21, char b22, char b23, char b24,
+ char b25, char b26, char b27, char b28, char b29,
+ char b30, char b31, char b32, char b33, char b34,
+ char b35, char b36, char b37, char b38, char b39,
+ char b40, char b41, char b42, char b43, char b44,
+ char b45, char b46, char b47, char b48, char b49,
+ char b50, char b51, char b52, char b53, char b54,
+ char b55, char b56, char b57, char b58, char b59,
+ char b60, char b61, char b62, char b63)
{
return v_uint8x64(_mm512_setr_epi32(word(b0, b1, b2, b3), word(b4, b5, b6, b7), word(b8, b9, b10, b11),
- word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23),
- word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35),
- word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47),
- word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59),
- word(b60, b61, b62, b63)));
+ word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23),
+ word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35),
+ word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47),
+ word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59),
+ word(b60, b61, b62, b63)));
}
static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even, v_int16x32& odd)
-1, 46, -1, 48, -1, 50, -1, 52, -1, 54, -1,
56, -1, 58, -1, 60, -1, 62, -1);
- v_uint8x64 mask_odd = v_setr_s8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1,
- 13, -1, 15, -1, 17, -1, 19, -1, 21, -1, 23,
- -1, 25, -1, 27, -1, 29, -1, 31, -1, 33, -1,
- 35, -1, 37, -1, 39, -1, 41, -1, 43, -1, 45,
- -1, 47, -1, 49, -1, 51, -1, 53, -1, 55, -1,
- 57, -1, 59, -1, 61, -1, 63, -1);
+ v_uint8x64 mask_odd = v_setr_s8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1,
+ 13, -1, 15, -1, 17, -1, 19, -1, 21, -1, 23,
+ -1, 25, -1, 27, -1, 29, -1, 31, -1, 33, -1,
+ 35, -1, 37, -1, 39, -1, 41, -1, 43, -1, 45,
+ -1, 47, -1, 49, -1, 51, -1, 53, -1, 55, -1,
+ 57, -1, 59, -1, 61, -1, 63, -1);
even.val = _mm512_shuffle_epi8(src.val, mask_even.val);
- odd .val = _mm512_shuffle_epi8(src.val, mask_odd.val);
+ odd.val = _mm512_shuffle_epi8(src.val, mask_odd.val);
}
static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
return v_uint32x16(_mm512_set_epi32(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0));
}
+static inline v_uint32x16 v_setr_s32(int b1, int b2, int b3, int b4, int b5, int b6, int b7, int b8,
+ int b9, int b10, int b11, int b12, int b13, int b14, int b15, int b16)
+{
+ return v_uint32x16(_mm512_setr_epi32(b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15, b16));
+}
+
static inline v_uint8x64 v_shuffle_s8(const v_uint8x64& a, const v_uint8x64& mask)
{
return v_uint8x64(_mm512_shuffle_epi8(a.val, mask.val));
}
+
static inline v_int16x32 v_load_ccache_expand(const uchar* ptr)
{
return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr))); \
}
+
static inline __m512i v512_insert_epi16(__m512i& target, const ushort x, const int index)
{
return _mm512_mask_set1_epi16(target, 1UL << index, x);
}
+
static inline __m512i v512_insert_epi32(__m512i& target, const int32_t x, const int index)
{
return _mm512_mask_set1_epi32(target, 1UL << index, x);
}
+static inline __m512i v512_insert_epi64(__m512i& target, const int64_t x, const int index)
+{
+ return _mm512_mask_set1_epi64(target, 1UL << index, x);
+}
+
static inline void v_gather_channel(v_uint8x64& vec, const uint8_t tmp[], const short mapsx[],
- int chanNum, int c, int x, int shift)
+ int chanNum, int c, int x, int shift)
{
__m256i vec1 = _mm256_setzero_si256();
__m256i vec2 = _mm256_setzero_si256();
template <int index>
static inline int v512_extract_epi16(__m512i target)
{
- return (v512_extract_epi32<index/2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
+ return (v512_extract_epi32<index / 2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
}
static inline v_uint8x64 v_gather_pairs(const uchar src[], const v_int16x32& index) {
return r;
}
+static inline void v_gather_pairs(const float src[], const int mapsx[], int x,
+ v_float32x16& low, v_float32x16& high) {
+ __m512i lo = _mm512_castps_si512(low.val);
+ __m512i hi = _mm512_castps_si512(high.val);
+
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x]]), 0);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 1]]), 1);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 2]]), 2);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 3]]), 3);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 4]]), 4);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 5]]), 5);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 6]]), 6);
+ lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 7]]), 7);
+
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 8]]), 0);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 9]]), 1);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 10]]), 2);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 11]]), 3);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 12]]), 4);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 13]]), 5);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 14]]), 6);
+ hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 15]]), 7);
+
+ low.val = _mm512_castsi512_ps(lo);
+ high.val = _mm512_castsi512_ps(hi);
+}
+
namespace {
template<int chanNum>
static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) {