}
}
-static inline void insert64(v_uint8& val, const short mapsx[],
- uint8_t tmp[], const int& x, const int& shift) {
- val = v_insert64<0>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 0]]));
- val = v_insert64<1>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 1]]));
- val = v_insert64<2>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 2]]));
- val = v_insert64<3>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 3]]));
-}
-
static inline v_uint8 setHorizontalShufMask1() {
return v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15,
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
constexpr int shift = 4;
v_uint8 shuf_mask1 = setHorizontalShufMask1();
- v_uint8 shuf_mask2 = setHorizontalShufMask2();;
+ v_uint8 shuf_mask2 = setHorizontalShufMask2();
+
v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
for (int x = 0; x < length; ) {
v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
- insert64(val_0, mapsx, tmp, x, 0);
- insert64(val_1, mapsx, tmp, x, shift);
- insert64(val_2, mapsx, tmp, x, shift*2);
- insert64(val_3, mapsx, tmp, x, shift*3);
-
+ v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
val_0 = v_permutevar8x32(val_0, idxs);
val_1 = v_permutevar8x32(val_1, idxs);
val_2 = v_permutevar8x32(val_2, idxs);
return v_uint8x32(_mm256_shuffle_epi8(a.val, mask.val));
}
+#if !defined(__GNUC__) || defined(__GNUC__) && defined(__x86_64)
template<int index>
-static inline v_uint8x32 v_insert64(v_uint8x32& a, const int64_t& i)
+static inline __m256i v_insert64(v_uint8x32& a, const int64_t& i)
{
- return v_uint8x32(_mm256_insert_epi64(a.val, i, index));
+ return _mm256_insert_epi64(a.val, i, index);
+}
+#endif
+
+static inline void v_setr64(v_uint8x32& val_0, v_uint8x32& val_1,v_uint8x32& val_2, v_uint8x32& val_3, const short mapsx[],
+ uint8_t tmp[], const int& x, const int& shift) {
+ val_0.val = _mm256_setr_epi64x(*reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 0]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 2]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]));
+
+ val_1.val = _mm256_setr_epi64x(*reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 0]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 1]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 2]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 3]]));
+
+ val_2.val = _mm256_setr_epi64x(*reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 2*shift + 0]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 2*shift + 1]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 2*shift + 2]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 2*shift + 3]]));
+
+ val_3.val = _mm256_setr_epi64x(*reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3 * shift + 0]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3 * shift + 1]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3 * shift + 2]]),
+ *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3 * shift + 3]]));
}
static inline v_uint8x32 v_permutevar8x32(v_uint8x32& a, v_uint32x8& idxs)