From cc2195887aac0ff2cbfac9ab9e90d61120c1e3b7 Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Mon, 14 May 2012 02:36:38 +0000 Subject: [PATCH] Added support for byte registers in the simulator Added byte scatters / gathers in the simulator Added conversion instructions in the simulator --- backend/src/backend/sim/sim_vector.h | 722 +++++++++++++++++++++++------ backend/src/backend/sim/sim_vector_str.cpp | 722 +++++++++++++++++++++++------ backend/src/backend/sim_context.cpp | 93 +++- backend/src/ocl_stdlib_str.cpp | 8 +- backend/src/utest/utest_vector.cpp | 297 +++++++++++- 5 files changed, 1549 insertions(+), 293 deletions(-) diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h index 111ff88..3fa5bbc 100644 --- a/backend/src/backend/sim/sim_vector.h +++ b/backend/src/backend/sim/sim_vector.h @@ -63,7 +63,10 @@ union scalar_dw { INLINE scalar_dw(uint32_t u) { this->u = u; } INLINE scalar_dw(int32_t s) { this->s = s; } INLINE scalar_dw(float f) { this->f = f; } - uint32_t u; int32_t s; float f; + uint32_t u; + int32_t s; + float f; + char data[4]; }; /*! Base structure for scalar word (16 bits) */ @@ -77,7 +80,27 @@ union scalar_w { x.u[1] = 0; return x.f; } - uint16_t u; int16_t s; + uint16_t u; + int16_t s; + char data[2]; +}; + +/*! Base structure for scalar byte (8 bits) */ +union scalar_b { + INLINE scalar_b(void) {} + INLINE scalar_b(uint8_t u) { this->u = u; } + INLINE scalar_b(int8_t s) { this->s = s; } + INLINE float toFloat(void) const { + union {uint8_t u[4]; float f;} x; + x.u[0] = u; + x.u[1] = 0; + x.u[2] = 0; + x.u[3] = 0; + return x.f; + } + uint8_t u; + int8_t s; + char data[1]; }; /*! Base structure for scalar mask */ @@ -115,6 +138,24 @@ struct simd_w { __m128 m[vectorNum]; }; +/*! Base structure for vectors 4 / 8 / 16 / 32 bytes. We do not store 16 bytes + * but only 4. This makes everything much simpler even if it is clearly slower + */ +template +struct simd_b { + INLINE simd_b(void) {} + INLINE simd_b(const scalar_b &s) { + const float f = s.toFloat(); + for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f); + } + simd_b &operator= (const scalar_b &s) { + const float f = s.toFloat(); + for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f); + return *this; + } + __m128 m[vectorNum]; +}; + /*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for "mask") */ template struct simd_m { @@ -126,26 +167,17 @@ struct simd_m { }; /*! Select instruction on vectors */ -template -INLINE void select(simd_dw &dst, - const simd_dw &src0, - const simd_dw &src1, - const simd_m &mask) -{ - for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]); -} -template -INLINE void select(simd_m &dst, - const simd_m &src0, - const simd_m &src1, +template class T> +INLINE void select(T &dst, + const T &src0, + const T &src1, const simd_m &mask) { for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]); } -/*! To cast through memory 32 bits values in sse registers */ +/*! To cast 32 bits values in sse registers through memory */ union cast_dw { INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3; @@ -161,13 +193,14 @@ union cast_dw { INLINE cast_dw(void) {} __m128 v; __m128i vi; + char data[16]; uint32_t u[4]; int32_t s[4]; float f[4]; }; static const cast_dw allTrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); -/*! To cast through memory 16 bits values in sse registers */ +/*! To cast 16 bits values in sse registers through memory */ union cast_w { INLINE cast_w(int16_t s0, int16_t s1, int16_t s2, int16_t s3) { s[0].v = s0; s[1].v = s1; s[2].v = s2; s[3].v = s3; @@ -182,10 +215,33 @@ union cast_w { INLINE cast_w(void) {} __m128 v; __m128i vi; + char data[16]; struct { uint16_t v; uint16_t pad; } u[4]; struct { int16_t v; int16_t pad; } s[4]; }; +/*! To cast 8 bits values in sse registers through memory */ +union cast_b { + INLINE cast_b(int8_t s0, int8_t s1, int8_t s2, int8_t s3) { + s[0].v = s0; s[1].v = s1; s[2].v = s2; s[3].v = s3; + for (uint32_t i = 0; i < 3; ++i) + s[0].pad[i] = s[1].pad[i] = s[2].pad[i] = s[3].pad[i] = 0; + } + INLINE cast_b(uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3) { + u[0].v = u0; u[1].v = u1; u[2].v = u2; u[3].v = u3; + for (uint32_t i = 0; i < 3; ++i) + s[0].pad[i] = s[1].pad[i] = s[2].pad[i] = s[3].pad[i] = 0; + } + INLINE cast_b(const __m128 &v) : v(v) {} + INLINE cast_b(const __m128i &vi) : vi(vi) {} + INLINE cast_b(void) {} + __m128 v; + __m128i vi; + char data[16]; + struct { uint8_t v; uint8_t pad[3]; } u[4]; + struct { int8_t v; int8_t pad[3]; } s[4]; +}; + /*! Make a mask true */ template INLINE void allTrueMask(simd_m &x) { @@ -203,12 +259,38 @@ typedef simd_w<1> simd4w; typedef simd_w<2> simd8w; typedef simd_w<4> simd16w; typedef simd_w<8> simd32w; +typedef scalar_b simd1b; +typedef simd_b<1> simd4b; +typedef simd_b<2> simd8b; +typedef simd_b<4> simd16b; +typedef simd_b<8> simd32b; typedef scalar_m simd1m; typedef simd_m<1> simd4m; typedef simd_m<2> simd8m; typedef simd_m<4> simd16m; typedef simd_m<8> simd32m; +/* Meta-programming crap to get the vector and the scalar types from the regular + * base C types + */ +template struct SimTypeTrait {}; + +#define DECL_TYPE_TRAIT(CTYPE, VECTOR_TYPE, SCALAR_TYPE, CAST_TYPE) \ +template \ +struct SimTypeTrait { \ + typedef VECTOR_TYPE Vector; \ + typedef SCALAR_TYPE Scalar; \ + typedef CAST_TYPE Cast; \ +}; +DECL_TYPE_TRAIT(uint8_t, simd_b, scalar_b, cast_b) +DECL_TYPE_TRAIT(int8_t, simd_b, scalar_b, cast_b) +DECL_TYPE_TRAIT(uint16_t, simd_w, scalar_w, cast_w) +DECL_TYPE_TRAIT(int16_t, simd_w, scalar_w, cast_w) +DECL_TYPE_TRAIT(uint32_t, simd_dw, scalar_dw, cast_dw) +DECL_TYPE_TRAIT(int32_t, simd_dw, scalar_dw, cast_dw) +DECL_TYPE_TRAIT(float, simd_dw, scalar_dw, cast_dw) +#undef DECL_TYPE_TRAIT + ////////////////////////////////////////////////////////////////////////////// // Vector instructions ////////////////////////////////////////////////////////////////////////////// @@ -218,11 +300,15 @@ INLINE uint32_t elemNum(const simd_dw &x) { return 4*vectorNum; } template -INLINE uint32_t elemNum(const simd_m &x) { +INLINE uint32_t elemNum(const simd_w &x) { return 4*vectorNum; } template -INLINE uint32_t elemNum(const simd_w &x) { +INLINE uint32_t elemNum(const simd_b &x) { + return 4*vectorNum; +} +template +INLINE uint32_t elemNum(const simd_m &x) { return 4*vectorNum; } @@ -255,6 +341,16 @@ INLINE void MOV_S16(simd_w &dst, const scalar_w &x) { const __m128 v = _mm_load1_ps(&f); for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v; } +template +INLINE void MOV_S8(simd_b &dst, const simd_b &v) { + for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i]; +} +template +INLINE void MOV_S8(simd_b &dst, const scalar_b &x) { + const float f = x.toFloat(); + const __m128 v = _mm_load1_ps(&f); + for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v; +} /* Vector instructions that use sse* */ #define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ @@ -299,6 +395,12 @@ VEC_OP(simd_w, simd_w, scalar_w, SUB_S16, _mm_sub_epi16, VEC_OP(simd_w, simd_w, scalar_w, AND_S16, _mm_and_ps, ID, ID, ID); VEC_OP(simd_w, simd_w, scalar_w, OR_S16, _mm_or_ps, ID, ID, ID); VEC_OP(simd_w, simd_w, scalar_w, XOR_S16, _mm_xor_ps, ID, ID, ID); +VEC_OP(simd_m, simd_b, scalar_b, EQ_S8, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_b, simd_b, scalar_b, ADD_S8, _mm_add_epi8, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_b, simd_b, scalar_b, SUB_S8, _mm_sub_epi8, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_b, simd_b, scalar_b, AND_S8, _mm_and_ps, ID, ID, ID); +VEC_OP(simd_b, simd_b, scalar_b, OR_S8, _mm_or_ps, ID, ID, ID); +VEC_OP(simd_b, simd_b, scalar_b, XOR_S8, _mm_xor_ps, ID, ID, ID); VEC_OP(simd_m, simd_m, scalar_m, AND_M, _mm_and_ps, ID, ID, ID); #undef VEC_OP @@ -348,18 +450,27 @@ template \ INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\ NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\ } +VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, SHL_U32, <<, s[j]); VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, MUL_S32, *, s[j]); VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, DIV_S32, /, s[j]); VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, REM_S32, %, s[j]); VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, MUL_U32, *, u[j]); VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, DIV_U32, /, u[j]); VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, REM_U32, %, u[j]); +VEC_OP(simd_w, simd_w, scalar_w, cast_w, SHL_U16, <<, s[j].v); VEC_OP(simd_w, simd_w, scalar_w, cast_w, MUL_S16, *, s[j].v); VEC_OP(simd_w, simd_w, scalar_w, cast_w, DIV_S16, /, s[j].v); VEC_OP(simd_w, simd_w, scalar_w, cast_w, REM_S16, %, s[j].v); VEC_OP(simd_w, simd_w, scalar_w, cast_w, MUL_U16, *, u[j].v); VEC_OP(simd_w, simd_w, scalar_w, cast_w, DIV_U16, /, u[j].v); VEC_OP(simd_w, simd_w, scalar_w, cast_w, REM_U16, %, u[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, SHL_U8, <<, s[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, MUL_S8, *, s[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, DIV_S8, /, s[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, REM_S8, %, s[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, MUL_U8, *, u[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, DIV_U8, /, u[j].v); +VEC_OP(simd_b, simd_b, scalar_b, cast_b, REM_U8, %, u[j].v); #undef VEC_OP /* Vector compare vectors that require C */ @@ -398,6 +509,14 @@ VEC_OP(simd_m, simd_w, scalar_w, cast_w, LE_S16, <=, s[ VEC_OP(simd_m, simd_w, scalar_w, cast_w, LT_S16, <, s[j].v); VEC_OP(simd_m, simd_w, scalar_w, cast_w, GE_S16, >=, s[j].v); VEC_OP(simd_m, simd_w, scalar_w, cast_w, GT_S16, >, s[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, LE_U8, <=, u[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, LT_U8, <, u[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, GE_U8, >=, u[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, GT_U8, >, u[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, LE_S8, <=, s[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, LT_S8, <, s[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, GE_S8, >=, s[j].v); +VEC_OP(simd_m, simd_b, scalar_b, cast_b, GT_S8, >, s[j].v); #undef VEC_OP /* Get NE from EQ */ @@ -452,6 +571,69 @@ INLINE void NE_S16(simd_m &dst, { NE_S16(dst, simd_w(v0), simd_w(v1)); } +template +INLINE void NE_S8(simd_m &dst, + const simd_b &v0, + const simd_b &v1) +{ + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_xor_ps(allTrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i])))); +} +template +INLINE void NE_S8(simd_m &dst, + const simd_b &v0, + const scalar_b &v1) +{ + NE_S8(dst, v0, simd_b(v1)); +} +template +INLINE void NE_S8(simd_m &dst, + const scalar_b &v0, + const simd_b &v1) +{ + NE_S8(dst, simd_b(v0), v1); +} +template +INLINE void NE_S8(simd_m &dst, + const scalar_b &v0, + const scalar_b &v1) +{ + NE_S8(dst, simd_b(v0), simd_b(v1)); +} + + +template class DstType, + template class SrcType> +INLINE void CVT(DstType &dst, const SrcType &src) +{ + for (uint32_t i = 0; i < vectorNum; ++i) { + const typename SimTypeTrait::Cast srcCast(src.m[i]); + const DstCType x0 = (DstCType) *(const SrcCType*) (srcCast.data + 0); + const DstCType x1 = (DstCType) *(const SrcCType*) (srcCast.data + 4); + const DstCType x2 = (DstCType) *(const SrcCType*) (srcCast.data + 8); + const DstCType x3 = (DstCType) *(const SrcCType*) (srcCast.data + 12); + const typename SimTypeTrait::Cast dstCast(x0, x1, x2, x3); + dst.m[i] = dstCast.v; + } +} + +template class DstType, + class SrcType> +INLINE void CVT(DstType &dst, const SrcType &src) +{ + for (uint32_t i = 0; i < vectorNum; ++i) { + const SrcCType from = *((SrcCType *) src.data); + const DstCType x = (DstCType) from; + const typename SimTypeTrait::Cast dstCast(x,x,x,x); + dst.m[i] = dstCast.v; + } +} /* Load from contiguous double words */ template @@ -492,6 +674,31 @@ INLINE void STORE(const simd_w &src, char *ptr) { } } +/* Load from contiguous bytes */ +template +INLINE void LOAD(simd_b &dst, const char *ptr) { + for (uint32_t i = 0; i < vectorNum; ++i) { + const uint8_t u0 = *((uint8_t*) ptr + 4*i + 0); + const uint8_t u1 = *((uint8_t*) ptr + 4*i + 1); + const uint8_t u2 = *((uint8_t*) ptr + 4*i + 2); + const uint8_t u3 = *((uint8_t*) ptr + 4*i + 3); + const cast_b w(u0,u1,u2,u3); + dst.m[i] = w.v; + } +} + +/* Store to contiguous bytes */ +template +INLINE void STORE(const simd_b &src, char *ptr) { + for (uint32_t i = 0; i < vectorNum; ++i) { + const cast_b w(src.m[i]); + *((uint8_t*) ptr + 4*i + 0) = w.u[0].v; + *((uint8_t*) ptr + 4*i + 1) = w.u[1].v; + *((uint8_t*) ptr + 4*i + 2) = w.u[2].v; + *((uint8_t*) ptr + 4*i + 3) = w.u[3].v; + } +} + /* Load immediates */ template INLINE void LOADI(simd_dw &dst, uint32_t u) { @@ -500,137 +707,328 @@ INLINE void LOADI(simd_dw &dst, uint32_t u) { for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = _mm_load1_ps(&cast.f); } - -/* Scatter */ template -INLINE void SCATTER(const simd_dw &offset, - const simd_dw &value, - char *base_address) { - for (uint32_t i = 0; i < vectorNum; ++i) { - const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0); - const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1); - const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2); - const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3); - const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0); - const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1); - const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2); - const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3); - *(int*)(base_address + o0) = v0; - *(int*)(base_address + o1) = v1; - *(int*)(base_address + o2) = v2; - *(int*)(base_address + o3) = v3; - } +INLINE void LOADI(simd_w &dst, uint16_t u) { + union { uint32_t u; float f; } cast; + cast.u = u; + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_load1_ps(&cast.f); } template -INLINE void SCATTER(const simd_dw &offset, - const scalar_dw &value, - char *base_address) { - SCATTER(offset, simd_dw(value), base_address); +INLINE void LOADI(simd_b &dst, uint8_t u) { + union { uint32_t u; float f; } cast; + cast.u = u; + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_load1_ps(&cast.f); } -template -INLINE void SCATTER(const scalar_dw &offset, - const simd_dw &value, - char *base_address) { - SCATTER(simd_dw(offset), value, base_address); + +/* Scatter for bytes, shorts and integers */ +#define DECL_SCATTER(VECTOR_TYPE, SCALAR_TYPE, CTYPE, MASK) \ +template \ +INLINE void SCATTER(const simd_dw &address, \ + const VECTOR_TYPE &value, \ + char *base_address, \ + uint32_t offset = 0) \ +{ \ + for (uint32_t i = 0; i < vectorNum; ++i) { \ + const uint32_t v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0) & MASK; \ + const uint32_t v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1) & MASK; \ + const uint32_t v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2) & MASK; \ + const uint32_t v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3) & MASK; \ + const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \ + const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \ + const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \ + const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \ + *(CTYPE *)(base_address + o0) = v0; \ + *(CTYPE *)(base_address + o1) = v1; \ + *(CTYPE *)(base_address + o2) = v2; \ + *(CTYPE *)(base_address + o3) = v3; \ + } \ +} \ +template \ +INLINE void SCATTER(const simd_dw &address, \ + const SCALAR_TYPE &value, \ + char *base_address, \ + uint32_t offset = 0) \ +{ \ + SCATTER(address, VECTOR_TYPE(value), base_address, offset); \ +} \ +template \ +INLINE void SCATTER(const scalar_dw &address, \ + const VECTOR_TYPE &value, \ + char *base_address, \ + uint32_t offset = 0) \ +{ \ + SCATTER(simd_dw(address), value, base_address, offset); \ +} +DECL_SCATTER(simd_dw, scalar_dw, uint32_t, 0xffffffff) +DECL_SCATTER(simd_w, scalar_w, uint16_t, 0xffff) +DECL_SCATTER(simd_b, scalar_b, uint8_t, 0xff) +#undef DECL_SCATTER + +template +INLINE void SCATTER2(const T &address, + const U &value0, + const V &value1, + char *base_address) +{ + SCATTER(address, value0, base_address, 0); + SCATTER(address, value1, base_address, 4); +} +template +INLINE void SCATTER3(const T &address, + const U &value0, + const V &value1, + const W &value2, + char *base_address) +{ + SCATTER(address, value0, base_address, 0); + SCATTER(address, value1, base_address, 4); + SCATTER(address, value2, base_address, 8); +} +template +INLINE void SCATTER4(const T &address, + const U &value0, + const V &value1, + const W &value2, + const X &value3, + char *base_address) +{ + SCATTER(address, value0, base_address, 0); + SCATTER(address, value1, base_address, 4); + SCATTER(address, value2, base_address, 8); + SCATTER(address, value3, base_address, 12); } /* Masked scatter will only store unmasked lanes */ -template -INLINE void MASKED_SCATTER(const simd_dw &offset, - const simd_dw &value, - char *base_address, - uint32_t mask) +#define DECL_MASKED_SCATTER(VECTOR_TYPE, SCALAR_TYPE, CTYPE, MASK) \ +template \ +INLINE void MASKED_SCATTER(const simd_dw &address, \ + const VECTOR_TYPE &value, \ + char *base_address, \ + uint32_t mask, \ + uint32_t offset = 0) \ +{ \ + for (uint32_t i = 0; i < vectorNum; ++i) { \ + const uint32_t v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0) & MASK; \ + const uint32_t v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1) & MASK; \ + const uint32_t v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2) & MASK; \ + const uint32_t v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3) & MASK; \ + const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \ + const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \ + const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \ + const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \ + if (mask & 1) *(CTYPE *)(base_address + o0) = v0; \ + if (mask & 2) *(CTYPE *)(base_address + o1) = v1; \ + if (mask & 4) *(CTYPE *)(base_address + o2) = v2; \ + if (mask & 8) *(CTYPE *)(base_address + o3) = v3; \ + mask = mask >> 4; \ + } \ +} \ +template \ +INLINE void MASKED_SCATTER(const simd_dw &address, \ + const SCALAR_TYPE &value, \ + char *base_address, \ + uint32_t mask, \ + uint32_t offset = 0) \ +{ \ + MASKED_SCATTER(address, VECTOR_TYPE(value), base_address, mask, offset); \ +} \ +template \ +INLINE void MASKED_SCATTER(const scalar_dw &address, \ + const VECTOR_TYPE &value, \ + char *base_address, \ + uint32_t mask, \ + uint32_t offset = 0) \ +{ \ + MASKED_SCATTER(simd_dw(address), value, base_address, mask, offset); \ +} +DECL_MASKED_SCATTER(simd_dw, scalar_dw, uint32_t, 0xffffffff) +DECL_MASKED_SCATTER(simd_w, scalar_w, uint16_t, 0xffff) +DECL_MASKED_SCATTER(simd_b, scalar_b, uint8_t, 0xff) +#undef DECL_MASKED_SCATTER + +template +INLINE void MASKED_SCATTER2(const T &address, + const U &value0, + const V &value1, + char *base_address, + uint32_t mask) { - for (uint32_t i = 0; i < vectorNum; ++i) { - const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0); - const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1); - const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2); - const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3); - const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0); - const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1); - const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2); - const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3); - if (mask & 1) *(int*)(base_address + o0) = v0; - if (mask & 2) *(int*)(base_address + o1) = v1; - if (mask & 4) *(int*)(base_address + o2) = v2; - if (mask & 8) *(int*)(base_address + o3) = v3; - mask = mask >> 4; - } + MASKED_SCATTER(address, value0, base_address, mask, 0); + MASKED_SCATTER(address, value1, base_address, mask, 4); } -template -INLINE void MASKED_SCATTER(const simd_dw &offset, - const scalar_dw &value, - char *base_address, - uint32_t mask) +template +INLINE void MASKED_SCATTER3(const T &address, + const U &value0, + const V &value1, + const W &value2, + char *base_address, + uint32_t mask) { - MASKED_SCATTER(offset, simd_dw(value), base_address, mask); + MASKED_SCATTER(address, value0, base_address, mask, 0); + MASKED_SCATTER(address, value1, base_address, mask, 4); + MASKED_SCATTER(address, value2, base_address, mask, 8); } -template -INLINE void MASKED_SCATTER(const scalar_dw &offset, - const simd_dw &value, - char *base_address, - uint32_t mask) +template +INLINE void MASKED_SCATTER4(const T &address, + const U &value0, + const V &value1, + const W &value2, + const X &value3, + char *base_address, + uint32_t mask) { - MASKED_SCATTER(simd_dw(offset), value, base_address, mask); + MASKED_SCATTER(address, value0, base_address, mask, 0); + MASKED_SCATTER(address, value1, base_address, mask, 4); + MASKED_SCATTER(address, value2, base_address, mask, 8); + MASKED_SCATTER(address, value3, base_address, mask, 12); } /* Gather */ -template -INLINE void GATHER(simd_dw &dst, - const simd_dw &offset, - const char *base_address) { - for (uint32_t i = 0; i < vectorNum; ++i) { - const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0); - const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1); - const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2); - const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3); - const int v0 = *(const int*)(base_address + o0); - const int v1 = *(const int*)(base_address + o1); - const int v2 = *(const int*)(base_address + o2); - const int v3 = *(const int*)(base_address + o3); - dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); - dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); - dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); - dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); - } +#define DECL_GATHER(VECTOR_TYPE, SCALAR_TYPE, CTYPE) \ +template \ +INLINE void GATHER(VECTOR_TYPE &dst, \ + const simd_dw &address, \ + const char *base_address, \ + uint32_t offset = 0) \ +{ \ + for (uint32_t i = 0; i < vectorNum; ++i) { \ + const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \ + const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \ + const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \ + const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \ + const CTYPE v0 = *(const CTYPE *)(base_address + o0); \ + const CTYPE v1 = *(const CTYPE *)(base_address + o1); \ + const CTYPE v2 = *(const CTYPE *)(base_address + o2); \ + const CTYPE v3 = *(const CTYPE *)(base_address + o3); \ + dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); \ + dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); \ + dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); \ + dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); \ + } \ +} \ +template \ +INLINE void GATHER(VECTOR_TYPE &dst, \ + const scalar_dw &address, \ + const char *base_address, \ + uint32_t offset = 0) \ +{ \ + GATHER(dst, VECTOR_TYPE(address), base_address, offset); \ } -template -INLINE void GATHER(simd_dw &dst, - const scalar_dw &offset, - const char *base_address) { - GATHER(dst, simd_dw(offset), base_address); +DECL_GATHER(simd_dw, scalar_dw, uint32_t) +DECL_GATHER(simd_w, scalar_w, uint16_t) +DECL_GATHER(simd_b, scalar_b, uint8_t) +#undef DECL_GATHER + +template +INLINE void GATHER2(U &value0, + V &value1, + const T &address, + char *base_address) +{ + GATHER(value0, address, base_address, 0); + GATHER(value1, address, base_address, 4); +} +template +INLINE void GATHER3(U &value0, + V &value1, + W &value2, + const T &address, + char *base_address) +{ + GATHER(value0, address, base_address, 0); + GATHER(value1, address, base_address, 4); + GATHER(value2, address, base_address, 8); +} +template +INLINE void GATHER4(U &value0, + V &value1, + W &value2, + X &value3, + const T &address, + char *base_address) +{ + GATHER(value0, address, base_address, 0); + GATHER(value1, address, base_address, 4); + GATHER(value2, address, base_address, 8); + GATHER(value3, address, base_address, 12); } /* Masked gather will only load activated lanes */ -template -INLINE void MASKED_GATHER(simd_dw &dst, - const simd_dw &offset, - const char *base_address, - uint32_t mask) +#define DECL_MASKED_GATHER(VECTOR_TYPE, SCALAR_TYPE, CTYPE) \ +template \ +INLINE void MASKED_GATHER(VECTOR_TYPE &dst, \ + const simd_dw &address, \ + const char *base_address, \ + uint32_t mask, \ + uint32_t offset = 0) \ +{ \ + for (uint32_t i = 0; i < vectorNum; ++i) { \ + const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \ + const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \ + const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \ + const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \ + const CTYPE v0 = *(const CTYPE *)(base_address + o0); \ + const CTYPE v1 = *(const CTYPE *)(base_address + o1); \ + const CTYPE v2 = *(const CTYPE *)(base_address + o2); \ + const CTYPE v3 = *(const CTYPE *)(base_address + o3); \ + if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); \ + if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); \ + if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); \ + if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); \ + mask = mask >> 4; \ + } \ +} \ +template \ +INLINE void MASKED_GATHER(VECTOR_TYPE &dst, \ + const scalar_dw &address, \ + const char *base_address, \ + uint32_t mask, \ + uint32_t offset = 0) \ +{ \ + MASKED_GATHER(dst, simd_dw(address), base_address, mask, offset); \ +} +DECL_MASKED_GATHER(simd_dw, scalar_dw, uint32_t) +DECL_MASKED_GATHER(simd_w, scalar_w, uint16_t) +DECL_MASKED_GATHER(simd_b, scalar_b, uint8_t) +#undef DECL_MASKED_GATHER + +template +INLINE void MASKED_GATHER2(U &value0, + V &value1, + const T &address, + char *base_address, + uint32_t mask) { - for (uint32_t i = 0; i < vectorNum; ++i) { - const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0); - const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1); - const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2); - const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3); - const int v0 = *(const int*)(base_address + o0); - const int v1 = *(const int*)(base_address + o1); - const int v2 = *(const int*)(base_address + o2); - const int v3 = *(const int*)(base_address + o3); - if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); - if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); - if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); - if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); - mask = mask >> 4; - } + MASKED_GATHER(value0, address, base_address, mask, 0); + MASKED_GATHER(value1, address, base_address, mask, 4); } -template -INLINE void MASKED_GATHER(simd_dw &dst, - const scalar_dw &offset, - const char *base_address, - uint32_t mask) +template +INLINE void MASKED_GATHER3(U &value0, + V &value1, + W &value2, + const T &address, + char *base_address, + uint32_t mask) { - MASKED_GATHER(dst, simd_dw(offset), base_address, mask); + MASKED_GATHER(value0, address, base_address, mask, 0); + MASKED_GATHER(value1, address, base_address, mask, 4); + MASKED_GATHER(value2, address, base_address, mask, 8); +} +template +INLINE void MASKED_GATHER4(U &value0, + V &value1, + W &value2, + X &value3, + const T &address, + char *base_address, + uint32_t mask) +{ + MASKED_GATHER(value0, address, base_address, mask, 0); + MASKED_GATHER(value1, address, base_address, mask, 4); + MASKED_GATHER(value2, address, base_address, mask, 8); + MASKED_GATHER(value3, address, base_address, mask, 12); } ////////////////////////////////////////////////////////////////////////////// @@ -638,6 +1036,7 @@ INLINE void MASKED_GATHER(simd_dw &dst, ////////////////////////////////////////////////////////////////////////////// INLINE uint32_t elemNum(const scalar_dw &x) { return 1; } INLINE uint32_t elemNum(const scalar_w &x) { return 1; } +INLINE uint32_t elemNum(const scalar_b &x) { return 1; } INLINE uint32_t elemNum(const scalar_m &x) { return 1; } INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; } @@ -654,6 +1053,7 @@ INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); } // 32 bit integers +INLINE void SHL_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s << v1.s; } INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; } INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; } INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; } @@ -681,7 +1081,8 @@ INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; } INLINE void SCATTER(scalar_dw offset, scalar_dw value, char *base) { *(uint32_t*)(base + offset.u) = value.u; } INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = *(const uint32_t*)(base + offset.u); } -// 16 bit floating points +// 16 bits scalar +INLINE void SHL_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u << v1.u; } INLINE void ADD_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u + v1.u; } INLINE void SUB_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u - v1.u; } INLINE void ADD_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s + v1.s; } @@ -708,16 +1109,48 @@ INLINE void GT_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u > v1 INLINE void LOAD(scalar_w &dst, const char *ptr) { dst.u = *(const uint16_t *) ptr; } INLINE void STORE(scalar_w src, char *ptr) { *(uint16_t *) ptr = src.u; } INLINE void LOADI(scalar_w &dst, uint16_t u) { dst.u = u; } -INLINE void SCATTER(scalar_w offset, scalar_w value, char *base) { *(uint16_t*)(base + offset.u) = value.u; } -INLINE void GATHER(scalar_w &dst, scalar_w offset, const char *base) { dst.u = *(const uint16_t*)(base + offset.u); } +INLINE void SCATTER(scalar_dw offset, scalar_w value, char *base) { *(uint16_t*)(base + offset.u) = value.u; } +INLINE void GATHER(scalar_w &dst, scalar_dw offset, const char *base) { dst.u = *(const uint16_t*)(base + offset.u); } + +// 8 bits scalars +INLINE void SHL_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u << v1.u; } +INLINE void ADD_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u + v1.u; } +INLINE void SUB_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u - v1.u; } +INLINE void ADD_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s + v1.s; } +INLINE void SUB_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s - v1.s; } +INLINE void MUL_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s * v1.s; } +INLINE void DIV_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s / v1.s; } +INLINE void REM_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s % v1.s; } +INLINE void MUL_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u * v1.u; } +INLINE void DIV_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u / v1.u; } +INLINE void REM_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u % v1.u; } +INLINE void EQ_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s == v1.s ? ~0 : 0); } +INLINE void NE_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s != v1.s ? ~0 : 0); } +INLINE void LE_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); } +INLINE void LT_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s < v1.s ? ~0 : 0); } +INLINE void GE_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); } +INLINE void GT_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s > v1.s ? ~0 : 0); } +INLINE void XOR_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s ^ v1.s; } +INLINE void OR_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s | v1.s; } +INLINE void AND_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s & v1.s; } +INLINE void LE_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); } +INLINE void LT_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u < v1.u ? ~0 : 0); } +INLINE void GE_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); } +INLINE void GT_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u > v1.u ? ~0 : 0); } +INLINE void LOAD(scalar_b &dst, const char *ptr) { dst.u = *(const uint8_t *) ptr; } +INLINE void STORE(scalar_b src, char *ptr) { *(uint8_t *) ptr = src.u; } +INLINE void LOADI(scalar_b &dst, uint8_t u) { dst.u = u; } +INLINE void SCATTER(scalar_dw offset, scalar_b value, char *base) { *(uint8_t*)(base + offset.u) = value.u; } +INLINE void GATHER(scalar_b &dst, scalar_dw offset, const char *base) { dst.u = *(const uint8_t*)(base + offset.u); } ////////////////////////////////////////////////////////////////////////////// // Identical instructions are forwarded ////////////////////////////////////////////////////////////////////////////// // Forward identical 32 bit instructions -#define NOV_U32 MOV_S32 -#define NOV_F MOV_S32 +#define MOV_U32 MOV_S32 +#define SHL_S32 SHL_U32 +#define MOV_F MOV_S32 #define ADD_U32 ADD_S32 #define SUB_U32 SUB_S32 #define XOR_U32 XOR_S32 @@ -727,7 +1160,8 @@ INLINE void GATHER(scalar_w &dst, scalar_w offset, const char *base) { dst.u = * #define NE_U32 NE_S32 // Forward identical 16 bit instructions -#define NOV_U16 MOV_S16 +#define MOV_U16 MOV_S16 +#define SHL_S16 SHL_U16 #define ADD_U16 ADD_S16 #define SUB_U16 SUB_S16 #define AND_U16 AND_S16 @@ -737,6 +1171,24 @@ INLINE void GATHER(scalar_w &dst, scalar_w offset, const char *base) { dst.u = * #define EQ_U16 EQ_S16 #define NE_U16 NE_S16 +// Forward identical 8 bit instructions +#define MOV_U8 MOV_S8 +#define SHL_S8 SHL_U8 +#define ADD_U8 ADD_S8 +#define SUB_U8 SUB_S8 +#define AND_U8 AND_S8 +#define XOR_U8 XOR_S8 +#define OR_U8 OR_S8 +#define AND_U8 AND_S8 +#define EQ_U8 EQ_S8 +#define NE_U8 NE_S8 + +// More convenient to emit code +#define GATHER1 GATHER +#define SCATTER1 SCATTER +#define MASKED_GATHER1 MASKED_GATHER +#define MASKED_SCATTER1 MASKED_SCATTER + #undef PS2SI #undef SI2PS #undef ID @@ -873,5 +1325,7 @@ void updateMask(simd_m &mask, const simd_w &uipVec, uint16 #undef INLINE +// May be needed for some macro hell +#define COMMA , #endif /* __GBE_SIM_VECTOR_H__ */ diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp index aa9383a..f9a0a64 100644 --- a/backend/src/backend/sim/sim_vector_str.cpp +++ b/backend/src/backend/sim/sim_vector_str.cpp @@ -89,7 +89,10 @@ std::string sim_vector_str = " INLINE scalar_dw(uint32_t u) { this->u = u; }\n" " INLINE scalar_dw(int32_t s) { this->s = s; }\n" " INLINE scalar_dw(float f) { this->f = f; }\n" -" uint32_t u; int32_t s; float f;\n" +" uint32_t u;\n" +" int32_t s;\n" +" float f;\n" +" char data[4];\n" "};\n" "\n" "/*! Base structure for scalar word (16 bits) */\n" @@ -103,7 +106,27 @@ std::string sim_vector_str = " x.u[1] = 0;\n" " return x.f;\n" " }\n" -" uint16_t u; int16_t s;\n" +" uint16_t u;\n" +" int16_t s;\n" +" char data[2];\n" +"};\n" +"\n" +"/*! Base structure for scalar byte (8 bits) */\n" +"union scalar_b {\n" +" INLINE scalar_b(void) {}\n" +" INLINE scalar_b(uint8_t u) { this->u = u; }\n" +" INLINE scalar_b(int8_t s) { this->s = s; }\n" +" INLINE float toFloat(void) const {\n" +" union {uint8_t u[4]; float f;} x;\n" +" x.u[0] = u;\n" +" x.u[1] = 0;\n" +" x.u[2] = 0;\n" +" x.u[3] = 0;\n" +" return x.f;\n" +" }\n" +" uint8_t u;\n" +" int8_t s;\n" +" char data[1];\n" "};\n" "\n" "/*! Base structure for scalar mask */\n" @@ -141,6 +164,24 @@ std::string sim_vector_str = " __m128 m[vectorNum];\n" "};\n" "\n" +"/*! Base structure for vectors 4 / 8 / 16 / 32 bytes. We do not store 16 bytes\n" +" * but only 4. This makes everything much simpler even if it is clearly slower\n" +" */\n" +"template \n" +"struct simd_b {\n" +" INLINE simd_b(void) {}\n" +" INLINE simd_b(const scalar_b &s) {\n" +" const float f = s.toFloat();\n" +" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f);\n" +" }\n" +" simd_b &operator= (const scalar_b &s) {\n" +" const float f = s.toFloat();\n" +" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f);\n" +" return *this;\n" +" }\n" +" __m128 m[vectorNum];\n" +"};\n" +"\n" "/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for \"mask\") */\n" "template \n" "struct simd_m {\n" @@ -152,26 +193,17 @@ std::string sim_vector_str = "};\n" "\n" "/*! Select instruction on vectors */\n" -"template \n" -"INLINE void select(simd_dw &dst,\n" -" const simd_dw &src0,\n" -" const simd_dw &src1,\n" -" const simd_m &mask)\n" -"{\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n" -"}\n" -"template \n" -"INLINE void select(simd_m &dst,\n" -" const simd_m &src0,\n" -" const simd_m &src1,\n" +"template class T>\n" +"INLINE void select(T &dst,\n" +" const T &src0,\n" +" const T &src1,\n" " const simd_m &mask)\n" "{\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" " dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n" "}\n" "\n" -"/*! To cast through memory 32 bits values in sse registers */\n" +"/*! To cast 32 bits values in sse registers through memory */\n" "union cast_dw {\n" " INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n" " u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n" @@ -187,13 +219,14 @@ std::string sim_vector_str = " INLINE cast_dw(void) {}\n" " __m128 v;\n" " __m128i vi;\n" +" char data[16];\n" " uint32_t u[4];\n" " int32_t s[4];\n" " float f[4];\n" "};\n" "static const cast_dw allTrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n" "\n" -"/*! To cast through memory 16 bits values in sse registers */\n" +"/*! To cast 16 bits values in sse registers through memory */\n" "union cast_w {\n" " INLINE cast_w(int16_t s0, int16_t s1, int16_t s2, int16_t s3) {\n" " s[0].v = s0; s[1].v = s1; s[2].v = s2; s[3].v = s3;\n" @@ -208,10 +241,33 @@ std::string sim_vector_str = " INLINE cast_w(void) {}\n" " __m128 v;\n" " __m128i vi;\n" +" char data[16];\n" " struct { uint16_t v; uint16_t pad; } u[4];\n" " struct { int16_t v; int16_t pad; } s[4];\n" "};\n" "\n" +"/*! To cast 8 bits values in sse registers through memory */\n" +"union cast_b {\n" +" INLINE cast_b(int8_t s0, int8_t s1, int8_t s2, int8_t s3) {\n" +" s[0].v = s0; s[1].v = s1; s[2].v = s2; s[3].v = s3;\n" +" for (uint32_t i = 0; i < 3; ++i)\n" +" s[0].pad[i] = s[1].pad[i] = s[2].pad[i] = s[3].pad[i] = 0;\n" +" }\n" +" INLINE cast_b(uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3) {\n" +" u[0].v = u0; u[1].v = u1; u[2].v = u2; u[3].v = u3;\n" +" for (uint32_t i = 0; i < 3; ++i)\n" +" s[0].pad[i] = s[1].pad[i] = s[2].pad[i] = s[3].pad[i] = 0;\n" +" }\n" +" INLINE cast_b(const __m128 &v) : v(v) {}\n" +" INLINE cast_b(const __m128i &vi) : vi(vi) {}\n" +" INLINE cast_b(void) {}\n" +" __m128 v;\n" +" __m128i vi;\n" +" char data[16];\n" +" struct { uint8_t v; uint8_t pad[3]; } u[4];\n" +" struct { int8_t v; int8_t pad[3]; } s[4];\n" +"};\n" +"\n" "/*! Make a mask true */\n" "template \n" "INLINE void allTrueMask(simd_m &x) {\n" @@ -229,12 +285,38 @@ std::string sim_vector_str = "typedef simd_w<2> simd8w;\n" "typedef simd_w<4> simd16w;\n" "typedef simd_w<8> simd32w;\n" +"typedef scalar_b simd1b;\n" +"typedef simd_b<1> simd4b;\n" +"typedef simd_b<2> simd8b;\n" +"typedef simd_b<4> simd16b;\n" +"typedef simd_b<8> simd32b;\n" "typedef scalar_m simd1m;\n" "typedef simd_m<1> simd4m;\n" "typedef simd_m<2> simd8m;\n" "typedef simd_m<4> simd16m;\n" "typedef simd_m<8> simd32m;\n" "\n" +"/* Meta-programming crap to get the vector and the scalar types from the regular\n" +" * base C types\n" +" */\n" +"template struct SimTypeTrait {};\n" +"\n" +"#define DECL_TYPE_TRAIT(CTYPE, VECTOR_TYPE, SCALAR_TYPE, CAST_TYPE) \\\n" +"template \\\n" +"struct SimTypeTrait { \\\n" +" typedef VECTOR_TYPE Vector; \\\n" +" typedef SCALAR_TYPE Scalar; \\\n" +" typedef CAST_TYPE Cast; \\\n" +"};\n" +"DECL_TYPE_TRAIT(uint8_t, simd_b, scalar_b, cast_b)\n" +"DECL_TYPE_TRAIT(int8_t, simd_b, scalar_b, cast_b)\n" +"DECL_TYPE_TRAIT(uint16_t, simd_w, scalar_w, cast_w)\n" +"DECL_TYPE_TRAIT(int16_t, simd_w, scalar_w, cast_w)\n" +"DECL_TYPE_TRAIT(uint32_t, simd_dw, scalar_dw, cast_dw)\n" +"DECL_TYPE_TRAIT(int32_t, simd_dw, scalar_dw, cast_dw)\n" +"DECL_TYPE_TRAIT(float, simd_dw, scalar_dw, cast_dw)\n" +"#undef DECL_TYPE_TRAIT\n" +"\n" "//////////////////////////////////////////////////////////////////////////////\n" "// Vector instructions\n" "//////////////////////////////////////////////////////////////////////////////\n" @@ -244,11 +326,15 @@ std::string sim_vector_str = " return 4*vectorNum;\n" "}\n" "template \n" -"INLINE uint32_t elemNum(const simd_m &x) {\n" +"INLINE uint32_t elemNum(const simd_w &x) {\n" " return 4*vectorNum;\n" "}\n" "template \n" -"INLINE uint32_t elemNum(const simd_w &x) {\n" +"INLINE uint32_t elemNum(const simd_b &x) {\n" +" return 4*vectorNum;\n" +"}\n" +"template \n" +"INLINE uint32_t elemNum(const simd_m &x) {\n" " return 4*vectorNum;\n" "}\n" "\n" @@ -281,6 +367,16 @@ std::string sim_vector_str = " const __m128 v = _mm_load1_ps(&f);\n" " for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n" "}\n" +"template \n" +"INLINE void MOV_S8(simd_b &dst, const simd_b &v) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];\n" +"}\n" +"template \n" +"INLINE void MOV_S8(simd_b &dst, const scalar_b &x) {\n" +" const float f = x.toFloat();\n" +" const __m128 v = _mm_load1_ps(&f);\n" +" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n" +"}\n" "\n" "/* Vector instructions that use sse* */\n" "#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n" @@ -325,6 +421,12 @@ std::string sim_vector_str = "VEC_OP(simd_w, simd_w, scalar_w, AND_S16, _mm_and_ps, ID, ID, ID);\n" "VEC_OP(simd_w, simd_w, scalar_w, OR_S16, _mm_or_ps, ID, ID, ID);\n" "VEC_OP(simd_w, simd_w, scalar_w, XOR_S16, _mm_xor_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_b, scalar_b, EQ_S8, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_b, simd_b, scalar_b, ADD_S8, _mm_add_epi8, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_b, simd_b, scalar_b, SUB_S8, _mm_sub_epi8, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_b, simd_b, scalar_b, AND_S8, _mm_and_ps, ID, ID, ID);\n" +"VEC_OP(simd_b, simd_b, scalar_b, OR_S8, _mm_or_ps, ID, ID, ID);\n" +"VEC_OP(simd_b, simd_b, scalar_b, XOR_S8, _mm_xor_ps, ID, ID, ID);\n" "VEC_OP(simd_m, simd_m, scalar_m, AND_M, _mm_and_ps, ID, ID, ID);\n" "#undef VEC_OP\n" "\n" @@ -374,18 +476,27 @@ std::string sim_vector_str = "INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\\\n" " NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\\\n" "}\n" +"VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, SHL_U32, <<, s[j]);\n" "VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, MUL_S32, *, s[j]);\n" "VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, DIV_S32, /, s[j]);\n" "VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, REM_S32, %, s[j]);\n" "VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, MUL_U32, *, u[j]);\n" "VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, DIV_U32, /, u[j]);\n" "VEC_OP(simd_dw, simd_dw, scalar_dw, cast_dw, REM_U32, %, u[j]);\n" +"VEC_OP(simd_w, simd_w, scalar_w, cast_w, SHL_U16, <<, s[j].v);\n" "VEC_OP(simd_w, simd_w, scalar_w, cast_w, MUL_S16, *, s[j].v);\n" "VEC_OP(simd_w, simd_w, scalar_w, cast_w, DIV_S16, /, s[j].v);\n" "VEC_OP(simd_w, simd_w, scalar_w, cast_w, REM_S16, %, s[j].v);\n" "VEC_OP(simd_w, simd_w, scalar_w, cast_w, MUL_U16, *, u[j].v);\n" "VEC_OP(simd_w, simd_w, scalar_w, cast_w, DIV_U16, /, u[j].v);\n" "VEC_OP(simd_w, simd_w, scalar_w, cast_w, REM_U16, %, u[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, SHL_U8, <<, s[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, MUL_S8, *, s[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, DIV_S8, /, s[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, REM_S8, %, s[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, MUL_U8, *, u[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, DIV_U8, /, u[j].v);\n" +"VEC_OP(simd_b, simd_b, scalar_b, cast_b, REM_U8, %, u[j].v);\n" "#undef VEC_OP\n" "\n" "/* Vector compare vectors that require C */\n" @@ -424,6 +535,14 @@ std::string sim_vector_str = "VEC_OP(simd_m, simd_w, scalar_w, cast_w, LT_S16, <, s[j].v);\n" "VEC_OP(simd_m, simd_w, scalar_w, cast_w, GE_S16, >=, s[j].v);\n" "VEC_OP(simd_m, simd_w, scalar_w, cast_w, GT_S16, >, s[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, LE_U8, <=, u[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, LT_U8, <, u[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, GE_U8, >=, u[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, GT_U8, >, u[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, LE_S8, <=, s[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, LT_S8, <, s[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, GE_S8, >=, s[j].v);\n" +"VEC_OP(simd_m, simd_b, scalar_b, cast_b, GT_S8, >, s[j].v);\n" "#undef VEC_OP\n" "\n" "/* Get NE from EQ */\n" @@ -478,6 +597,69 @@ std::string sim_vector_str = "{\n" " NE_S16(dst, simd_w(v0), simd_w(v1));\n" "}\n" +"template \n" +"INLINE void NE_S8(simd_m &dst,\n" +" const simd_b &v0,\n" +" const simd_b &v1)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_xor_ps(allTrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));\n" +"}\n" +"template \n" +"INLINE void NE_S8(simd_m &dst,\n" +" const simd_b &v0,\n" +" const scalar_b &v1)\n" +"{\n" +" NE_S8(dst, v0, simd_b(v1));\n" +"}\n" +"template \n" +"INLINE void NE_S8(simd_m &dst,\n" +" const scalar_b &v0,\n" +" const simd_b &v1)\n" +"{\n" +" NE_S8(dst, simd_b(v0), v1);\n" +"}\n" +"template \n" +"INLINE void NE_S8(simd_m &dst,\n" +" const scalar_b &v0,\n" +" const scalar_b &v1)\n" +"{\n" +" NE_S8(dst, simd_b(v0), simd_b(v1));\n" +"}\n" +"\n" +"\n" +"template class DstType,\n" +" template class SrcType>\n" +"INLINE void CVT(DstType &dst, const SrcType &src)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const typename SimTypeTrait::Cast srcCast(src.m[i]);\n" +" const DstCType x0 = (DstCType) *(const SrcCType*) (srcCast.data + 0);\n" +" const DstCType x1 = (DstCType) *(const SrcCType*) (srcCast.data + 4);\n" +" const DstCType x2 = (DstCType) *(const SrcCType*) (srcCast.data + 8);\n" +" const DstCType x3 = (DstCType) *(const SrcCType*) (srcCast.data + 12);\n" +" const typename SimTypeTrait::Cast dstCast(x0, x1, x2, x3);\n" +" dst.m[i] = dstCast.v;\n" +" }\n" +"}\n" +"\n" +"template class DstType,\n" +" class SrcType>\n" +"INLINE void CVT(DstType &dst, const SrcType &src)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const SrcCType from = *((SrcCType *) src.data);\n" +" const DstCType x = (DstCType) from;\n" +" const typename SimTypeTrait::Cast dstCast(x,x,x,x);\n" +" dst.m[i] = dstCast.v;\n" +" }\n" +"}\n" "\n" "/* Load from contiguous double words */\n" "template \n" @@ -518,6 +700,31 @@ std::string sim_vector_str = " }\n" "}\n" "\n" +"/* Load from contiguous bytes */\n" +"template \n" +"INLINE void LOAD(simd_b &dst, const char *ptr) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const uint8_t u0 = *((uint8_t*) ptr + 4*i + 0);\n" +" const uint8_t u1 = *((uint8_t*) ptr + 4*i + 1);\n" +" const uint8_t u2 = *((uint8_t*) ptr + 4*i + 2);\n" +" const uint8_t u3 = *((uint8_t*) ptr + 4*i + 3);\n" +" const cast_b w(u0,u1,u2,u3);\n" +" dst.m[i] = w.v;\n" +" }\n" +"}\n" +"\n" +"/* Store to contiguous bytes */\n" +"template \n" +"INLINE void STORE(const simd_b &src, char *ptr) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const cast_b w(src.m[i]);\n" +" *((uint8_t*) ptr + 4*i + 0) = w.u[0].v;\n" +" *((uint8_t*) ptr + 4*i + 1) = w.u[1].v;\n" +" *((uint8_t*) ptr + 4*i + 2) = w.u[2].v;\n" +" *((uint8_t*) ptr + 4*i + 3) = w.u[3].v;\n" +" }\n" +"}\n" +"\n" "/* Load immediates */\n" "template \n" "INLINE void LOADI(simd_dw &dst, uint32_t u) {\n" @@ -526,137 +733,328 @@ std::string sim_vector_str = " for (uint32_t i = 0; i < vectorNum; ++i)\n" " dst.m[i] = _mm_load1_ps(&cast.f);\n" "}\n" -"\n" -"/* Scatter */\n" "template \n" -"INLINE void SCATTER(const simd_dw &offset,\n" -" const simd_dw &value,\n" -" char *base_address) {\n" -" for (uint32_t i = 0; i < vectorNum; ++i) {\n" -" const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n" -" const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n" -" const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n" -" const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n" -" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);\n" -" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n" -" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n" -" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n" -" *(int*)(base_address + o0) = v0;\n" -" *(int*)(base_address + o1) = v1;\n" -" *(int*)(base_address + o2) = v2;\n" -" *(int*)(base_address + o3) = v3;\n" -" }\n" +"INLINE void LOADI(simd_w &dst, uint16_t u) {\n" +" union { uint32_t u; float f; } cast;\n" +" cast.u = u;\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_load1_ps(&cast.f);\n" "}\n" "template \n" -"INLINE void SCATTER(const simd_dw &offset,\n" -" const scalar_dw &value,\n" -" char *base_address) {\n" -" SCATTER(offset, simd_dw(value), base_address);\n" +"INLINE void LOADI(simd_b &dst, uint8_t u) {\n" +" union { uint32_t u; float f; } cast;\n" +" cast.u = u;\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_load1_ps(&cast.f);\n" "}\n" -"template \n" -"INLINE void SCATTER(const scalar_dw &offset,\n" -" const simd_dw &value,\n" -" char *base_address) {\n" -" SCATTER(simd_dw(offset), value, base_address);\n" +"\n" +"/* Scatter for bytes, shorts and integers */\n" +"#define DECL_SCATTER(VECTOR_TYPE, SCALAR_TYPE, CTYPE, MASK) \\\n" +"template \\\n" +"INLINE void SCATTER(const simd_dw &address, \\\n" +" const VECTOR_TYPE &value, \\\n" +" char *base_address, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) { \\\n" +" const uint32_t v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0) & MASK; \\\n" +" const uint32_t v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1) & MASK; \\\n" +" const uint32_t v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2) & MASK; \\\n" +" const uint32_t v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3) & MASK; \\\n" +" const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \\\n" +" const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \\\n" +" const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \\\n" +" const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \\\n" +" *(CTYPE *)(base_address + o0) = v0; \\\n" +" *(CTYPE *)(base_address + o1) = v1; \\\n" +" *(CTYPE *)(base_address + o2) = v2; \\\n" +" *(CTYPE *)(base_address + o3) = v3; \\\n" +" } \\\n" +"} \\\n" +"template \\\n" +"INLINE void SCATTER(const simd_dw &address, \\\n" +" const SCALAR_TYPE &value, \\\n" +" char *base_address, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" SCATTER(address, VECTOR_TYPE(value), base_address, offset); \\\n" +"} \\\n" +"template \\\n" +"INLINE void SCATTER(const scalar_dw &address, \\\n" +" const VECTOR_TYPE &value, \\\n" +" char *base_address, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" SCATTER(simd_dw(address), value, base_address, offset); \\\n" +"}\n" +"DECL_SCATTER(simd_dw, scalar_dw, uint32_t, 0xffffffff)\n" +"DECL_SCATTER(simd_w, scalar_w, uint16_t, 0xffff)\n" +"DECL_SCATTER(simd_b, scalar_b, uint8_t, 0xff)\n" +"#undef DECL_SCATTER\n" +"\n" +"template \n" +"INLINE void SCATTER2(const T &address,\n" +" const U &value0,\n" +" const V &value1,\n" +" char *base_address)\n" +"{\n" +" SCATTER(address, value0, base_address, 0);\n" +" SCATTER(address, value1, base_address, 4);\n" +"}\n" +"template \n" +"INLINE void SCATTER3(const T &address,\n" +" const U &value0,\n" +" const V &value1,\n" +" const W &value2,\n" +" char *base_address)\n" +"{\n" +" SCATTER(address, value0, base_address, 0);\n" +" SCATTER(address, value1, base_address, 4);\n" +" SCATTER(address, value2, base_address, 8);\n" +"}\n" +"template \n" +"INLINE void SCATTER4(const T &address,\n" +" const U &value0,\n" +" const V &value1,\n" +" const W &value2,\n" +" const X &value3,\n" +" char *base_address)\n" +"{\n" +" SCATTER(address, value0, base_address, 0);\n" +" SCATTER(address, value1, base_address, 4);\n" +" SCATTER(address, value2, base_address, 8);\n" +" SCATTER(address, value3, base_address, 12);\n" "}\n" "\n" "/* Masked scatter will only store unmasked lanes */\n" -"template \n" -"INLINE void MASKED_SCATTER(const simd_dw &offset,\n" -" const simd_dw &value,\n" -" char *base_address,\n" -" uint32_t mask)\n" +"#define DECL_MASKED_SCATTER(VECTOR_TYPE, SCALAR_TYPE, CTYPE, MASK) \\\n" +"template \\\n" +"INLINE void MASKED_SCATTER(const simd_dw &address, \\\n" +" const VECTOR_TYPE &value, \\\n" +" char *base_address, \\\n" +" uint32_t mask, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) { \\\n" +" const uint32_t v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0) & MASK; \\\n" +" const uint32_t v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1) & MASK; \\\n" +" const uint32_t v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2) & MASK; \\\n" +" const uint32_t v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3) & MASK; \\\n" +" const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \\\n" +" const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \\\n" +" const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \\\n" +" const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \\\n" +" if (mask & 1) *(CTYPE *)(base_address + o0) = v0; \\\n" +" if (mask & 2) *(CTYPE *)(base_address + o1) = v1; \\\n" +" if (mask & 4) *(CTYPE *)(base_address + o2) = v2; \\\n" +" if (mask & 8) *(CTYPE *)(base_address + o3) = v3; \\\n" +" mask = mask >> 4; \\\n" +" } \\\n" +"} \\\n" +"template \\\n" +"INLINE void MASKED_SCATTER(const simd_dw &address, \\\n" +" const SCALAR_TYPE &value, \\\n" +" char *base_address, \\\n" +" uint32_t mask, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" MASKED_SCATTER(address, VECTOR_TYPE(value), base_address, mask, offset); \\\n" +"} \\\n" +"template \\\n" +"INLINE void MASKED_SCATTER(const scalar_dw &address, \\\n" +" const VECTOR_TYPE &value, \\\n" +" char *base_address, \\\n" +" uint32_t mask, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" MASKED_SCATTER(simd_dw(address), value, base_address, mask, offset); \\\n" +"}\n" +"DECL_MASKED_SCATTER(simd_dw, scalar_dw, uint32_t, 0xffffffff)\n" +"DECL_MASKED_SCATTER(simd_w, scalar_w, uint16_t, 0xffff)\n" +"DECL_MASKED_SCATTER(simd_b, scalar_b, uint8_t, 0xff)\n" +"#undef DECL_MASKED_SCATTER\n" +"\n" +"template \n" +"INLINE void MASKED_SCATTER2(const T &address,\n" +" const U &value0,\n" +" const V &value1,\n" +" char *base_address,\n" +" uint32_t mask)\n" "{\n" -" for (uint32_t i = 0; i < vectorNum; ++i) {\n" -" const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n" -" const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n" -" const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n" -" const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n" -" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);\n" -" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n" -" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n" -" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n" -" if (mask & 1) *(int*)(base_address + o0) = v0;\n" -" if (mask & 2) *(int*)(base_address + o1) = v1;\n" -" if (mask & 4) *(int*)(base_address + o2) = v2;\n" -" if (mask & 8) *(int*)(base_address + o3) = v3;\n" -" mask = mask >> 4;\n" -" }\n" +" MASKED_SCATTER(address, value0, base_address, mask, 0);\n" +" MASKED_SCATTER(address, value1, base_address, mask, 4);\n" "}\n" -"template \n" -"INLINE void MASKED_SCATTER(const simd_dw &offset,\n" -" const scalar_dw &value,\n" -" char *base_address,\n" -" uint32_t mask)\n" +"template \n" +"INLINE void MASKED_SCATTER3(const T &address,\n" +" const U &value0,\n" +" const V &value1,\n" +" const W &value2,\n" +" char *base_address,\n" +" uint32_t mask)\n" "{\n" -" MASKED_SCATTER(offset, simd_dw(value), base_address, mask);\n" +" MASKED_SCATTER(address, value0, base_address, mask, 0);\n" +" MASKED_SCATTER(address, value1, base_address, mask, 4);\n" +" MASKED_SCATTER(address, value2, base_address, mask, 8);\n" "}\n" -"template \n" -"INLINE void MASKED_SCATTER(const scalar_dw &offset,\n" -" const simd_dw &value,\n" -" char *base_address,\n" -" uint32_t mask)\n" +"template \n" +"INLINE void MASKED_SCATTER4(const T &address,\n" +" const U &value0,\n" +" const V &value1,\n" +" const W &value2,\n" +" const X &value3,\n" +" char *base_address,\n" +" uint32_t mask)\n" "{\n" -" MASKED_SCATTER(simd_dw(offset), value, base_address, mask);\n" +" MASKED_SCATTER(address, value0, base_address, mask, 0);\n" +" MASKED_SCATTER(address, value1, base_address, mask, 4);\n" +" MASKED_SCATTER(address, value2, base_address, mask, 8);\n" +" MASKED_SCATTER(address, value3, base_address, mask, 12);\n" "}\n" "\n" "/* Gather */\n" -"template \n" -"INLINE void GATHER(simd_dw &dst,\n" -" const simd_dw &offset,\n" -" const char *base_address) {\n" -" for (uint32_t i = 0; i < vectorNum; ++i) {\n" -" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);\n" -" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n" -" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n" -" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n" -" const int v0 = *(const int*)(base_address + o0);\n" -" const int v1 = *(const int*)(base_address + o1);\n" -" const int v2 = *(const int*)(base_address + o2);\n" -" const int v3 = *(const int*)(base_address + o3);\n" -" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n" -" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n" -" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n" -" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n" -" }\n" +"#define DECL_GATHER(VECTOR_TYPE, SCALAR_TYPE, CTYPE) \\\n" +"template \\\n" +"INLINE void GATHER(VECTOR_TYPE &dst, \\\n" +" const simd_dw &address, \\\n" +" const char *base_address, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) { \\\n" +" const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \\\n" +" const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \\\n" +" const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \\\n" +" const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \\\n" +" const CTYPE v0 = *(const CTYPE *)(base_address + o0); \\\n" +" const CTYPE v1 = *(const CTYPE *)(base_address + o1); \\\n" +" const CTYPE v2 = *(const CTYPE *)(base_address + o2); \\\n" +" const CTYPE v3 = *(const CTYPE *)(base_address + o3); \\\n" +" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); \\\n" +" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); \\\n" +" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); \\\n" +" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); \\\n" +" } \\\n" +"} \\\n" +"template \\\n" +"INLINE void GATHER(VECTOR_TYPE &dst, \\\n" +" const scalar_dw &address, \\\n" +" const char *base_address, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" GATHER(dst, VECTOR_TYPE(address), base_address, offset); \\\n" "}\n" -"template \n" -"INLINE void GATHER(simd_dw &dst,\n" -" const scalar_dw &offset,\n" -" const char *base_address) {\n" -" GATHER(dst, simd_dw(offset), base_address);\n" +"DECL_GATHER(simd_dw, scalar_dw, uint32_t)\n" +"DECL_GATHER(simd_w, scalar_w, uint16_t)\n" +"DECL_GATHER(simd_b, scalar_b, uint8_t)\n" +"#undef DECL_GATHER\n" +"\n" +"template \n" +"INLINE void GATHER2(U &value0,\n" +" V &value1,\n" +" const T &address,\n" +" char *base_address)\n" +"{\n" +" GATHER(value0, address, base_address, 0);\n" +" GATHER(value1, address, base_address, 4);\n" +"}\n" +"template \n" +"INLINE void GATHER3(U &value0,\n" +" V &value1,\n" +" W &value2,\n" +" const T &address,\n" +" char *base_address)\n" +"{\n" +" GATHER(value0, address, base_address, 0);\n" +" GATHER(value1, address, base_address, 4);\n" +" GATHER(value2, address, base_address, 8);\n" +"}\n" +"template \n" +"INLINE void GATHER4(U &value0,\n" +" V &value1,\n" +" W &value2,\n" +" X &value3,\n" +" const T &address,\n" +" char *base_address)\n" +"{\n" +" GATHER(value0, address, base_address, 0);\n" +" GATHER(value1, address, base_address, 4);\n" +" GATHER(value2, address, base_address, 8);\n" +" GATHER(value3, address, base_address, 12);\n" "}\n" "\n" "/* Masked gather will only load activated lanes */\n" -"template \n" -"INLINE void MASKED_GATHER(simd_dw &dst,\n" -" const simd_dw &offset,\n" -" const char *base_address,\n" -" uint32_t mask)\n" +"#define DECL_MASKED_GATHER(VECTOR_TYPE, SCALAR_TYPE, CTYPE) \\\n" +"template \\\n" +"INLINE void MASKED_GATHER(VECTOR_TYPE &dst, \\\n" +" const simd_dw &address, \\\n" +" const char *base_address, \\\n" +" uint32_t mask, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) { \\\n" +" const uint32_t o0 = _mm_extract_epi32(PS2SI(address.m[i]), 0) + offset; \\\n" +" const uint32_t o1 = _mm_extract_epi32(PS2SI(address.m[i]), 1) + offset; \\\n" +" const uint32_t o2 = _mm_extract_epi32(PS2SI(address.m[i]), 2) + offset; \\\n" +" const uint32_t o3 = _mm_extract_epi32(PS2SI(address.m[i]), 3) + offset; \\\n" +" const CTYPE v0 = *(const CTYPE *)(base_address + o0); \\\n" +" const CTYPE v1 = *(const CTYPE *)(base_address + o1); \\\n" +" const CTYPE v2 = *(const CTYPE *)(base_address + o2); \\\n" +" const CTYPE v3 = *(const CTYPE *)(base_address + o3); \\\n" +" if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); \\\n" +" if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); \\\n" +" if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); \\\n" +" if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); \\\n" +" mask = mask >> 4; \\\n" +" } \\\n" +"} \\\n" +"template \\\n" +"INLINE void MASKED_GATHER(VECTOR_TYPE &dst, \\\n" +" const scalar_dw &address, \\\n" +" const char *base_address, \\\n" +" uint32_t mask, \\\n" +" uint32_t offset = 0) \\\n" +"{ \\\n" +" MASKED_GATHER(dst, simd_dw(address), base_address, mask, offset); \\\n" +"}\n" +"DECL_MASKED_GATHER(simd_dw, scalar_dw, uint32_t)\n" +"DECL_MASKED_GATHER(simd_w, scalar_w, uint16_t)\n" +"DECL_MASKED_GATHER(simd_b, scalar_b, uint8_t)\n" +"#undef DECL_MASKED_GATHER\n" +"\n" +"template \n" +"INLINE void MASKED_GATHER2(U &value0,\n" +" V &value1,\n" +" const T &address,\n" +" char *base_address,\n" +" uint32_t mask)\n" "{\n" -" for (uint32_t i = 0; i < vectorNum; ++i) {\n" -" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);\n" -" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n" -" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n" -" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n" -" const int v0 = *(const int*)(base_address + o0);\n" -" const int v1 = *(const int*)(base_address + o1);\n" -" const int v2 = *(const int*)(base_address + o2);\n" -" const int v3 = *(const int*)(base_address + o3);\n" -" if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n" -" if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n" -" if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n" -" if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n" -" mask = mask >> 4;\n" -" }\n" +" MASKED_GATHER(value0, address, base_address, mask, 0);\n" +" MASKED_GATHER(value1, address, base_address, mask, 4);\n" "}\n" -"template \n" -"INLINE void MASKED_GATHER(simd_dw &dst,\n" -" const scalar_dw &offset,\n" -" const char *base_address,\n" -" uint32_t mask)\n" +"template \n" +"INLINE void MASKED_GATHER3(U &value0,\n" +" V &value1,\n" +" W &value2,\n" +" const T &address,\n" +" char *base_address,\n" +" uint32_t mask)\n" "{\n" -" MASKED_GATHER(dst, simd_dw(offset), base_address, mask);\n" +" MASKED_GATHER(value0, address, base_address, mask, 0);\n" +" MASKED_GATHER(value1, address, base_address, mask, 4);\n" +" MASKED_GATHER(value2, address, base_address, mask, 8);\n" +"}\n" +"template \n" +"INLINE void MASKED_GATHER4(U &value0,\n" +" V &value1,\n" +" W &value2,\n" +" X &value3,\n" +" const T &address,\n" +" char *base_address,\n" +" uint32_t mask)\n" +"{\n" +" MASKED_GATHER(value0, address, base_address, mask, 0);\n" +" MASKED_GATHER(value1, address, base_address, mask, 4);\n" +" MASKED_GATHER(value2, address, base_address, mask, 8);\n" +" MASKED_GATHER(value3, address, base_address, mask, 12);\n" "}\n" "\n" "//////////////////////////////////////////////////////////////////////////////\n" @@ -664,6 +1062,7 @@ std::string sim_vector_str = "//////////////////////////////////////////////////////////////////////////////\n" "INLINE uint32_t elemNum(const scalar_dw &x) { return 1; }\n" "INLINE uint32_t elemNum(const scalar_w &x) { return 1; }\n" +"INLINE uint32_t elemNum(const scalar_b &x) { return 1; }\n" "INLINE uint32_t elemNum(const scalar_m &x) { return 1; }\n" "INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; }\n" "\n" @@ -680,6 +1079,7 @@ std::string sim_vector_str = "INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); }\n" "\n" "// 32 bit integers\n" +"INLINE void SHL_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s << v1.s; }\n" "INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; }\n" "INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; }\n" "INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; }\n" @@ -707,7 +1107,8 @@ std::string sim_vector_str = "INLINE void SCATTER(scalar_dw offset, scalar_dw value, char *base) { *(uint32_t*)(base + offset.u) = value.u; }\n" "INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = *(const uint32_t*)(base + offset.u); }\n" "\n" -"// 16 bit floating points\n" +"// 16 bits scalar\n" +"INLINE void SHL_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u << v1.u; }\n" "INLINE void ADD_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u + v1.u; }\n" "INLINE void SUB_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u - v1.u; }\n" "INLINE void ADD_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s + v1.s; }\n" @@ -734,16 +1135,48 @@ std::string sim_vector_str = "INLINE void LOAD(scalar_w &dst, const char *ptr) { dst.u = *(const uint16_t *) ptr; }\n" "INLINE void STORE(scalar_w src, char *ptr) { *(uint16_t *) ptr = src.u; }\n" "INLINE void LOADI(scalar_w &dst, uint16_t u) { dst.u = u; }\n" -"INLINE void SCATTER(scalar_w offset, scalar_w value, char *base) { *(uint16_t*)(base + offset.u) = value.u; }\n" -"INLINE void GATHER(scalar_w &dst, scalar_w offset, const char *base) { dst.u = *(const uint16_t*)(base + offset.u); }\n" +"INLINE void SCATTER(scalar_dw offset, scalar_w value, char *base) { *(uint16_t*)(base + offset.u) = value.u; }\n" +"INLINE void GATHER(scalar_w &dst, scalar_dw offset, const char *base) { dst.u = *(const uint16_t*)(base + offset.u); }\n" +"\n" +"// 8 bits scalars\n" +"INLINE void SHL_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u << v1.u; }\n" +"INLINE void ADD_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u + v1.u; }\n" +"INLINE void SUB_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u - v1.u; }\n" +"INLINE void ADD_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s + v1.s; }\n" +"INLINE void SUB_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s - v1.s; }\n" +"INLINE void MUL_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s * v1.s; }\n" +"INLINE void DIV_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s / v1.s; }\n" +"INLINE void REM_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s % v1.s; }\n" +"INLINE void MUL_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u * v1.u; }\n" +"INLINE void DIV_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u / v1.u; }\n" +"INLINE void REM_U8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.u = v0.u % v1.u; }\n" +"INLINE void EQ_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s == v1.s ? ~0 : 0); }\n" +"INLINE void NE_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s != v1.s ? ~0 : 0); }\n" +"INLINE void LE_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); }\n" +"INLINE void LT_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s < v1.s ? ~0 : 0); }\n" +"INLINE void GE_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); }\n" +"INLINE void GT_S8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.s > v1.s ? ~0 : 0); }\n" +"INLINE void XOR_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s ^ v1.s; }\n" +"INLINE void OR_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s | v1.s; }\n" +"INLINE void AND_S8(scalar_b &dst, scalar_b v0, scalar_b v1) { dst.s = v0.s & v1.s; }\n" +"INLINE void LE_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); }\n" +"INLINE void LT_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u < v1.u ? ~0 : 0); }\n" +"INLINE void GE_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); }\n" +"INLINE void GT_U8(scalar_m &dst, scalar_b v0, scalar_b v1) { dst.u = (v0.u > v1.u ? ~0 : 0); }\n" +"INLINE void LOAD(scalar_b &dst, const char *ptr) { dst.u = *(const uint8_t *) ptr; }\n" +"INLINE void STORE(scalar_b src, char *ptr) { *(uint8_t *) ptr = src.u; }\n" +"INLINE void LOADI(scalar_b &dst, uint8_t u) { dst.u = u; }\n" +"INLINE void SCATTER(scalar_dw offset, scalar_b value, char *base) { *(uint8_t*)(base + offset.u) = value.u; }\n" +"INLINE void GATHER(scalar_b &dst, scalar_dw offset, const char *base) { dst.u = *(const uint8_t*)(base + offset.u); }\n" "\n" "//////////////////////////////////////////////////////////////////////////////\n" "// Identical instructions are forwarded\n" "//////////////////////////////////////////////////////////////////////////////\n" "\n" "// Forward identical 32 bit instructions\n" -"#define NOV_U32 MOV_S32\n" -"#define NOV_F MOV_S32\n" +"#define MOV_U32 MOV_S32\n" +"#define SHL_S32 SHL_U32\n" +"#define MOV_F MOV_S32\n" "#define ADD_U32 ADD_S32\n" "#define SUB_U32 SUB_S32\n" "#define XOR_U32 XOR_S32\n" @@ -753,7 +1186,8 @@ std::string sim_vector_str = "#define NE_U32 NE_S32\n" "\n" "// Forward identical 16 bit instructions\n" -"#define NOV_U16 MOV_S16\n" +"#define MOV_U16 MOV_S16\n" +"#define SHL_S16 SHL_U16\n" "#define ADD_U16 ADD_S16\n" "#define SUB_U16 SUB_S16\n" "#define AND_U16 AND_S16\n" @@ -763,6 +1197,24 @@ std::string sim_vector_str = "#define EQ_U16 EQ_S16\n" "#define NE_U16 NE_S16\n" "\n" +"// Forward identical 8 bit instructions\n" +"#define MOV_U8 MOV_S8\n" +"#define SHL_S8 SHL_U8\n" +"#define ADD_U8 ADD_S8\n" +"#define SUB_U8 SUB_S8\n" +"#define AND_U8 AND_S8\n" +"#define XOR_U8 XOR_S8\n" +"#define OR_U8 OR_S8\n" +"#define AND_U8 AND_S8\n" +"#define EQ_U8 EQ_S8\n" +"#define NE_U8 NE_S8\n" +"\n" +"// More convenient to emit code\n" +"#define GATHER1 GATHER\n" +"#define SCATTER1 SCATTER\n" +"#define MASKED_GATHER1 MASKED_GATHER\n" +"#define MASKED_SCATTER1 MASKED_SCATTER\n" +"\n" "#undef PS2SI\n" "#undef SI2PS\n" "#undef ID\n" @@ -899,6 +1351,8 @@ std::string sim_vector_str = "\n" "#undef INLINE\n" "\n" +"// May be needed for some macro hell\n" +"#define COMMA ,\n" "#endif /* __GBE_SIM_VECTOR_H__ */\n" "\n" ; diff --git a/backend/src/backend/sim_context.cpp b/backend/src/backend/sim_context.cpp index 30b95ab..1507cfb 100644 --- a/backend/src/backend/sim_context.cpp +++ b/backend/src/backend/sim_context.cpp @@ -71,7 +71,17 @@ namespace gbe const ir::RegisterData regData = fn.getRegisterData(reg); switch (regData.family) { case ir::FAMILY_BYTE: + if (isScalarReg(reg) == true) + o << "scalar_b _" << regID << ";\n"; + else + o << "simd" << simdWidth << "b _" << regID << ";\n"; + break; case ir::FAMILY_WORD: + if (isScalarReg(reg) == true) + o << "scalar_w _" << regID << ";\n"; + else + o << "simd" << simdWidth << "w _" << regID << ";\n"; + break; case ir::FAMILY_QWORD: NOT_IMPLEMENTED; break; @@ -154,6 +164,23 @@ namespace gbe }; } + static const char *typeCppStr(const ir::Type &type) { + switch (type) { + case ir::TYPE_BOOL: return "bool"; + case ir::TYPE_S8: return "int8_t"; + case ir::TYPE_S16: return "int16_t"; + case ir::TYPE_S32: return "int32_t"; + case ir::TYPE_S64: return "int64_t"; + case ir::TYPE_U8: return "uint8_t"; + case ir::TYPE_U16: return "uint16_t"; + case ir::TYPE_U32: return "uint32_t"; + case ir::TYPE_U64: return "uint64_t"; + case ir::TYPE_FLOAT: return "float"; + case ir::TYPE_DOUBLE: return "double"; + default: NOT_IMPLEMENTED; return NULL; + }; + } + void SimContext::emitMaskingCode(void) { const int32_t blockIPOffset = kernel->getCurbeOffset(GBE_CURBE_BLOCK_IP, 0); GBE_ASSERT(blockIPOffset >= 0); @@ -211,8 +238,11 @@ namespace gbe GBE_ASSERT(JIPs.contains(&bra) == true); const LabelIndex jip = JIPs.find(&bra)->second; if (isPredicated) { + // Now the branch itself. Update the PcIP for the lanes that + // actually take the branch: only lanes that do not take the branch + // will have a PcIP updated to next const Register pred = bra.getPredicateIndex(); - o << "SIM_FWD_BRA_C(uip, emask, " << "_" << pred + o << "SIM_FWD_BRA_C(uip, emask, _" << pred << ", " << uint32_t(jip) << ", " << uint32_t(uip) << ");\n"; } else { @@ -221,6 +251,11 @@ namespace gbe << ");\n"; } } else { // BWD jump + // Set the IP of the next block for all activated lanes + GBE_ASSERT(bb->getNextBlock() != NULL); + const LabelIndex next = bb->getNextBlock()->getLabelIndex(); + o << "updateUIP(uip, emask, " << uint32_t(next) << ");\n"; + if (isPredicated) { const Register pred = bra.getPredicateIndex(); o << "SIM_BWD_BRA_C(uip, emask, _" << pred @@ -234,13 +269,22 @@ namespace gbe return; } -#if GBE_DEBUG - // Extra checks - if (opcode == OP_LOAD) - GBE_ASSERT(cast(insn).getValueNum() == 1); - if (opcode == OP_STORE) - GBE_ASSERT(cast(insn).getValueNum() == 1); -#endif /* GBE_DEBUG */ + uint32_t valueNum = 0; // for loads and stores + if (opcode == OP_LOAD) { + const LoadInstruction &load = cast(insn); + valueNum = load.getValueNum(); + if (load.isAligned() == true) + GBE_ASSERT(valueNum <= 4); + else + GBE_ASSERT(valueNum = 1); + } else if (opcode == OP_STORE) { + const StoreInstruction &store = cast(insn); + valueNum = store.getValueNum(); + if (store.isAligned() == true) + GBE_ASSERT(valueNum <= 4); + else + GBE_ASSERT(valueNum == 1); + } // Regular compute instruction const uint32_t dstNum = insn.getDstNum(); @@ -249,11 +293,11 @@ namespace gbe // These two needs a new instruction. Fortunately, it is just a string // manipulation. MASKED(OP,... just becomes MASKED_OP(...) if (opcode == OP_STORE || opcode == OP_LOAD) - o << "MASKED_" << opcodeStr << "("; + o << "MASKED_" << opcodeStr << valueNum << "("; else o << "MASKED" << dstNum << "(" << opcodeStr; - // Append type when needed + // Append type when needed or extra information (for templates) if (insn.isMemberOf() == true) o << "_" << typeStr(cast(insn).getType()); else if (insn.isMemberOf() == true) @@ -262,8 +306,15 @@ namespace gbe o << "_" << typeStr(cast(insn).getType()); else if (insn.isMemberOf() == true) o << "_" << typeStr(cast(insn).getType()); - if (opcode != OP_STORE && opcode != OP_LOAD) - o << ", "; + else if (insn.isMemberOf() == true) { + const ConvertInstruction cvt = cast(insn); + const Type dstType = cvt.getDstType(); + const Type srcType = cvt.getSrcType(); + o << "<" << typeCppStr(dstType) << " COMMA " + << typeCppStr(srcType) << " COMMA " + << (this->simdWidth / sizeof(uint32_t)) << ">"; + } + if (opcode != OP_STORE && opcode != OP_LOAD) o << ", "; // Output both destinations and sources in that order for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { @@ -278,10 +329,16 @@ namespace gbe // Append extra stuff for instructions that need it if (opcode == OP_LOADI) { Immediate imm = cast(insn).getImmediate(); - GBE_ASSERT(imm.type == TYPE_S32 || - imm.type == TYPE_U32 || - imm.type == TYPE_FLOAT); - o << ", " << imm.data.u32; + if (imm.type == TYPE_S32 || + imm.type == TYPE_U32 || + imm.type == TYPE_FLOAT) + o << ", " << imm.data.u32; + else if (imm.type == TYPE_S16 || + imm.type == TYPE_U16) + o << ", " << uint32_t(imm.data.u16); + else if (imm.type == TYPE_S8 || + imm.type == TYPE_U8) + o << ", " << uint32_t(imm.data.u8); } else if (opcode == OP_LOAD || opcode == OP_STORE) o << ", base, movedMask"; o << ");\n"; @@ -292,8 +349,8 @@ namespace gbe SVAR(OCL_GCC_SIM_COMPILER, "gcc"); SVAR(OCL_ICC_SIM_COMPILER, "icc"); - //SVAR(OCL_GCC_SIM_COMPILER_OPTIONS, "-Wall -fPIC -shared -msse -msse2 -msse3 -mssse3 -msse4.1 -g -O3"); - SVAR(OCL_GCC_SIM_COMPILER_OPTIONS, "-Wall -fPIC -shared -msse -msse2 -msse3 -mssse3 -msse4.1 -g"); + SVAR(OCL_GCC_SIM_COMPILER_OPTIONS, "-Wall -Wno-unused-label -Wno-strict-aliasing -fPIC -shared -msse -msse2 -msse3 -mssse3 -msse4.1 -g -O3"); + //SVAR(OCL_GCC_SIM_COMPILER_OPTIONS, "-Wall -fPIC -shared -msse -msse2 -msse3 -mssse3 -msse4.1 -g"); SVAR(OCL_ICC_SIM_COMPILER_OPTIONS, "-Wall -ldl -fabi-version=2 -fPIC -shared -O3 -g"); BVAR(OCL_USE_ICC, false); diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp index 7b37955..be2e5af 100644 --- a/backend/src/ocl_stdlib_str.cpp +++ b/backend/src/ocl_stdlib_str.cpp @@ -111,10 +111,10 @@ std::string ocl_stdlib_str = " DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n" "\n" "#define DECL_UNTYPED_RW_ALL(TYPE) \\\n" -"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n" -"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n" -"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n" -"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n" "\n" "DECL_UNTYPED_RW_ALL(float)\n" "DECL_UNTYPED_RW_ALL(uint)\n" diff --git a/backend/src/utest/utest_vector.cpp b/backend/src/utest/utest_vector.cpp index bdb86b9..816c34d 100644 --- a/backend/src/utest/utest_vector.cpp +++ b/backend/src/utest/utest_vector.cpp @@ -244,6 +244,90 @@ static void utestUINT16(void) } } +static void utestINT8(void) +{ + simd1b _0, _4, _5; + simd16b _1, _2, _3; + const int8_t data[32] = {-1,1,-2,-3,4,-5,6,7,-8,9,10,11,12,13,14,15,8, + 9,10,11,12,-13,14,-15,-1,1,-2,3,4,5,6,7}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + CHECK_BINARY_OP(int8_t,ADD_S8,+,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,SUB_S8,-,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,MUL_S8,*,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,DIV_S8,/,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,REM_S8,%,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,AND_S8,&,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,XOR_S8,^,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,OR_S8, |,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int8_t,ADD_S8,+,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,SUB_S8,-,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,MUL_S8,*,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,DIV_S8,/,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,REM_S8,%,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,AND_S8,&,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,XOR_S8,^,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,OR_S8, |,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int8_t,ADD_S8,+,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,SUB_S8,-,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,MUL_S8,*,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,DIV_S8,/,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,REM_S8,%,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,AND_S8,&,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,XOR_S8,^,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int8_t,OR_S8, |,_5,_4,_0,data[index4],data[index0]); + } +} + +static void utestUINT8(void) +{ + simd1b _0, _4, _5; + simd16b _1, _2, _3; + const uint8_t data[32] = {1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,8, + 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + CHECK_BINARY_OP(uint8_t,ADD_U8,+,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,SUB_U8,-,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,MUL_U8,*,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,DIV_U8,/,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,REM_U8,%,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,AND_U8,&,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,XOR_U8,^,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,OR_U8, |,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint8_t,ADD_U8,+,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,SUB_U8,-,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,MUL_U8,*,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,DIV_U8,/,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,REM_U8,%,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,AND_U8,&,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,XOR_U8,^,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,OR_U8, |,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint8_t,ADD_U8,+,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,SUB_U8,-,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,MUL_U8,*,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,DIV_U8,/,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,REM_U8,%,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,AND_U8,&,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,XOR_U8,^,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint8_t,OR_U8, |,_5,_4,_0,data[index4],data[index0]); + } +} + #undef CHECK_BINARY_OP #define CHECK_CMP_OP(FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\ @@ -465,6 +549,111 @@ static void utestINT16Cmp(void) } } +static void utestUINT8Cmp(void) +{ + simd1b _0, _4; + simd16b _1, _2; + simd8b _6, _7; + simd1m _5; + simd16m _3; + simd8m _8; + const uint8_t data[64] = {11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9,10, + 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7, + 10,11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9, + 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + LOAD(_6, (const char *) (data+index6)); + LOAD(_7, (const char *) (data+index7)); + CHECK_CMP_OP(GE_U8,>=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LE_U8,<=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GT_U8,>,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LT_U8,<,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(EQ_U8,==,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(NE_U8,!=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GE_U8,>=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LE_U8,<=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GT_U8,>,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LT_U8,<,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(EQ_U8,==,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(NE_U8,!=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GE_U8,>=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LE_U8,<=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GT_U8,>,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LT_U8,<,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(EQ_U8,==,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(NE_U8,!=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GE_U8,>=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LE_U8,<=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(GT_U8,>,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LT_U8,<,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(EQ_U8,==,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(NE_U8,!=,_5,_4,_0,data[index4],data[index0]); + } +} + +static void utestINT8Cmp(void) +{ + simd1b _0, _4; + simd16b _1, _2; + simd8b _6, _7; + simd1m _5; + simd16m _3; + simd8m _8; + const int8_t data[64] = {-11,-12,13,14,-15,8,-1,-1,2,3,4,5,-6,7,8,9,10, + 9,10,-11,12,-13,14,15,1,1,2,-3,4,-5,6,7, + 10,11,-12,13,14,15,-8,1,1,2,-3,-4,5,-6,7,8,9, + 9,10,11,12,-13,14,15,-1,-1,-2,-3,-4,5,6,7}; + + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + LOAD(_6, (const char *) (data+index6)); + LOAD(_7, (const char *) (data+index7)); + CHECK_CMP_OP(GE_S8,>=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LE_S8,<=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GT_S8,>,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LT_S8,<,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(EQ_S8,==,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(NE_S8,!=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GE_S8,>=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LE_S8,<=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GT_S8,>,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LT_S8,<,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(EQ_S8,==,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(NE_S8,!=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GE_S8,>=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LE_S8,<=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GT_S8,>,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LT_S8,<,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(EQ_S8,==,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(NE_S8,!=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GE_S8,>=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LE_S8,<=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(GT_S8,>,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LT_S8,<,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(EQ_S8,==,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(NE_S8,!=,_5,_4,_0,data[index4],data[index0]); + } +} + static void utestFPCmp(void) { simd1dw _0, _4; @@ -522,9 +711,10 @@ static void utestFPCmp(void) } #undef CHECK_CMP_OP -static void utestScatterGather(void) +static void utestScatterGatherUINT32(void) { - uint32_t data[64], gatherOffsets[64], scatterOffsets[64], dst[64]; + uint32_t data[64], dst[64]; + uint32_t gatherOffsets[64], scatterOffsets[64]; simd1dw _0, _0s, _0g, _4, _4s, _4g; simd16dw _1, _1s, _1g, _2, _2s, _2g; simd8dw _6, _6s, _6g, _7, _7s, _7g; @@ -564,7 +754,102 @@ static void utestScatterGather(void) CHECK_SCATTER_GATHER_OP(7); } #undef CHECK_SCATTER_GATHER_OP +} + +static void utestScatterGatherUINT16(void) +{ + uint16_t data[64], dst[64]; + uint32_t gatherOffsets[64], scatterOffsets[64]; + simd1dw _0s, _0g, _4s, _4g; + simd1w _0, _4; + simd16dw _1s, _1g, _2s, _2g; + simd16w _1, _2; + simd8dw _6s, _6g, _7s, _7g; + simd8w _6, _7; + + // Create the value and offset arrays + for (uint32_t i = 0; i < 64; ++i) { + data[i] = i; + scatterOffsets[i] = gatherOffsets[i] = i * sizeof(uint16_t); + } + for (uint32_t i = 0; i < 63; ++i) { + const int gatherIndex = rand() % (63-i)+i+1; + const int scatterIndex = rand() % (63-i)+i+1; + std::swap(gatherOffsets[i], gatherOffsets[gatherIndex]); + std::swap(scatterOffsets[i], scatterOffsets[scatterIndex]); + } + +#define CHECK_SCATTER_GATHER_OP(INDEX)\ + LOAD(_##INDEX##g, (const char *) (gatherOffsets+index##INDEX));\ + LOAD(_##INDEX##s, (const char *) (scatterOffsets+index##INDEX));\ + GATHER(_##INDEX, _##INDEX##g, (const char *) data);\ + SCATTER(_##INDEX##s, _##INDEX, (char *) dst);\ + for (uint32_t i = 0; i < elemNum(_##INDEX); ++i)\ + GBE_ASSERT(data[gatherOffsets[index##INDEX+i] / sizeof(uint16_t)] ==\ + dst[scatterOffsets[index##INDEX+i] / sizeof(uint16_t)]); + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + CHECK_SCATTER_GATHER_OP(0); + CHECK_SCATTER_GATHER_OP(1); + CHECK_SCATTER_GATHER_OP(2); + CHECK_SCATTER_GATHER_OP(4); + CHECK_SCATTER_GATHER_OP(6); + CHECK_SCATTER_GATHER_OP(7); + } +#undef CHECK_SCATTER_GATHER_OP +} + +static void utestScatterGatherUINT8(void) +{ + uint8_t data[64], dst[64]; + uint32_t gatherOffsets[64], scatterOffsets[64]; + simd1dw _0s, _0g, _4s, _4g; + simd1b _0, _4; + simd16dw _1s, _1g, _2s, _2g; + simd16b _1, _2; + simd8dw _6s, _6g, _7s, _7g; + simd8b _6, _7; + + // Create the value and offset arrays + for (uint32_t i = 0; i < 64; ++i) { + data[i] = i; + scatterOffsets[i] = gatherOffsets[i] = i * sizeof(uint8_t); + } + for (uint32_t i = 0; i < 63; ++i) { + const int gatherIndex = rand() % (63-i)+i+1; + const int scatterIndex = rand() % (63-i)+i+1; + std::swap(gatherOffsets[i], gatherOffsets[gatherIndex]); + std::swap(scatterOffsets[i], scatterOffsets[scatterIndex]); + } +#define CHECK_SCATTER_GATHER_OP(INDEX)\ + LOAD(_##INDEX##g, (const char *) (gatherOffsets+index##INDEX));\ + LOAD(_##INDEX##s, (const char *) (scatterOffsets+index##INDEX));\ + GATHER(_##INDEX, _##INDEX##g, (const char *) data);\ + SCATTER(_##INDEX##s, _##INDEX, (char *) dst);\ + for (uint32_t i = 0; i < elemNum(_##INDEX); ++i)\ + GBE_ASSERT(data[gatherOffsets[index##INDEX+i] / sizeof(uint8_t)] ==\ + dst[scatterOffsets[index##INDEX+i] / sizeof(uint8_t)]); + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + CHECK_SCATTER_GATHER_OP(0); + CHECK_SCATTER_GATHER_OP(1); + CHECK_SCATTER_GATHER_OP(2); + CHECK_SCATTER_GATHER_OP(4); + CHECK_SCATTER_GATHER_OP(6); + CHECK_SCATTER_GATHER_OP(7); + } +#undef CHECK_SCATTER_GATHER_OP } static void utestVector(void) @@ -574,12 +859,18 @@ static void utestVector(void) UTEST_EXPECT_SUCCESS(utestUINT32()); UTEST_EXPECT_SUCCESS(utestINT16()); UTEST_EXPECT_SUCCESS(utestUINT16()); + UTEST_EXPECT_SUCCESS(utestINT8()); + UTEST_EXPECT_SUCCESS(utestUINT8()); UTEST_EXPECT_SUCCESS(utestFPCmp()); UTEST_EXPECT_SUCCESS(utestINT32Cmp()); UTEST_EXPECT_SUCCESS(utestUINT32Cmp()); UTEST_EXPECT_SUCCESS(utestINT16Cmp()); UTEST_EXPECT_SUCCESS(utestUINT16Cmp()); - UTEST_EXPECT_SUCCESS(utestScatterGather()); + UTEST_EXPECT_SUCCESS(utestINT8Cmp()); + UTEST_EXPECT_SUCCESS(utestUINT8Cmp()); + UTEST_EXPECT_SUCCESS(utestScatterGatherUINT32()); + UTEST_EXPECT_SUCCESS(utestScatterGatherUINT16()); + UTEST_EXPECT_SUCCESS(utestScatterGatherUINT8()); } UTEST_REGISTER(utestVector) -- 2.7.4