From 7792e6429c8fca33f8bdfd2cadb2c874b65f450d Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Tue, 10 Apr 2012 02:48:56 -0700 Subject: [PATCH] Completely redesigned the vector API for the simulator. Now it is properly untyped as the Gen IR is --- backend/src/backend/sim/sim_vector.h | 520 ++++++++++++++++------------- backend/src/backend/sim/sim_vector_str.cpp | 520 ++++++++++++++++------------- backend/src/utest/utest_vector.cpp | 334 +++++++++++++++--- 3 files changed, 874 insertions(+), 500 deletions(-) diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h index 44cfdd5..1d09c98 100644 --- a/backend/src/backend/sim/sim_vector.h +++ b/backend/src/backend/sim/sim_vector.h @@ -35,309 +35,373 @@ #include #define INLINE inline __attribute__((always_inline)) +#define ID(X) (X) +#define PS2SI(X) _mm_castps_si128(X) +#define SI2PS(X) _mm_castsi128_ps(X) -/*! Base structure for 1 / 4 / 8 / 16 / 32 floats */ -template -struct genf { __m128 m[vectorNum]; }; -/*! Base structure for 1 / 4 / 8 / 16 / 32 integers */ -template -struct geni { __m128i m[vectorNum]; }; -/*! Base structure for 1 / 4 / 8 / 16 / 32 booleans (m stands for "mask") */ -template -struct genm { __m128 m[vectorNum]; }; - -/*! To cast through memory */ -union CastType { - INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { - u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3; - } - INLINE CastType(float f0, float f1, float f2, float f3) { - f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3; - } - __m128 v; - __m128i vi; - uint32_t u[4]; - float f[4]; -}; - -typedef genf<1,true> genf1; // contains 3 clobbered values -typedef genf<1,false> genf4; -typedef genf<2,false> genf8; -typedef genf<4,false> genf16; -typedef genf<8,false> genf32; -typedef geni<1,true> geni1; // contains 3 clobbered values -typedef geni<1,false> geni4; -typedef geni<2,false> geni8; -typedef geni<4,false> geni16; -typedef geni<8,false> geni32; -typedef genm<1,true> genm1; // contains 3 clobbered values -typedef genm<1,false> genm4; -typedef genm<2,false> genm8; -typedef genm<4,false> genm16; -typedef genm<8,false> genm32; - -static INLINE uint32_t elemNum(genf1 x) { return 1; } -static INLINE uint32_t elemNum(genf4 x) { return 4; } -static INLINE uint32_t elemNum(genf8 x) { return 8; } -static INLINE uint32_t elemNum(genf16 x) { return 16; } -static INLINE uint32_t elemNum(genf32 x) { return 32; } -static INLINE uint32_t elemNum(geni1 x) { return 1; } -static INLINE uint32_t elemNum(geni4 x) { return 4; } -static INLINE uint32_t elemNum(geni8 x) { return 8; } -static INLINE uint32_t elemNum(geni16 x) { return 16; } -static INLINE uint32_t elemNum(geni32 x) { return 32; } - +/* Some extra SSE functions */ template INLINE const __m128 shuffle(const __m128& b) { return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); } - template INLINE __m128 expand(const __m128& b) { return shuffle(b); } - template INLINE const __m128i shuffle(const __m128i& a) { return _mm_shuffle_epi32(a, _MM_SHUFFLE(index_3, index_2, index_1, index_0)); } - template INLINE const __m128i expand(const __m128i& b) { return shuffle(b); } +/*! Base structure for scalar double word */ +union scalar_dw { uint32_t u; int32_t s; float f; }; + +/*! Base structure for scalar mask */ +union scalar_m { uint32_t u; int32_t s; float f; }; + +/*! Base structure for vectors 4 / 8 / 16 / 32 double words */ +template +struct simd_dw { + INLINE simd_dw(void) {} + INLINE simd_dw(const scalar_dw &s) { + for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f); + } + simd_dw &operator= (const scalar_dw &s) { + for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f); + return *this; + } + __m128 m[vectorNum]; +}; + +/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for "mask") */ +template +struct simd_m { + INLINE simd_m(void) {} + INLINE simd_m(scalar_m s) { + for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f); + } + __m128 m[vectorNum]; +}; + +/*! To cast through memory */ +union cast_dw { + INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { + u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3; + } + INLINE cast_dw(int32_t s0, int32_t s1, int32_t s2, int32_t s3) { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + } + INLINE cast_dw(float f0, float f1, float f2, float f3) { + f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3; + } + INLINE cast_dw(const __m128 &v) : v(v) {} + INLINE cast_dw(const __m128i &vi) : vi(vi) {} + INLINE cast_dw(void) {} + __m128 v; + __m128i vi; + uint32_t u[4]; + int32_t s[4]; + float f[4]; +}; +static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); + +/* Some convenient typedefs */ +typedef scalar_dw simd1dw; +typedef simd_dw<1> simd4dw; +typedef simd_dw<2> simd8dw; +typedef simd_dw<4> simd16dw; +typedef simd_dw<8> simd32dw; +typedef scalar_m simd1m; +typedef simd_m<1> simd4m; +typedef simd_m<2> simd8m; +typedef simd_m<4> simd16m; +typedef simd_m<8> simd32m; + +////////////////////////////////////////////////////////////////////////////// +// Vector instructions +////////////////////////////////////////////////////////////////////////////// +/* Simple function to get the number of element per vector */ +template +INLINE uint32_t elemNum(const simd_dw &x) { + return 4 * vectorNum; +} +template +INLINE uint32_t elemNum(const simd_m &x) { + return 4 * vectorNum; +} + /* Build an integer mask from the mask vectors */ template -INLINE uint32_t mask(const genm v) { +INLINE uint32_t mask(const simd_m v) { uint32_t m = _mm_movemask_ps(v.m[0]); for (uint32_t i = 1; i < vectorNum; ++i) - m |= _mm_movemask_ps(v.m[i]) << (4*i); + m |= (_mm_movemask_ps(v.m[i]) << (4*i)); return m; } -INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; } -#define ID(X) X +/* Vector instructions that use sse* */ #define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ template \ -INLINE void NAME(DST_TYPE &dst,\ - const SRC_TYPE &v0,\ - const SRC_TYPE &v1) {\ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\ for (uint32_t i = 0; i < vectorNum; ++i)\ dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\ }\ template \ -INLINE void NAME(DST_TYPE &dst,\ - const SRC_TYPE &v0,\ - const SRC_TYPE##1 &v1) {\ - for (uint32_t i = 0; i < vectorNum; ++i)\ - dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\ + NAME(dst, v0, simd_dw(v1));\ }\ template \ -INLINE void NAME(DST_TYPE &dst,\ - const SRC_TYPE##1 &v0,\ - const SRC_TYPE &v1) {\ - for (uint32_t i = 0; i < vectorNum; ++i)\ - dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\ +INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\ + NAME(dst, simd_dw(v0), v1);\ } - -VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID); -VEC_OP(genf, genf, SUB, _mm_sub_ps, ID, ID, ID); -VEC_OP(genf, genf, MUL, _mm_mul_ps, ID, ID, ID); -VEC_OP(genf, genf, DIV, _mm_div_ps, ID, ID, ID); -VEC_OP(genm, genf, EQ, _mm_cmpeq_ps, ID, ID, ID); -VEC_OP(genm, genf, NE, _mm_cmpneq_ps, ID, ID, ID); -VEC_OP(genm, genf, LT, _mm_cmplt_ps, ID, ID, ID); -VEC_OP(genm, genf, LE, _mm_cmple_ps, ID, ID, ID); -VEC_OP(genm, genf, GT, _mm_cmpgt_ps, ID, ID, ID); -VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID); -VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID); -VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID); -VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID); -VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID); -VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID); -VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps); -VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps); -VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps); -VEC_OP(genm, genf, SLT, _mm_cmplt_ps, ID, ID, ID); -VEC_OP(genm, genf, SLE, _mm_cmple_ps, ID, ID, ID); -VEC_OP(genm, genf, SGT, _mm_cmpgt_ps, ID, ID, ID); -VEC_OP(genm, genf, SGE, _mm_cmpge_ps, ID, ID, ID); - +VEC_OP(simd_dw, simd_dw, ADD_F, _mm_add_ps, ID, ID, ID); +VEC_OP(simd_dw, simd_dw, SUB_F, _mm_sub_ps, ID, ID, ID); +VEC_OP(simd_dw, simd_dw, MUL_F, _mm_mul_ps, ID, ID, ID); +VEC_OP(simd_dw, simd_dw, DIV_F, _mm_div_ps, ID, ID, ID); +VEC_OP(simd_m, simd_dw, EQ_F, _mm_cmpeq_ps, ID, ID, ID); +VEC_OP(simd_m, simd_dw, NE_F, _mm_cmpneq_ps, ID, ID, ID); +VEC_OP(simd_m, simd_dw, LT_F, _mm_cmplt_ps, ID, ID, ID); +VEC_OP(simd_m, simd_dw, LE_F, _mm_cmple_ps, ID, ID, ID); +VEC_OP(simd_m, simd_dw, GT_F, _mm_cmpgt_ps, ID, ID, ID); +VEC_OP(simd_m, simd_dw, GE_F, _mm_cmpge_ps, ID, ID, ID); +VEC_OP(simd_dw, simd_dw, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_dw, simd_dw, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_m, simd_dw, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_m, simd_dw, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_m, simd_dw, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_dw, simd_dw, OR_S32, _mm_or_ps, ID, ID, ID); +VEC_OP(simd_dw, simd_dw, XOR_S32, _mm_xor_ps, ID, ID, ID); +VEC_OP(simd_dw, simd_dw, AND_S32, _mm_and_ps, ID, ID, ID); #undef VEC_OP -#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ +/* Vector integer operations that we can get by switching argument order */ +#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ template \ -INLINE void NAME(DST_TYPE &dst,\ - const SRC_TYPE &v0,\ - const SRC_TYPE &v1) {\ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\ for (uint32_t i = 0; i < vectorNum; ++i)\ - dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\ + dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), alltrue.v);\ }\ template \ -INLINE void NAME(DST_TYPE &dst,\ - const SRC_TYPE &v0,\ - const SRC_TYPE##1 &v1) {\ - for (uint32_t i = 0; i < vectorNum; ++i)\ - dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\ + NAME(dst, v0, simd_dw(v1));\ }\ template \ -INLINE void NAME(DST_TYPE &dst,\ - const SRC_TYPE##1 &v0,\ - const SRC_TYPE &v1) {\ - for (uint32_t i = 0; i < vectorNum; ++i)\ - dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\ +INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\ + NAME(dst, simd_dw(v0), v1);\ +} +VEC_OP(simd_m, simd_dw, GE_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI); +VEC_OP(simd_m, simd_dw, LE_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI); +#undef VEC_OP + +/* Vector binary integer operations that require C */ +#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\ +template \ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i) {\ + cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\ + for (uint32_t j = 0; j < 4; ++j)\ + d.FIELD[j] = c0.FIELD[j] OP c1.FIELD[j];\ + dst.m[i] = d.v;\ + }\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\ + NAME(dst, v0, simd_dw(v1));\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\ + NAME(dst, simd_dw(v0), v1);\ } -ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID); -ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID); -#undef ICMP_VEC_OP +VEC_OP(simd_dw, simd_dw, MUL_S32, *, s); +VEC_OP(simd_dw, simd_dw, DIV_S32, /, s); +VEC_OP(simd_dw, simd_dw, REM_S32, %, s); +VEC_OP(simd_dw, simd_dw, MUL_U32, *, u); +VEC_OP(simd_dw, simd_dw, DIV_U32, /, u); +VEC_OP(simd_dw, simd_dw, REM_U32, %, u); +#undef VEC_OP -static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff); +/* Vector compare vectors that require C */ +#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\ +template \ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i) {\ + cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\ + for (uint32_t j = 0; j < 4; ++j)\ + d.u[j] = (c0.FIELD[j] OP c1.FIELD[j]) ? ~0u : 0u;\ + dst.m[i] = d.v;\ + }\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i) {\ + cast_dw c0(v0.m[i]), d;\ + for (uint32_t j = 0; j < 4; ++j)\ + d.u[j] = (c0.FIELD[j] OP v1.FIELD) ? ~0u : 0u;\ + dst.m[i] = d.v;\ + }\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i) {\ + cast_dw c1(v1.m[i]), d;\ + for (uint32_t j = 0; j < 4; ++j)\ + d.u[j] = (v0.FIELD OP c1.FIELD[j]) ? ~0u : 0u;\ + dst.m[i] = d.v;\ + }\ +} +VEC_OP(simd_m, simd_dw, LE_U32, <=, u); +VEC_OP(simd_m, simd_dw, LT_U32, <, u); +VEC_OP(simd_m, simd_dw, GE_U32, >=, u); +VEC_OP(simd_m, simd_dw, GT_U32, >, u); +#undef VEC_OP template -INLINE void NE(genm &dst, const geni &v0, const geni &v1) { - for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v)); +INLINE void NE_S32(simd_m &dst, + const simd_dw &v0, + const scalar_dw &v1) +{ + NE_S32(dst, v0, simd_dw(v1)); } template -INLINE void NE(genm &dst, const geni &v0, const geni1 &v1) { - for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v)); +INLINE void NE_S32(simd_m &dst, + const scalar_dw &v0, + const simd_dw &v1) +{ + NE_S32(dst, simd_dw(v0), v1); } template -INLINE void NE(genm &dst, const geni1 &v0, const geni &v1) { +INLINE void NE_S32(simd_m &dst, + const simd_dw &v0, + const simd_dw &v1) +{ for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v)); + dst.m[i] = _mm_xor_ps(alltrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i])))); } -#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\ -INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\ - dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\ -} -SCALAR_OP(genf1, ADD, _mm_add_ss); -SCALAR_OP(genf1, SUB, _mm_sub_ss); -SCALAR_OP(genf1, MUL, _mm_mul_ss); -SCALAR_OP(genf1, DIV, _mm_div_ss); -SCALAR_OP(geni1, ADD, _mm_add_epi32); -SCALAR_OP(geni1, SUB, _mm_sub_epi32); -#undef SCALAR_OP - -/* load from contiguous floats / integers */ +/* Load from contiguous double words */ template -INLINE void LOAD(genf &dst, const char *ptr) { +INLINE void LOAD(simd_dw &dst, const char *ptr) { for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i); } -INLINE void LOAD(genf1 &dst, const char *ptr) { - dst.m[0] = _mm_load_ss((const float*) ptr); -} -template -INLINE void LOAD(geni &dst, const char *ptr) { - for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i)); -} -INLINE void LOAD(geni1 &dst, const char *ptr) { - dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr)); -} -/* store to contiguous floats / integers */ +/* Store to contiguous double words */ template -INLINE void STORE(const genf &src, char *ptr) { +INLINE void STORE(const simd_dw &src, char *ptr) { for (uint32_t i = 0; i < vectorNum; ++i) _mm_storeu_ps((float*) ptr + 4*i, src.m[i]); } -INLINE void STORE(genf1 &src, char *ptr) { - _mm_store_ss((float*) ptr, src.m[0]); -} -template -INLINE void STORE(const geni &src, char *ptr) { - for (uint32_t i = 0; i < vectorNum; ++i) - _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i])); -} -INLINE void STORE(const geni1 &src, char *ptr) { - _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0])); -} /* Load immediates */ template -INLINE void LOADI(genf &dst, float f) { +INLINE void LOADI(simd_dw &dst, float f) { for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = _mm_load1_ps(&f); } -INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); } -template -INLINE void LOADI(geni &dst, uint32_t u) { - union { float f; uint32_t u; } cast; - cast.u = u; - for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f)); -} -INLINE void LOADI(geni1 &dst, uint32_t u) { - union { float f; uint32_t u; } cast; - cast.u = u; - dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f)); -} /* Scatter */ -#define SCATTER_OP(TYPE, FN)\ -template \ -INLINE void SCATTER(const TYPE &value,\ - const geni &offset,\ - char *base_address) {\ - for (uint32_t i = 0; i < vectorNum; ++i) {\ - const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\ - const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\ - const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\ - const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\ - const int o0 = _mm_extract_epi32(offset.m[i], 0);\ - const int o1 = _mm_extract_epi32(offset.m[i], 1);\ - const int o2 = _mm_extract_epi32(offset.m[i], 2);\ - const int o3 = _mm_extract_epi32(offset.m[i], 3);\ - *(int*)(base_address + o0) = v0;\ - *(int*)(base_address + o1) = v1;\ - *(int*)(base_address + o2) = v2;\ - *(int*)(base_address + o3) = v3;\ - }\ -}\ -INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\ - const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\ - const int o0 = _mm_extract_epi32(offset.m[0], 0);\ - *(int*)(base_address + o0) = v0;\ +template +INLINE void SCATTER(const simd_dw &value, + const simd_dw &offset, + char *base_address) { + for (uint32_t i = 0; i < vectorNum; ++i) { + const int v0 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 0); + const int v1 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 1); + const int v2 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 2); + const int v3 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 3); + const int o0 = _mm_extract_epi32(offset.m[i], 0); + const int o1 = _mm_extract_epi32(offset.m[i], 1); + const int o2 = _mm_extract_epi32(offset.m[i], 2); + const int o3 = _mm_extract_epi32(offset.m[i], 3); + *(int*)(base_address + o0) = v0; + *(int*)(base_address + o1) = v1; + *(int*)(base_address + o2) = v2; + *(int*)(base_address + o3) = v3; + } } -SCATTER_OP(genf, _mm_castps_si128) -SCATTER_OP(geni, ID) -#undef SCATTER_OP /* Gather */ -#define GATHER_OP(TYPE, FN)\ -template \ -INLINE void GATHER(TYPE &dst,\ - const geni &offset,\ - char *base_address) {\ - for (uint32_t i = 0; i < vectorNum; ++i) {\ - const int o0 = _mm_extract_epi32(offset.m[i], 0);\ - const int o1 = _mm_extract_epi32(offset.m[i], 1);\ - const int o2 = _mm_extract_epi32(offset.m[i], 2);\ - const int o3 = _mm_extract_epi32(offset.m[i], 3);\ - const int v0 = *(int*)(base_address + o0);\ - const int v1 = *(int*)(base_address + o1);\ - const int v2 = *(int*)(base_address + o2);\ - const int v3 = *(int*)(base_address + o3);\ - _mm_insert_epi32(FN(dst.m[i]), v0, 0);\ - _mm_insert_epi32(FN(dst.m[i]), v1, 1);\ - _mm_insert_epi32(FN(dst.m[i]), v2, 2);\ - _mm_insert_epi32(FN(dst.m[i]), v3, 3);\ - }\ -}\ -INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\ - const int o0 = _mm_extract_epi32(offset.m[0], 0);\ - const int v0 = *(int*)(base_address + o0);\ - _mm_insert_epi32(FN(dst.m[0]), v0, 0);\ +template +INLINE void GATHER(simd_dw &dst, + const simd_dw &offset, + char *base_address) { + for (uint32_t i = 0; i < vectorNum; ++i) { + const int o0 = _mm_extract_epi32(offset.m[i], 0); + const int o1 = _mm_extract_epi32(offset.m[i], 1); + const int o2 = _mm_extract_epi32(offset.m[i], 2); + const int o3 = _mm_extract_epi32(offset.m[i], 3); + const int v0 = *(int*)(base_address + o0); + const int v1 = *(int*)(base_address + o1); + const int v2 = *(int*)(base_address + o2); + const int v3 = *(int*)(base_address + o3); + _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v0, 0); + _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v1, 1); + _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v2, 2); + _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v3, 3); + } } -GATHER_OP(genf, _mm_castps_si128) -GATHER_OP(geni, ID) -#undef GATHER_OP +////////////////////////////////////////////////////////////////////////////// +// Scalar instructions +////////////////////////////////////////////////////////////////////////////// +INLINE uint32_t elemNum(const scalar_dw &x) { return 1; } +INLINE uint32_t elemNum(const scalar_m &x) { return 1; } +INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; } +INLINE void ADD_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f + v1.f; } +INLINE void SUB_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f - v1.f; } +INLINE void MUL_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f * v1.f; } +INLINE void DIV_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f / v1.f; } +INLINE void EQ_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f == v1.f ? ~0 : 0); } +INLINE void NE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f != v1.f ? ~0 : 0); } +INLINE void LE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f <= v1.f ? ~0 : 0); } +INLINE void LT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f < v1.f ? ~0 : 0); } +INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v1.f ? ~0 : 0); } +INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); } +INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; } +INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; } +INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; } +INLINE void DIV_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s / v1.s; } +INLINE void REM_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s % v1.s; } +INLINE void MUL_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u * v1.u; } +INLINE void DIV_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u / v1.u; } +INLINE void REM_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u % v1.u; } +INLINE void EQ_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s == v1.s ? ~0 : 0); } +INLINE void NE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s != v1.s ? ~0 : 0); } +INLINE void LE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); } +INLINE void LT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s < v1.s ? ~0 : 0); } +INLINE void GE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); } +INLINE void GT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s > v1.s ? ~0 : 0); } +INLINE void XOR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s ^ v1.s; } +INLINE void OR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s | v1.s; } +INLINE void AND_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s & v1.s; } +INLINE void LE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); } +INLINE void LT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u < v1.u ? ~0 : 0); } +INLINE void GE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); } +INLINE void GT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u > v1.u ? ~0 : 0); } +INLINE void LOAD(scalar_dw &dst, const char *ptr) { dst.u = *(const uint32_t *) ptr; } +INLINE void STORE(scalar_dw src, char *ptr) { *(uint32_t *) ptr = src.u; } +INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; } +INLINE void SCATTER(scalar_dw value, scalar_dw offset, char *base) { *(uint32_t*)(base + offset.u) = value.u; } +INLINE void GATHER(scalar_dw &dst, scalar_dw offset, char *base) { dst.u = *(uint32_t*)(base + offset.u); } + +////////////////////////////////////////////////////////////////////////////// +// Identical instructions are forwarded +////////////////////////////////////////////////////////////////////////////// + +#define ADD_U32 ADD_S32 +#define SUB_U32 SUB_S32 +#define XOR_U32 XOR_S32 +#define OR_U32 OR_S32 +#define AND_U32 AND_S32 +#define EQ_U32 EQ_S32 +#define NE_U32 NE_S32 + +#undef PS2SI +#undef SI2PS #undef ID #undef INLINE diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp index e37d7f2..87ff6de 100644 --- a/backend/src/backend/sim/sim_vector_str.cpp +++ b/backend/src/backend/sim/sim_vector_str.cpp @@ -61,309 +61,373 @@ std::string sim_vector_str = "#include \n" "\n" "#define INLINE inline __attribute__((always_inline))\n" +"#define ID(X) (X)\n" +"#define PS2SI(X) _mm_castps_si128(X)\n" +"#define SI2PS(X) _mm_castsi128_ps(X)\n" "\n" -"/*! Base structure for 1 / 4 / 8 / 16 / 32 floats */\n" -"template \n" -"struct genf { __m128 m[vectorNum]; };\n" -"/*! Base structure for 1 / 4 / 8 / 16 / 32 integers */\n" -"template \n" -"struct geni { __m128i m[vectorNum]; };\n" -"/*! Base structure for 1 / 4 / 8 / 16 / 32 booleans (m stands for \"mask\") */\n" -"template \n" -"struct genm { __m128 m[vectorNum]; };\n" -"\n" -"/*! To cast through memory */\n" -"union CastType {\n" -" INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n" -" u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n" -" }\n" -" INLINE CastType(float f0, float f1, float f2, float f3) {\n" -" f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;\n" -" }\n" -" __m128 v;\n" -" __m128i vi;\n" -" uint32_t u[4];\n" -" float f[4];\n" -"};\n" -"\n" -"typedef genf<1,true> genf1; // contains 3 clobbered values\n" -"typedef genf<1,false> genf4;\n" -"typedef genf<2,false> genf8;\n" -"typedef genf<4,false> genf16;\n" -"typedef genf<8,false> genf32;\n" -"typedef geni<1,true> geni1; // contains 3 clobbered values\n" -"typedef geni<1,false> geni4;\n" -"typedef geni<2,false> geni8;\n" -"typedef geni<4,false> geni16;\n" -"typedef geni<8,false> geni32;\n" -"typedef genm<1,true> genm1; // contains 3 clobbered values\n" -"typedef genm<1,false> genm4;\n" -"typedef genm<2,false> genm8;\n" -"typedef genm<4,false> genm16;\n" -"typedef genm<8,false> genm32;\n" -"\n" -"static INLINE uint32_t elemNum(genf1 x) { return 1; }\n" -"static INLINE uint32_t elemNum(genf4 x) { return 4; }\n" -"static INLINE uint32_t elemNum(genf8 x) { return 8; }\n" -"static INLINE uint32_t elemNum(genf16 x) { return 16; }\n" -"static INLINE uint32_t elemNum(genf32 x) { return 32; }\n" -"static INLINE uint32_t elemNum(geni1 x) { return 1; }\n" -"static INLINE uint32_t elemNum(geni4 x) { return 4; }\n" -"static INLINE uint32_t elemNum(geni8 x) { return 8; }\n" -"static INLINE uint32_t elemNum(geni16 x) { return 16; }\n" -"static INLINE uint32_t elemNum(geni32 x) { return 32; }\n" -"\n" +"/* Some extra SSE functions */\n" "template\n" "INLINE const __m128 shuffle(const __m128& b) {\n" " return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));\n" "}\n" -"\n" "template INLINE\n" "__m128 expand(const __m128& b) { \n" " return shuffle(b);\n" "}\n" -"\n" "template\n" "INLINE const __m128i shuffle(const __m128i& a) {\n" " return _mm_shuffle_epi32(a, _MM_SHUFFLE(index_3, index_2, index_1, index_0));\n" "}\n" -"\n" "template\n" "INLINE const __m128i expand(const __m128i& b) {\n" " return shuffle(b);\n" "}\n" "\n" +"/*! Base structure for scalar double word */\n" +"union scalar_dw { uint32_t u; int32_t s; float f; };\n" +"\n" +"/*! Base structure for scalar mask */\n" +"union scalar_m { uint32_t u; int32_t s; float f; };\n" +"\n" +"/*! Base structure for vectors 4 / 8 / 16 / 32 double words */\n" +"template \n" +"struct simd_dw {\n" +" INLINE simd_dw(void) {}\n" +" INLINE simd_dw(const scalar_dw &s) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);\n" +" }\n" +" simd_dw &operator= (const scalar_dw &s) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);\n" +" return *this;\n" +" }\n" +" __m128 m[vectorNum];\n" +"};\n" +"\n" +"/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for \"mask\") */\n" +"template \n" +"struct simd_m {\n" +" INLINE simd_m(void) {}\n" +" INLINE simd_m(scalar_m s) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);\n" +" }\n" +" __m128 m[vectorNum];\n" +"};\n" +"\n" +"/*! To cast through memory */\n" +"union cast_dw {\n" +" INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n" +" u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n" +" }\n" +" INLINE cast_dw(int32_t s0, int32_t s1, int32_t s2, int32_t s3) {\n" +" s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;\n" +" }\n" +" INLINE cast_dw(float f0, float f1, float f2, float f3) {\n" +" f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;\n" +" }\n" +" INLINE cast_dw(const __m128 &v) : v(v) {}\n" +" INLINE cast_dw(const __m128i &vi) : vi(vi) {}\n" +" INLINE cast_dw(void) {}\n" +" __m128 v;\n" +" __m128i vi;\n" +" uint32_t u[4];\n" +" int32_t s[4];\n" +" float f[4];\n" +"};\n" +"static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n" +"\n" +"/* Some convenient typedefs */\n" +"typedef scalar_dw simd1dw;\n" +"typedef simd_dw<1> simd4dw;\n" +"typedef simd_dw<2> simd8dw;\n" +"typedef simd_dw<4> simd16dw;\n" +"typedef simd_dw<8> simd32dw;\n" +"typedef scalar_m simd1m;\n" +"typedef simd_m<1> simd4m;\n" +"typedef simd_m<2> simd8m;\n" +"typedef simd_m<4> simd16m;\n" +"typedef simd_m<8> simd32m;\n" +"\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"// Vector instructions\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"/* Simple function to get the number of element per vector */\n" +"template \n" +"INLINE uint32_t elemNum(const simd_dw &x) {\n" +" return 4 * vectorNum;\n" +"}\n" +"template \n" +"INLINE uint32_t elemNum(const simd_m &x) {\n" +" return 4 * vectorNum;\n" +"}\n" +"\n" "/* Build an integer mask from the mask vectors */\n" "template \n" -"INLINE uint32_t mask(const genm v) {\n" +"INLINE uint32_t mask(const simd_m v) {\n" " uint32_t m = _mm_movemask_ps(v.m[0]);\n" " for (uint32_t i = 1; i < vectorNum; ++i)\n" -" m |= _mm_movemask_ps(v.m[i]) << (4*i);\n" +" m |= (_mm_movemask_ps(v.m[i]) << (4*i));\n" " return m;\n" "}\n" -"INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; }\n" "\n" -"#define ID(X) X\n" +"/* Vector instructions that use sse* */\n" "#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n" "template \\n" -"INLINE void NAME(DST_TYPE &dst,\\n" -" const SRC_TYPE &v0,\\n" -" const SRC_TYPE &v1) {\\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n" " for (uint32_t i = 0; i < vectorNum; ++i)\\n" " dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\\n" "}\\n" "template \\n" -"INLINE void NAME(DST_TYPE &dst,\\n" -" const SRC_TYPE &v0,\\n" -" const SRC_TYPE##1 &v1) {\\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n" +" NAME(dst, v0, simd_dw(v1));\\n" "}\\n" "template \\n" -"INLINE void NAME(DST_TYPE &dst,\\n" -" const SRC_TYPE##1 &v0,\\n" -" const SRC_TYPE &v1) {\\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\\n" +"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n" +" NAME(dst, simd_dw(v0), v1);\\n" "}\n" -"\n" -"VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID);\n" -"VEC_OP(genf, genf, SUB, _mm_sub_ps, ID, ID, ID);\n" -"VEC_OP(genf, genf, MUL, _mm_mul_ps, ID, ID, ID);\n" -"VEC_OP(genf, genf, DIV, _mm_div_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, EQ, _mm_cmpeq_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, NE, _mm_cmpneq_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, LT, _mm_cmplt_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, LE, _mm_cmple_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, GT, _mm_cmpgt_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID);\n" -"VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID);\n" -"VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID);\n" -"VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID);\n" -"VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID);\n" -"VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID);\n" -"VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n" -"VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n" -"VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n" -"VEC_OP(genm, genf, SLT, _mm_cmplt_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, SLE, _mm_cmple_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, SGT, _mm_cmpgt_ps, ID, ID, ID);\n" -"VEC_OP(genm, genf, SGE, _mm_cmpge_ps, ID, ID, ID);\n" -"\n" +"VEC_OP(simd_dw, simd_dw, ADD_F, _mm_add_ps, ID, ID, ID);\n" +"VEC_OP(simd_dw, simd_dw, SUB_F, _mm_sub_ps, ID, ID, ID);\n" +"VEC_OP(simd_dw, simd_dw, MUL_F, _mm_mul_ps, ID, ID, ID);\n" +"VEC_OP(simd_dw, simd_dw, DIV_F, _mm_div_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_dw, EQ_F, _mm_cmpeq_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_dw, NE_F, _mm_cmpneq_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_dw, LT_F, _mm_cmplt_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_dw, LE_F, _mm_cmple_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_dw, GT_F, _mm_cmpgt_ps, ID, ID, ID);\n" +"VEC_OP(simd_m, simd_dw, GE_F, _mm_cmpge_ps, ID, ID, ID);\n" +"VEC_OP(simd_dw, simd_dw, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_dw, simd_dw, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_m, simd_dw, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_m, simd_dw, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_m, simd_dw, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_dw, simd_dw, OR_S32, _mm_or_ps, ID, ID, ID);\n" +"VEC_OP(simd_dw, simd_dw, XOR_S32, _mm_xor_ps, ID, ID, ID);\n" +"VEC_OP(simd_dw, simd_dw, AND_S32, _mm_and_ps, ID, ID, ID);\n" "#undef VEC_OP\n" "\n" -"#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n" +"/* Vector integer operations that we can get by switching argument order */\n" +"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n" "template \\n" -"INLINE void NAME(DST_TYPE &dst,\\n" -" const SRC_TYPE &v0,\\n" -" const SRC_TYPE &v1) {\\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n" " for (uint32_t i = 0; i < vectorNum; ++i)\\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\\n" +" dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), alltrue.v);\\n" "}\\n" "template \\n" -"INLINE void NAME(DST_TYPE &dst,\\n" -" const SRC_TYPE &v0,\\n" -" const SRC_TYPE##1 &v1) {\\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n" +" NAME(dst, v0, simd_dw(v1));\\n" "}\\n" "template \\n" -"INLINE void NAME(DST_TYPE &dst,\\n" -" const SRC_TYPE##1 &v0,\\n" -" const SRC_TYPE &v1) {\\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\\n" +"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n" +" NAME(dst, simd_dw(v0), v1);\\n" +"}\n" +"VEC_OP(simd_m, simd_dw, GE_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);\n" +"VEC_OP(simd_m, simd_dw, LE_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);\n" +"#undef VEC_OP\n" +"\n" +"/* Vector binary integer operations that require C */\n" +"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" +" cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\\n" +" for (uint32_t j = 0; j < 4; ++j)\\n" +" d.FIELD[j] = c0.FIELD[j] OP c1.FIELD[j];\\n" +" dst.m[i] = d.v;\\n" +" }\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n" +" NAME(dst, v0, simd_dw(v1));\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n" +" NAME(dst, simd_dw(v0), v1);\\n" "}\n" -"ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID);\n" -"ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID);\n" -"#undef ICMP_VEC_OP\n" +"VEC_OP(simd_dw, simd_dw, MUL_S32, *, s);\n" +"VEC_OP(simd_dw, simd_dw, DIV_S32, /, s);\n" +"VEC_OP(simd_dw, simd_dw, REM_S32, %, s);\n" +"VEC_OP(simd_dw, simd_dw, MUL_U32, *, u);\n" +"VEC_OP(simd_dw, simd_dw, DIV_U32, /, u);\n" +"VEC_OP(simd_dw, simd_dw, REM_U32, %, u);\n" +"#undef VEC_OP\n" "\n" -"static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff);\n" +"/* Vector compare vectors that require C */\n" +"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" +" cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\\n" +" for (uint32_t j = 0; j < 4; ++j)\\n" +" d.u[j] = (c0.FIELD[j] OP c1.FIELD[j]) ? ~0u : 0u;\\n" +" dst.m[i] = d.v;\\n" +" }\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" +" cast_dw c0(v0.m[i]), d;\\n" +" for (uint32_t j = 0; j < 4; ++j)\\n" +" d.u[j] = (c0.FIELD[j] OP v1.FIELD) ? ~0u : 0u;\\n" +" dst.m[i] = d.v;\\n" +" }\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" +" cast_dw c1(v1.m[i]), d;\\n" +" for (uint32_t j = 0; j < 4; ++j)\\n" +" d.u[j] = (v0.FIELD OP c1.FIELD[j]) ? ~0u : 0u;\\n" +" dst.m[i] = d.v;\\n" +" }\\n" +"}\n" +"VEC_OP(simd_m, simd_dw, LE_U32, <=, u);\n" +"VEC_OP(simd_m, simd_dw, LT_U32, <, u);\n" +"VEC_OP(simd_m, simd_dw, GE_U32, >=, u);\n" +"VEC_OP(simd_m, simd_dw, GT_U32, >, u);\n" +"#undef VEC_OP\n" "\n" "template \n" -"INLINE void NE(genm &dst, const geni &v0, const geni &v1) {\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v));\n" +"INLINE void NE_S32(simd_m &dst,\n" +" const simd_dw &v0,\n" +" const scalar_dw &v1)\n" +"{\n" +" NE_S32(dst, v0, simd_dw(v1));\n" "}\n" "template \n" -"INLINE void NE(genm &dst, const geni &v0, const geni1 &v1) {\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v));\n" +"INLINE void NE_S32(simd_m &dst,\n" +" const scalar_dw &v0,\n" +" const simd_dw &v1)\n" +"{\n" +" NE_S32(dst, simd_dw(v0), v1);\n" "}\n" "template \n" -"INLINE void NE(genm &dst, const geni1 &v0, const geni &v1) {\n" +"INLINE void NE_S32(simd_m &dst,\n" +" const simd_dw &v0,\n" +" const simd_dw &v1)\n" +"{\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v));\n" +" dst.m[i] = _mm_xor_ps(alltrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));\n" "}\n" "\n" -"#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\\n" -"INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\\n" -" dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\\n" -"}\n" -"SCALAR_OP(genf1, ADD, _mm_add_ss);\n" -"SCALAR_OP(genf1, SUB, _mm_sub_ss);\n" -"SCALAR_OP(genf1, MUL, _mm_mul_ss);\n" -"SCALAR_OP(genf1, DIV, _mm_div_ss);\n" -"SCALAR_OP(geni1, ADD, _mm_add_epi32);\n" -"SCALAR_OP(geni1, SUB, _mm_sub_epi32);\n" -"#undef SCALAR_OP\n" -"\n" -"/* load from contiguous floats / integers */\n" +"/* Load from contiguous double words */\n" "template \n" -"INLINE void LOAD(genf &dst, const char *ptr) {\n" +"INLINE void LOAD(simd_dw &dst, const char *ptr) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" " dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i);\n" "}\n" -"INLINE void LOAD(genf1 &dst, const char *ptr) {\n" -" dst.m[0] = _mm_load_ss((const float*) ptr);\n" -"}\n" -"template \n" -"INLINE void LOAD(geni &dst, const char *ptr) {\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i));\n" -"}\n" -"INLINE void LOAD(geni1 &dst, const char *ptr) {\n" -" dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr));\n" -"}\n" "\n" -"/* store to contiguous floats / integers */\n" +"/* Store to contiguous double words */\n" "template \n" -"INLINE void STORE(const genf &src, char *ptr) {\n" +"INLINE void STORE(const simd_dw &src, char *ptr) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" " _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);\n" "}\n" -"INLINE void STORE(genf1 &src, char *ptr) {\n" -" _mm_store_ss((float*) ptr, src.m[0]);\n" -"}\n" -"template \n" -"INLINE void STORE(const geni &src, char *ptr) {\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\n" -" _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i]));\n" -"}\n" -"INLINE void STORE(const geni1 &src, char *ptr) {\n" -" _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0]));\n" -"}\n" "\n" "/* Load immediates */\n" "template \n" -"INLINE void LOADI(genf &dst, float f) {\n" +"INLINE void LOADI(simd_dw &dst, float f) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" " dst.m[i] = _mm_load1_ps(&f);\n" "}\n" -"INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); }\n" -"template \n" -"INLINE void LOADI(geni &dst, uint32_t u) {\n" -" union { float f; uint32_t u; } cast;\n" -" cast.u = u;\n" -" for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f));\n" -"}\n" -"INLINE void LOADI(geni1 &dst, uint32_t u) {\n" -" union { float f; uint32_t u; } cast;\n" -" cast.u = u;\n" -" dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f));\n" -"}\n" "\n" "/* Scatter */\n" -"#define SCATTER_OP(TYPE, FN)\\n" -"template \\n" -"INLINE void SCATTER(const TYPE &value,\\n" -" const geni &offset,\\n" -" char *base_address) {\\n" -" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" -" const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\\n" -" const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\\n" -" const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\\n" -" const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\\n" -" const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n" -" const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n" -" const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n" -" const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n" -" *(int*)(base_address + o0) = v0;\\n" -" *(int*)(base_address + o1) = v1;\\n" -" *(int*)(base_address + o2) = v2;\\n" -" *(int*)(base_address + o3) = v3;\\n" -" }\\n" -"}\\n" -"INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\\n" -" const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\\n" -" const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n" -" *(int*)(base_address + o0) = v0;\\n" +"template \n" +"INLINE void SCATTER(const simd_dw &value,\n" +" const simd_dw &offset,\n" +" char *base_address) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const int v0 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 0);\n" +" const int v1 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 1);\n" +" const int v2 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 2);\n" +" const int v3 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 3);\n" +" const int o0 = _mm_extract_epi32(offset.m[i], 0);\n" +" const int o1 = _mm_extract_epi32(offset.m[i], 1);\n" +" const int o2 = _mm_extract_epi32(offset.m[i], 2);\n" +" const int o3 = _mm_extract_epi32(offset.m[i], 3);\n" +" *(int*)(base_address + o0) = v0;\n" +" *(int*)(base_address + o1) = v1;\n" +" *(int*)(base_address + o2) = v2;\n" +" *(int*)(base_address + o3) = v3;\n" +" }\n" "}\n" -"SCATTER_OP(genf, _mm_castps_si128)\n" -"SCATTER_OP(geni, ID)\n" -"#undef SCATTER_OP\n" "\n" "/* Gather */\n" -"#define GATHER_OP(TYPE, FN)\\n" -"template \\n" -"INLINE void GATHER(TYPE &dst,\\n" -" const geni &offset,\\n" -" char *base_address) {\\n" -" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" -" const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n" -" const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n" -" const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n" -" const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n" -" const int v0 = *(int*)(base_address + o0);\\n" -" const int v1 = *(int*)(base_address + o1);\\n" -" const int v2 = *(int*)(base_address + o2);\\n" -" const int v3 = *(int*)(base_address + o3);\\n" -" _mm_insert_epi32(FN(dst.m[i]), v0, 0);\\n" -" _mm_insert_epi32(FN(dst.m[i]), v1, 1);\\n" -" _mm_insert_epi32(FN(dst.m[i]), v2, 2);\\n" -" _mm_insert_epi32(FN(dst.m[i]), v3, 3);\\n" -" }\\n" -"}\\n" -"INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\\n" -" const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n" -" const int v0 = *(int*)(base_address + o0);\\n" -" _mm_insert_epi32(FN(dst.m[0]), v0, 0);\\n" +"template \n" +"INLINE void GATHER(simd_dw &dst,\n" +" const simd_dw &offset,\n" +" char *base_address) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const int o0 = _mm_extract_epi32(offset.m[i], 0);\n" +" const int o1 = _mm_extract_epi32(offset.m[i], 1);\n" +" const int o2 = _mm_extract_epi32(offset.m[i], 2);\n" +" const int o3 = _mm_extract_epi32(offset.m[i], 3);\n" +" const int v0 = *(int*)(base_address + o0);\n" +" const int v1 = *(int*)(base_address + o1);\n" +" const int v2 = *(int*)(base_address + o2);\n" +" const int v3 = *(int*)(base_address + o3);\n" +" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v0, 0);\n" +" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v1, 1);\n" +" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v2, 2);\n" +" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v3, 3);\n" +" }\n" "}\n" -"GATHER_OP(genf, _mm_castps_si128)\n" -"GATHER_OP(geni, ID)\n" -"#undef GATHER_OP\n" "\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"// Scalar instructions\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"INLINE uint32_t elemNum(const scalar_dw &x) { return 1; }\n" +"INLINE uint32_t elemNum(const scalar_m &x) { return 1; }\n" +"INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; }\n" +"INLINE void ADD_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f + v1.f; }\n" +"INLINE void SUB_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f - v1.f; }\n" +"INLINE void MUL_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f * v1.f; }\n" +"INLINE void DIV_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f / v1.f; }\n" +"INLINE void EQ_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f == v1.f ? ~0 : 0); }\n" +"INLINE void NE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f != v1.f ? ~0 : 0); }\n" +"INLINE void LE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f <= v1.f ? ~0 : 0); }\n" +"INLINE void LT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f < v1.f ? ~0 : 0); }\n" +"INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v1.f ? ~0 : 0); }\n" +"INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); }\n" +"INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; }\n" +"INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; }\n" +"INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; }\n" +"INLINE void DIV_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s / v1.s; }\n" +"INLINE void REM_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s % v1.s; }\n" +"INLINE void MUL_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u * v1.u; }\n" +"INLINE void DIV_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u / v1.u; }\n" +"INLINE void REM_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u % v1.u; }\n" +"INLINE void EQ_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s == v1.s ? ~0 : 0); }\n" +"INLINE void NE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s != v1.s ? ~0 : 0); }\n" +"INLINE void LE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); }\n" +"INLINE void LT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s < v1.s ? ~0 : 0); }\n" +"INLINE void GE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); }\n" +"INLINE void GT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s > v1.s ? ~0 : 0); }\n" +"INLINE void XOR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s ^ v1.s; }\n" +"INLINE void OR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s | v1.s; }\n" +"INLINE void AND_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s & v1.s; }\n" +"INLINE void LE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); }\n" +"INLINE void LT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u < v1.u ? ~0 : 0); }\n" +"INLINE void GE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); }\n" +"INLINE void GT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u > v1.u ? ~0 : 0); }\n" +"INLINE void LOAD(scalar_dw &dst, const char *ptr) { dst.u = *(const uint32_t *) ptr; }\n" +"INLINE void STORE(scalar_dw src, char *ptr) { *(uint32_t *) ptr = src.u; }\n" +"INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; }\n" +"INLINE void SCATTER(scalar_dw value, scalar_dw offset, char *base) { *(uint32_t*)(base + offset.u) = value.u; }\n" +"INLINE void GATHER(scalar_dw &dst, scalar_dw offset, char *base) { dst.u = *(uint32_t*)(base + offset.u); }\n" +"\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"// Identical instructions are forwarded\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"\n" +"#define ADD_U32 ADD_S32\n" +"#define SUB_U32 SUB_S32\n" +"#define XOR_U32 XOR_S32\n" +"#define OR_U32 OR_S32\n" +"#define AND_U32 AND_S32\n" +"#define EQ_U32 EQ_S32\n" +"#define NE_U32 NE_S32\n" +"\n" +"#undef PS2SI\n" +"#undef SI2PS\n" "#undef ID\n" "#undef INLINE\n" "\n" diff --git a/backend/src/utest/utest_vector.cpp b/backend/src/utest/utest_vector.cpp index 78bfcc0..e6db38b 100644 --- a/backend/src/utest/utest_vector.cpp +++ b/backend/src/utest/utest_vector.cpp @@ -20,10 +20,12 @@ #include "backend/sim/sim_vector.h" #include "utest/utest.hpp" + static INLINE bool ok(float x, float y) { return fabs(x-y) / (1.f + std::max(fabs(x), fabs(y))) < 1.e-6; } -static INLINE bool ok(int x, int y) { return x == y; } +static INLINE bool ok(int32_t x, int32_t y) { return x == y; } +static INLINE bool ok(uint32_t x, uint32_t y) { return x == y; } #define CHECK_BINARY_OP(TYPE,FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\ do {\ @@ -38,60 +40,304 @@ static INLINE bool ok(int x, int y) { return x == y; } static void utestFP(void) { - genf1 _0, _4, _5; - genf16 _1, _2, _3; - const float data[32] = {1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f, + simd1dw _0, _4, _5; + simd16dw _1, _2, _3; + const float data[32] = {1.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f, 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f, 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f, - 1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; - - LOAD(_0, (const char *) (data+4)); - LOAD(_4, (const char *) (data+5)); - LOAD(_1, (const char *) (data)); - LOAD(_2, (const char *) (data)); - CHECK_BINARY_OP(float,MUL,*,_3,_2,_1,data[i],data[i]); - CHECK_BINARY_OP(float,DIV,/,_3,_2,_1,data[i],data[i]); - CHECK_BINARY_OP(float,ADD,+,_3,_2,_1,data[i],data[i]); - CHECK_BINARY_OP(float,SUB,-,_3,_2,_1,data[i],data[i]); - CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(float,MUL,*,_5,_4,_0,data[5],data[4]); - CHECK_BINARY_OP(float,DIV,/,_5,_4,_0,data[5],data[4]); - CHECK_BINARY_OP(float,ADD,+,_5,_4,_0,data[5],data[4]); - CHECK_BINARY_OP(float,SUB,-,_5,_4,_0,data[5],data[4]); + 1.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + CHECK_BINARY_OP(float,MUL_F,*,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(float,DIV_F,/,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(float,ADD_F,+,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(float,SUB_F,-,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(float,MUL_F,*,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,DIV_F,/,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,ADD_F,+,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,SUB_F,-,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,MUL_F,*,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,DIV_F,/,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,ADD_F,+,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,SUB_F,-,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(float,MUL_F,*,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(float,DIV_F,/,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(float,ADD_F,+,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(float,SUB_F,-,_5,_4,_0,data[index4],data[index0]); + } +} + +static void utestINT32(void) +{ + simd1dw _0, _4, _5; + simd16dw _1, _2, _3; + const int32_t data[32] = {-1,1,-2,-3,4,-5,6,7,-8,9,10,11,12,13,14,15,8, + 9,10,11,12,-13,14,-15,-1,1,-2,3,4,5,6,7}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + CHECK_BINARY_OP(int32_t,ADD_S32,+,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,SUB_S32,-,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,MUL_S32,*,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,DIV_S32,/,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,REM_S32,%,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,AND_S32,&,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,XOR_S32,^,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,OR_S32, |,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(int32_t,ADD_S32,+,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,SUB_S32,-,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,MUL_S32,*,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,DIV_S32,/,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,REM_S32,%,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,AND_S32,&,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,XOR_S32,^,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,OR_S32, |,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(int32_t,ADD_S32,+,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,SUB_S32,-,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,MUL_S32,*,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,DIV_S32,/,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,REM_S32,%,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,AND_S32,&,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,XOR_S32,^,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(int32_t,OR_S32, |,_5,_4,_0,data[index4],data[index0]); + } +} + +static void utestUINT32(void) +{ + simd1dw _0, _4, _5; + simd16dw _1, _2, _3; + const uint32_t data[32] = {1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,8, + 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + CHECK_BINARY_OP(uint32_t,ADD_U32,+,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,SUB_U32,-,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,MUL_U32,*,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,DIV_U32,/,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,REM_U32,%,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,AND_U32,&,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,XOR_U32,^,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,OR_U32, |,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_BINARY_OP(uint32_t,ADD_U32,+,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,SUB_U32,-,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,MUL_U32,*,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,DIV_U32,/,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,REM_U32,%,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,AND_U32,&,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,XOR_U32,^,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,OR_U32, |,_3,_2,_0,data[i+index2],data[index0]); + CHECK_BINARY_OP(uint32_t,ADD_U32,+,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,SUB_U32,-,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,MUL_U32,*,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,DIV_U32,/,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,REM_U32,%,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,AND_U32,&,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,XOR_U32,^,_5,_4,_0,data[index4],data[index0]); + CHECK_BINARY_OP(uint32_t,OR_U32, |,_5,_4,_0,data[index4],data[index0]); + } +} +#undef CHECK_BINARY_OP + +#define CHECK_CMP_OP(FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\ + do {\ + FN(DST, SRC0, SRC1);\ + uint32_t m = 0;\ + for (uint32_t i = 0; i < elemNum(DST); ++i)\ + m |= (((ELEM0 OP ELEM1) ? 1 : 0) << i);\ + GBE_ASSERT(m == mask(DST));\ + } while (0); + +static void utestUINT32Cmp(void) +{ + simd1dw _0, _4; + simd16dw _1, _2; + simd8dw _6, _7; + simd1m _5; + simd16m _3; + simd8m _8; + const uint32_t data[64] = {11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9,10, + 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7, + 10,11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9, + 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + LOAD(_6, (const char *) (data+index6)); + LOAD(_7, (const char *) (data+index7)); + CHECK_CMP_OP(GE_U32,>=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LE_U32,<=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GT_U32,>,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LT_U32,<,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(EQ_U32,==,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(NE_U32,!=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GE_U32,>=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LE_U32,<=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GT_U32,>,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LT_U32,<,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(EQ_U32,==,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(NE_U32,!=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GE_U32,>=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LE_U32,<=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GT_U32,>,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LT_U32,<,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(EQ_U32,==,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(NE_U32,!=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GE_U32,>=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LE_U32,<=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(GT_U32,>,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LT_U32,<,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(EQ_U32,==,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(NE_U32,!=,_5,_4,_0,data[index4],data[index0]); + } +} + +static void utestINT32Cmp(void) +{ + simd1dw _0, _4; + simd16dw _1, _2; + simd8dw _6, _7; + simd1m _5; + simd16m _3; + simd8m _8; + const int32_t data[64] = {-11,-12,13,14,-15,8,-1,-1,2,3,4,5,-6,7,8,9,10, + 9,10,-11,12,-13,14,15,1,1,2,-3,4,-5,6,7, + 10,11,-12,13,14,15,-8,1,1,2,-3,-4,5,-6,7,8,9, + 9,10,11,12,-13,14,15,-1,-1,-2,-3,-4,5,6,7}; + + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + LOAD(_6, (const char *) (data+index6)); + LOAD(_7, (const char *) (data+index7)); + CHECK_CMP_OP(GE_S32,>=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LE_S32,<=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GT_S32,>,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LT_S32,<,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(EQ_S32,==,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(NE_S32,!=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GE_S32,>=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LE_S32,<=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GT_S32,>,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LT_S32,<,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(EQ_S32,==,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(NE_S32,!=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GE_S32,>=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LE_S32,<=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GT_S32,>,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LT_S32,<,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(EQ_S32,==,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(NE_S32,!=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GE_S32,>=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LE_S32,<=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(GT_S32,>,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LT_S32,<,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(EQ_S32,==,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(NE_S32,!=,_5,_4,_0,data[index4],data[index0]); + } } -static void utestInt(void) +static void utestFPCmp(void) { - geni1 _0, _4, _5; - geni16 _1, _2, _3; - const int data[32] = {1,1,2, 3, 4, 5, 6, 7, - 8,9,10,11,12,13,14,15, - 8,9,10,11,12,13,14,15, - 1,1,2, 3, 4, 5, 6, 7}; - LOAD(_0, (const char *) (data+4)); - LOAD(_4, (const char *) (data+5)); - LOAD(_1, (const char *) (data)); - LOAD(_2, (const char *) (data)); - CHECK_BINARY_OP(int,ADD,+,_3,_2,_1,data[i],data[i]); - CHECK_BINARY_OP(int,SUB,-,_3,_2,_1,data[i],data[i]); - CHECK_BINARY_OP(int,ADD,+,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(int,SUB,-,_3,_2,_0,data[i],data[4]); - CHECK_BINARY_OP(int,ADD,+,_5,_4,_0,data[5],data[4]); - CHECK_BINARY_OP(int,SUB,-,_5,_4,_0,data[5],data[4]); + simd1dw _0, _4; + simd16dw _1, _2; + simd8dw _6, _7; + simd1m _5; + simd16m _3; + simd8m _8; + const float data[64] = {1.f,-1.f,2.f,3.f,4.f,5.f,-6.f,7.f, + 8.f,9.f,10.f,11.f,12.f,-13.f,14.f,15.f, + -8.f,9.f,-10.f,11.f,-12.f,13.f,-14.f,15.f, + 1.f,1.f,2.f,3.f,4.f,5.f,6.f,-7.f, + 8.f,9.f,10.f,11.f,12.f,-13.f,14.f,15.f, + -8.f,9.f,-10.f,11.f,-12.f,13.f,-14.f,15.f, + 8.f,9.f,10.f,11.f,12.f,-13.f,14.f,15.f}; + for (uint32_t i = 0; i < 32; ++i) { + const int index0 = rand() % 32; + const int index1 = rand() % 16; + const int index2 = rand() % 16; + const int index4 = rand() % 32; + const int index6 = rand() % 16; + const int index7 = rand() % 32; + LOAD(_0, (const char *) (data+index0)); + LOAD(_1, (const char *) (data+index1)); + LOAD(_2, (const char *) (data+index2)); + LOAD(_4, (const char *) (data+index4)); + LOAD(_6, (const char *) (data+index6)); + LOAD(_7, (const char *) (data+index7)); + CHECK_CMP_OP(GE_F,>=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LE_F,<=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GT_F,>,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(LT_F,<,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(EQ_F,==,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(NE_F,!=,_3,_2,_1,data[i+index2],data[i+index1]); + CHECK_CMP_OP(GE_F,>=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LE_F,<=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GT_F,>,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(LT_F,<,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(EQ_F,==,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(NE_F,!=,_8,_7,_6,data[i+index7],data[i+index6]); + CHECK_CMP_OP(GE_F,>=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LE_F,<=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GT_F,>,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(LT_F,<,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(EQ_F,==,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(NE_F,!=,_3,_2,_0,data[i+index2],data[index0]); + CHECK_CMP_OP(GE_F,>=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LE_F,<=,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(GT_F,>,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(LT_F,<,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(EQ_F,==,_5,_4,_0,data[index4],data[index0]); + CHECK_CMP_OP(NE_F,!=,_5,_4,_0,data[index4],data[index0]); + } } +#undef CHECK_CMP_OP static void utestVector(void) { UTEST_EXPECT_SUCCESS(utestFP()); - UTEST_EXPECT_SUCCESS(utestInt()); + UTEST_EXPECT_SUCCESS(utestINT32()); + UTEST_EXPECT_SUCCESS(utestUINT32()); + UTEST_EXPECT_SUCCESS(utestFPCmp()); + UTEST_EXPECT_SUCCESS(utestINT32Cmp()); + UTEST_EXPECT_SUCCESS(utestUINT32Cmp()); } UTEST_REGISTER(utestVector) -- 2.7.4