#include <xmmintrin.h>
#include <emmintrin.h>
#include <pmmintrin.h>
+#include <smmintrin.h>
#include <stdint.h>
#include <cmath>
template <uint32_t vectorNum, bool scalar = false>
struct genm { __m128 m[vectorNum]; };
+/*! To cast through memory */
+union CastType {
+ INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
+ u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;
+ }
+ INLINE CastType(float f0, float f1, float f2, float f3) {
+ f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;
+ }
+ __m128 v;
+ __m128i vi;
+ uint32_t u[4];
+ float f[4];
+};
+
typedef genf<1,true> genf1; // contains 3 clobbered values
typedef genf<1,false> genf4;
typedef genf<2,false> genf8;
static INLINE uint32_t elemNum(genf8 x) { return 8; }
static INLINE uint32_t elemNum(genf16 x) { return 16; }
static INLINE uint32_t elemNum(genf32 x) { return 32; }
+static INLINE uint32_t elemNum(geni1 x) { return 1; }
+static INLINE uint32_t elemNum(geni4 x) { return 4; }
+static INLINE uint32_t elemNum(geni8 x) { return 8; }
+static INLINE uint32_t elemNum(geni16 x) { return 16; }
+static INLINE uint32_t elemNum(geni32 x) { return 32; }
template<size_t i0, size_t i1, size_t i2, size_t i3>
INLINE const __m128 shuffle(const __m128& b) {
return shuffle<index, index, index, index>(b);
}
+/* Build an integer mask from the mask vectors */
+template <uint32_t vectorNum>
+INLINE uint32_t mask(const genm<vectorNum> v) {
+ uint32_t m = _mm_movemask_ps(v.m[0]);
+ for (uint32_t i = 1; i < vectorNum; ++i)
+ m |= _mm_movemask_ps(v.m[i]) << (4*i);
+ return m;
+}
+INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; }
+
#define ID(X) X
-#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1) \
-template <uint32_t vectorNum> \
-INLINE void NAME(DST_TYPE<vectorNum> &dst, \
- const SRC_TYPE<vectorNum> &v0, \
- const SRC_TYPE<vectorNum> &v1) { \
- for (uint32_t i = 0; i < vectorNum; ++i) \
- dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i]))); \
-} \
-template <uint32_t vectorNum> \
-INLINE void NAME(DST_TYPE<vectorNum> &dst, \
- const SRC_TYPE<vectorNum> &v0, \
- const SRC_TYPE##1 &v1) { \
- for (uint32_t i = 0; i < vectorNum; ++i) \
- dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0])))); \
-} \
-template <uint32_t vectorNum> \
-INLINE void NAME(DST_TYPE<vectorNum> &dst, \
- const SRC_TYPE##1 &v0, \
- const SRC_TYPE<vectorNum> &v1) { \
- for (uint32_t i = 0; i < vectorNum; ++i) \
- dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i]))); \
+#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE<vectorNum> &dst,\
+ const SRC_TYPE<vectorNum> &v0,\
+ const SRC_TYPE<vectorNum> &v1) {\
+ for (uint32_t i = 0; i < vectorNum; ++i)\
+ dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE<vectorNum> &dst,\
+ const SRC_TYPE<vectorNum> &v0,\
+ const SRC_TYPE##1 &v1) {\
+ for (uint32_t i = 0; i < vectorNum; ++i)\
+ dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE<vectorNum> &dst,\
+ const SRC_TYPE##1 &v0,\
+ const SRC_TYPE<vectorNum> &v1) {\
+ for (uint32_t i = 0; i < vectorNum; ++i)\
+ dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\
}
VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID);
VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID);
VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID);
VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID);
+VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID);
+VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID);
+VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID);
VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);
VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);
VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);
#undef VEC_OP
-#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME) \
-INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) { \
- dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]); \
+#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE<vectorNum> &dst,\
+ const SRC_TYPE<vectorNum> &v0,\
+ const SRC_TYPE<vectorNum> &v1) {\
+ for (uint32_t i = 0; i < vectorNum; ++i)\
+ dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE<vectorNum> &dst,\
+ const SRC_TYPE<vectorNum> &v0,\
+ const SRC_TYPE##1 &v1) {\
+ for (uint32_t i = 0; i < vectorNum; ++i)\
+ dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE<vectorNum> &dst,\
+ const SRC_TYPE##1 &v0,\
+ const SRC_TYPE<vectorNum> &v1) {\
+ for (uint32_t i = 0; i < vectorNum; ++i)\
+ dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\
+}
+ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID);
+ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID);
+#undef ICMP_VEC_OP
+
+static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff);
+
+template <uint32_t vectorNum>
+INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni<vectorNum> &v1) {
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v));
+}
+template <uint32_t vectorNum>
+INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni1 &v1) {
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v));
+}
+template <uint32_t vectorNum>
+INLINE void NE(genm<vectorNum> &dst, const geni1 &v0, const geni<vectorNum> &v1) {
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v));
+}
+
+#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\
+INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\
+ dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\
}
SCALAR_OP(genf1, ADD, _mm_add_ss);
SCALAR_OP(genf1, SUB, _mm_sub_ss);
SCALAR_OP(geni1, ADD, _mm_add_epi32);
SCALAR_OP(geni1, SUB, _mm_sub_epi32);
#undef SCALAR_OP
-#undef ID
/* load from contiguous floats / integers */
template <uint32_t vectorNum>
-INLINE void LOAD(genf<vectorNum> &dst, const float *ptr) {
+INLINE void LOAD(genf<vectorNum> &dst, const char *ptr) {
for (uint32_t i = 0; i < vectorNum; ++i)
- dst.m[i] = _mm_loadu_ps(ptr + 4*i);
+ dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i);
}
-INLINE void LOAD(genf1 &dst, const float *ptr) {
- dst.m[0] = _mm_load_ss(ptr);
+INLINE void LOAD(genf1 &dst, const char *ptr) {
+ dst.m[0] = _mm_load_ss((const float*) ptr);
}
template <uint32_t vectorNum>
-INLINE void LOAD(geni<vectorNum> &dst, const float *ptr) {
+INLINE void LOAD(geni<vectorNum> &dst, const char *ptr) {
for (uint32_t i = 0; i < vectorNum; ++i)
- dst.m[i] = _mm_castps_si128(_mm_loadu_ps(ptr + 4*i));
+ dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i));
}
-INLINE void LOAD(geni1 &dst, const float *ptr) {
- dst.m[0] = _mm_castps_si128(_mm_load_ss(ptr));
+INLINE void LOAD(geni1 &dst, const char *ptr) {
+ dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr));
}
/* store to contiguous floats / integers */
template <uint32_t vectorNum>
-INLINE void STORE(genf<vectorNum> &dst, float *ptr) {
+INLINE void STORE(const genf<vectorNum> &src, char *ptr) {
for (uint32_t i = 0; i < vectorNum; ++i)
- _mm_storeu_ps(ptr + 4*i, dst.m[i]);
+ _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);
}
-INLINE void STORE(genf1 &dst, float *ptr) {
- _mm_store_ss(ptr, dst.m[0]);
+INLINE void STORE(genf1 &src, char *ptr) {
+ _mm_store_ss((float*) ptr, src.m[0]);
}
template <uint32_t vectorNum>
-INLINE void STORE(geni<vectorNum> &dst, float *ptr) {
+INLINE void STORE(const geni<vectorNum> &src, char *ptr) {
for (uint32_t i = 0; i < vectorNum; ++i)
- _mm_storeu_ps(ptr + 4*i, _mm_castsi128_ps(dst.m[i]));
+ _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i]));
}
-INLINE void STORE(geni1 &dst, float *ptr) {
- _mm_store_ss(ptr, _mm_castsi128_ps(dst.m[0]));
+INLINE void STORE(const geni1 &src, char *ptr) {
+ _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0]));
}
+/* Load immediates */
+template <uint32_t vectorNum>
+INLINE void LOADI(genf<vectorNum> &dst, float f) {
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_load1_ps(&f);
+}
+INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); }
+template <uint32_t vectorNum>
+INLINE void LOADI(geni<vectorNum> &dst, uint32_t u) {
+ union { float f; uint32_t u; } cast;
+ cast.u = u;
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f));
+}
+INLINE void LOADI(geni1 &dst, uint32_t u) {
+ union { float f; uint32_t u; } cast;
+ cast.u = u;
+ dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f));
+}
+
+/* Scatter */
+#define SCATTER_OP(TYPE, FN)\
+template <uint32_t vectorNum>\
+INLINE void SCATTER(const TYPE<vectorNum> &value,\
+ const geni<vectorNum> &offset,\
+ char *base_address) {\
+ for (uint32_t i = 0; i < vectorNum; ++i) {\
+ const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\
+ const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\
+ const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\
+ const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\
+ const int o0 = _mm_extract_epi32(offset.m[i], 0);\
+ const int o1 = _mm_extract_epi32(offset.m[i], 1);\
+ const int o2 = _mm_extract_epi32(offset.m[i], 2);\
+ const int o3 = _mm_extract_epi32(offset.m[i], 3);\
+ *(int*)(base_address + o0) = v0;\
+ *(int*)(base_address + o1) = v1;\
+ *(int*)(base_address + o2) = v2;\
+ *(int*)(base_address + o3) = v3;\
+ }\
+}\
+INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\
+ const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\
+ const int o0 = _mm_extract_epi32(offset.m[0], 0);\
+ *(int*)(base_address + o0) = v0;\
+}
+SCATTER_OP(genf, _mm_castps_si128)
+SCATTER_OP(geni, ID)
+#undef SCATTER_OP
+
+/* Gather */
+#define GATHER_OP(TYPE, FN)\
+template <uint32_t vectorNum>\
+INLINE void GATHER(TYPE<vectorNum> &dst,\
+ const geni<vectorNum> &offset,\
+ char *base_address) {\
+ for (uint32_t i = 0; i < vectorNum; ++i) {\
+ const int o0 = _mm_extract_epi32(offset.m[i], 0);\
+ const int o1 = _mm_extract_epi32(offset.m[i], 1);\
+ const int o2 = _mm_extract_epi32(offset.m[i], 2);\
+ const int o3 = _mm_extract_epi32(offset.m[i], 3);\
+ const int v0 = *(int*)(base_address + o0);\
+ const int v1 = *(int*)(base_address + o1);\
+ const int v2 = *(int*)(base_address + o2);\
+ const int v3 = *(int*)(base_address + o3);\
+ _mm_insert_epi32(FN(dst.m[i]), v0, 0);\
+ _mm_insert_epi32(FN(dst.m[i]), v1, 1);\
+ _mm_insert_epi32(FN(dst.m[i]), v2, 2);\
+ _mm_insert_epi32(FN(dst.m[i]), v3, 3);\
+ }\
+}\
+INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\
+ const int o0 = _mm_extract_epi32(offset.m[0], 0);\
+ const int v0 = *(int*)(base_address + o0);\
+ _mm_insert_epi32(FN(dst.m[0]), v0, 0);\
+}
+GATHER_OP(genf, _mm_castps_si128)
+GATHER_OP(geni, ID)
+#undef GATHER_OP
+
+#undef ID
#undef INLINE
#endif /* __GBE_SIM_VECTOR_H__ */
"#include <xmmintrin.h>\n"
"#include <emmintrin.h>\n"
"#include <pmmintrin.h>\n"
+"#include <smmintrin.h>\n"
"#include <stdint.h>\n"
"#include <cmath>\n"
"\n"
"template <uint32_t vectorNum, bool scalar = false>\n"
"struct genm { __m128 m[vectorNum]; };\n"
"\n"
+"/*! To cast through memory */\n"
+"union CastType {\n"
+" INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
+" u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n"
+" }\n"
+" INLINE CastType(float f0, float f1, float f2, float f3) {\n"
+" f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;\n"
+" }\n"
+" __m128 v;\n"
+" __m128i vi;\n"
+" uint32_t u[4];\n"
+" float f[4];\n"
+"};\n"
+"\n"
"typedef genf<1,true> genf1; // contains 3 clobbered values\n"
"typedef genf<1,false> genf4;\n"
"typedef genf<2,false> genf8;\n"
"static INLINE uint32_t elemNum(genf8 x) { return 8; }\n"
"static INLINE uint32_t elemNum(genf16 x) { return 16; }\n"
"static INLINE uint32_t elemNum(genf32 x) { return 32; }\n"
+"static INLINE uint32_t elemNum(geni1 x) { return 1; }\n"
+"static INLINE uint32_t elemNum(geni4 x) { return 4; }\n"
+"static INLINE uint32_t elemNum(geni8 x) { return 8; }\n"
+"static INLINE uint32_t elemNum(geni16 x) { return 16; }\n"
+"static INLINE uint32_t elemNum(geni32 x) { return 32; }\n"
"\n"
"template<size_t i0, size_t i1, size_t i2, size_t i3>\n"
"INLINE const __m128 shuffle(const __m128& b) {\n"
" return shuffle<index, index, index, index>(b);\n"
"}\n"
"\n"
+"/* Build an integer mask from the mask vectors */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE uint32_t mask(const genm<vectorNum> v) {\n"
+" uint32_t m = _mm_movemask_ps(v.m[0]);\n"
+" for (uint32_t i = 1; i < vectorNum; ++i)\n"
+" m |= _mm_movemask_ps(v.m[i]) << (4*i);\n"
+" return m;\n"
+"}\n"
+"INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; }\n"
+"\n"
"#define ID(X) X\n"
-"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1) \\n"
-"template <uint32_t vectorNum> \\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst, \\n"
-" const SRC_TYPE<vectorNum> &v0, \\n"
-" const SRC_TYPE<vectorNum> &v1) { \\n"
-" for (uint32_t i = 0; i < vectorNum; ++i) \\n"
-" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i]))); \\n"
-"} \\n"
-"template <uint32_t vectorNum> \\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst, \\n"
-" const SRC_TYPE<vectorNum> &v0, \\n"
-" const SRC_TYPE##1 &v1) { \\n"
-" for (uint32_t i = 0; i < vectorNum; ++i) \\n"
-" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0])))); \\n"
-"} \\n"
-"template <uint32_t vectorNum> \\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst, \\n"
-" const SRC_TYPE##1 &v0, \\n"
-" const SRC_TYPE<vectorNum> &v1) { \\n"
-" for (uint32_t i = 0; i < vectorNum; ++i) \\n"
-" dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i]))); \\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
+" const SRC_TYPE<vectorNum> &v0,\\n"
+" const SRC_TYPE<vectorNum> &v1) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\\n"
+" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
+" const SRC_TYPE<vectorNum> &v0,\\n"
+" const SRC_TYPE##1 &v1) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\\n"
+" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
+" const SRC_TYPE##1 &v0,\\n"
+" const SRC_TYPE<vectorNum> &v1) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\\n"
+" dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\\n"
"}\n"
"\n"
"VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID);\n"
"VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID);\n"
"VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID);\n"
"VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID);\n"
+"VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID);\n"
+"VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID);\n"
+"VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID);\n"
"VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n"
"VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n"
"VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n"
"\n"
"#undef VEC_OP\n"
"\n"
-"#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME) \\n"
-"INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) { \\n"
-" dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]); \\n"
+"#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
+" const SRC_TYPE<vectorNum> &v0,\\n"
+" const SRC_TYPE<vectorNum> &v1) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\\n"
+" dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
+" const SRC_TYPE<vectorNum> &v0,\\n"
+" const SRC_TYPE##1 &v1) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\\n"
+" dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
+" const SRC_TYPE##1 &v0,\\n"
+" const SRC_TYPE<vectorNum> &v1) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\\n"
+" dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\\n"
+"}\n"
+"ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID);\n"
+"ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID);\n"
+"#undef ICMP_VEC_OP\n"
+"\n"
+"static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff);\n"
+"\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni<vectorNum> &v1) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v));\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni1 &v1) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v));\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE(genm<vectorNum> &dst, const geni1 &v0, const geni<vectorNum> &v1) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v));\n"
+"}\n"
+"\n"
+"#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\\n"
+"INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\\n"
+" dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\\n"
"}\n"
"SCALAR_OP(genf1, ADD, _mm_add_ss);\n"
"SCALAR_OP(genf1, SUB, _mm_sub_ss);\n"
"SCALAR_OP(geni1, ADD, _mm_add_epi32);\n"
"SCALAR_OP(geni1, SUB, _mm_sub_epi32);\n"
"#undef SCALAR_OP\n"
-"#undef ID\n"
"\n"
"/* load from contiguous floats / integers */\n"
"template <uint32_t vectorNum>\n"
-"INLINE void LOAD(genf<vectorNum> &dst, const float *ptr) {\n"
+"INLINE void LOAD(genf<vectorNum> &dst, const char *ptr) {\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\n"
-" dst.m[i] = _mm_loadu_ps(ptr + 4*i);\n"
+" dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i);\n"
"}\n"
-"INLINE void LOAD(genf1 &dst, const float *ptr) {\n"
-" dst.m[0] = _mm_load_ss(ptr);\n"
+"INLINE void LOAD(genf1 &dst, const char *ptr) {\n"
+" dst.m[0] = _mm_load_ss((const float*) ptr);\n"
"}\n"
"template <uint32_t vectorNum>\n"
-"INLINE void LOAD(geni<vectorNum> &dst, const float *ptr) {\n"
+"INLINE void LOAD(geni<vectorNum> &dst, const char *ptr) {\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\n"
-" dst.m[i] = _mm_castps_si128(_mm_loadu_ps(ptr + 4*i));\n"
+" dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i));\n"
"}\n"
-"INLINE void LOAD(geni1 &dst, const float *ptr) {\n"
-" dst.m[0] = _mm_castps_si128(_mm_load_ss(ptr));\n"
+"INLINE void LOAD(geni1 &dst, const char *ptr) {\n"
+" dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr));\n"
"}\n"
"\n"
"/* store to contiguous floats / integers */\n"
"template <uint32_t vectorNum>\n"
-"INLINE void STORE(genf<vectorNum> &dst, float *ptr) {\n"
+"INLINE void STORE(const genf<vectorNum> &src, char *ptr) {\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\n"
-" _mm_storeu_ps(ptr + 4*i, dst.m[i]);\n"
+" _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);\n"
"}\n"
-"INLINE void STORE(genf1 &dst, float *ptr) {\n"
-" _mm_store_ss(ptr, dst.m[0]);\n"
+"INLINE void STORE(genf1 &src, char *ptr) {\n"
+" _mm_store_ss((float*) ptr, src.m[0]);\n"
"}\n"
"template <uint32_t vectorNum>\n"
-"INLINE void STORE(geni<vectorNum> &dst, float *ptr) {\n"
+"INLINE void STORE(const geni<vectorNum> &src, char *ptr) {\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\n"
-" _mm_storeu_ps(ptr + 4*i, _mm_castsi128_ps(dst.m[i]));\n"
+" _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i]));\n"
"}\n"
-"INLINE void STORE(geni1 &dst, float *ptr) {\n"
-" _mm_store_ss(ptr, _mm_castsi128_ps(dst.m[0]));\n"
+"INLINE void STORE(const geni1 &src, char *ptr) {\n"
+" _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0]));\n"
"}\n"
"\n"
+"/* Load immediates */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void LOADI(genf<vectorNum> &dst, float f) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_load1_ps(&f);\n"
+"}\n"
+"INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); }\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void LOADI(geni<vectorNum> &dst, uint32_t u) {\n"
+" union { float f; uint32_t u; } cast;\n"
+" cast.u = u;\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f));\n"
+"}\n"
+"INLINE void LOADI(geni1 &dst, uint32_t u) {\n"
+" union { float f; uint32_t u; } cast;\n"
+" cast.u = u;\n"
+" dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f));\n"
+"}\n"
+"\n"
+"/* Scatter */\n"
+"#define SCATTER_OP(TYPE, FN)\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void SCATTER(const TYPE<vectorNum> &value,\\n"
+" const geni<vectorNum> &offset,\\n"
+" char *base_address) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
+" const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\\n"
+" const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\\n"
+" const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\\n"
+" const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\\n"
+" const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n"
+" const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n"
+" const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n"
+" const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n"
+" *(int*)(base_address + o0) = v0;\\n"
+" *(int*)(base_address + o1) = v1;\\n"
+" *(int*)(base_address + o2) = v2;\\n"
+" *(int*)(base_address + o3) = v3;\\n"
+" }\\n"
+"}\\n"
+"INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\\n"
+" const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\\n"
+" const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n"
+" *(int*)(base_address + o0) = v0;\\n"
+"}\n"
+"SCATTER_OP(genf, _mm_castps_si128)\n"
+"SCATTER_OP(geni, ID)\n"
+"#undef SCATTER_OP\n"
+"\n"
+"/* Gather */\n"
+"#define GATHER_OP(TYPE, FN)\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void GATHER(TYPE<vectorNum> &dst,\\n"
+" const geni<vectorNum> &offset,\\n"
+" char *base_address) {\\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
+" const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n"
+" const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n"
+" const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n"
+" const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n"
+" const int v0 = *(int*)(base_address + o0);\\n"
+" const int v1 = *(int*)(base_address + o1);\\n"
+" const int v2 = *(int*)(base_address + o2);\\n"
+" const int v3 = *(int*)(base_address + o3);\\n"
+" _mm_insert_epi32(FN(dst.m[i]), v0, 0);\\n"
+" _mm_insert_epi32(FN(dst.m[i]), v1, 1);\\n"
+" _mm_insert_epi32(FN(dst.m[i]), v2, 2);\\n"
+" _mm_insert_epi32(FN(dst.m[i]), v3, 3);\\n"
+" }\\n"
+"}\\n"
+"INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\\n"
+" const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n"
+" const int v0 = *(int*)(base_address + o0);\\n"
+" _mm_insert_epi32(FN(dst.m[0]), v0, 0);\\n"
+"}\n"
+"GATHER_OP(genf, _mm_castps_si128)\n"
+"GATHER_OP(geni, ID)\n"
+"#undef GATHER_OP\n"
+"\n"
+"#undef ID\n"
"#undef INLINE\n"
"\n"
"#endif /* __GBE_SIM_VECTOR_H__ */\n"
if (UNLIKELY(simKernel->fn == NULL))
FATAL("Failed to get the symbol from the compiled shared object");
}
-
} /* namespace gbe */
-
static INLINE bool ok(float x, float y) {
return fabs(x-y) / (1.f + std::max(fabs(x), fabs(y))) < 1.e-6;
}
+static INLINE bool ok(int x, int y) { return x == y; }
-NOINLINE void hop(const float *p0, const float *p1, float *p2)
-{
- genf16 _0,_1,_2;
- LOAD(_0, p0);
- LOAD(_1, p1);
- LOAD(_2, p2);
- MUL(_0, _0, _1);
- ADD(_0, _1, _2);
- MUL(_1, _1, _2);
- SUB(_0, _0, _2);
- SUB(_0, _1, _0);
- STORE(_0, p2);
-}
-
-static void utestFP(void)
-{
- genf1 _0, _4, _5;
- genf16 _1, _2, _3;
- const float fdata16[32] = {1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f,
- 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,
- 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,
- 1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
-
- LOAD(_0, fdata16+4);
- LOAD(_4, fdata16+5);
- LOAD(_1, fdata16);
- LOAD(_2, fdata16);
-#define CHECK_BIN_FLOAT(FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\
+#define CHECK_BINARY_OP(TYPE,FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\
do {\
FN(DST, SRC0, SRC1);\
- float tmp[32];\
- STORE(DST, tmp);\
+ TYPE tmp[32];\
+ STORE(DST, (char*) tmp);\
for (uint32_t i = 0; i < elemNum(DST); ++i) {\
- const float verification = ELEM0 OP ELEM1;\
+ const TYPE verification = ELEM0 OP ELEM1;\
GBE_ASSERT(ok(verification, tmp[i]));\
}\
} while (0);
- CHECK_BIN_FLOAT(MUL,*,_3,_2,_1,fdata16[i],fdata16[i]);
- CHECK_BIN_FLOAT(DIV,/,_3,_2,_1,fdata16[i],fdata16[i]);
- CHECK_BIN_FLOAT(ADD,+,_3,_2,_1,fdata16[i],fdata16[i]);
- CHECK_BIN_FLOAT(SUB,-,_3,_2,_1,fdata16[i],fdata16[i]);
- CHECK_BIN_FLOAT(MUL,*,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(DIV,/,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(ADD,+,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(SUB,-,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(MUL,*,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(DIV,/,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(ADD,+,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(SUB,-,_3,_2,_0,fdata16[i],fdata16[4]);
- CHECK_BIN_FLOAT(MUL,*,_5,_4,_0,fdata16[5],fdata16[4]);
- CHECK_BIN_FLOAT(DIV,/,_5,_4,_0,fdata16[5],fdata16[4]);
- CHECK_BIN_FLOAT(ADD,+,_5,_4,_0,fdata16[5],fdata16[4]);
- CHECK_BIN_FLOAT(SUB,-,_5,_4,_0,fdata16[5],fdata16[4]);
-#undef CHECK_BIN_FLOAT
- float t0[16], t1[16], t2[16];
- hop(t0,t1,t2);
+
+static void utestFP(void)
+{
+ genf1 _0, _4, _5;
+ genf16 _1, _2, _3;
+ const float data[32] = {1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f,
+ 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,
+ 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,
+ 1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
+
+ LOAD(_0, (const char *) (data+4));
+ LOAD(_4, (const char *) (data+5));
+ LOAD(_1, (const char *) (data));
+ LOAD(_2, (const char *) (data));
+ CHECK_BINARY_OP(float,MUL,*,_3,_2,_1,data[i],data[i]);
+ CHECK_BINARY_OP(float,DIV,/,_3,_2,_1,data[i],data[i]);
+ CHECK_BINARY_OP(float,ADD,+,_3,_2,_1,data[i],data[i]);
+ CHECK_BINARY_OP(float,SUB,-,_3,_2,_1,data[i],data[i]);
+ CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(float,MUL,*,_5,_4,_0,data[5],data[4]);
+ CHECK_BINARY_OP(float,DIV,/,_5,_4,_0,data[5],data[4]);
+ CHECK_BINARY_OP(float,ADD,+,_5,_4,_0,data[5],data[4]);
+ CHECK_BINARY_OP(float,SUB,-,_5,_4,_0,data[5],data[4]);
+}
+
+static void utestInt(void)
+{
+ geni1 _0, _4, _5;
+ geni16 _1, _2, _3;
+ const int data[32] = {1,1,2, 3, 4, 5, 6, 7,
+ 8,9,10,11,12,13,14,15,
+ 8,9,10,11,12,13,14,15,
+ 1,1,2, 3, 4, 5, 6, 7};
+ LOAD(_0, (const char *) (data+4));
+ LOAD(_4, (const char *) (data+5));
+ LOAD(_1, (const char *) (data));
+ LOAD(_2, (const char *) (data));
+ CHECK_BINARY_OP(int,ADD,+,_3,_2,_1,data[i],data[i]);
+ CHECK_BINARY_OP(int,SUB,-,_3,_2,_1,data[i],data[i]);
+ CHECK_BINARY_OP(int,ADD,+,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(int,SUB,-,_3,_2,_0,data[i],data[4]);
+ CHECK_BINARY_OP(int,ADD,+,_5,_4,_0,data[5],data[4]);
+ CHECK_BINARY_OP(int,SUB,-,_5,_4,_0,data[5],data[4]);
+
}
static void utestVector(void)
{
UTEST_EXPECT_SUCCESS(utestFP());
+ UTEST_EXPECT_SUCCESS(utestInt());
}
UTEST_REGISTER(utestVector)