From: Benjamin Segovia Date: Mon, 9 Apr 2012 18:39:07 +0000 (+0000) Subject: Added more stuff in the vector library X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=748cd5596d0c8d18e68a65f085239ed3b4743727;p=contrib%2Fbeignet.git Added more stuff in the vector library --- diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h index 64385cf..44cfdd5 100644 --- a/backend/src/backend/sim/sim_vector.h +++ b/backend/src/backend/sim/sim_vector.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,20 @@ struct geni { __m128i m[vectorNum]; }; template struct genm { __m128 m[vectorNum]; }; +/*! To cast through memory */ +union CastType { + INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { + u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3; + } + INLINE CastType(float f0, float f1, float f2, float f3) { + f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3; + } + __m128 v; + __m128i vi; + uint32_t u[4]; + float f[4]; +}; + typedef genf<1,true> genf1; // contains 3 clobbered values typedef genf<1,false> genf4; typedef genf<2,false> genf8; @@ -66,6 +81,11 @@ static INLINE uint32_t elemNum(genf4 x) { return 4; } static INLINE uint32_t elemNum(genf8 x) { return 8; } static INLINE uint32_t elemNum(genf16 x) { return 16; } static INLINE uint32_t elemNum(genf32 x) { return 32; } +static INLINE uint32_t elemNum(geni1 x) { return 1; } +static INLINE uint32_t elemNum(geni4 x) { return 4; } +static INLINE uint32_t elemNum(geni8 x) { return 8; } +static INLINE uint32_t elemNum(geni16 x) { return 16; } +static INLINE uint32_t elemNum(geni32 x) { return 32; } template INLINE const __m128 shuffle(const __m128& b) { @@ -87,28 +107,38 @@ INLINE const __m128i expand(const __m128i& b) { return shuffle(b); } +/* Build an integer mask from the mask vectors */ +template +INLINE uint32_t mask(const genm v) { + uint32_t m = _mm_movemask_ps(v.m[0]); + for (uint32_t i = 1; i < vectorNum; ++i) + m |= _mm_movemask_ps(v.m[i]) << (4*i); + return m; +} +INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; } + #define ID(X) X -#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1) \ -template \ -INLINE void NAME(DST_TYPE &dst, \ - const SRC_TYPE &v0, \ - const SRC_TYPE &v1) { \ - for (uint32_t i = 0; i < vectorNum; ++i) \ - dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i]))); \ -} \ -template \ -INLINE void NAME(DST_TYPE &dst, \ - const SRC_TYPE &v0, \ - const SRC_TYPE##1 &v1) { \ - for (uint32_t i = 0; i < vectorNum; ++i) \ - dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0])))); \ -} \ -template \ -INLINE void NAME(DST_TYPE &dst, \ - const SRC_TYPE##1 &v0, \ - const SRC_TYPE &v1) { \ - for (uint32_t i = 0; i < vectorNum; ++i) \ - dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i]))); \ +#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ +template \ +INLINE void NAME(DST_TYPE &dst,\ + const SRC_TYPE &v0,\ + const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i)\ + dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst,\ + const SRC_TYPE &v0,\ + const SRC_TYPE##1 &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i)\ + dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst,\ + const SRC_TYPE##1 &v0,\ + const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i)\ + dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\ } VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID); @@ -123,6 +153,9 @@ VEC_OP(genm, genf, GT, _mm_cmpgt_ps, ID, ID, ID); VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID); VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID); VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID); +VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID); +VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID); +VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID); VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps); VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps); VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps); @@ -133,9 +166,53 @@ VEC_OP(genm, genf, SGE, _mm_cmpge_ps, ID, ID, ID); #undef VEC_OP -#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME) \ -INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) { \ - dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]); \ +#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ +template \ +INLINE void NAME(DST_TYPE &dst,\ + const SRC_TYPE &v0,\ + const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i)\ + dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst,\ + const SRC_TYPE &v0,\ + const SRC_TYPE##1 &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i)\ + dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\ +}\ +template \ +INLINE void NAME(DST_TYPE &dst,\ + const SRC_TYPE##1 &v0,\ + const SRC_TYPE &v1) {\ + for (uint32_t i = 0; i < vectorNum; ++i)\ + dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\ +} +ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID); +ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID); +#undef ICMP_VEC_OP + +static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff); + +template +INLINE void NE(genm &dst, const geni &v0, const geni &v1) { + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v)); +} +template +INLINE void NE(genm &dst, const geni &v0, const geni1 &v1) { + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v)); +} +template +INLINE void NE(genm &dst, const geni1 &v0, const geni &v1) { + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v)); +} + +#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\ +INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\ + dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\ } SCALAR_OP(genf1, ADD, _mm_add_ss); SCALAR_OP(genf1, SUB, _mm_sub_ss); @@ -144,44 +221,124 @@ SCALAR_OP(genf1, DIV, _mm_div_ss); SCALAR_OP(geni1, ADD, _mm_add_epi32); SCALAR_OP(geni1, SUB, _mm_sub_epi32); #undef SCALAR_OP -#undef ID /* load from contiguous floats / integers */ template -INLINE void LOAD(genf &dst, const float *ptr) { +INLINE void LOAD(genf &dst, const char *ptr) { for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_loadu_ps(ptr + 4*i); + dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i); } -INLINE void LOAD(genf1 &dst, const float *ptr) { - dst.m[0] = _mm_load_ss(ptr); +INLINE void LOAD(genf1 &dst, const char *ptr) { + dst.m[0] = _mm_load_ss((const float*) ptr); } template -INLINE void LOAD(geni &dst, const float *ptr) { +INLINE void LOAD(geni &dst, const char *ptr) { for (uint32_t i = 0; i < vectorNum; ++i) - dst.m[i] = _mm_castps_si128(_mm_loadu_ps(ptr + 4*i)); + dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i)); } -INLINE void LOAD(geni1 &dst, const float *ptr) { - dst.m[0] = _mm_castps_si128(_mm_load_ss(ptr)); +INLINE void LOAD(geni1 &dst, const char *ptr) { + dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr)); } /* store to contiguous floats / integers */ template -INLINE void STORE(genf &dst, float *ptr) { +INLINE void STORE(const genf &src, char *ptr) { for (uint32_t i = 0; i < vectorNum; ++i) - _mm_storeu_ps(ptr + 4*i, dst.m[i]); + _mm_storeu_ps((float*) ptr + 4*i, src.m[i]); } -INLINE void STORE(genf1 &dst, float *ptr) { - _mm_store_ss(ptr, dst.m[0]); +INLINE void STORE(genf1 &src, char *ptr) { + _mm_store_ss((float*) ptr, src.m[0]); } template -INLINE void STORE(geni &dst, float *ptr) { +INLINE void STORE(const geni &src, char *ptr) { for (uint32_t i = 0; i < vectorNum; ++i) - _mm_storeu_ps(ptr + 4*i, _mm_castsi128_ps(dst.m[i])); + _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i])); } -INLINE void STORE(geni1 &dst, float *ptr) { - _mm_store_ss(ptr, _mm_castsi128_ps(dst.m[0])); +INLINE void STORE(const geni1 &src, char *ptr) { + _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0])); } +/* Load immediates */ +template +INLINE void LOADI(genf &dst, float f) { + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_load1_ps(&f); +} +INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); } +template +INLINE void LOADI(geni &dst, uint32_t u) { + union { float f; uint32_t u; } cast; + cast.u = u; + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f)); +} +INLINE void LOADI(geni1 &dst, uint32_t u) { + union { float f; uint32_t u; } cast; + cast.u = u; + dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f)); +} + +/* Scatter */ +#define SCATTER_OP(TYPE, FN)\ +template \ +INLINE void SCATTER(const TYPE &value,\ + const geni &offset,\ + char *base_address) {\ + for (uint32_t i = 0; i < vectorNum; ++i) {\ + const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\ + const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\ + const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\ + const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\ + const int o0 = _mm_extract_epi32(offset.m[i], 0);\ + const int o1 = _mm_extract_epi32(offset.m[i], 1);\ + const int o2 = _mm_extract_epi32(offset.m[i], 2);\ + const int o3 = _mm_extract_epi32(offset.m[i], 3);\ + *(int*)(base_address + o0) = v0;\ + *(int*)(base_address + o1) = v1;\ + *(int*)(base_address + o2) = v2;\ + *(int*)(base_address + o3) = v3;\ + }\ +}\ +INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\ + const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\ + const int o0 = _mm_extract_epi32(offset.m[0], 0);\ + *(int*)(base_address + o0) = v0;\ +} +SCATTER_OP(genf, _mm_castps_si128) +SCATTER_OP(geni, ID) +#undef SCATTER_OP + +/* Gather */ +#define GATHER_OP(TYPE, FN)\ +template \ +INLINE void GATHER(TYPE &dst,\ + const geni &offset,\ + char *base_address) {\ + for (uint32_t i = 0; i < vectorNum; ++i) {\ + const int o0 = _mm_extract_epi32(offset.m[i], 0);\ + const int o1 = _mm_extract_epi32(offset.m[i], 1);\ + const int o2 = _mm_extract_epi32(offset.m[i], 2);\ + const int o3 = _mm_extract_epi32(offset.m[i], 3);\ + const int v0 = *(int*)(base_address + o0);\ + const int v1 = *(int*)(base_address + o1);\ + const int v2 = *(int*)(base_address + o2);\ + const int v3 = *(int*)(base_address + o3);\ + _mm_insert_epi32(FN(dst.m[i]), v0, 0);\ + _mm_insert_epi32(FN(dst.m[i]), v1, 1);\ + _mm_insert_epi32(FN(dst.m[i]), v2, 2);\ + _mm_insert_epi32(FN(dst.m[i]), v3, 3);\ + }\ +}\ +INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\ + const int o0 = _mm_extract_epi32(offset.m[0], 0);\ + const int v0 = *(int*)(base_address + o0);\ + _mm_insert_epi32(FN(dst.m[0]), v0, 0);\ +} +GATHER_OP(genf, _mm_castps_si128) +GATHER_OP(geni, ID) +#undef GATHER_OP + +#undef ID #undef INLINE #endif /* __GBE_SIM_VECTOR_H__ */ diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp index 410f6b0..e37d7f2 100644 --- a/backend/src/backend/sim/sim_vector_str.cpp +++ b/backend/src/backend/sim/sim_vector_str.cpp @@ -56,6 +56,7 @@ std::string sim_vector_str = "#include \n" "#include \n" "#include \n" +"#include \n" "#include \n" "#include \n" "\n" @@ -71,6 +72,20 @@ std::string sim_vector_str = "template \n" "struct genm { __m128 m[vectorNum]; };\n" "\n" +"/*! To cast through memory */\n" +"union CastType {\n" +" INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n" +" u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n" +" }\n" +" INLINE CastType(float f0, float f1, float f2, float f3) {\n" +" f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;\n" +" }\n" +" __m128 v;\n" +" __m128i vi;\n" +" uint32_t u[4];\n" +" float f[4];\n" +"};\n" +"\n" "typedef genf<1,true> genf1; // contains 3 clobbered values\n" "typedef genf<1,false> genf4;\n" "typedef genf<2,false> genf8;\n" @@ -92,6 +107,11 @@ std::string sim_vector_str = "static INLINE uint32_t elemNum(genf8 x) { return 8; }\n" "static INLINE uint32_t elemNum(genf16 x) { return 16; }\n" "static INLINE uint32_t elemNum(genf32 x) { return 32; }\n" +"static INLINE uint32_t elemNum(geni1 x) { return 1; }\n" +"static INLINE uint32_t elemNum(geni4 x) { return 4; }\n" +"static INLINE uint32_t elemNum(geni8 x) { return 8; }\n" +"static INLINE uint32_t elemNum(geni16 x) { return 16; }\n" +"static INLINE uint32_t elemNum(geni32 x) { return 32; }\n" "\n" "template\n" "INLINE const __m128 shuffle(const __m128& b) {\n" @@ -113,28 +133,38 @@ std::string sim_vector_str = " return shuffle(b);\n" "}\n" "\n" +"/* Build an integer mask from the mask vectors */\n" +"template \n" +"INLINE uint32_t mask(const genm v) {\n" +" uint32_t m = _mm_movemask_ps(v.m[0]);\n" +" for (uint32_t i = 1; i < vectorNum; ++i)\n" +" m |= _mm_movemask_ps(v.m[i]) << (4*i);\n" +" return m;\n" +"}\n" +"INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; }\n" +"\n" "#define ID(X) X\n" -"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1) \\n" -"template \\n" -"INLINE void NAME(DST_TYPE &dst, \\n" -" const SRC_TYPE &v0, \\n" -" const SRC_TYPE &v1) { \\n" -" for (uint32_t i = 0; i < vectorNum; ++i) \\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i]))); \\n" -"} \\n" -"template \\n" -"INLINE void NAME(DST_TYPE &dst, \\n" -" const SRC_TYPE &v0, \\n" -" const SRC_TYPE##1 &v1) { \\n" -" for (uint32_t i = 0; i < vectorNum; ++i) \\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0])))); \\n" -"} \\n" -"template \\n" -"INLINE void NAME(DST_TYPE &dst, \\n" -" const SRC_TYPE##1 &v0, \\n" -" const SRC_TYPE &v1) { \\n" -" for (uint32_t i = 0; i < vectorNum; ++i) \\n" -" dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i]))); \\n" +"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst,\\n" +" const SRC_TYPE &v0,\\n" +" const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\\n" +" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst,\\n" +" const SRC_TYPE &v0,\\n" +" const SRC_TYPE##1 &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\\n" +" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst,\\n" +" const SRC_TYPE##1 &v0,\\n" +" const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\\n" +" dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\\n" "}\n" "\n" "VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID);\n" @@ -149,6 +179,9 @@ std::string sim_vector_str = "VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID);\n" "VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID);\n" "VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID);\n" +"VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID);\n" +"VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID);\n" +"VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID);\n" "VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n" "VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n" "VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n" @@ -159,9 +192,53 @@ std::string sim_vector_str = "\n" "#undef VEC_OP\n" "\n" -"#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME) \\n" -"INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) { \\n" -" dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]); \\n" +"#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst,\\n" +" const SRC_TYPE &v0,\\n" +" const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\\n" +" dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst,\\n" +" const SRC_TYPE &v0,\\n" +" const SRC_TYPE##1 &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\\n" +" dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\\n" +"}\\n" +"template \\n" +"INLINE void NAME(DST_TYPE &dst,\\n" +" const SRC_TYPE##1 &v0,\\n" +" const SRC_TYPE &v1) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\\n" +" dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\\n" +"}\n" +"ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID);\n" +"ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID);\n" +"#undef ICMP_VEC_OP\n" +"\n" +"static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff);\n" +"\n" +"template \n" +"INLINE void NE(genm &dst, const geni &v0, const geni &v1) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v));\n" +"}\n" +"template \n" +"INLINE void NE(genm &dst, const geni &v0, const geni1 &v1) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v));\n" +"}\n" +"template \n" +"INLINE void NE(genm &dst, const geni1 &v0, const geni &v1) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v));\n" +"}\n" +"\n" +"#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\\n" +"INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\\n" +" dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\\n" "}\n" "SCALAR_OP(genf1, ADD, _mm_add_ss);\n" "SCALAR_OP(genf1, SUB, _mm_sub_ss);\n" @@ -170,44 +247,124 @@ std::string sim_vector_str = "SCALAR_OP(geni1, ADD, _mm_add_epi32);\n" "SCALAR_OP(geni1, SUB, _mm_sub_epi32);\n" "#undef SCALAR_OP\n" -"#undef ID\n" "\n" "/* load from contiguous floats / integers */\n" "template \n" -"INLINE void LOAD(genf &dst, const float *ptr) {\n" +"INLINE void LOAD(genf &dst, const char *ptr) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_loadu_ps(ptr + 4*i);\n" +" dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i);\n" "}\n" -"INLINE void LOAD(genf1 &dst, const float *ptr) {\n" -" dst.m[0] = _mm_load_ss(ptr);\n" +"INLINE void LOAD(genf1 &dst, const char *ptr) {\n" +" dst.m[0] = _mm_load_ss((const float*) ptr);\n" "}\n" "template \n" -"INLINE void LOAD(geni &dst, const float *ptr) {\n" +"INLINE void LOAD(geni &dst, const char *ptr) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" -" dst.m[i] = _mm_castps_si128(_mm_loadu_ps(ptr + 4*i));\n" +" dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i));\n" "}\n" -"INLINE void LOAD(geni1 &dst, const float *ptr) {\n" -" dst.m[0] = _mm_castps_si128(_mm_load_ss(ptr));\n" +"INLINE void LOAD(geni1 &dst, const char *ptr) {\n" +" dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr));\n" "}\n" "\n" "/* store to contiguous floats / integers */\n" "template \n" -"INLINE void STORE(genf &dst, float *ptr) {\n" +"INLINE void STORE(const genf &src, char *ptr) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" -" _mm_storeu_ps(ptr + 4*i, dst.m[i]);\n" +" _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);\n" "}\n" -"INLINE void STORE(genf1 &dst, float *ptr) {\n" -" _mm_store_ss(ptr, dst.m[0]);\n" +"INLINE void STORE(genf1 &src, char *ptr) {\n" +" _mm_store_ss((float*) ptr, src.m[0]);\n" "}\n" "template \n" -"INLINE void STORE(geni &dst, float *ptr) {\n" +"INLINE void STORE(const geni &src, char *ptr) {\n" " for (uint32_t i = 0; i < vectorNum; ++i)\n" -" _mm_storeu_ps(ptr + 4*i, _mm_castsi128_ps(dst.m[i]));\n" +" _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i]));\n" "}\n" -"INLINE void STORE(geni1 &dst, float *ptr) {\n" -" _mm_store_ss(ptr, _mm_castsi128_ps(dst.m[0]));\n" +"INLINE void STORE(const geni1 &src, char *ptr) {\n" +" _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0]));\n" "}\n" "\n" +"/* Load immediates */\n" +"template \n" +"INLINE void LOADI(genf &dst, float f) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_load1_ps(&f);\n" +"}\n" +"INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); }\n" +"template \n" +"INLINE void LOADI(geni &dst, uint32_t u) {\n" +" union { float f; uint32_t u; } cast;\n" +" cast.u = u;\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f));\n" +"}\n" +"INLINE void LOADI(geni1 &dst, uint32_t u) {\n" +" union { float f; uint32_t u; } cast;\n" +" cast.u = u;\n" +" dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f));\n" +"}\n" +"\n" +"/* Scatter */\n" +"#define SCATTER_OP(TYPE, FN)\\n" +"template \\n" +"INLINE void SCATTER(const TYPE &value,\\n" +" const geni &offset,\\n" +" char *base_address) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" +" const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\\n" +" const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\\n" +" const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\\n" +" const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\\n" +" const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n" +" const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n" +" const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n" +" const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n" +" *(int*)(base_address + o0) = v0;\\n" +" *(int*)(base_address + o1) = v1;\\n" +" *(int*)(base_address + o2) = v2;\\n" +" *(int*)(base_address + o3) = v3;\\n" +" }\\n" +"}\\n" +"INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\\n" +" const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\\n" +" const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n" +" *(int*)(base_address + o0) = v0;\\n" +"}\n" +"SCATTER_OP(genf, _mm_castps_si128)\n" +"SCATTER_OP(geni, ID)\n" +"#undef SCATTER_OP\n" +"\n" +"/* Gather */\n" +"#define GATHER_OP(TYPE, FN)\\n" +"template \\n" +"INLINE void GATHER(TYPE &dst,\\n" +" const geni &offset,\\n" +" char *base_address) {\\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\\n" +" const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n" +" const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n" +" const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n" +" const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n" +" const int v0 = *(int*)(base_address + o0);\\n" +" const int v1 = *(int*)(base_address + o1);\\n" +" const int v2 = *(int*)(base_address + o2);\\n" +" const int v3 = *(int*)(base_address + o3);\\n" +" _mm_insert_epi32(FN(dst.m[i]), v0, 0);\\n" +" _mm_insert_epi32(FN(dst.m[i]), v1, 1);\\n" +" _mm_insert_epi32(FN(dst.m[i]), v2, 2);\\n" +" _mm_insert_epi32(FN(dst.m[i]), v3, 3);\\n" +" }\\n" +"}\\n" +"INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\\n" +" const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n" +" const int v0 = *(int*)(base_address + o0);\\n" +" _mm_insert_epi32(FN(dst.m[0]), v0, 0);\\n" +"}\n" +"GATHER_OP(genf, _mm_castps_si128)\n" +"GATHER_OP(geni, ID)\n" +"#undef GATHER_OP\n" +"\n" +"#undef ID\n" "#undef INLINE\n" "\n" "#endif /* __GBE_SIM_VECTOR_H__ */\n" diff --git a/backend/src/backend/sim_context.cpp b/backend/src/backend/sim_context.cpp index 33fc03e..7a5d57c 100644 --- a/backend/src/backend/sim_context.cpp +++ b/backend/src/backend/sim_context.cpp @@ -75,7 +75,5 @@ namespace gbe if (UNLIKELY(simKernel->fn == NULL)) FATAL("Failed to get the symbol from the compiled shared object"); } - } /* namespace gbe */ - diff --git a/backend/src/utest/utest_vector.cpp b/backend/src/utest/utest_vector.cpp index a00637f..78bfcc0 100644 --- a/backend/src/utest/utest_vector.cpp +++ b/backend/src/utest/utest_vector.cpp @@ -23,68 +23,75 @@ static INLINE bool ok(float x, float y) { return fabs(x-y) / (1.f + std::max(fabs(x), fabs(y))) < 1.e-6; } +static INLINE bool ok(int x, int y) { return x == y; } -NOINLINE void hop(const float *p0, const float *p1, float *p2) -{ - genf16 _0,_1,_2; - LOAD(_0, p0); - LOAD(_1, p1); - LOAD(_2, p2); - MUL(_0, _0, _1); - ADD(_0, _1, _2); - MUL(_1, _1, _2); - SUB(_0, _0, _2); - SUB(_0, _1, _0); - STORE(_0, p2); -} - -static void utestFP(void) -{ - genf1 _0, _4, _5; - genf16 _1, _2, _3; - const float fdata16[32] = {1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f, - 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f, - 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f, - 1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; - - LOAD(_0, fdata16+4); - LOAD(_4, fdata16+5); - LOAD(_1, fdata16); - LOAD(_2, fdata16); -#define CHECK_BIN_FLOAT(FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\ +#define CHECK_BINARY_OP(TYPE,FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\ do {\ FN(DST, SRC0, SRC1);\ - float tmp[32];\ - STORE(DST, tmp);\ + TYPE tmp[32];\ + STORE(DST, (char*) tmp);\ for (uint32_t i = 0; i < elemNum(DST); ++i) {\ - const float verification = ELEM0 OP ELEM1;\ + const TYPE verification = ELEM0 OP ELEM1;\ GBE_ASSERT(ok(verification, tmp[i]));\ }\ } while (0); - CHECK_BIN_FLOAT(MUL,*,_3,_2,_1,fdata16[i],fdata16[i]); - CHECK_BIN_FLOAT(DIV,/,_3,_2,_1,fdata16[i],fdata16[i]); - CHECK_BIN_FLOAT(ADD,+,_3,_2,_1,fdata16[i],fdata16[i]); - CHECK_BIN_FLOAT(SUB,-,_3,_2,_1,fdata16[i],fdata16[i]); - CHECK_BIN_FLOAT(MUL,*,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(DIV,/,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(ADD,+,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(SUB,-,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(MUL,*,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(DIV,/,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(ADD,+,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(SUB,-,_3,_2,_0,fdata16[i],fdata16[4]); - CHECK_BIN_FLOAT(MUL,*,_5,_4,_0,fdata16[5],fdata16[4]); - CHECK_BIN_FLOAT(DIV,/,_5,_4,_0,fdata16[5],fdata16[4]); - CHECK_BIN_FLOAT(ADD,+,_5,_4,_0,fdata16[5],fdata16[4]); - CHECK_BIN_FLOAT(SUB,-,_5,_4,_0,fdata16[5],fdata16[4]); -#undef CHECK_BIN_FLOAT - float t0[16], t1[16], t2[16]; - hop(t0,t1,t2); + +static void utestFP(void) +{ + genf1 _0, _4, _5; + genf16 _1, _2, _3; + const float data[32] = {1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f, + 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f, + 8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f, + 1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; + + LOAD(_0, (const char *) (data+4)); + LOAD(_4, (const char *) (data+5)); + LOAD(_1, (const char *) (data)); + LOAD(_2, (const char *) (data)); + CHECK_BINARY_OP(float,MUL,*,_3,_2,_1,data[i],data[i]); + CHECK_BINARY_OP(float,DIV,/,_3,_2,_1,data[i],data[i]); + CHECK_BINARY_OP(float,ADD,+,_3,_2,_1,data[i],data[i]); + CHECK_BINARY_OP(float,SUB,-,_3,_2,_1,data[i],data[i]); + CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(float,MUL,*,_5,_4,_0,data[5],data[4]); + CHECK_BINARY_OP(float,DIV,/,_5,_4,_0,data[5],data[4]); + CHECK_BINARY_OP(float,ADD,+,_5,_4,_0,data[5],data[4]); + CHECK_BINARY_OP(float,SUB,-,_5,_4,_0,data[5],data[4]); +} + +static void utestInt(void) +{ + geni1 _0, _4, _5; + geni16 _1, _2, _3; + const int data[32] = {1,1,2, 3, 4, 5, 6, 7, + 8,9,10,11,12,13,14,15, + 8,9,10,11,12,13,14,15, + 1,1,2, 3, 4, 5, 6, 7}; + LOAD(_0, (const char *) (data+4)); + LOAD(_4, (const char *) (data+5)); + LOAD(_1, (const char *) (data)); + LOAD(_2, (const char *) (data)); + CHECK_BINARY_OP(int,ADD,+,_3,_2,_1,data[i],data[i]); + CHECK_BINARY_OP(int,SUB,-,_3,_2,_1,data[i],data[i]); + CHECK_BINARY_OP(int,ADD,+,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(int,SUB,-,_3,_2,_0,data[i],data[4]); + CHECK_BINARY_OP(int,ADD,+,_5,_4,_0,data[5],data[4]); + CHECK_BINARY_OP(int,SUB,-,_5,_4,_0,data[5],data[4]); + } static void utestVector(void) { UTEST_EXPECT_SUCCESS(utestFP()); + UTEST_EXPECT_SUCCESS(utestInt()); } UTEST_REGISTER(utestVector)