From 4f528e65b2360318a3c7c72d6219c8edd055d467 Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Mon, 16 Apr 2012 18:44:44 +0000 Subject: [PATCH] First implementation of branches in the simulator --- backend/src/CMakeLists.txt | 7 +- backend/src/backend/program.cpp | 4 +- backend/src/backend/sim/sim_vector.h | 238 ++++++++++++++++++++- backend/src/backend/sim/sim_vector_str.cpp | 238 ++++++++++++++++++++- backend/src/backend/sim_context.cpp | 78 +++++-- backend/src/backend/sim_context.hpp | 2 + backend/src/llvm/llvm_to_gen.cpp | 18 +- backend/src/{llvm/stdlib.h => ocl_stdlib.h} | 0 .../{llvm/stdlib_str.cpp => ocl_stdlib_str.cpp} | 2 +- 9 files changed, 558 insertions(+), 29 deletions(-) rename backend/src/{llvm/stdlib.h => ocl_stdlib.h} (100%) rename backend/src/{llvm/stdlib_str.cpp => ocl_stdlib_str.cpp} (99%) diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index 26fd6cd..f66235f 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -22,13 +22,15 @@ endmacro (stringify) set (TO_STRINGIFY_FILES simulator sim_vector) stringify ("${GBE_SOURCE_DIR}/src/backend/sim/" "${TO_STRINGIFY_FILES}") -set (TO_STRINGIFY_FILES stdlib) -stringify ("${GBE_SOURCE_DIR}/src/llvm/" "${TO_STRINGIFY_FILES}") +set (TO_STRINGIFY_FILES ocl_stdlib) +stringify ("${GBE_SOURCE_DIR}/src/" "${TO_STRINGIFY_FILES}") if (GBE_USE_BLOB) set (GBE_SRC blob.cpp) else (GBE_USE_BLOB) set (GBE_SRC + ocl_stdlib.h + ocl_stdlib_str.cpp sys/vector.hpp sys/hash_map.hpp sys/map.hpp @@ -69,7 +71,6 @@ else (GBE_USE_BLOB) ir/function.hpp ir/value.cpp ir/value.hpp - llvm/stdlib_str.cpp backend/context.cpp backend/context.hpp backend/program.cpp diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 08dac6e..9ec86f3 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -88,7 +88,7 @@ namespace gbe { GBE_SAFE_DELETE(program); } - extern std::string stdlib_str; + extern std::string ocl_stdlib_str; static gbe_program programNewFromSource(const char *source, size_t stringSize, char *err, @@ -101,7 +101,7 @@ namespace gbe { // Write the source to the cl file FILE *clFile = fopen(clName.c_str(), "w"); FATAL_IF(clFile == NULL, "Failed to open temporary file"); - fwrite(stdlib_str.c_str(), strlen(stdlib_str.c_str()), 1, clFile); + fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile); fwrite(source, strlen(source), 1, clFile); fclose(clFile); diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h index 34cec34..2baffcc 100644 --- a/backend/src/backend/sim/sim_vector.h +++ b/backend/src/backend/sim/sim_vector.h @@ -58,7 +58,13 @@ INLINE const __m128i expand(const __m128i& b) { } /*! Base structure for scalar double word */ -union scalar_dw { uint32_t u; int32_t s; float f; }; +union scalar_dw { + INLINE scalar_dw(void) {} + INLINE scalar_dw(uint32_t u) { this->u = u; } + INLINE scalar_dw(int32_t s) { this->s = s; } + INLINE scalar_dw(float f) { this->f = f; } + uint32_t u; int32_t s; float f; +}; /*! Base structure for scalar mask */ union scalar_m { uint32_t u; int32_t s; float f; }; @@ -87,6 +93,26 @@ struct simd_m { __m128 m[vectorNum]; }; +/*! Select instruction on vectors */ +template +INLINE void select(simd_dw &dst, + const simd_dw &src0, + const simd_dw &src1, + const simd_m &mask) +{ + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]); +} +template +INLINE void select(simd_m &dst, + const simd_m &src0, + const simd_m &src1, + const simd_m &mask) +{ + for (uint32_t i = 0; i < vectorNum; ++i) + dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]); +} + /*! To cast through memory */ union cast_dw { INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { @@ -109,6 +135,12 @@ union cast_dw { }; static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); +/*! Make a mask true */ +template +INLINE void alltrueMask(simd_m &x) { + for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v; +} + /* Some convenient typedefs */ typedef scalar_dw simd1dw; typedef simd_dw<1> simd4dw; @@ -143,6 +175,17 @@ INLINE uint32_t mask(const simd_m v) { return m; } +/* MOV instruction */ +template +INLINE void MOV_S32(simd_dw &dst, const simd_dw &v) { + for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i]; +} +template +INLINE void MOV_S32(simd_dw &dst, const scalar_dw &x) { + const __m128 v = _mm_load1_ps(&x.f); + for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v; +} + /* Vector instructions that use sse* */ #define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\ template \ @@ -317,7 +360,6 @@ INLINE void LOADI(simd_dw &dst, uint32_t u) { dst.m[i] = _mm_load1_ps(&cast.f); } -#include /* Scatter */ template INLINE void SCATTER(const simd_dw &offset, @@ -350,7 +392,47 @@ INLINE void SCATTER(const scalar_dw &offset, char *base_address) { SCATTER(simd_dw(offset), value, base_address); } -#include + +/* Masked scatter will only store unmasked lanes */ +template +INLINE void MASKED_SCATTER(const simd_dw &offset, + const simd_dw &value, + char *base_address, + uint32_t mask) +{ + for (uint32_t i = 0; i < vectorNum; ++i) { + const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0); + const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1); + const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2); + const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3); + const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0); + const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1); + const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2); + const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3); + if (mask & 1) *(int*)(base_address + o0) = v0; + if (mask & 2) *(int*)(base_address + o1) = v1; + if (mask & 4) *(int*)(base_address + o2) = v2; + if (mask & 8) *(int*)(base_address + o3) = v3; + mask = mask >> 4; + } +} +template +INLINE void MASKED_SCATTER(const simd_dw &offset, + const scalar_dw &value, + char *base_address, + uint32_t mask) +{ + MASKED_SCATTER(offset, simd_dw(value), base_address, mask); +} +template +INLINE void MASKED_SCATTER(const scalar_dw &offset, + const simd_dw &value, + char *base_address, + uint32_t mask) +{ + MASKED_SCATTER(simd_dw(offset), value, base_address, mask); +} + /* Gather */ template INLINE void GATHER(simd_dw &dst, @@ -378,6 +460,38 @@ INLINE void GATHER(simd_dw &dst, GATHER(dst, simd_dw(offset), base_address); } +/* Masked gather will only load activated lanes */ +template +INLINE void MASKED_GATHER(simd_dw &dst, + const simd_dw &offset, + const char *base_address, + uint32_t mask) +{ + for (uint32_t i = 0; i < vectorNum; ++i) { + const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0); + const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1); + const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2); + const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3); + const int v0 = *(const int*)(base_address + o0); + const int v1 = *(const int*)(base_address + o1); + const int v2 = *(const int*)(base_address + o2); + const int v3 = *(const int*)(base_address + o3); + if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0)); + if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1)); + if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2)); + if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3)); + mask = mask >> 4; + } +} +template +INLINE void MASKED_GATHER(simd_dw &dst, + const scalar_dw &offset, + const char *base_address, + uint32_t mask) +{ + MASKED_GATHER(dst, simd_dw(offset), base_address, mask); +} + ////////////////////////////////////////////////////////////////////////////// // Scalar instructions ////////////////////////////////////////////////////////////////////////////// @@ -425,6 +539,8 @@ INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = // Identical instructions are forwarded ////////////////////////////////////////////////////////////////////////////// +#define NOV_U32 MOV_S32 +#define NOV_F MOV_S32 #define ADD_U32 ADD_S32 #define SUB_U32 SUB_S32 #define XOR_U32 XOR_S32 @@ -436,6 +552,122 @@ INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = #undef PS2SI #undef SI2PS #undef ID + +////////////////////////////////////////////////////////////////////////////// +// Goto implementation which is directly inspired by BDW goto and by this +// article "Whole function vectorization" (CGO 2011) +////////////////////////////////////////////////////////////////////////////// + +/*! Update the UIP vector according for the lanes alive in mask */ +template +INLINE void updateUIP(simd_dw &uipVec, const simd_m mask, uint32_t uip) { + union { float f; uint32_t u; } x; + x.u = uip; + __m128 v = _mm_load1_ps(&x.f); + for (uint32_t i = 0; i < vectorNum; ++i) + uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]); +} + +/*! Update the execution mask based on block IP and UIP values */ +template +INLINE void updateMask(simd_m &mask, const simd_dw &uipVec, uint32_t ip) { + const simd_dw ipv(ip); + LE_U32(mask, uipVec, ipv); +} + +/*! Jump to the block JIP */ +#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \ + do { \ + updateUIP(UIPVEC, EMASK, UIP); \ + goto label##JIP; \ + } while (0) + +/*! Based on the condition jump to block JIP */ +#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \ + do { \ + updateUIP(UIPVEC, COND, UIP); \ + typeof(COND) jumpCond; \ + scalar_dw jipScalar(uint32_t(JIP)); \ + LT_U32(jumpCond, UIPVEC, JIP); \ + uint32_t jumpMask = mask(jumpCond); \ + if (!jumpMask) goto label##JIP; \ + } while (0) + +/*! Backward jump is always taken */ +#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \ + do { \ + updateUIP(UIPVEC, EMASK, JIP); \ + goto label##JIP; \ + } while (0) + +/*! Conditional backward jump is taken if the condition is non-null */ +#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \ + do { \ + updateUIP(UIPVEC, COND, JIP); \ + if (mask(COND) != 0) goto label##JIP; \ + } while (0) + +/*! JOIN: reactivates lanes */ +#define SIM_JOIN(UIPVEC, MASK, IP) \ + do { \ + updateMask(MASK, UIPVEC, IP); \ + movedMask = mask(MASK); \ + } while (0) + +/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */ +#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \ + do { \ + SIM_JOIN(UIPVEC, EMASK, IP); \ + const uint32_t execMask = mask(EMASK); \ + if (execMask == 0) goto label##JIP; \ + } while (0) + +/* Macro to apply masking on destinations (from zero to four destinations) */ +#define MASKED0(OP, ...) \ + do { \ + OP(__VA_ARGS__); \ + } while (0) + +#define MASKED1(OP, ARG0, ...) \ + do { \ + typeof(ARG0) ARG0##__; \ + OP(ARG0##__, __VA_ARGS__); \ + select(ARG0, ARG0, ARG0##__, emask); \ + } while (0) + +#define MASKED2(OP, ARG0, ARG1, ...) \ + do { \ + typeof(ARG0) ARG0##__; \ + typeof(ARG1) ARG1##__; \ + OP(ARG0##__, ARG1##__, __VA_ARGS__); \ + select(ARG0, ARG0, ARG0##__, emask); \ + select(ARG1, ARG1, ARG1##__, emask); \ + } while (0) + +#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \ + do { \ + typeof(ARG0) ARG0##__; \ + typeof(ARG1) ARG1##__; \ + typeof(ARG2) ARG2##__; \ + OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \ + select(ARG0, ARG0, ARG0##__, emask); \ + select(ARG1, ARG1, ARG1##__, emask); \ + select(ARG2, ARG2, ARG2##__, emask); \ + } while (0) + +#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \ + do { \ + typeof(ARG0) ARG0##__; \ + typeof(ARG1) ARG1##__; \ + typeof(ARG2) ARG2##__; \ + typeof(ARG3) ARG3##__; \ + OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \ + select(ARG0, ARG0, ARG0##__, emask); \ + select(ARG1, ARG1, ARG1##__, emask); \ + select(ARG2, ARG2, ARG2##__, emask); \ + select(ARG3, ARG3, ARG3##__, emask); \ + } while (0) + #undef INLINE #endif /* __GBE_SIM_VECTOR_H__ */ diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp index f900c3f..d3cc938 100644 --- a/backend/src/backend/sim/sim_vector_str.cpp +++ b/backend/src/backend/sim/sim_vector_str.cpp @@ -84,7 +84,13 @@ std::string sim_vector_str = "}\n" "\n" "/*! Base structure for scalar double word */\n" -"union scalar_dw { uint32_t u; int32_t s; float f; };\n" +"union scalar_dw {\n" +" INLINE scalar_dw(void) {}\n" +" INLINE scalar_dw(uint32_t u) { this->u = u; }\n" +" INLINE scalar_dw(int32_t s) { this->s = s; }\n" +" INLINE scalar_dw(float f) { this->f = f; }\n" +" uint32_t u; int32_t s; float f;\n" +"};\n" "\n" "/*! Base structure for scalar mask */\n" "union scalar_m { uint32_t u; int32_t s; float f; };\n" @@ -113,6 +119,26 @@ std::string sim_vector_str = " __m128 m[vectorNum];\n" "};\n" "\n" +"/*! Select instruction on vectors */\n" +"template \n" +"INLINE void select(simd_dw &dst,\n" +" const simd_dw &src0,\n" +" const simd_dw &src1,\n" +" const simd_m &mask)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n" +"}\n" +"template \n" +"INLINE void select(simd_m &dst,\n" +" const simd_m &src0,\n" +" const simd_m &src1,\n" +" const simd_m &mask)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n" +"}\n" +"\n" "/*! To cast through memory */\n" "union cast_dw {\n" " INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n" @@ -135,6 +161,12 @@ std::string sim_vector_str = "};\n" "static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n" "\n" +"/*! Make a mask true */\n" +"template \n" +"INLINE void alltrueMask(simd_m &x) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;\n" +"}\n" +"\n" "/* Some convenient typedefs */\n" "typedef scalar_dw simd1dw;\n" "typedef simd_dw<1> simd4dw;\n" @@ -169,6 +201,17 @@ std::string sim_vector_str = " return m;\n" "}\n" "\n" +"/* MOV instruction */\n" +"template \n" +"INLINE void MOV_S32(simd_dw &dst, const simd_dw &v) {\n" +" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];\n" +"}\n" +"template \n" +"INLINE void MOV_S32(simd_dw &dst, const scalar_dw &x) {\n" +" const __m128 v = _mm_load1_ps(&x.f);\n" +" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n" +"}\n" +"\n" "/* Vector instructions that use sse* */\n" "#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n" "template \\\n" @@ -343,7 +386,6 @@ std::string sim_vector_str = " dst.m[i] = _mm_load1_ps(&cast.f);\n" "}\n" "\n" -"#include \n" "/* Scatter */\n" "template \n" "INLINE void SCATTER(const simd_dw &offset,\n" @@ -376,7 +418,47 @@ std::string sim_vector_str = " char *base_address) {\n" " SCATTER(simd_dw(offset), value, base_address);\n" "}\n" -"#include \n" +"\n" +"/* Masked scatter will only store unmasked lanes */\n" +"template \n" +"INLINE void MASKED_SCATTER(const simd_dw &offset,\n" +" const simd_dw &value,\n" +" char *base_address,\n" +" uint32_t mask)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n" +" const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n" +" const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n" +" const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n" +" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);\n" +" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n" +" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n" +" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n" +" if (mask & 1) *(int*)(base_address + o0) = v0;\n" +" if (mask & 2) *(int*)(base_address + o1) = v1;\n" +" if (mask & 4) *(int*)(base_address + o2) = v2;\n" +" if (mask & 8) *(int*)(base_address + o3) = v3;\n" +" mask = mask >> 4;\n" +" }\n" +"}\n" +"template \n" +"INLINE void MASKED_SCATTER(const simd_dw &offset,\n" +" const scalar_dw &value,\n" +" char *base_address,\n" +" uint32_t mask)\n" +"{\n" +" MASKED_SCATTER(offset, simd_dw(value), base_address, mask);\n" +"}\n" +"template \n" +"INLINE void MASKED_SCATTER(const scalar_dw &offset,\n" +" const simd_dw &value,\n" +" char *base_address,\n" +" uint32_t mask)\n" +"{\n" +" MASKED_SCATTER(simd_dw(offset), value, base_address, mask);\n" +"}\n" +"\n" "/* Gather */\n" "template \n" "INLINE void GATHER(simd_dw &dst,\n" @@ -404,6 +486,38 @@ std::string sim_vector_str = " GATHER(dst, simd_dw(offset), base_address);\n" "}\n" "\n" +"/* Masked gather will only load activated lanes */\n" +"template \n" +"INLINE void MASKED_GATHER(simd_dw &dst,\n" +" const simd_dw &offset,\n" +" const char *base_address,\n" +" uint32_t mask)\n" +"{\n" +" for (uint32_t i = 0; i < vectorNum; ++i) {\n" +" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);\n" +" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n" +" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n" +" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n" +" const int v0 = *(const int*)(base_address + o0);\n" +" const int v1 = *(const int*)(base_address + o1);\n" +" const int v2 = *(const int*)(base_address + o2);\n" +" const int v3 = *(const int*)(base_address + o3);\n" +" if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n" +" if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n" +" if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n" +" if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n" +" mask = mask >> 4;\n" +" }\n" +"}\n" +"template \n" +"INLINE void MASKED_GATHER(simd_dw &dst,\n" +" const scalar_dw &offset,\n" +" const char *base_address,\n" +" uint32_t mask)\n" +"{\n" +" MASKED_GATHER(dst, simd_dw(offset), base_address, mask);\n" +"}\n" +"\n" "//////////////////////////////////////////////////////////////////////////////\n" "// Scalar instructions\n" "//////////////////////////////////////////////////////////////////////////////\n" @@ -451,6 +565,8 @@ std::string sim_vector_str = "// Identical instructions are forwarded\n" "//////////////////////////////////////////////////////////////////////////////\n" "\n" +"#define NOV_U32 MOV_S32\n" +"#define NOV_F MOV_S32\n" "#define ADD_U32 ADD_S32\n" "#define SUB_U32 SUB_S32\n" "#define XOR_U32 XOR_S32\n" @@ -462,6 +578,122 @@ std::string sim_vector_str = "#undef PS2SI\n" "#undef SI2PS\n" "#undef ID\n" +"\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"// Goto implementation which is directly inspired by BDW goto and by this\n" +"// article \"Whole function vectorization\" (CGO 2011)\n" +"//////////////////////////////////////////////////////////////////////////////\n" +"\n" +"/*! Update the UIP vector according for the lanes alive in mask */\n" +"template \n" +"INLINE void updateUIP(simd_dw &uipVec, const simd_m mask, uint32_t uip) {\n" +" union { float f; uint32_t u; } x;\n" +" x.u = uip;\n" +" __m128 v = _mm_load1_ps(&x.f);\n" +" for (uint32_t i = 0; i < vectorNum; ++i)\n" +" uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);\n" +"}\n" +"\n" +"/*! Update the execution mask based on block IP and UIP values */\n" +"template \n" +"INLINE void updateMask(simd_m &mask, const simd_dw &uipVec, uint32_t ip) {\n" +" const simd_dw ipv(ip);\n" +" LE_U32(mask, uipVec, ipv);\n" +"}\n" +"\n" +"/*! Jump to the block JIP */\n" +"#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \\\n" +" do { \\\n" +" updateUIP(UIPVEC, EMASK, UIP); \\\n" +" goto label##JIP; \\\n" +" } while (0)\n" +"\n" +"/*! Based on the condition jump to block JIP */\n" +"#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \\\n" +" do { \\\n" +" updateUIP(UIPVEC, COND, UIP); \\\n" +" typeof(COND) jumpCond; \\\n" +" scalar_dw jipScalar(uint32_t(JIP)); \\\n" +" LT_U32(jumpCond, UIPVEC, JIP); \\\n" +" uint32_t jumpMask = mask(jumpCond); \\\n" +" if (!jumpMask) goto label##JIP; \\\n" +" } while (0)\n" +"\n" +"/*! Backward jump is always taken */\n" +"#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \\\n" +" do { \\\n" +" updateUIP(UIPVEC, EMASK, JIP); \\\n" +" goto label##JIP; \\\n" +" } while (0)\n" +"\n" +"/*! Conditional backward jump is taken if the condition is non-null */\n" +"#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \\\n" +" do { \\\n" +" updateUIP(UIPVEC, COND, JIP); \\\n" +" if (mask(COND) != 0) goto label##JIP; \\\n" +" } while (0)\n" +"\n" +"/*! JOIN: reactivates lanes */\n" +"#define SIM_JOIN(UIPVEC, MASK, IP) \\\n" +" do { \\\n" +" updateMask(MASK, UIPVEC, IP); \\\n" +" movedMask = mask(MASK); \\\n" +" } while (0)\n" +"\n" +"/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */\n" +"#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \\\n" +" do { \\\n" +" SIM_JOIN(UIPVEC, EMASK, IP); \\\n" +" const uint32_t execMask = mask(EMASK); \\\n" +" if (execMask == 0) goto label##JIP; \\\n" +" } while (0)\n" +"\n" +"/* Macro to apply masking on destinations (from zero to four destinations) */\n" +"#define MASKED0(OP, ...) \\\n" +" do { \\\n" +" OP(__VA_ARGS__); \\\n" +" } while (0)\n" +"\n" +"#define MASKED1(OP, ARG0, ...) \\\n" +" do { \\\n" +" typeof(ARG0) ARG0##__; \\\n" +" OP(ARG0##__, __VA_ARGS__); \\\n" +" select(ARG0, ARG0, ARG0##__, emask); \\\n" +" } while (0)\n" +"\n" +"#define MASKED2(OP, ARG0, ARG1, ...) \\\n" +" do { \\\n" +" typeof(ARG0) ARG0##__; \\\n" +" typeof(ARG1) ARG1##__; \\\n" +" OP(ARG0##__, ARG1##__, __VA_ARGS__); \\\n" +" select(ARG0, ARG0, ARG0##__, emask); \\\n" +" select(ARG1, ARG1, ARG1##__, emask); \\\n" +" } while (0)\n" +"\n" +"#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \\\n" +" do { \\\n" +" typeof(ARG0) ARG0##__; \\\n" +" typeof(ARG1) ARG1##__; \\\n" +" typeof(ARG2) ARG2##__; \\\n" +" OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \\\n" +" select(ARG0, ARG0, ARG0##__, emask); \\\n" +" select(ARG1, ARG1, ARG1##__, emask); \\\n" +" select(ARG2, ARG2, ARG2##__, emask); \\\n" +" } while (0)\n" +"\n" +"#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \\\n" +" do { \\\n" +" typeof(ARG0) ARG0##__; \\\n" +" typeof(ARG1) ARG1##__; \\\n" +" typeof(ARG2) ARG2##__; \\\n" +" typeof(ARG3) ARG3##__; \\\n" +" OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \\\n" +" select(ARG0, ARG0, ARG0##__, emask); \\\n" +" select(ARG1, ARG1, ARG1##__, emask); \\\n" +" select(ARG2, ARG2, ARG2##__, emask); \\\n" +" select(ARG3, ARG3, ARG3##__, emask); \\\n" +" } while (0)\n" +"\n" "#undef INLINE\n" "\n" "#endif /* __GBE_SIM_VECTOR_H__ */\n" diff --git a/backend/src/backend/sim_context.cpp b/backend/src/backend/sim_context.cpp index b01ca7b..04176f5 100644 --- a/backend/src/backend/sim_context.cpp +++ b/backend/src/backend/sim_context.cpp @@ -69,12 +69,17 @@ namespace gbe if (reg == ir::ocl::lid2) lid2 = true; const ir::RegisterData regData = fn.getRegisterData(reg); switch (regData.family) { - case ir::FAMILY_BOOL: case ir::FAMILY_BYTE: case ir::FAMILY_WORD: case ir::FAMILY_QWORD: NOT_IMPLEMENTED; break; + case ir::FAMILY_BOOL: + if (isScalarReg(reg) == true) + o << "scalar_m _" << regID << ";\n"; + else + o << "simd" << simdWidth << "m _" << regID << ";\n"; + break; case ir::FAMILY_DWORD: if (isScalarReg(reg) == true) o << "scalar_dw _" << regID << ";\n"; @@ -90,9 +95,9 @@ namespace gbe if (lid2 == false) o << "scalar_dw _" << uint32_t(ir::ocl::lid2) << ";\n"; } -#define LOAD_SPECIAL_REG(CURBE, REG) do { \ - const int32_t offset = kernel->getCurbeOffset(CURBE, 0); \ - if (offset >= 0) \ +#define LOAD_SPECIAL_REG(CURBE, REG) do { \ + const int32_t offset = kernel->getCurbeOffset(CURBE, 0); \ + if (offset >= 0) \ o << "LOAD(_" << uint32_t(REG) << ", curbe + " << offset << ");\n"; \ } while (0) @@ -148,16 +153,23 @@ namespace gbe }; } + void SimContext::emitMaskingCode(void) { + o << "simd" << simdWidth << "m " << "emask;\n" + << "simd" << simdWidth << "dw " << "uip(scalar_dw(0u));\n" + << "alltrueMask(emask);\n" + << "uint32_t movedMask = ~0x0u;\n"; + } + void SimContext::emitInstructionStream(void) { using namespace ir; fn.foreachInstruction([&](const Instruction &insn) { const char *opcodeStr = NULL; const Opcode opcode = insn.getOpcode(); -#define DECL_INSN(OPCODE, FAMILY) \ - case OP_##OPCODE: \ - if (opcode == OP_LOAD) opcodeStr = "GATHER"; \ +#define DECL_INSN(OPCODE, FAMILY) \ + case OP_##OPCODE: \ + if (opcode == OP_LOAD) opcodeStr = "GATHER"; \ else if (opcode == OP_STORE) opcodeStr = "SCATTER"; \ - else opcodeStr = #OPCODE; \ + else opcodeStr = #OPCODE; \ break; switch (opcode) { #include "ir/instruction.hxx" @@ -167,11 +179,40 @@ namespace gbe if (opcode == OP_LABEL) { const LabelInstruction labelInsn = cast(insn); const LabelIndex index = labelInsn.getLabelIndex(); - if (usedLabels.contains(index) == true) - o << "label" << index << ":\n"; + o << "\n"; + if (usedLabels.contains(index) == false) o << "// "; + o << "label" << index << ":\n"; + o << "SIM_JOIN(uip, emask, " << uint32_t(index) << ");\n"; return; } else if (opcode == OP_BRA) { - NOT_IMPLEMENTED; + // Get the label of the block + const BranchInstruction bra = cast(insn); + const BasicBlock *bb = insn.getParent(); + const Instruction *label = bb->getFirstInstruction(); + GBE_ASSERT(label->isMemberOf() == true); + const LabelIndex srcIndex = cast(label)->getLabelIndex(); + const LabelIndex dstIndex = bra.getLabelIndex(); + const bool isPredicated = bra.isPredicated(); + + if (uint32_t(dstIndex) > uint32_t(srcIndex)) { // FWD jump here + if (isPredicated) { + const Register pred = bra.getPredicateIndex(); + o << "SIM_FWD_BRA_C(uip, emask, " << "_" << pred + << ", " << uint32_t(dstIndex) << ", " << uint32_t(dstIndex) + << ");\n"; + } else { + o << "SIM_FWD_BRA(uip, emask, " + << uint32_t(dstIndex) << ", " << uint32_t(dstIndex) + << ");\n"; + } + } else { // BWD jump + if (isPredicated) { + const Register pred = bra.getPredicateIndex(); + o << "SIM_BWD_BRA_C(uip, _" << pred + << ", " << uint32_t(dstIndex) << ");\n"; + } else + o << "SIM_BWD_BRA(uip, emask, " << uint32_t(dstIndex) << ");\n"; + } return; } else if (opcode == OP_RET) { o << "return;\n"; @@ -189,7 +230,13 @@ namespace gbe // Regular compute instruction const uint32_t dstNum = insn.getDstNum(); const uint32_t srcNum = insn.getSrcNum(); - o << opcodeStr; + + // These two needs a new instruction. Fortunately, it is just a string + // manipulation. MASKED(OP,... just becomes MASKED_OP(...) + if (opcode == OP_STORE || opcode == OP_LOAD) + o << "MASKED_" << opcodeStr << "("; + else + o << "MASKED" << dstNum << "(" << opcodeStr; // Append type when needed if (insn.isMemberOf() == true) @@ -200,7 +247,8 @@ namespace gbe o << "_" << typeStr(cast(insn).getType()); else if (insn.isMemberOf() == true) o << "_" << typeStr(cast(insn).getType()); - o << "("; + if (opcode != OP_STORE && opcode != OP_LOAD) + o << ", "; // Output both destinations and sources in that order for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { @@ -220,7 +268,7 @@ namespace gbe imm.type == TYPE_FLOAT); o << ", " << imm.data.u32; } else if (opcode == OP_LOAD || opcode == OP_STORE) - o << ", base"; + o << ", base, movedMask"; o << ");\n"; }); } @@ -250,7 +298,9 @@ namespace gbe << "const size_t curbe_sz = sim->get_curbe_size(sim);\n" << "const char *curbe = (const char*) sim->get_curbe_address(sim) + curbe_sz * tid;\n" << "char *base = (char*) sim->get_base_address(sim);\n"; + this->emitRegisters(); + this->emitMaskingCode(); this->emitCurbeLoad(); this->emitInstructionStream(); o << "}\n"; diff --git a/backend/src/backend/sim_context.hpp b/backend/src/backend/sim_context.hpp index 9021adb..a52d439 100644 --- a/backend/src/backend/sim_context.hpp +++ b/backend/src/backend/sim_context.hpp @@ -51,6 +51,8 @@ namespace gbe void emitRegisters(void); /*! Load the curbe data into the registers */ void emitCurbeLoad(void); + /*! Emit the masking code (mask / UIP) */ + void emitMaskingCode(void); /*! Emit the instructions */ void emitInstructionStream(void); /*! Implements base class */ diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp index 9cdbbec..0759f46 100644 --- a/backend/src/llvm/llvm_to_gen.cpp +++ b/backend/src/llvm/llvm_to_gen.cpp @@ -36,6 +36,10 @@ #include "sys/cvar.hpp" #include "sys/platform.hpp" +#include +#include +#include + namespace gbe { BVAR(OCL_OUTPUT_LLVM, false); @@ -44,10 +48,11 @@ namespace gbe bool llvmToGen(ir::Unit &unit, const char *fileName) { using namespace llvm; + // Get the global LLVM context llvm::LLVMContext& c = llvm::getGlobalContext(); std::string errInfo; - llvm::raw_fd_ostream o("-", errInfo); + auto *o = new llvm::raw_fd_ostream("-", errInfo); // Get the module from its file SMDiagnostic Err; @@ -60,7 +65,7 @@ namespace gbe // Print the code before further optimizations if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS) - passes.add(createPrintModulePass(&o)); + passes.add(createPrintModulePass(o)); passes.add(createScalarReplAggregatesPass()); // Break up allocas passes.add(createRemoveGEPPass(unit)); passes.add(createConstantPropagationPass()); @@ -72,8 +77,15 @@ namespace gbe // Print the code extra optimization passes if (OCL_OUTPUT_LLVM) - passes.add(createPrintModulePass(&o)); + passes.add(createPrintModulePass(o)); passes.run(mod); + + // raw_fd_ostream closes stdout. We must reopen it + delete o; + int fd; + fd = open("/dev/tty", O_WRONLY); + stdout = fdopen(fd, "w"); + return true; } } /* namespace gbe */ diff --git a/backend/src/llvm/stdlib.h b/backend/src/ocl_stdlib.h similarity index 100% rename from backend/src/llvm/stdlib.h rename to backend/src/ocl_stdlib.h diff --git a/backend/src/llvm/stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp similarity index 99% rename from backend/src/llvm/stdlib_str.cpp rename to backend/src/ocl_stdlib_str.cpp index 41ce7fe..fb262c2 100644 --- a/backend/src/llvm/stdlib_str.cpp +++ b/backend/src/ocl_stdlib_str.cpp @@ -19,7 +19,7 @@ #include "string" namespace gbe { -std::string stdlib_str = +std::string ocl_stdlib_str = "#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n" "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n" "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n" -- 2.7.4