set (TO_STRINGIFY_FILES simulator sim_vector)
stringify ("${GBE_SOURCE_DIR}/src/backend/sim/" "${TO_STRINGIFY_FILES}")
-set (TO_STRINGIFY_FILES stdlib)
-stringify ("${GBE_SOURCE_DIR}/src/llvm/" "${TO_STRINGIFY_FILES}")
+set (TO_STRINGIFY_FILES ocl_stdlib)
+stringify ("${GBE_SOURCE_DIR}/src/" "${TO_STRINGIFY_FILES}")
if (GBE_USE_BLOB)
set (GBE_SRC blob.cpp)
else (GBE_USE_BLOB)
set (GBE_SRC
+ ocl_stdlib.h
+ ocl_stdlib_str.cpp
sys/vector.hpp
sys/hash_map.hpp
sys/map.hpp
ir/function.hpp
ir/value.cpp
ir/value.hpp
- llvm/stdlib_str.cpp
backend/context.cpp
backend/context.hpp
backend/program.cpp
GBE_SAFE_DELETE(program);
}
- extern std::string stdlib_str;
+ extern std::string ocl_stdlib_str;
static gbe_program programNewFromSource(const char *source,
size_t stringSize,
char *err,
// Write the source to the cl file
FILE *clFile = fopen(clName.c_str(), "w");
FATAL_IF(clFile == NULL, "Failed to open temporary file");
- fwrite(stdlib_str.c_str(), strlen(stdlib_str.c_str()), 1, clFile);
+ fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
fwrite(source, strlen(source), 1, clFile);
fclose(clFile);
}
/*! Base structure for scalar double word */
-union scalar_dw { uint32_t u; int32_t s; float f; };
+union scalar_dw {
+ INLINE scalar_dw(void) {}
+ INLINE scalar_dw(uint32_t u) { this->u = u; }
+ INLINE scalar_dw(int32_t s) { this->s = s; }
+ INLINE scalar_dw(float f) { this->f = f; }
+ uint32_t u; int32_t s; float f;
+};
/*! Base structure for scalar mask */
union scalar_m { uint32_t u; int32_t s; float f; };
__m128 m[vectorNum];
};
+/*! Select instruction on vectors */
+template <uint32_t vectorNum>
+INLINE void select(simd_dw<vectorNum> &dst,
+ const simd_dw<vectorNum> &src0,
+ const simd_dw<vectorNum> &src1,
+ const simd_m<vectorNum> &mask)
+{
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
+}
+template <uint32_t vectorNum>
+INLINE void select(simd_m<vectorNum> &dst,
+ const simd_m<vectorNum> &src0,
+ const simd_m<vectorNum> &src1,
+ const simd_m<vectorNum> &mask)
+{
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
+}
+
/*! To cast through memory */
union cast_dw {
INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
};
static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+/*! Make a mask true */
+template <uint32_t vectorNum>
+INLINE void alltrueMask(simd_m<vectorNum> &x) {
+ for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;
+}
+
/* Some convenient typedefs */
typedef scalar_dw simd1dw;
typedef simd_dw<1> simd4dw;
return m;
}
+/* MOV instruction */
+template <uint32_t vectorNum>
+INLINE void MOV_S32(simd_dw<vectorNum> &dst, const simd_dw<vectorNum> &v) {
+ for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];
+}
+template <uint32_t vectorNum>
+INLINE void MOV_S32(simd_dw<vectorNum> &dst, const scalar_dw &x) {
+ const __m128 v = _mm_load1_ps(&x.f);
+ for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;
+}
+
/* Vector instructions that use sse* */
#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
template <uint32_t vectorNum>\
dst.m[i] = _mm_load1_ps(&cast.f);
}
-#include <cstdio>
/* Scatter */
template <uint32_t vectorNum>
INLINE void SCATTER(const simd_dw<vectorNum> &offset,
char *base_address) {
SCATTER(simd_dw<vectorNum>(offset), value, base_address);
}
-#include <cstdio>
+
+/* Masked scatter will only store unmasked lanes */
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,
+ const simd_dw<vectorNum> &value,
+ char *base_address,
+ uint32_t mask)
+{
+ for (uint32_t i = 0; i < vectorNum; ++i) {
+ const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);
+ const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);
+ const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);
+ const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);
+ const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);
+ const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);
+ const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);
+ const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);
+ if (mask & 1) *(int*)(base_address + o0) = v0;
+ if (mask & 2) *(int*)(base_address + o1) = v1;
+ if (mask & 4) *(int*)(base_address + o2) = v2;
+ if (mask & 8) *(int*)(base_address + o3) = v3;
+ mask = mask >> 4;
+ }
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,
+ const scalar_dw &value,
+ char *base_address,
+ uint32_t mask)
+{
+ MASKED_SCATTER(offset, simd_dw<vectorNum>(value), base_address, mask);
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const scalar_dw &offset,
+ const simd_dw<vectorNum> &value,
+ char *base_address,
+ uint32_t mask)
+{
+ MASKED_SCATTER(simd_dw<vectorNum>(offset), value, base_address, mask);
+}
+
/* Gather */
template <uint32_t vectorNum>
INLINE void GATHER(simd_dw<vectorNum> &dst,
GATHER(dst, simd_dw<vectorNum>(offset), base_address);
}
+/* Masked gather will only load activated lanes */
+template <uint32_t vectorNum>
+INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,
+ const simd_dw<vectorNum> &offset,
+ const char *base_address,
+ uint32_t mask)
+{
+ for (uint32_t i = 0; i < vectorNum; ++i) {
+ const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);
+ const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);
+ const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);
+ const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);
+ const int v0 = *(const int*)(base_address + o0);
+ const int v1 = *(const int*)(base_address + o1);
+ const int v2 = *(const int*)(base_address + o2);
+ const int v3 = *(const int*)(base_address + o3);
+ if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));
+ if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));
+ if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));
+ if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));
+ mask = mask >> 4;
+ }
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,
+ const scalar_dw &offset,
+ const char *base_address,
+ uint32_t mask)
+{
+ MASKED_GATHER(dst, simd_dw<vectorNum>(offset), base_address, mask);
+}
+
//////////////////////////////////////////////////////////////////////////////
// Scalar instructions
//////////////////////////////////////////////////////////////////////////////
// Identical instructions are forwarded
//////////////////////////////////////////////////////////////////////////////
+#define NOV_U32 MOV_S32
+#define NOV_F MOV_S32
#define ADD_U32 ADD_S32
#define SUB_U32 SUB_S32
#define XOR_U32 XOR_S32
#undef PS2SI
#undef SI2PS
#undef ID
+
+//////////////////////////////////////////////////////////////////////////////
+// Goto implementation which is directly inspired by BDW goto and by this
+// article "Whole function vectorization" (CGO 2011)
+//////////////////////////////////////////////////////////////////////////////
+
+/*! Update the UIP vector according for the lanes alive in mask */
+template <uint32_t vectorNum>
+INLINE void updateUIP(simd_dw<vectorNum> &uipVec, const simd_m<vectorNum> mask, uint32_t uip) {
+ union { float f; uint32_t u; } x;
+ x.u = uip;
+ __m128 v = _mm_load1_ps(&x.f);
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);
+}
+
+/*! Update the execution mask based on block IP and UIP values */
+template <uint32_t vectorNum>
+INLINE void updateMask(simd_m<vectorNum> &mask, const simd_dw<vectorNum> &uipVec, uint32_t ip) {
+ const simd_dw<vectorNum> ipv(ip);
+ LE_U32(mask, uipVec, ipv);
+}
+
+/*! Jump to the block JIP */
+#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \
+ do { \
+ updateUIP(UIPVEC, EMASK, UIP); \
+ goto label##JIP; \
+ } while (0)
+
+/*! Based on the condition jump to block JIP */
+#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \
+ do { \
+ updateUIP(UIPVEC, COND, UIP); \
+ typeof(COND) jumpCond; \
+ scalar_dw jipScalar(uint32_t(JIP)); \
+ LT_U32(jumpCond, UIPVEC, JIP); \
+ uint32_t jumpMask = mask(jumpCond); \
+ if (!jumpMask) goto label##JIP; \
+ } while (0)
+
+/*! Backward jump is always taken */
+#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \
+ do { \
+ updateUIP(UIPVEC, EMASK, JIP); \
+ goto label##JIP; \
+ } while (0)
+
+/*! Conditional backward jump is taken if the condition is non-null */
+#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \
+ do { \
+ updateUIP(UIPVEC, COND, JIP); \
+ if (mask(COND) != 0) goto label##JIP; \
+ } while (0)
+
+/*! JOIN: reactivates lanes */
+#define SIM_JOIN(UIPVEC, MASK, IP) \
+ do { \
+ updateMask(MASK, UIPVEC, IP); \
+ movedMask = mask(MASK); \
+ } while (0)
+
+/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */
+#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \
+ do { \
+ SIM_JOIN(UIPVEC, EMASK, IP); \
+ const uint32_t execMask = mask(EMASK); \
+ if (execMask == 0) goto label##JIP; \
+ } while (0)
+
+/* Macro to apply masking on destinations (from zero to four destinations) */
+#define MASKED0(OP, ...) \
+ do { \
+ OP(__VA_ARGS__); \
+ } while (0)
+
+#define MASKED1(OP, ARG0, ...) \
+ do { \
+ typeof(ARG0) ARG0##__; \
+ OP(ARG0##__, __VA_ARGS__); \
+ select(ARG0, ARG0, ARG0##__, emask); \
+ } while (0)
+
+#define MASKED2(OP, ARG0, ARG1, ...) \
+ do { \
+ typeof(ARG0) ARG0##__; \
+ typeof(ARG1) ARG1##__; \
+ OP(ARG0##__, ARG1##__, __VA_ARGS__); \
+ select(ARG0, ARG0, ARG0##__, emask); \
+ select(ARG1, ARG1, ARG1##__, emask); \
+ } while (0)
+
+#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \
+ do { \
+ typeof(ARG0) ARG0##__; \
+ typeof(ARG1) ARG1##__; \
+ typeof(ARG2) ARG2##__; \
+ OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \
+ select(ARG0, ARG0, ARG0##__, emask); \
+ select(ARG1, ARG1, ARG1##__, emask); \
+ select(ARG2, ARG2, ARG2##__, emask); \
+ } while (0)
+
+#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \
+ do { \
+ typeof(ARG0) ARG0##__; \
+ typeof(ARG1) ARG1##__; \
+ typeof(ARG2) ARG2##__; \
+ typeof(ARG3) ARG3##__; \
+ OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \
+ select(ARG0, ARG0, ARG0##__, emask); \
+ select(ARG1, ARG1, ARG1##__, emask); \
+ select(ARG2, ARG2, ARG2##__, emask); \
+ select(ARG3, ARG3, ARG3##__, emask); \
+ } while (0)
+
#undef INLINE
#endif /* __GBE_SIM_VECTOR_H__ */
"}\n"
"\n"
"/*! Base structure for scalar double word */\n"
-"union scalar_dw { uint32_t u; int32_t s; float f; };\n"
+"union scalar_dw {\n"
+" INLINE scalar_dw(void) {}\n"
+" INLINE scalar_dw(uint32_t u) { this->u = u; }\n"
+" INLINE scalar_dw(int32_t s) { this->s = s; }\n"
+" INLINE scalar_dw(float f) { this->f = f; }\n"
+" uint32_t u; int32_t s; float f;\n"
+"};\n"
"\n"
"/*! Base structure for scalar mask */\n"
"union scalar_m { uint32_t u; int32_t s; float f; };\n"
" __m128 m[vectorNum];\n"
"};\n"
"\n"
+"/*! Select instruction on vectors */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void select(simd_dw<vectorNum> &dst,\n"
+" const simd_dw<vectorNum> &src0,\n"
+" const simd_dw<vectorNum> &src1,\n"
+" const simd_m<vectorNum> &mask)\n"
+"{\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void select(simd_m<vectorNum> &dst,\n"
+" const simd_m<vectorNum> &src0,\n"
+" const simd_m<vectorNum> &src1,\n"
+" const simd_m<vectorNum> &mask)\n"
+"{\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
+"}\n"
+"\n"
"/*! To cast through memory */\n"
"union cast_dw {\n"
" INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
"};\n"
"static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n"
"\n"
+"/*! Make a mask true */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void alltrueMask(simd_m<vectorNum> &x) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;\n"
+"}\n"
+"\n"
"/* Some convenient typedefs */\n"
"typedef scalar_dw simd1dw;\n"
"typedef simd_dw<1> simd4dw;\n"
" return m;\n"
"}\n"
"\n"
+"/* MOV instruction */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S32(simd_dw<vectorNum> &dst, const simd_dw<vectorNum> &v) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S32(simd_dw<vectorNum> &dst, const scalar_dw &x) {\n"
+" const __m128 v = _mm_load1_ps(&x.f);\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n"
+"}\n"
+"\n"
"/* Vector instructions that use sse* */\n"
"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n"
"template <uint32_t vectorNum>\\\n"
" dst.m[i] = _mm_load1_ps(&cast.f);\n"
"}\n"
"\n"
-"#include <cstdio>\n"
"/* Scatter */\n"
"template <uint32_t vectorNum>\n"
"INLINE void SCATTER(const simd_dw<vectorNum> &offset,\n"
" char *base_address) {\n"
" SCATTER(simd_dw<vectorNum>(offset), value, base_address);\n"
"}\n"
-"#include <cstdio>\n"
+"\n"
+"/* Masked scatter will only store unmasked lanes */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,\n"
+" const simd_dw<vectorNum> &value,\n"
+" char *base_address,\n"
+" uint32_t mask)\n"
+"{\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+" const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n"
+" const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n"
+" const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n"
+" const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n"
+" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);\n"
+" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n"
+" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n"
+" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n"
+" if (mask & 1) *(int*)(base_address + o0) = v0;\n"
+" if (mask & 2) *(int*)(base_address + o1) = v1;\n"
+" if (mask & 4) *(int*)(base_address + o2) = v2;\n"
+" if (mask & 8) *(int*)(base_address + o3) = v3;\n"
+" mask = mask >> 4;\n"
+" }\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,\n"
+" const scalar_dw &value,\n"
+" char *base_address,\n"
+" uint32_t mask)\n"
+"{\n"
+" MASKED_SCATTER(offset, simd_dw<vectorNum>(value), base_address, mask);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const scalar_dw &offset,\n"
+" const simd_dw<vectorNum> &value,\n"
+" char *base_address,\n"
+" uint32_t mask)\n"
+"{\n"
+" MASKED_SCATTER(simd_dw<vectorNum>(offset), value, base_address, mask);\n"
+"}\n"
+"\n"
"/* Gather */\n"
"template <uint32_t vectorNum>\n"
"INLINE void GATHER(simd_dw<vectorNum> &dst,\n"
" GATHER(dst, simd_dw<vectorNum>(offset), base_address);\n"
"}\n"
"\n"
+"/* Masked gather will only load activated lanes */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,\n"
+" const simd_dw<vectorNum> &offset,\n"
+" const char *base_address,\n"
+" uint32_t mask)\n"
+"{\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+" const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);\n"
+" const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n"
+" const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n"
+" const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n"
+" const int v0 = *(const int*)(base_address + o0);\n"
+" const int v1 = *(const int*)(base_address + o1);\n"
+" const int v2 = *(const int*)(base_address + o2);\n"
+" const int v3 = *(const int*)(base_address + o3);\n"
+" if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n"
+" if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n"
+" if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n"
+" if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n"
+" mask = mask >> 4;\n"
+" }\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,\n"
+" const scalar_dw &offset,\n"
+" const char *base_address,\n"
+" uint32_t mask)\n"
+"{\n"
+" MASKED_GATHER(dst, simd_dw<vectorNum>(offset), base_address, mask);\n"
+"}\n"
+"\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"// Scalar instructions\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"// Identical instructions are forwarded\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"\n"
+"#define NOV_U32 MOV_S32\n"
+"#define NOV_F MOV_S32\n"
"#define ADD_U32 ADD_S32\n"
"#define SUB_U32 SUB_S32\n"
"#define XOR_U32 XOR_S32\n"
"#undef PS2SI\n"
"#undef SI2PS\n"
"#undef ID\n"
+"\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"// Goto implementation which is directly inspired by BDW goto and by this\n"
+"// article \"Whole function vectorization\" (CGO 2011)\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"\n"
+"/*! Update the UIP vector according for the lanes alive in mask */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void updateUIP(simd_dw<vectorNum> &uipVec, const simd_m<vectorNum> mask, uint32_t uip) {\n"
+" union { float f; uint32_t u; } x;\n"
+" x.u = uip;\n"
+" __m128 v = _mm_load1_ps(&x.f);\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);\n"
+"}\n"
+"\n"
+"/*! Update the execution mask based on block IP and UIP values */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void updateMask(simd_m<vectorNum> &mask, const simd_dw<vectorNum> &uipVec, uint32_t ip) {\n"
+" const simd_dw<vectorNum> ipv(ip);\n"
+" LE_U32(mask, uipVec, ipv);\n"
+"}\n"
+"\n"
+"/*! Jump to the block JIP */\n"
+"#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \\\n"
+" do { \\\n"
+" updateUIP(UIPVEC, EMASK, UIP); \\\n"
+" goto label##JIP; \\\n"
+" } while (0)\n"
+"\n"
+"/*! Based on the condition jump to block JIP */\n"
+"#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \\\n"
+" do { \\\n"
+" updateUIP(UIPVEC, COND, UIP); \\\n"
+" typeof(COND) jumpCond; \\\n"
+" scalar_dw jipScalar(uint32_t(JIP)); \\\n"
+" LT_U32(jumpCond, UIPVEC, JIP); \\\n"
+" uint32_t jumpMask = mask(jumpCond); \\\n"
+" if (!jumpMask) goto label##JIP; \\\n"
+" } while (0)\n"
+"\n"
+"/*! Backward jump is always taken */\n"
+"#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \\\n"
+" do { \\\n"
+" updateUIP(UIPVEC, EMASK, JIP); \\\n"
+" goto label##JIP; \\\n"
+" } while (0)\n"
+"\n"
+"/*! Conditional backward jump is taken if the condition is non-null */\n"
+"#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \\\n"
+" do { \\\n"
+" updateUIP(UIPVEC, COND, JIP); \\\n"
+" if (mask(COND) != 0) goto label##JIP; \\\n"
+" } while (0)\n"
+"\n"
+"/*! JOIN: reactivates lanes */\n"
+"#define SIM_JOIN(UIPVEC, MASK, IP) \\\n"
+" do { \\\n"
+" updateMask(MASK, UIPVEC, IP); \\\n"
+" movedMask = mask(MASK); \\\n"
+" } while (0)\n"
+"\n"
+"/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */\n"
+"#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \\\n"
+" do { \\\n"
+" SIM_JOIN(UIPVEC, EMASK, IP); \\\n"
+" const uint32_t execMask = mask(EMASK); \\\n"
+" if (execMask == 0) goto label##JIP; \\\n"
+" } while (0)\n"
+"\n"
+"/* Macro to apply masking on destinations (from zero to four destinations) */\n"
+"#define MASKED0(OP, ...) \\\n"
+" do { \\\n"
+" OP(__VA_ARGS__); \\\n"
+" } while (0)\n"
+"\n"
+"#define MASKED1(OP, ARG0, ...) \\\n"
+" do { \\\n"
+" typeof(ARG0) ARG0##__; \\\n"
+" OP(ARG0##__, __VA_ARGS__); \\\n"
+" select(ARG0, ARG0, ARG0##__, emask); \\\n"
+" } while (0)\n"
+"\n"
+"#define MASKED2(OP, ARG0, ARG1, ...) \\\n"
+" do { \\\n"
+" typeof(ARG0) ARG0##__; \\\n"
+" typeof(ARG1) ARG1##__; \\\n"
+" OP(ARG0##__, ARG1##__, __VA_ARGS__); \\\n"
+" select(ARG0, ARG0, ARG0##__, emask); \\\n"
+" select(ARG1, ARG1, ARG1##__, emask); \\\n"
+" } while (0)\n"
+"\n"
+"#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \\\n"
+" do { \\\n"
+" typeof(ARG0) ARG0##__; \\\n"
+" typeof(ARG1) ARG1##__; \\\n"
+" typeof(ARG2) ARG2##__; \\\n"
+" OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \\\n"
+" select(ARG0, ARG0, ARG0##__, emask); \\\n"
+" select(ARG1, ARG1, ARG1##__, emask); \\\n"
+" select(ARG2, ARG2, ARG2##__, emask); \\\n"
+" } while (0)\n"
+"\n"
+"#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \\\n"
+" do { \\\n"
+" typeof(ARG0) ARG0##__; \\\n"
+" typeof(ARG1) ARG1##__; \\\n"
+" typeof(ARG2) ARG2##__; \\\n"
+" typeof(ARG3) ARG3##__; \\\n"
+" OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \\\n"
+" select(ARG0, ARG0, ARG0##__, emask); \\\n"
+" select(ARG1, ARG1, ARG1##__, emask); \\\n"
+" select(ARG2, ARG2, ARG2##__, emask); \\\n"
+" select(ARG3, ARG3, ARG3##__, emask); \\\n"
+" } while (0)\n"
+"\n"
"#undef INLINE\n"
"\n"
"#endif /* __GBE_SIM_VECTOR_H__ */\n"
if (reg == ir::ocl::lid2) lid2 = true;
const ir::RegisterData regData = fn.getRegisterData(reg);
switch (regData.family) {
- case ir::FAMILY_BOOL:
case ir::FAMILY_BYTE:
case ir::FAMILY_WORD:
case ir::FAMILY_QWORD:
NOT_IMPLEMENTED;
break;
+ case ir::FAMILY_BOOL:
+ if (isScalarReg(reg) == true)
+ o << "scalar_m _" << regID << ";\n";
+ else
+ o << "simd" << simdWidth << "m _" << regID << ";\n";
+ break;
case ir::FAMILY_DWORD:
if (isScalarReg(reg) == true)
o << "scalar_dw _" << regID << ";\n";
if (lid2 == false) o << "scalar_dw _" << uint32_t(ir::ocl::lid2) << ";\n";
}
-#define LOAD_SPECIAL_REG(CURBE, REG) do { \
- const int32_t offset = kernel->getCurbeOffset(CURBE, 0); \
- if (offset >= 0) \
+#define LOAD_SPECIAL_REG(CURBE, REG) do { \
+ const int32_t offset = kernel->getCurbeOffset(CURBE, 0); \
+ if (offset >= 0) \
o << "LOAD(_" << uint32_t(REG) << ", curbe + " << offset << ");\n"; \
} while (0)
};
}
+ void SimContext::emitMaskingCode(void) {
+ o << "simd" << simdWidth << "m " << "emask;\n"
+ << "simd" << simdWidth << "dw " << "uip(scalar_dw(0u));\n"
+ << "alltrueMask(emask);\n"
+ << "uint32_t movedMask = ~0x0u;\n";
+ }
+
void SimContext::emitInstructionStream(void) {
using namespace ir;
fn.foreachInstruction([&](const Instruction &insn) {
const char *opcodeStr = NULL;
const Opcode opcode = insn.getOpcode();
-#define DECL_INSN(OPCODE, FAMILY) \
- case OP_##OPCODE: \
- if (opcode == OP_LOAD) opcodeStr = "GATHER"; \
+#define DECL_INSN(OPCODE, FAMILY) \
+ case OP_##OPCODE: \
+ if (opcode == OP_LOAD) opcodeStr = "GATHER"; \
else if (opcode == OP_STORE) opcodeStr = "SCATTER"; \
- else opcodeStr = #OPCODE; \
+ else opcodeStr = #OPCODE; \
break;
switch (opcode) {
#include "ir/instruction.hxx"
if (opcode == OP_LABEL) {
const LabelInstruction labelInsn = cast<LabelInstruction>(insn);
const LabelIndex index = labelInsn.getLabelIndex();
- if (usedLabels.contains(index) == true)
- o << "label" << index << ":\n";
+ o << "\n";
+ if (usedLabels.contains(index) == false) o << "// ";
+ o << "label" << index << ":\n";
+ o << "SIM_JOIN(uip, emask, " << uint32_t(index) << ");\n";
return;
} else if (opcode == OP_BRA) {
- NOT_IMPLEMENTED;
+ // Get the label of the block
+ const BranchInstruction bra = cast<BranchInstruction>(insn);
+ const BasicBlock *bb = insn.getParent();
+ const Instruction *label = bb->getFirstInstruction();
+ GBE_ASSERT(label->isMemberOf<LabelInstruction>() == true);
+ const LabelIndex srcIndex = cast<LabelInstruction>(label)->getLabelIndex();
+ const LabelIndex dstIndex = bra.getLabelIndex();
+ const bool isPredicated = bra.isPredicated();
+
+ if (uint32_t(dstIndex) > uint32_t(srcIndex)) { // FWD jump here
+ if (isPredicated) {
+ const Register pred = bra.getPredicateIndex();
+ o << "SIM_FWD_BRA_C(uip, emask, " << "_" << pred
+ << ", " << uint32_t(dstIndex) << ", " << uint32_t(dstIndex)
+ << ");\n";
+ } else {
+ o << "SIM_FWD_BRA(uip, emask, "
+ << uint32_t(dstIndex) << ", " << uint32_t(dstIndex)
+ << ");\n";
+ }
+ } else { // BWD jump
+ if (isPredicated) {
+ const Register pred = bra.getPredicateIndex();
+ o << "SIM_BWD_BRA_C(uip, _" << pred
+ << ", " << uint32_t(dstIndex) << ");\n";
+ } else
+ o << "SIM_BWD_BRA(uip, emask, " << uint32_t(dstIndex) << ");\n";
+ }
return;
} else if (opcode == OP_RET) {
o << "return;\n";
// Regular compute instruction
const uint32_t dstNum = insn.getDstNum();
const uint32_t srcNum = insn.getSrcNum();
- o << opcodeStr;
+
+ // These two needs a new instruction. Fortunately, it is just a string
+ // manipulation. MASKED(OP,... just becomes MASKED_OP(...)
+ if (opcode == OP_STORE || opcode == OP_LOAD)
+ o << "MASKED_" << opcodeStr << "(";
+ else
+ o << "MASKED" << dstNum << "(" << opcodeStr;
// Append type when needed
if (insn.isMemberOf<UnaryInstruction>() == true)
o << "_" << typeStr(cast<BinaryInstruction>(insn).getType());
else if (insn.isMemberOf<CompareInstruction>() == true)
o << "_" << typeStr(cast<CompareInstruction>(insn).getType());
- o << "(";
+ if (opcode != OP_STORE && opcode != OP_LOAD)
+ o << ", ";
// Output both destinations and sources in that order
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
imm.type == TYPE_FLOAT);
o << ", " << imm.data.u32;
} else if (opcode == OP_LOAD || opcode == OP_STORE)
- o << ", base";
+ o << ", base, movedMask";
o << ");\n";
});
}
<< "const size_t curbe_sz = sim->get_curbe_size(sim);\n"
<< "const char *curbe = (const char*) sim->get_curbe_address(sim) + curbe_sz * tid;\n"
<< "char *base = (char*) sim->get_base_address(sim);\n";
+
this->emitRegisters();
+ this->emitMaskingCode();
this->emitCurbeLoad();
this->emitInstructionStream();
o << "}\n";
void emitRegisters(void);
/*! Load the curbe data into the registers */
void emitCurbeLoad(void);
+ /*! Emit the masking code (mask / UIP) */
+ void emitMaskingCode(void);
/*! Emit the instructions */
void emitInstructionStream(void);
/*! Implements base class */
#include "sys/cvar.hpp"
#include "sys/platform.hpp"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
namespace gbe
{
BVAR(OCL_OUTPUT_LLVM, false);
bool llvmToGen(ir::Unit &unit, const char *fileName)
{
using namespace llvm;
+
// Get the global LLVM context
llvm::LLVMContext& c = llvm::getGlobalContext();
std::string errInfo;
- llvm::raw_fd_ostream o("-", errInfo);
+ auto *o = new llvm::raw_fd_ostream("-", errInfo);
// Get the module from its file
SMDiagnostic Err;
// Print the code before further optimizations
if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
- passes.add(createPrintModulePass(&o));
+ passes.add(createPrintModulePass(o));
passes.add(createScalarReplAggregatesPass()); // Break up allocas
passes.add(createRemoveGEPPass(unit));
passes.add(createConstantPropagationPass());
// Print the code extra optimization passes
if (OCL_OUTPUT_LLVM)
- passes.add(createPrintModulePass(&o));
+ passes.add(createPrintModulePass(o));
passes.run(mod);
+
+ // raw_fd_ostream closes stdout. We must reopen it
+ delete o;
+ int fd;
+ fd = open("/dev/tty", O_WRONLY);
+ stdout = fdopen(fd, "w");
+
return true;
}
} /* namespace gbe */
#include "string"
namespace gbe {
-std::string stdlib_str =
+std::string ocl_stdlib_str =
"#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n"
"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n"
"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n"