From 4f528e65b2360318a3c7c72d6219c8edd055d467 Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <segovia.benjamin@gmail.com>
Date: Mon, 16 Apr 2012 18:44:44 +0000
Subject: [PATCH] First implementation of branches in the simulator

---
 backend/src/CMakeLists.txt                         |   7 +-
 backend/src/backend/program.cpp                    |   4 +-
 backend/src/backend/sim/sim_vector.h               | 238 ++++++++++++++++++++-
 backend/src/backend/sim/sim_vector_str.cpp         | 238 ++++++++++++++++++++-
 backend/src/backend/sim_context.cpp                |  78 +++++--
 backend/src/backend/sim_context.hpp                |   2 +
 backend/src/llvm/llvm_to_gen.cpp                   |  18 +-
 backend/src/{llvm/stdlib.h => ocl_stdlib.h}        |   0
 .../{llvm/stdlib_str.cpp => ocl_stdlib_str.cpp}    |   2 +-
 9 files changed, 558 insertions(+), 29 deletions(-)
 rename backend/src/{llvm/stdlib.h => ocl_stdlib.h} (100%)
 rename backend/src/{llvm/stdlib_str.cpp => ocl_stdlib_str.cpp} (99%)
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 26fd6cd..f66235f 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -22,13 +22,15 @@ endmacro (stringify)
 
 set (TO_STRINGIFY_FILES simulator sim_vector)
 stringify ("${GBE_SOURCE_DIR}/src/backend/sim/" "${TO_STRINGIFY_FILES}")
-set (TO_STRINGIFY_FILES stdlib)
-stringify ("${GBE_SOURCE_DIR}/src/llvm/" "${TO_STRINGIFY_FILES}")
+set (TO_STRINGIFY_FILES ocl_stdlib)
+stringify ("${GBE_SOURCE_DIR}/src/" "${TO_STRINGIFY_FILES}")
 
 if (GBE_USE_BLOB)
   set (GBE_SRC blob.cpp)
 else (GBE_USE_BLOB)
   set (GBE_SRC
+    ocl_stdlib.h
+    ocl_stdlib_str.cpp
     sys/vector.hpp
     sys/hash_map.hpp
     sys/map.hpp
@@ -69,7 +71,6 @@ else (GBE_USE_BLOB)
     ir/function.hpp
     ir/value.cpp
     ir/value.hpp
-    llvm/stdlib_str.cpp
     backend/context.cpp
     backend/context.hpp
     backend/program.cpp
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 08dac6e..9ec86f3 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -88,7 +88,7 @@ namespace gbe {
     GBE_SAFE_DELETE(program);
   }
 
-  extern std::string stdlib_str;
+  extern std::string ocl_stdlib_str;
   static gbe_program programNewFromSource(const char *source,
                                           size_t stringSize,
                                           char *err,
@@ -101,7 +101,7 @@ namespace gbe {
     // Write the source to the cl file
     FILE *clFile = fopen(clName.c_str(), "w");
     FATAL_IF(clFile == NULL, "Failed to open temporary file");
-    fwrite(stdlib_str.c_str(), strlen(stdlib_str.c_str()), 1, clFile);
+    fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
     fwrite(source, strlen(source), 1, clFile);
     fclose(clFile);
 
diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h
index 34cec34..2baffcc 100644
--- a/backend/src/backend/sim/sim_vector.h
+++ b/backend/src/backend/sim/sim_vector.h
@@ -58,7 +58,13 @@ INLINE const __m128i expand(const __m128i& b) {
 }
 
 /*! Base structure for scalar double word */
-union scalar_dw { uint32_t u; int32_t s; float f; };
+union scalar_dw {
+  INLINE scalar_dw(void) {}
+  INLINE scalar_dw(uint32_t u) { this->u = u; }
+  INLINE scalar_dw(int32_t s) { this->s = s; }
+  INLINE scalar_dw(float f) { this->f = f; }
+  uint32_t u; int32_t s; float f;
+};
 
 /*! Base structure for scalar mask */
 union scalar_m { uint32_t u; int32_t s; float f; };
@@ -87,6 +93,26 @@ struct simd_m {
   __m128 m[vectorNum];
 };
 
+/*! Select instruction on vectors */
+template <uint32_t vectorNum>
+INLINE void select(simd_dw<vectorNum> &dst,
+                   const simd_dw<vectorNum> &src0,
+                   const simd_dw<vectorNum> &src1,
+                   const simd_m<vectorNum> &mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i)
+    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
+}
+template <uint32_t vectorNum>
+INLINE void select(simd_m<vectorNum> &dst,
+                   const simd_m<vectorNum> &src0,
+                   const simd_m<vectorNum> &src1,
+                   const simd_m<vectorNum> &mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i)
+    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
+}
+
 /*! To cast through memory */
 union cast_dw {
   INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
@@ -109,6 +135,12 @@ union cast_dw {
 };
 static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
 
+/*! Make a mask true */
+template <uint32_t vectorNum>
+INLINE void alltrueMask(simd_m<vectorNum> &x) {
+  for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;
+}
+
 /* Some convenient typedefs */
 typedef scalar_dw  simd1dw;
 typedef simd_dw<1> simd4dw;
@@ -143,6 +175,17 @@ INLINE uint32_t mask(const simd_m<vectorNum> v) {
   return m;
 }
 
+/* MOV instruction */
+template <uint32_t vectorNum>
+INLINE void MOV_S32(simd_dw<vectorNum> &dst, const simd_dw<vectorNum> &v) {
+  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];
+}
+template <uint32_t vectorNum>
+INLINE void MOV_S32(simd_dw<vectorNum> &dst, const scalar_dw &x) {
+  const __m128 v = _mm_load1_ps(&x.f);
+  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;
+}
+
 /* Vector instructions that use sse* */
 #define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
 template <uint32_t vectorNum>\
@@ -317,7 +360,6 @@ INLINE void LOADI(simd_dw<vectorNum> &dst, uint32_t u) {
     dst.m[i] = _mm_load1_ps(&cast.f);
 }
 
-#include <cstdio>
 /* Scatter */
 template <uint32_t vectorNum>
 INLINE void SCATTER(const simd_dw<vectorNum> &offset,
@@ -350,7 +392,47 @@ INLINE void SCATTER(const scalar_dw &offset,
                     char *base_address) {
   SCATTER(simd_dw<vectorNum>(offset), value, base_address);
 }
-#include <cstdio>
+
+/* Masked scatter will only store unmasked lanes */
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,
+                           const simd_dw<vectorNum> &value,
+                           char *base_address,
+                           uint32_t mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i) {
+    const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);
+    const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);
+    const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);
+    const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);
+    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);
+    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);
+    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);
+    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);
+    if (mask & 1) *(int*)(base_address + o0) = v0;
+    if (mask & 2) *(int*)(base_address + o1) = v1;
+    if (mask & 4) *(int*)(base_address + o2) = v2;
+    if (mask & 8) *(int*)(base_address + o3) = v3;
+    mask = mask >> 4;
+  }
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,
+                           const scalar_dw &value,
+                           char *base_address,
+                           uint32_t mask)
+{
+  MASKED_SCATTER(offset, simd_dw<vectorNum>(value), base_address, mask);
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const scalar_dw &offset,
+                           const simd_dw<vectorNum> &value,
+                           char *base_address,
+                           uint32_t mask)
+{
+  MASKED_SCATTER(simd_dw<vectorNum>(offset), value, base_address, mask);
+}
+
 /* Gather */
 template <uint32_t vectorNum>
 INLINE void GATHER(simd_dw<vectorNum> &dst,
@@ -378,6 +460,38 @@ INLINE void GATHER(simd_dw<vectorNum> &dst,
   GATHER(dst, simd_dw<vectorNum>(offset), base_address);
 }
 
+/* Masked gather will only load activated lanes */
+template <uint32_t vectorNum>
+INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,
+                          const simd_dw<vectorNum> &offset,
+                          const char *base_address,
+                          uint32_t mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i) {
+    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);
+    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);
+    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);
+    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);
+    const int v0 = *(const int*)(base_address + o0);
+    const int v1 = *(const int*)(base_address + o1);
+    const int v2 = *(const int*)(base_address + o2);
+    const int v3 = *(const int*)(base_address + o3);
+    if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));
+    if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));
+    if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));
+    if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));
+    mask = mask >> 4;
+  }
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,
+                          const scalar_dw &offset,
+                          const char *base_address,
+                          uint32_t mask)
+{
+  MASKED_GATHER(dst, simd_dw<vectorNum>(offset), base_address, mask);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // Scalar instructions
 //////////////////////////////////////////////////////////////////////////////
@@ -425,6 +539,8 @@ INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u =
 // Identical instructions are forwarded
 //////////////////////////////////////////////////////////////////////////////
 
+#define NOV_U32 MOV_S32
+#define NOV_F MOV_S32
 #define ADD_U32 ADD_S32
 #define SUB_U32 SUB_S32
 #define XOR_U32 XOR_S32
@@ -436,6 +552,122 @@ INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u =
 #undef PS2SI
 #undef SI2PS
 #undef ID
+
+//////////////////////////////////////////////////////////////////////////////
+// Goto implementation which is directly inspired by BDW goto and by this
+// article "Whole function vectorization" (CGO 2011)
+//////////////////////////////////////////////////////////////////////////////
+
+/*! Update the UIP vector according for the lanes alive in mask */
+template <uint32_t vectorNum>
+INLINE void updateUIP(simd_dw<vectorNum> &uipVec, const simd_m<vectorNum> mask, uint32_t uip) {
+  union { float f; uint32_t u; } x;
+  x.u = uip;
+  __m128 v = _mm_load1_ps(&x.f);
+  for (uint32_t i = 0; i < vectorNum; ++i)
+    uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);
+}
+
+/*! Update the execution mask based on block IP and UIP values */
+template <uint32_t vectorNum>
+INLINE void updateMask(simd_m<vectorNum> &mask, const simd_dw<vectorNum> &uipVec, uint32_t ip) {
+  const simd_dw<vectorNum> ipv(ip);
+  LE_U32(mask, uipVec, ipv);
+}
+
+/*! Jump to the block JIP */
+#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \
+  do { \
+    updateUIP(UIPVEC, EMASK, UIP); \
+    goto label##JIP; \
+  } while (0)
+
+/*! Based on the condition jump to block JIP */
+#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \
+  do { \
+    updateUIP(UIPVEC, COND, UIP); \
+    typeof(COND) jumpCond; \
+    scalar_dw jipScalar(uint32_t(JIP)); \
+    LT_U32(jumpCond, UIPVEC, JIP); \
+    uint32_t jumpMask = mask(jumpCond); \
+    if (!jumpMask) goto label##JIP; \
+  } while (0)
+
+/*! Backward jump is always taken */
+#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \
+  do { \
+    updateUIP(UIPVEC, EMASK, JIP); \
+    goto label##JIP; \
+  } while (0)
+
+/*! Conditional backward jump is taken if the condition is non-null */
+#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \
+  do { \
+    updateUIP(UIPVEC, COND, JIP); \
+    if (mask(COND) != 0) goto label##JIP; \
+  } while (0)
+
+/*! JOIN: reactivates lanes */
+#define SIM_JOIN(UIPVEC, MASK, IP) \
+  do { \
+    updateMask(MASK, UIPVEC, IP); \
+    movedMask = mask(MASK); \
+  } while (0)
+
+/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */
+#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \
+  do { \
+    SIM_JOIN(UIPVEC, EMASK, IP); \
+    const uint32_t execMask = mask(EMASK); \
+    if (execMask == 0) goto label##JIP; \
+  } while (0)
+
+/* Macro to apply masking on destinations (from zero to four destinations) */
+#define MASKED0(OP, ...) \
+  do { \
+    OP(__VA_ARGS__); \
+  } while (0)
+
+#define MASKED1(OP, ARG0, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    OP(ARG0##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+  } while (0)
+
+#define MASKED2(OP, ARG0, ARG1, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    typeof(ARG1) ARG1##__; \
+    OP(ARG0##__, ARG1##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+    select(ARG1, ARG1, ARG1##__, emask); \
+  } while (0)
+
+#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    typeof(ARG1) ARG1##__; \
+    typeof(ARG2) ARG2##__; \
+    OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+    select(ARG1, ARG1, ARG1##__, emask); \
+    select(ARG2, ARG2, ARG2##__, emask); \
+  } while (0)
+
+#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    typeof(ARG1) ARG1##__; \
+    typeof(ARG2) ARG2##__; \
+    typeof(ARG3) ARG3##__; \
+    OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+    select(ARG1, ARG1, ARG1##__, emask); \
+    select(ARG2, ARG2, ARG2##__, emask); \
+    select(ARG3, ARG3, ARG3##__, emask); \
+  } while (0)
+
 #undef INLINE
 
 #endif /* __GBE_SIM_VECTOR_H__ */
diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp
index f900c3f..d3cc938 100644
--- a/backend/src/backend/sim/sim_vector_str.cpp
+++ b/backend/src/backend/sim/sim_vector_str.cpp
@@ -84,7 +84,13 @@ std::string sim_vector_str =
 "}\n"
 "\n"
 "/*! Base structure for scalar double word */\n"
-"union scalar_dw { uint32_t u; int32_t s; float f; };\n"
+"union scalar_dw {\n"
+"  INLINE scalar_dw(void) {}\n"
+"  INLINE scalar_dw(uint32_t u) { this->u = u; }\n"
+"  INLINE scalar_dw(int32_t s) { this->s = s; }\n"
+"  INLINE scalar_dw(float f) { this->f = f; }\n"
+"  uint32_t u; int32_t s; float f;\n"
+"};\n"
 "\n"
 "/*! Base structure for scalar mask */\n"
 "union scalar_m { uint32_t u; int32_t s; float f; };\n"
@@ -113,6 +119,26 @@ std::string sim_vector_str =
 "  __m128 m[vectorNum];\n"
 "};\n"
 "\n"
+"/*! Select instruction on vectors */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void select(simd_dw<vectorNum> &dst,\n"
+"                   const simd_dw<vectorNum> &src0,\n"
+"                   const simd_dw<vectorNum> &src1,\n"
+"                   const simd_m<vectorNum> &mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
+"    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void select(simd_m<vectorNum> &dst,\n"
+"                   const simd_m<vectorNum> &src0,\n"
+"                   const simd_m<vectorNum> &src1,\n"
+"                   const simd_m<vectorNum> &mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
+"    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
+"}\n"
+"\n"
 "/*! To cast through memory */\n"
 "union cast_dw {\n"
 "  INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
@@ -135,6 +161,12 @@ std::string sim_vector_str =
 "};\n"
 "static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n"
 "\n"
+"/*! Make a mask true */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void alltrueMask(simd_m<vectorNum> &x) {\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;\n"
+"}\n"
+"\n"
 "/* Some convenient typedefs */\n"
 "typedef scalar_dw  simd1dw;\n"
 "typedef simd_dw<1> simd4dw;\n"
@@ -169,6 +201,17 @@ std::string sim_vector_str =
 "  return m;\n"
 "}\n"
 "\n"
+"/* MOV instruction */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S32(simd_dw<vectorNum> &dst, const simd_dw<vectorNum> &v) {\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S32(simd_dw<vectorNum> &dst, const scalar_dw &x) {\n"
+"  const __m128 v = _mm_load1_ps(&x.f);\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n"
+"}\n"
+"\n"
 "/* Vector instructions that use sse* */\n"
 "#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n"
 "template <uint32_t vectorNum>\\\n"
@@ -343,7 +386,6 @@ std::string sim_vector_str =
 "    dst.m[i] = _mm_load1_ps(&cast.f);\n"
 "}\n"
 "\n"
-"#include <cstdio>\n"
 "/* Scatter */\n"
 "template <uint32_t vectorNum>\n"
 "INLINE void SCATTER(const simd_dw<vectorNum> &offset,\n"
@@ -376,7 +418,47 @@ std::string sim_vector_str =
 "                    char *base_address) {\n"
 "  SCATTER(simd_dw<vectorNum>(offset), value, base_address);\n"
 "}\n"
-"#include <cstdio>\n"
+"\n"
+"/* Masked scatter will only store unmasked lanes */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,\n"
+"                           const simd_dw<vectorNum> &value,\n"
+"                           char *base_address,\n"
+"                           uint32_t mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+"    const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n"
+"    const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n"
+"    const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n"
+"    const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n"
+"    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);\n"
+"    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n"
+"    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n"
+"    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n"
+"    if (mask & 1) *(int*)(base_address + o0) = v0;\n"
+"    if (mask & 2) *(int*)(base_address + o1) = v1;\n"
+"    if (mask & 4) *(int*)(base_address + o2) = v2;\n"
+"    if (mask & 8) *(int*)(base_address + o3) = v3;\n"
+"    mask = mask >> 4;\n"
+"  }\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,\n"
+"                           const scalar_dw &value,\n"
+"                           char *base_address,\n"
+"                           uint32_t mask)\n"
+"{\n"
+"  MASKED_SCATTER(offset, simd_dw<vectorNum>(value), base_address, mask);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const scalar_dw &offset,\n"
+"                           const simd_dw<vectorNum> &value,\n"
+"                           char *base_address,\n"
+"                           uint32_t mask)\n"
+"{\n"
+"  MASKED_SCATTER(simd_dw<vectorNum>(offset), value, base_address, mask);\n"
+"}\n"
+"\n"
 "/* Gather */\n"
 "template <uint32_t vectorNum>\n"
 "INLINE void GATHER(simd_dw<vectorNum> &dst,\n"
@@ -404,6 +486,38 @@ std::string sim_vector_str =
 "  GATHER(dst, simd_dw<vectorNum>(offset), base_address);\n"
 "}\n"
 "\n"
+"/* Masked gather will only load activated lanes */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,\n"
+"                          const simd_dw<vectorNum> &offset,\n"
+"                          const char *base_address,\n"
+"                          uint32_t mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+"    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);\n"
+"    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n"
+"    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n"
+"    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n"
+"    const int v0 = *(const int*)(base_address + o0);\n"
+"    const int v1 = *(const int*)(base_address + o1);\n"
+"    const int v2 = *(const int*)(base_address + o2);\n"
+"    const int v3 = *(const int*)(base_address + o3);\n"
+"    if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n"
+"    if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n"
+"    if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n"
+"    if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n"
+"    mask = mask >> 4;\n"
+"  }\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,\n"
+"                          const scalar_dw &offset,\n"
+"                          const char *base_address,\n"
+"                          uint32_t mask)\n"
+"{\n"
+"  MASKED_GATHER(dst, simd_dw<vectorNum>(offset), base_address, mask);\n"
+"}\n"
+"\n"
 "//////////////////////////////////////////////////////////////////////////////\n"
 "// Scalar instructions\n"
 "//////////////////////////////////////////////////////////////////////////////\n"
@@ -451,6 +565,8 @@ std::string sim_vector_str =
 "// Identical instructions are forwarded\n"
 "//////////////////////////////////////////////////////////////////////////////\n"
 "\n"
+"#define NOV_U32 MOV_S32\n"
+"#define NOV_F MOV_S32\n"
 "#define ADD_U32 ADD_S32\n"
 "#define SUB_U32 SUB_S32\n"
 "#define XOR_U32 XOR_S32\n"
@@ -462,6 +578,122 @@ std::string sim_vector_str =
 "#undef PS2SI\n"
 "#undef SI2PS\n"
 "#undef ID\n"
+"\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"// Goto implementation which is directly inspired by BDW goto and by this\n"
+"// article \"Whole function vectorization\" (CGO 2011)\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"\n"
+"/*! Update the UIP vector according for the lanes alive in mask */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void updateUIP(simd_dw<vectorNum> &uipVec, const simd_m<vectorNum> mask, uint32_t uip) {\n"
+"  union { float f; uint32_t u; } x;\n"
+"  x.u = uip;\n"
+"  __m128 v = _mm_load1_ps(&x.f);\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
+"    uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);\n"
+"}\n"
+"\n"
+"/*! Update the execution mask based on block IP and UIP values */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void updateMask(simd_m<vectorNum> &mask, const simd_dw<vectorNum> &uipVec, uint32_t ip) {\n"
+"  const simd_dw<vectorNum> ipv(ip);\n"
+"  LE_U32(mask, uipVec, ipv);\n"
+"}\n"
+"\n"
+"/*! Jump to the block JIP */\n"
+"#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, EMASK, UIP); \\\n"
+"    goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! Based on the condition jump to block JIP */\n"
+"#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, COND, UIP); \\\n"
+"    typeof(COND) jumpCond; \\\n"
+"    scalar_dw jipScalar(uint32_t(JIP)); \\\n"
+"    LT_U32(jumpCond, UIPVEC, JIP); \\\n"
+"    uint32_t jumpMask = mask(jumpCond); \\\n"
+"    if (!jumpMask) goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! Backward jump is always taken */\n"
+"#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, EMASK, JIP); \\\n"
+"    goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! Conditional backward jump is taken if the condition is non-null */\n"
+"#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, COND, JIP); \\\n"
+"    if (mask(COND) != 0) goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! JOIN: reactivates lanes */\n"
+"#define SIM_JOIN(UIPVEC, MASK, IP) \\\n"
+"  do { \\\n"
+"    updateMask(MASK, UIPVEC, IP); \\\n"
+"    movedMask = mask(MASK); \\\n"
+"  } while (0)\n"
+"\n"
+"/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */\n"
+"#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \\\n"
+"  do { \\\n"
+"    SIM_JOIN(UIPVEC, EMASK, IP); \\\n"
+"    const uint32_t execMask = mask(EMASK); \\\n"
+"    if (execMask == 0) goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/* Macro to apply masking on destinations (from zero to four destinations) */\n"
+"#define MASKED0(OP, ...) \\\n"
+"  do { \\\n"
+"    OP(__VA_ARGS__); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED1(OP, ARG0, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    OP(ARG0##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED2(OP, ARG0, ARG1, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    typeof(ARG1) ARG1##__; \\\n"
+"    OP(ARG0##__, ARG1##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"    select(ARG1, ARG1, ARG1##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    typeof(ARG1) ARG1##__; \\\n"
+"    typeof(ARG2) ARG2##__; \\\n"
+"    OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"    select(ARG1, ARG1, ARG1##__, emask); \\\n"
+"    select(ARG2, ARG2, ARG2##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    typeof(ARG1) ARG1##__; \\\n"
+"    typeof(ARG2) ARG2##__; \\\n"
+"    typeof(ARG3) ARG3##__; \\\n"
+"    OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"    select(ARG1, ARG1, ARG1##__, emask); \\\n"
+"    select(ARG2, ARG2, ARG2##__, emask); \\\n"
+"    select(ARG3, ARG3, ARG3##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
 "#undef INLINE\n"
 "\n"
 "#endif /* __GBE_SIM_VECTOR_H__ */\n"
diff --git a/backend/src/backend/sim_context.cpp b/backend/src/backend/sim_context.cpp
index b01ca7b..04176f5 100644
--- a/backend/src/backend/sim_context.cpp
+++ b/backend/src/backend/sim_context.cpp
@@ -69,12 +69,17 @@ namespace gbe
       if (reg == ir::ocl::lid2) lid2 = true;
       const ir::RegisterData regData = fn.getRegisterData(reg);
       switch (regData.family) {
-        case ir::FAMILY_BOOL:
         case ir::FAMILY_BYTE:
         case ir::FAMILY_WORD:
         case ir::FAMILY_QWORD:
           NOT_IMPLEMENTED;
         break;
+        case ir::FAMILY_BOOL:
+          if (isScalarReg(reg) == true)
+            o << "scalar_m _" << regID << ";\n";
+          else
+            o << "simd" << simdWidth << "m _" << regID << ";\n";
+        break;
         case ir::FAMILY_DWORD:
           if (isScalarReg(reg) == true)
             o << "scalar_dw _" << regID << ";\n";
@@ -90,9 +95,9 @@ namespace gbe
     if (lid2 == false) o << "scalar_dw _" << uint32_t(ir::ocl::lid2) << ";\n";
   }
 
-#define LOAD_SPECIAL_REG(CURBE, REG) do {                                 \
-    const int32_t offset = kernel->getCurbeOffset(CURBE, 0);              \
-    if (offset >= 0)                                                      \
+#define LOAD_SPECIAL_REG(CURBE, REG) do { \
+    const int32_t offset = kernel->getCurbeOffset(CURBE, 0); \
+    if (offset >= 0) \
       o << "LOAD(_" << uint32_t(REG) << ", curbe + " << offset << ");\n"; \
   } while (0)
 
@@ -148,16 +153,23 @@ namespace gbe
     };
   }
 
+  void SimContext::emitMaskingCode(void) {
+    o << "simd" << simdWidth << "m " << "emask;\n"
+      << "simd" << simdWidth << "dw " << "uip(scalar_dw(0u));\n"
+      << "alltrueMask(emask);\n"
+      << "uint32_t movedMask = ~0x0u;\n";
+  }
+
   void SimContext::emitInstructionStream(void) {
     using namespace ir;
     fn.foreachInstruction([&](const Instruction &insn) {
       const char *opcodeStr = NULL;
       const Opcode opcode = insn.getOpcode();
-#define DECL_INSN(OPCODE, FAMILY)                         \
-      case OP_##OPCODE:                                   \
-      if (opcode == OP_LOAD) opcodeStr = "GATHER";        \
+#define DECL_INSN(OPCODE, FAMILY) \
+      case OP_##OPCODE: \
+      if (opcode == OP_LOAD) opcodeStr = "GATHER"; \
       else if (opcode == OP_STORE) opcodeStr = "SCATTER"; \
-      else opcodeStr = #OPCODE;                           \
+      else opcodeStr = #OPCODE; \
       break;
       switch (opcode) {
         #include "ir/instruction.hxx"
@@ -167,11 +179,40 @@ namespace gbe
       if (opcode == OP_LABEL) {
         const LabelInstruction labelInsn = cast<LabelInstruction>(insn);
         const LabelIndex index = labelInsn.getLabelIndex();
-        if (usedLabels.contains(index) == true)
-          o << "label" << index << ":\n";
+        o << "\n";
+        if (usedLabels.contains(index) == false)  o << "// ";
+        o << "label" << index << ":\n";
+        o << "SIM_JOIN(uip, emask, " << uint32_t(index) << ");\n";
         return;
       } else if (opcode == OP_BRA) {
-        NOT_IMPLEMENTED;
+        // Get the label of the block
+        const BranchInstruction bra = cast<BranchInstruction>(insn);
+        const BasicBlock *bb = insn.getParent();
+        const Instruction *label = bb->getFirstInstruction();
+        GBE_ASSERT(label->isMemberOf<LabelInstruction>() == true);
+        const LabelIndex srcIndex = cast<LabelInstruction>(label)->getLabelIndex();
+        const LabelIndex dstIndex = bra.getLabelIndex();
+        const bool isPredicated = bra.isPredicated();
+
+        if (uint32_t(dstIndex) > uint32_t(srcIndex)) { // FWD jump here
+          if (isPredicated) {
+            const Register pred = bra.getPredicateIndex();
+            o << "SIM_FWD_BRA_C(uip, emask, " << "_" << pred
+              << ", " << uint32_t(dstIndex) << ", " << uint32_t(dstIndex)
+              << ");\n";
+          } else {
+            o << "SIM_FWD_BRA(uip, emask, "
+              << uint32_t(dstIndex) << ", " << uint32_t(dstIndex)
+              << ");\n";
+          }
+        } else { // BWD jump
+          if (isPredicated) {
+            const Register pred = bra.getPredicateIndex();
+            o << "SIM_BWD_BRA_C(uip, _" << pred
+              << ", " << uint32_t(dstIndex) << ");\n";
+          } else
+            o << "SIM_BWD_BRA(uip, emask, " << uint32_t(dstIndex) << ");\n";
+        }
         return;
       } else if (opcode == OP_RET) {
         o << "return;\n";
@@ -189,7 +230,13 @@ namespace gbe
       // Regular compute instruction
       const uint32_t dstNum = insn.getDstNum();
       const uint32_t srcNum = insn.getSrcNum();
-      o << opcodeStr;
+
+      // These two needs a new instruction. Fortunately, it is just a string
+      // manipulation. MASKED(OP,... just becomes MASKED_OP(...)
+      if (opcode == OP_STORE || opcode == OP_LOAD)
+        o << "MASKED_" << opcodeStr << "(";
+      else
+        o << "MASKED" << dstNum << "(" << opcodeStr;
 
       // Append type when needed
       if (insn.isMemberOf<UnaryInstruction>() == true)
@@ -200,7 +247,8 @@ namespace gbe
        o << "_" << typeStr(cast<BinaryInstruction>(insn).getType());
       else if (insn.isMemberOf<CompareInstruction>() == true)
        o << "_" << typeStr(cast<CompareInstruction>(insn).getType());
-      o << "(";
+      if (opcode != OP_STORE && opcode != OP_LOAD)
+        o << ", ";
 
       // Output both destinations and sources in that order
       for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
@@ -220,7 +268,7 @@ namespace gbe
                    imm.type == TYPE_FLOAT);
         o << ", " << imm.data.u32;
       } else if (opcode == OP_LOAD || opcode == OP_STORE)
-        o << ", base";
+        o << ", base, movedMask";
       o << ");\n";
     });
   }
@@ -250,7 +298,9 @@ namespace gbe
       << "const size_t curbe_sz = sim->get_curbe_size(sim);\n"
       << "const char *curbe = (const char*) sim->get_curbe_address(sim) + curbe_sz * tid;\n"
       << "char *base = (char*) sim->get_base_address(sim);\n";
+
     this->emitRegisters();
+    this->emitMaskingCode();
     this->emitCurbeLoad();
     this->emitInstructionStream();
     o << "}\n";
diff --git a/backend/src/backend/sim_context.hpp b/backend/src/backend/sim_context.hpp
index 9021adb..a52d439 100644
--- a/backend/src/backend/sim_context.hpp
+++ b/backend/src/backend/sim_context.hpp
@@ -51,6 +51,8 @@ namespace gbe
     void emitRegisters(void);
     /*! Load the curbe data into the registers */
     void emitCurbeLoad(void);
+    /*! Emit the masking code (mask / UIP) */
+    void emitMaskingCode(void);
     /*! Emit the instructions */
     void emitInstructionStream(void);
     /*! Implements base class */
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 9cdbbec..0759f46 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -36,6 +36,10 @@
 #include "sys/cvar.hpp"
 #include "sys/platform.hpp"
 
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
 namespace gbe
 {
   BVAR(OCL_OUTPUT_LLVM, false);
@@ -44,10 +48,11 @@ namespace gbe
   bool llvmToGen(ir::Unit &unit, const char *fileName)
   {
     using namespace llvm;
+
     // Get the global LLVM context
     llvm::LLVMContext& c = llvm::getGlobalContext();
     std::string errInfo;
-    llvm::raw_fd_ostream o("-", errInfo);
+    auto *o = new llvm::raw_fd_ostream("-", errInfo);
 
     // Get the module from its file
     SMDiagnostic Err;
@@ -60,7 +65,7 @@ namespace gbe
 
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
-      passes.add(createPrintModulePass(&o));
+      passes.add(createPrintModulePass(o));
     passes.add(createScalarReplAggregatesPass()); // Break up allocas
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
@@ -72,8 +77,15 @@ namespace gbe
 
     // Print the code extra optimization passes
     if (OCL_OUTPUT_LLVM)
-      passes.add(createPrintModulePass(&o));
+      passes.add(createPrintModulePass(o));
     passes.run(mod);
+
+    // raw_fd_ostream closes stdout. We must reopen it
+    delete o;
+    int fd;
+    fd = open("/dev/tty", O_WRONLY);
+    stdout = fdopen(fd, "w");
+
     return true;
   }
 } /* namespace gbe */
diff --git a/backend/src/llvm/stdlib.h b/backend/src/ocl_stdlib.h
similarity index 100%
rename from backend/src/llvm/stdlib.h
rename to backend/src/ocl_stdlib.h
diff --git a/backend/src/llvm/stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp
similarity index 99%
rename from backend/src/llvm/stdlib_str.cpp
rename to backend/src/ocl_stdlib_str.cpp
index 41ce7fe..fb262c2 100644
--- a/backend/src/llvm/stdlib_str.cpp
+++ b/backend/src/ocl_stdlib_str.cpp
@@ -19,7 +19,7 @@
 
 #include "string"
 namespace gbe {
-std::string stdlib_str = 
+std::string ocl_stdlib_str = 
 "#define DECL_INTERNAL_WORK_ITEM_FN(NAME)                             \\\n"
 "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void);  \\\n"
 "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void);  \\\n"
-- 
2.7.4