First implementation of branches in the simulator

author Benjamin Segovia <segovia.benjamin@gmail.com>

Mon, 16 Apr 2012 18:44:44 +0000 (18:44 +0000)

committer Keith Packard <keithp@keithp.com>

Fri, 10 Aug 2012 23:16:28 +0000 (16:16 -0700)
author Benjamin Segovia <segovia.benjamin@gmail.com>
Mon, 16 Apr 2012 18:44:44 +0000 (18:44 +0000)
committer Keith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:16:28 +0000 (16:16 -0700)
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt

index 26fd6cd..f66235f 100644 (file)
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -22,13 +22,15 @@ endmacro (stringify)
  
  set (TO_STRINGIFY_FILES simulator sim_vector)
  stringify ("${GBE_SOURCE_DIR}/src/backend/sim/" "${TO_STRINGIFY_FILES}")
-set (TO_STRINGIFY_FILES stdlib)
-stringify ("${GBE_SOURCE_DIR}/src/llvm/" "${TO_STRINGIFY_FILES}")
+set (TO_STRINGIFY_FILES ocl_stdlib)
+stringify ("${GBE_SOURCE_DIR}/src/" "${TO_STRINGIFY_FILES}")
  
  if (GBE_USE_BLOB)
    set (GBE_SRC blob.cpp)
  else (GBE_USE_BLOB)
    set (GBE_SRC
+    ocl_stdlib.h
+    ocl_stdlib_str.cpp
      sys/vector.hpp
      sys/hash_map.hpp
      sys/map.hpp
@@ -69,7 +71,6 @@ else (GBE_USE_BLOB)
      ir/function.hpp
      ir/value.cpp
      ir/value.hpp
-    llvm/stdlib_str.cpp
      backend/context.cpp
      backend/context.hpp
      backend/program.cpp
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp

index 08dac6e..9ec86f3 100644 (file)
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -88,7 +88,7 @@ namespace gbe {
      GBE_SAFE_DELETE(program);
    }
  
-  extern std::string stdlib_str;
+  extern std::string ocl_stdlib_str;
    static gbe_program programNewFromSource(const char *source,
                                            size_t stringSize,
                                            char *err,
@@ -101,7 +101,7 @@ namespace gbe {
      // Write the source to the cl file
      FILE *clFile = fopen(clName.c_str(), "w");
      FATAL_IF(clFile == NULL, "Failed to open temporary file");
-    fwrite(stdlib_str.c_str(), strlen(stdlib_str.c_str()), 1, clFile);
+    fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
      fwrite(source, strlen(source), 1, clFile);
      fclose(clFile);
  
diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h

index 34cec34..2baffcc 100644 (file)
--- a/backend/src/backend/sim/sim_vector.h
+++ b/backend/src/backend/sim/sim_vector.h
@@ -58,7 +58,13 @@ INLINE const __m128i expand(const __m128i& b) {
  }
  
  /*! Base structure for scalar double word */
-union scalar_dw { uint32_t u; int32_t s; float f; };
+union scalar_dw {
+  INLINE scalar_dw(void) {}
+  INLINE scalar_dw(uint32_t u) { this->u = u; }
+  INLINE scalar_dw(int32_t s) { this->s = s; }
+  INLINE scalar_dw(float f) { this->f = f; }
+  uint32_t u; int32_t s; float f;
+};
  
  /*! Base structure for scalar mask */
  union scalar_m { uint32_t u; int32_t s; float f; };
@@ -87,6 +93,26 @@ struct simd_m {
    __m128 m[vectorNum];
  };
  
+/*! Select instruction on vectors */
+template <uint32_t vectorNum>
+INLINE void select(simd_dw<vectorNum> &dst,
+                   const simd_dw<vectorNum> &src0,
+                   const simd_dw<vectorNum> &src1,
+                   const simd_m<vectorNum> &mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i)
+    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
+}
+template <uint32_t vectorNum>
+INLINE void select(simd_m<vectorNum> &dst,
+                   const simd_m<vectorNum> &src0,
+                   const simd_m<vectorNum> &src1,
+                   const simd_m<vectorNum> &mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i)
+    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
+}
+
  /*! To cast through memory */
  union cast_dw {
    INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
@@ -109,6 +135,12 @@ union cast_dw {
  };
  static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
  
+/*! Make a mask true */
+template <uint32_t vectorNum>
+INLINE void alltrueMask(simd_m<vectorNum> &x) {
+  for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;
+}
+
  /* Some convenient typedefs */
  typedef scalar_dw  simd1dw;
  typedef simd_dw<1> simd4dw;
@@ -143,6 +175,17 @@ INLINE uint32_t mask(const simd_m<vectorNum> v) {
    return m;
  }
  
+/* MOV instruction */
+template <uint32_t vectorNum>
+INLINE void MOV_S32(simd_dw<vectorNum> &dst, const simd_dw<vectorNum> &v) {
+  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];
+}
+template <uint32_t vectorNum>
+INLINE void MOV_S32(simd_dw<vectorNum> &dst, const scalar_dw &x) {
+  const __m128 v = _mm_load1_ps(&x.f);
+  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;
+}
+
  /* Vector instructions that use sse* */
  #define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
  template <uint32_t vectorNum>\
@@ -317,7 +360,6 @@ INLINE void LOADI(simd_dw<vectorNum> &dst, uint32_t u) {
      dst.m[i] = _mm_load1_ps(&cast.f);
  }
  
-#include <cstdio>
  /* Scatter */
  template <uint32_t vectorNum>
  INLINE void SCATTER(const simd_dw<vectorNum> &offset,
@@ -350,7 +392,47 @@ INLINE void SCATTER(const scalar_dw &offset,
                      char *base_address) {
    SCATTER(simd_dw<vectorNum>(offset), value, base_address);
  }
-#include <cstdio>
+
+/* Masked scatter will only store unmasked lanes */
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,
+                           const simd_dw<vectorNum> &value,
+                           char *base_address,
+                           uint32_t mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i) {
+    const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);
+    const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);
+    const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);
+    const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);
+    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);
+    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);
+    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);
+    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);
+    if (mask & 1) *(int*)(base_address + o0) = v0;
+    if (mask & 2) *(int*)(base_address + o1) = v1;
+    if (mask & 4) *(int*)(base_address + o2) = v2;
+    if (mask & 8) *(int*)(base_address + o3) = v3;
+    mask = mask >> 4;
+  }
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,
+                           const scalar_dw &value,
+                           char *base_address,
+                           uint32_t mask)
+{
+  MASKED_SCATTER(offset, simd_dw<vectorNum>(value), base_address, mask);
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_SCATTER(const scalar_dw &offset,
+                           const simd_dw<vectorNum> &value,
+                           char *base_address,
+                           uint32_t mask)
+{
+  MASKED_SCATTER(simd_dw<vectorNum>(offset), value, base_address, mask);
+}
+
  /* Gather */
  template <uint32_t vectorNum>
  INLINE void GATHER(simd_dw<vectorNum> &dst,
@@ -378,6 +460,38 @@ INLINE void GATHER(simd_dw<vectorNum> &dst,
    GATHER(dst, simd_dw<vectorNum>(offset), base_address);
  }
  
+/* Masked gather will only load activated lanes */
+template <uint32_t vectorNum>
+INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,
+                          const simd_dw<vectorNum> &offset,
+                          const char *base_address,
+                          uint32_t mask)
+{
+  for (uint32_t i = 0; i < vectorNum; ++i) {
+    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);
+    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);
+    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);
+    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);
+    const int v0 = *(const int*)(base_address + o0);
+    const int v1 = *(const int*)(base_address + o1);
+    const int v2 = *(const int*)(base_address + o2);
+    const int v3 = *(const int*)(base_address + o3);
+    if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));
+    if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));
+    if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));
+    if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));
+    mask = mask >> 4;
+  }
+}
+template <uint32_t vectorNum>
+INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,
+                          const scalar_dw &offset,
+                          const char *base_address,
+                          uint32_t mask)
+{
+  MASKED_GATHER(dst, simd_dw<vectorNum>(offset), base_address, mask);
+}
+
  //////////////////////////////////////////////////////////////////////////////
  // Scalar instructions
  //////////////////////////////////////////////////////////////////////////////
@@ -425,6 +539,8 @@ INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u =
  // Identical instructions are forwarded
  //////////////////////////////////////////////////////////////////////////////
  
+#define NOV_U32 MOV_S32
+#define NOV_F MOV_S32
  #define ADD_U32 ADD_S32
  #define SUB_U32 SUB_S32
  #define XOR_U32 XOR_S32
@@ -436,6 +552,122 @@ INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u =
  #undef PS2SI
  #undef SI2PS
  #undef ID
+
+//////////////////////////////////////////////////////////////////////////////
+// Goto implementation which is directly inspired by BDW goto and by this
+// article "Whole function vectorization" (CGO 2011)
+//////////////////////////////////////////////////////////////////////////////
+
+/*! Update the UIP vector according for the lanes alive in mask */
+template <uint32_t vectorNum>
+INLINE void updateUIP(simd_dw<vectorNum> &uipVec, const simd_m<vectorNum> mask, uint32_t uip) {
+  union { float f; uint32_t u; } x;
+  x.u = uip;
+  __m128 v = _mm_load1_ps(&x.f);
+  for (uint32_t i = 0; i < vectorNum; ++i)
+    uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);
+}
+
+/*! Update the execution mask based on block IP and UIP values */
+template <uint32_t vectorNum>
+INLINE void updateMask(simd_m<vectorNum> &mask, const simd_dw<vectorNum> &uipVec, uint32_t ip) {
+  const simd_dw<vectorNum> ipv(ip);
+  LE_U32(mask, uipVec, ipv);
+}
+
+/*! Jump to the block JIP */
+#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \
+  do { \
+    updateUIP(UIPVEC, EMASK, UIP); \
+    goto label##JIP; \
+  } while (0)
+
+/*! Based on the condition jump to block JIP */
+#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \
+  do { \
+    updateUIP(UIPVEC, COND, UIP); \
+    typeof(COND) jumpCond; \
+    scalar_dw jipScalar(uint32_t(JIP)); \
+    LT_U32(jumpCond, UIPVEC, JIP); \
+    uint32_t jumpMask = mask(jumpCond); \
+    if (!jumpMask) goto label##JIP; \
+  } while (0)
+
+/*! Backward jump is always taken */
+#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \
+  do { \
+    updateUIP(UIPVEC, EMASK, JIP); \
+    goto label##JIP; \
+  } while (0)
+
+/*! Conditional backward jump is taken if the condition is non-null */
+#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \
+  do { \
+    updateUIP(UIPVEC, COND, JIP); \
+    if (mask(COND) != 0) goto label##JIP; \
+  } while (0)
+
+/*! JOIN: reactivates lanes */
+#define SIM_JOIN(UIPVEC, MASK, IP) \
+  do { \
+    updateMask(MASK, UIPVEC, IP); \
+    movedMask = mask(MASK); \
+  } while (0)
+
+/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */
+#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \
+  do { \
+    SIM_JOIN(UIPVEC, EMASK, IP); \
+    const uint32_t execMask = mask(EMASK); \
+    if (execMask == 0) goto label##JIP; \
+  } while (0)
+
+/* Macro to apply masking on destinations (from zero to four destinations) */
+#define MASKED0(OP, ...) \
+  do { \
+    OP(__VA_ARGS__); \
+  } while (0)
+
+#define MASKED1(OP, ARG0, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    OP(ARG0##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+  } while (0)
+
+#define MASKED2(OP, ARG0, ARG1, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    typeof(ARG1) ARG1##__; \
+    OP(ARG0##__, ARG1##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+    select(ARG1, ARG1, ARG1##__, emask); \
+  } while (0)
+
+#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    typeof(ARG1) ARG1##__; \
+    typeof(ARG2) ARG2##__; \
+    OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+    select(ARG1, ARG1, ARG1##__, emask); \
+    select(ARG2, ARG2, ARG2##__, emask); \
+  } while (0)
+
+#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \
+  do { \
+    typeof(ARG0) ARG0##__; \
+    typeof(ARG1) ARG1##__; \
+    typeof(ARG2) ARG2##__; \
+    typeof(ARG3) ARG3##__; \
+    OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \
+    select(ARG0, ARG0, ARG0##__, emask); \
+    select(ARG1, ARG1, ARG1##__, emask); \
+    select(ARG2, ARG2, ARG2##__, emask); \
+    select(ARG3, ARG3, ARG3##__, emask); \
+  } while (0)
+
  #undef INLINE
  
  #endif /* __GBE_SIM_VECTOR_H__ */
diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp

index f900c3f..d3cc938 100644 (file)
--- a/backend/src/backend/sim/sim_vector_str.cpp
+++ b/backend/src/backend/sim/sim_vector_str.cpp
@@ -84,7 +84,13 @@ std::string sim_vector_str =
  "}\n"
  "\n"
  "/*! Base structure for scalar double word */\n"
-"union scalar_dw { uint32_t u; int32_t s; float f; };\n"
+"union scalar_dw {\n"
+"  INLINE scalar_dw(void) {}\n"
+"  INLINE scalar_dw(uint32_t u) { this->u = u; }\n"
+"  INLINE scalar_dw(int32_t s) { this->s = s; }\n"
+"  INLINE scalar_dw(float f) { this->f = f; }\n"
+"  uint32_t u; int32_t s; float f;\n"
+"};\n"
  "\n"
  "/*! Base structure for scalar mask */\n"
  "union scalar_m { uint32_t u; int32_t s; float f; };\n"
@@ -113,6 +119,26 @@ std::string sim_vector_str =
  "  __m128 m[vectorNum];\n"
  "};\n"
  "\n"
+"/*! Select instruction on vectors */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void select(simd_dw<vectorNum> &dst,\n"
+"                   const simd_dw<vectorNum> &src0,\n"
+"                   const simd_dw<vectorNum> &src1,\n"
+"                   const simd_m<vectorNum> &mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
+"    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void select(simd_m<vectorNum> &dst,\n"
+"                   const simd_m<vectorNum> &src0,\n"
+"                   const simd_m<vectorNum> &src1,\n"
+"                   const simd_m<vectorNum> &mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
+"    dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
+"}\n"
+"\n"
  "/*! To cast through memory */\n"
  "union cast_dw {\n"
  "  INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
@@ -135,6 +161,12 @@ std::string sim_vector_str =
  "};\n"
  "static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n"
  "\n"
+"/*! Make a mask true */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void alltrueMask(simd_m<vectorNum> &x) {\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;\n"
+"}\n"
+"\n"
  "/* Some convenient typedefs */\n"
  "typedef scalar_dw  simd1dw;\n"
  "typedef simd_dw<1> simd4dw;\n"
@@ -169,6 +201,17 @@ std::string sim_vector_str =
  "  return m;\n"
  "}\n"
  "\n"
+"/* MOV instruction */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S32(simd_dw<vectorNum> &dst, const simd_dw<vectorNum> &v) {\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S32(simd_dw<vectorNum> &dst, const scalar_dw &x) {\n"
+"  const __m128 v = _mm_load1_ps(&x.f);\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n"
+"}\n"
+"\n"
  "/* Vector instructions that use sse* */\n"
  "#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n"
  "template <uint32_t vectorNum>\\\n"
@@ -343,7 +386,6 @@ std::string sim_vector_str =
  "    dst.m[i] = _mm_load1_ps(&cast.f);\n"
  "}\n"
  "\n"
-"#include <cstdio>\n"
  "/* Scatter */\n"
  "template <uint32_t vectorNum>\n"
  "INLINE void SCATTER(const simd_dw<vectorNum> &offset,\n"
@@ -376,7 +418,47 @@ std::string sim_vector_str =
  "                    char *base_address) {\n"
  "  SCATTER(simd_dw<vectorNum>(offset), value, base_address);\n"
  "}\n"
-"#include <cstdio>\n"
+"\n"
+"/* Masked scatter will only store unmasked lanes */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,\n"
+"                           const simd_dw<vectorNum> &value,\n"
+"                           char *base_address,\n"
+"                           uint32_t mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+"    const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n"
+"    const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n"
+"    const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n"
+"    const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n"
+"    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]), 0);\n"
+"    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n"
+"    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n"
+"    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n"
+"    if (mask & 1) *(int*)(base_address + o0) = v0;\n"
+"    if (mask & 2) *(int*)(base_address + o1) = v1;\n"
+"    if (mask & 4) *(int*)(base_address + o2) = v2;\n"
+"    if (mask & 8) *(int*)(base_address + o3) = v3;\n"
+"    mask = mask >> 4;\n"
+"  }\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const simd_dw<vectorNum> &offset,\n"
+"                           const scalar_dw &value,\n"
+"                           char *base_address,\n"
+"                           uint32_t mask)\n"
+"{\n"
+"  MASKED_SCATTER(offset, simd_dw<vectorNum>(value), base_address, mask);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_SCATTER(const scalar_dw &offset,\n"
+"                           const simd_dw<vectorNum> &value,\n"
+"                           char *base_address,\n"
+"                           uint32_t mask)\n"
+"{\n"
+"  MASKED_SCATTER(simd_dw<vectorNum>(offset), value, base_address, mask);\n"
+"}\n"
+"\n"
  "/* Gather */\n"
  "template <uint32_t vectorNum>\n"
  "INLINE void GATHER(simd_dw<vectorNum> &dst,\n"
@@ -404,6 +486,38 @@ std::string sim_vector_str =
  "  GATHER(dst, simd_dw<vectorNum>(offset), base_address);\n"
  "}\n"
  "\n"
+"/* Masked gather will only load activated lanes */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,\n"
+"                          const simd_dw<vectorNum> &offset,\n"
+"                          const char *base_address,\n"
+"                          uint32_t mask)\n"
+"{\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+"    const int o0 = _mm_extract_epi32(PS2SI(offset.m[i]) , 0);\n"
+"    const int o1 = _mm_extract_epi32(PS2SI(offset.m[i]), 1);\n"
+"    const int o2 = _mm_extract_epi32(PS2SI(offset.m[i]), 2);\n"
+"    const int o3 = _mm_extract_epi32(PS2SI(offset.m[i]), 3);\n"
+"    const int v0 = *(const int*)(base_address + o0);\n"
+"    const int v1 = *(const int*)(base_address + o1);\n"
+"    const int v2 = *(const int*)(base_address + o2);\n"
+"    const int v3 = *(const int*)(base_address + o3);\n"
+"    if (mask & 1) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n"
+"    if (mask & 2) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n"
+"    if (mask & 4) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n"
+"    if (mask & 8) dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n"
+"    mask = mask >> 4;\n"
+"  }\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MASKED_GATHER(simd_dw<vectorNum> &dst,\n"
+"                          const scalar_dw &offset,\n"
+"                          const char *base_address,\n"
+"                          uint32_t mask)\n"
+"{\n"
+"  MASKED_GATHER(dst, simd_dw<vectorNum>(offset), base_address, mask);\n"
+"}\n"
+"\n"
  "//////////////////////////////////////////////////////////////////////////////\n"
  "// Scalar instructions\n"
  "//////////////////////////////////////////////////////////////////////////////\n"
@@ -451,6 +565,8 @@ std::string sim_vector_str =
  "// Identical instructions are forwarded\n"
  "//////////////////////////////////////////////////////////////////////////////\n"
  "\n"
+"#define NOV_U32 MOV_S32\n"
+"#define NOV_F MOV_S32\n"
  "#define ADD_U32 ADD_S32\n"
  "#define SUB_U32 SUB_S32\n"
  "#define XOR_U32 XOR_S32\n"
@@ -462,6 +578,122 @@ std::string sim_vector_str =
  "#undef PS2SI\n"
  "#undef SI2PS\n"
  "#undef ID\n"
+"\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"// Goto implementation which is directly inspired by BDW goto and by this\n"
+"// article \"Whole function vectorization\" (CGO 2011)\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"\n"
+"/*! Update the UIP vector according for the lanes alive in mask */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void updateUIP(simd_dw<vectorNum> &uipVec, const simd_m<vectorNum> mask, uint32_t uip) {\n"
+"  union { float f; uint32_t u; } x;\n"
+"  x.u = uip;\n"
+"  __m128 v = _mm_load1_ps(&x.f);\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
+"    uipVec.m[i] = _mm_blendv_ps(uipVec.m[i], v, mask.m[i]);\n"
+"}\n"
+"\n"
+"/*! Update the execution mask based on block IP and UIP values */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void updateMask(simd_m<vectorNum> &mask, const simd_dw<vectorNum> &uipVec, uint32_t ip) {\n"
+"  const simd_dw<vectorNum> ipv(ip);\n"
+"  LE_U32(mask, uipVec, ipv);\n"
+"}\n"
+"\n"
+"/*! Jump to the block JIP */\n"
+"#define SIM_FWD_BRA(UIPVEC, EMASK, JIP, UIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, EMASK, UIP); \\\n"
+"    goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! Based on the condition jump to block JIP */\n"
+"#define SIM_FWD_BRA_C(UIPVEC, EMASK, COND, JIP, UIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, COND, UIP); \\\n"
+"    typeof(COND) jumpCond; \\\n"
+"    scalar_dw jipScalar(uint32_t(JIP)); \\\n"
+"    LT_U32(jumpCond, UIPVEC, JIP); \\\n"
+"    uint32_t jumpMask = mask(jumpCond); \\\n"
+"    if (!jumpMask) goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! Backward jump is always taken */\n"
+"#define SIM_BWD_BRA(UIPVEC, EMASK, JIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, EMASK, JIP); \\\n"
+"    goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! Conditional backward jump is taken if the condition is non-null */\n"
+"#define SIM_BWD_BRA_C(UIPVEC, COND, JIP) \\\n"
+"  do { \\\n"
+"    updateUIP(UIPVEC, COND, JIP); \\\n"
+"    if (mask(COND) != 0) goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/*! JOIN: reactivates lanes */\n"
+"#define SIM_JOIN(UIPVEC, MASK, IP) \\\n"
+"  do { \\\n"
+"    updateMask(MASK, UIPVEC, IP); \\\n"
+"    movedMask = mask(MASK); \\\n"
+"  } while (0)\n"
+"\n"
+"/*! JOIN_JUMP: ractivate lanes and jump to JIP if none is activated */\n"
+"#define SIM_JOIN_JUMP(UIPVEC, EMASK, IP, JIP) \\\n"
+"  do { \\\n"
+"    SIM_JOIN(UIPVEC, EMASK, IP); \\\n"
+"    const uint32_t execMask = mask(EMASK); \\\n"
+"    if (execMask == 0) goto label##JIP; \\\n"
+"  } while (0)\n"
+"\n"
+"/* Macro to apply masking on destinations (from zero to four destinations) */\n"
+"#define MASKED0(OP, ...) \\\n"
+"  do { \\\n"
+"    OP(__VA_ARGS__); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED1(OP, ARG0, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    OP(ARG0##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED2(OP, ARG0, ARG1, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    typeof(ARG1) ARG1##__; \\\n"
+"    OP(ARG0##__, ARG1##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"    select(ARG1, ARG1, ARG1##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED3(OP, ARG0, ARG1, ARG2, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    typeof(ARG1) ARG1##__; \\\n"
+"    typeof(ARG2) ARG2##__; \\\n"
+"    OP(ARG0##__, ARG1##__, ARG2##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"    select(ARG1, ARG1, ARG1##__, emask); \\\n"
+"    select(ARG2, ARG2, ARG2##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
+"#define MASKED4(OP, ARG0, ARG1, ARG2, ARG3, ...) \\\n"
+"  do { \\\n"
+"    typeof(ARG0) ARG0##__; \\\n"
+"    typeof(ARG1) ARG1##__; \\\n"
+"    typeof(ARG2) ARG2##__; \\\n"
+"    typeof(ARG3) ARG3##__; \\\n"
+"    OP(ARG0##__, ARG1##__, ARG2##__, ARG3##__, __VA_ARGS__); \\\n"
+"    select(ARG0, ARG0, ARG0##__, emask); \\\n"
+"    select(ARG1, ARG1, ARG1##__, emask); \\\n"
+"    select(ARG2, ARG2, ARG2##__, emask); \\\n"
+"    select(ARG3, ARG3, ARG3##__, emask); \\\n"
+"  } while (0)\n"
+"\n"
  "#undef INLINE\n"
  "\n"
  "#endif /* __GBE_SIM_VECTOR_H__ */\n"
diff --git a/backend/src/backend/sim_context.cpp b/backend/src/backend/sim_context.cpp

index b01ca7b..04176f5 100644 (file)
--- a/backend/src/backend/sim_context.cpp
+++ b/backend/src/backend/sim_context.cpp
@@ -69,12 +69,17 @@ namespace gbe
        if (reg == ir::ocl::lid2) lid2 = true;
        const ir::RegisterData regData = fn.getRegisterData(reg);
        switch (regData.family) {
-        case ir::FAMILY_BOOL:
          case ir::FAMILY_BYTE:
          case ir::FAMILY_WORD:
          case ir::FAMILY_QWORD:
            NOT_IMPLEMENTED;
          break;
+        case ir::FAMILY_BOOL:
+          if (isScalarReg(reg) == true)
+            o << "scalar_m _" << regID << ";\n";
+          else
+            o << "simd" << simdWidth << "m _" << regID << ";\n";
+        break;
          case ir::FAMILY_DWORD:
            if (isScalarReg(reg) == true)
              o << "scalar_dw _" << regID << ";\n";
@@ -90,9 +95,9 @@ namespace gbe
      if (lid2 == false) o << "scalar_dw _" << uint32_t(ir::ocl::lid2) << ";\n";
    }
  
-#define LOAD_SPECIAL_REG(CURBE, REG) do {                                 \
-    const int32_t offset = kernel->getCurbeOffset(CURBE, 0);              \
-    if (offset >= 0)                                                      \
+#define LOAD_SPECIAL_REG(CURBE, REG) do { \
+    const int32_t offset = kernel->getCurbeOffset(CURBE, 0); \
+    if (offset >= 0) \
        o << "LOAD(_" << uint32_t(REG) << ", curbe + " << offset << ");\n"; \
    } while (0)
  
@@ -148,16 +153,23 @@ namespace gbe
      };
    }
  
+  void SimContext::emitMaskingCode(void) {
+    o << "simd" << simdWidth << "m " << "emask;\n"
+      << "simd" << simdWidth << "dw " << "uip(scalar_dw(0u));\n"
+      << "alltrueMask(emask);\n"
+      << "uint32_t movedMask = ~0x0u;\n";
+  }
+
    void SimContext::emitInstructionStream(void) {
      using namespace ir;
      fn.foreachInstruction([&](const Instruction &insn) {
        const char *opcodeStr = NULL;
        const Opcode opcode = insn.getOpcode();
-#define DECL_INSN(OPCODE, FAMILY)                         \
-      case OP_##OPCODE:                                   \
-      if (opcode == OP_LOAD) opcodeStr = "GATHER";        \
+#define DECL_INSN(OPCODE, FAMILY) \
+      case OP_##OPCODE: \
+      if (opcode == OP_LOAD) opcodeStr = "GATHER"; \
        else if (opcode == OP_STORE) opcodeStr = "SCATTER"; \
-      else opcodeStr = #OPCODE;                           \
+      else opcodeStr = #OPCODE; \
        break;
        switch (opcode) {
          #include "ir/instruction.hxx"
@@ -167,11 +179,40 @@ namespace gbe
        if (opcode == OP_LABEL) {
          const LabelInstruction labelInsn = cast<LabelInstruction>(insn);
          const LabelIndex index = labelInsn.getLabelIndex();
-        if (usedLabels.contains(index) == true)
-          o << "label" << index << ":\n";
+        o << "\n";
+        if (usedLabels.contains(index) == false)  o << "// ";
+        o << "label" << index << ":\n";
+        o << "SIM_JOIN(uip, emask, " << uint32_t(index) << ");\n";
          return;
        } else if (opcode == OP_BRA) {
-        NOT_IMPLEMENTED;
+        // Get the label of the block
+        const BranchInstruction bra = cast<BranchInstruction>(insn);
+        const BasicBlock *bb = insn.getParent();
+        const Instruction *label = bb->getFirstInstruction();
+        GBE_ASSERT(label->isMemberOf<LabelInstruction>() == true);
+        const LabelIndex srcIndex = cast<LabelInstruction>(label)->getLabelIndex();
+        const LabelIndex dstIndex = bra.getLabelIndex();
+        const bool isPredicated = bra.isPredicated();
+
+        if (uint32_t(dstIndex) > uint32_t(srcIndex)) { // FWD jump here
+          if (isPredicated) {
+            const Register pred = bra.getPredicateIndex();
+            o << "SIM_FWD_BRA_C(uip, emask, " << "_" << pred
+              << ", " << uint32_t(dstIndex) << ", " << uint32_t(dstIndex)
+              << ");\n";
+          } else {
+            o << "SIM_FWD_BRA(uip, emask, "
+              << uint32_t(dstIndex) << ", " << uint32_t(dstIndex)
+              << ");\n";
+          }
+        } else { // BWD jump
+          if (isPredicated) {
+            const Register pred = bra.getPredicateIndex();
+            o << "SIM_BWD_BRA_C(uip, _" << pred
+              << ", " << uint32_t(dstIndex) << ");\n";
+          } else
+            o << "SIM_BWD_BRA(uip, emask, " << uint32_t(dstIndex) << ");\n";
+        }
          return;
        } else if (opcode == OP_RET) {
          o << "return;\n";
@@ -189,7 +230,13 @@ namespace gbe
        // Regular compute instruction
        const uint32_t dstNum = insn.getDstNum();
        const uint32_t srcNum = insn.getSrcNum();
-      o << opcodeStr;
+
+      // These two needs a new instruction. Fortunately, it is just a string
+      // manipulation. MASKED(OP,... just becomes MASKED_OP(...)
+      if (opcode == OP_STORE || opcode == OP_LOAD)
+        o << "MASKED_" << opcodeStr << "(";
+      else
+        o << "MASKED" << dstNum << "(" << opcodeStr;
  
        // Append type when needed
        if (insn.isMemberOf<UnaryInstruction>() == true)
@@ -200,7 +247,8 @@ namespace gbe
         o << "_" << typeStr(cast<BinaryInstruction>(insn).getType());
        else if (insn.isMemberOf<CompareInstruction>() == true)
         o << "_" << typeStr(cast<CompareInstruction>(insn).getType());
-      o << "(";
+      if (opcode != OP_STORE && opcode != OP_LOAD)
+        o << ", ";
  
        // Output both destinations and sources in that order
        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
@@ -220,7 +268,7 @@ namespace gbe
                     imm.type == TYPE_FLOAT);
          o << ", " << imm.data.u32;
        } else if (opcode == OP_LOAD || opcode == OP_STORE)
-        o << ", base";
+        o << ", base, movedMask";
        o << ");\n";
      });
    }
@@ -250,7 +298,9 @@ namespace gbe
        << "const size_t curbe_sz = sim->get_curbe_size(sim);\n"
        << "const char *curbe = (const char*) sim->get_curbe_address(sim) + curbe_sz * tid;\n"
        << "char *base = (char*) sim->get_base_address(sim);\n";
+
      this->emitRegisters();
+    this->emitMaskingCode();
      this->emitCurbeLoad();
      this->emitInstructionStream();
      o << "}\n";
diff --git a/backend/src/backend/sim_context.hpp b/backend/src/backend/sim_context.hpp

index 9021adb..a52d439 100644 (file)
--- a/backend/src/backend/sim_context.hpp
+++ b/backend/src/backend/sim_context.hpp
@@ -51,6 +51,8 @@ namespace gbe
      void emitRegisters(void);
      /*! Load the curbe data into the registers */
      void emitCurbeLoad(void);
+    /*! Emit the masking code (mask / UIP) */
+    void emitMaskingCode(void);
      /*! Emit the instructions */
      void emitInstructionStream(void);
      /*! Implements base class */
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp

index 9cdbbec..0759f46 100644 (file)
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -36,6 +36,10 @@
  #include "sys/cvar.hpp"
  #include "sys/platform.hpp"
  
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
  namespace gbe
  {
    BVAR(OCL_OUTPUT_LLVM, false);
@@ -44,10 +48,11 @@ namespace gbe
    bool llvmToGen(ir::Unit &unit, const char *fileName)
    {
      using namespace llvm;
+
      // Get the global LLVM context
      llvm::LLVMContext& c = llvm::getGlobalContext();
      std::string errInfo;
-    llvm::raw_fd_ostream o("-", errInfo);
+    auto *o = new llvm::raw_fd_ostream("-", errInfo);
  
      // Get the module from its file
      SMDiagnostic Err;
@@ -60,7 +65,7 @@ namespace gbe
  
      // Print the code before further optimizations
      if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
-      passes.add(createPrintModulePass(&o));
+      passes.add(createPrintModulePass(o));
      passes.add(createScalarReplAggregatesPass()); // Break up allocas
      passes.add(createRemoveGEPPass(unit));
      passes.add(createConstantPropagationPass());
@@ -72,8 +77,15 @@ namespace gbe
  
      // Print the code extra optimization passes
      if (OCL_OUTPUT_LLVM)
-      passes.add(createPrintModulePass(&o));
+      passes.add(createPrintModulePass(o));
      passes.run(mod);
+
+    // raw_fd_ostream closes stdout. We must reopen it
+    delete o;
+    int fd;
+    fd = open("/dev/tty", O_WRONLY);
+    stdout = fdopen(fd, "w");
+
      return true;
    }
  } /* namespace gbe */
diff --git a/backend/src/llvm/stdlib.h b/backend/src/ocl_stdlib.h

similarity index 100%

rename from backend/src/llvm/stdlib.h

rename to backend/src/ocl_stdlib.h
diff --git a/backend/src/llvm/stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp

similarity index 99%

rename from backend/src/llvm/stdlib_str.cpp

rename to backend/src/ocl_stdlib_str.cpp

index 41ce7fe..fb262c2 100644 (file)
--- a/backend/src/llvm/stdlib_str.cpp
+++ b/backend/src/ocl_stdlib_str.cpp
@@ -19,7 +19,7 @@
  
  #include "string"
  namespace gbe {
-std::string stdlib_str = 
+std::string ocl_stdlib_str = 
  "#define DECL_INTERNAL_WORK_ITEM_FN(NAME)                             \\\n"
  "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void);  \\\n"
  "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void);  \\\n"
author	Benjamin Segovia <segovia.benjamin@gmail.com>
	Mon, 16 Apr 2012 18:44:44 +0000 (18:44 +0000)
committer	Keith Packard <keithp@keithp.com>
	Fri, 10 Aug 2012 23:16:28 +0000 (16:16 -0700)
backend/src/CMakeLists.txt		patch \| blob \| history
backend/src/backend/program.cpp		patch \| blob \| history
backend/src/backend/sim/sim_vector.h		patch \| blob \| history
backend/src/backend/sim/sim_vector_str.cpp		patch \| blob \| history
backend/src/backend/sim_context.cpp		patch \| blob \| history
backend/src/backend/sim_context.hpp		patch \| blob \| history
backend/src/llvm/llvm_to_gen.cpp		patch \| blob \| history
backend/src/ocl_stdlib.h	[moved from backend/src/llvm/stdlib.h with 100% similarity]	patch \| blob \| history
backend/src/ocl_stdlib_str.cpp	[moved from backend/src/llvm/stdlib_str.cpp with 99% similarity]	patch \| blob \| history