From 9487593e2ce5268b4b9bfc606fdcd4c6c88401e2 Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <devnull@localhost>
Date: Sun, 13 May 2012 17:58:26 +0000
Subject: [PATCH] Added support for loads and stores of uint2/3/4

---
 backend/src/backend/gen_context.cpp | 119 +++++++++++++++---------
 backend/src/backend/gen_context.hpp |   4 +-
 backend/src/ir/function.cpp         |   1 -
 backend/src/ocl_stdlib.h            | 177 ++++++++++++++++++++++++------------
 backend/src/ocl_stdlib_str.cpp      | 177 ++++++++++++++++++++++++------------
 5 files changed, 320 insertions(+), 158 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 52732f7..5613b18 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -118,18 +118,6 @@ namespace gbe
     }
   }
 
-  uint32_t GenContext::createGenReg(ir::Register reg, uint32_t grfOffset) {
-    using namespace ir;
-    if (fn.isSpecialReg(reg) == true) return grfOffset; // already done
-    if (fn.getInput(reg) != NULL) return grfOffset; // already done
-    const RegisterData regData = fn.getRegisterData(reg);
-    const RegisterFamily family = regData.family;
-    const uint32_t typeSize = familySize[family];
-    const uint32_t regSize = simdWidth*typeSize;
-    grfOffset = ALIGN(grfOffset, regSize);
-    if (grfOffset + regSize <= GEN_GRF_SIZE) {
-      const uint32_t nr = grfOffset / GEN_REG_SIZE;
-      const uint32_t subnr = (grfOffset % GEN_REG_SIZE) / typeSize;
 #define INSERT_REG(SIMD16, SIMD8, SIMD1) \
   if (this->isScalarReg(reg) == true) { \
     RA.insert(std::make_pair(reg, GenReg::SIMD1(nr, subnr))); \
@@ -142,6 +130,19 @@ namespace gbe
     grfOffset += simdWidth * typeSize; \
   } else \
     NOT_SUPPORTED;
+
+  uint32_t GenContext::createGenReg(ir::Register reg, uint32_t grfOffset) {
+    using namespace ir;
+    if (fn.isSpecialReg(reg) == true) return grfOffset; // already done
+    if (fn.getInput(reg) != NULL) return grfOffset; // already done
+    const RegisterData regData = fn.getRegisterData(reg);
+    const RegisterFamily family = regData.family;
+    const uint32_t typeSize = familySize[family];
+    const uint32_t regSize = simdWidth*typeSize;
+    grfOffset = ALIGN(grfOffset, regSize);
+    if (grfOffset + regSize <= GEN_GRF_SIZE) {
+      const uint32_t nr = grfOffset / GEN_REG_SIZE;
+      const uint32_t subnr = (grfOffset % GEN_REG_SIZE) / typeSize;
       switch (family) {
         case FAMILY_BOOL:
         case FAMILY_WORD:
@@ -156,12 +157,13 @@ namespace gbe
         default:
           NOT_SUPPORTED;
       }
-#undef INSERT_REG
     } else
       NOT_SUPPORTED;
     return grfOffset;
   }
 
+#undef INSERT_REG
+
   void GenContext::allocateRegister(void) {
     using namespace ir;
     GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
@@ -466,7 +468,8 @@ namespace gbe
       p->CMP(genCmp, src0, src1);
     p->pop();
 
-    // We emit a very unoptimized code where we store the resulting mask in a GRF
+    // We emit a very unoptimized code where we store the resulting mask in a
+    // GRF
     p->push();
       p->curr.flag = 0;
       p->curr.subFlag = 1;
@@ -549,23 +552,46 @@ namespace gbe
     }
   }
 
-  void GenContext::emitUntypedRead(const ir::LoadInstruction &insn,
-                                   GenReg address,
-                                   GenReg value)
+  void GenContext::emitUntypedRead(const ir::LoadInstruction &insn, GenReg address)
   {
     using namespace ir;
+    const uint32_t valueNum = insn.getValueNum();
+    GenReg src;
     GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL);
-    GBE_ASSERT(insn.getValueNum() == 1);
+
+    // A scalar address register requires to be aligned
     if (isScalarReg(insn.getAddress()) == true) {
+      if (this->simdWidth == 8)
+        src = GenReg::f8grf(112, 0);
+      else
+        src = GenReg::f16grf(112, 0);
+      p->MOV(src, GenReg::retype(address, GEN_TYPE_F));
+    } else
+      src = address;
+
+    // Gather of integer is simpler since we may not need to move the
+    // destination
+    if (valueNum == 1) {
+      const GenReg value = this->genReg(insn.getValue(0));
+      p->UNTYPED_READ(value, src, 0, 1);
+    }
+    // Right now, we just move everything to registers when dealing with
+    // int2/3/4
+    else {
+      p->UNTYPED_READ(GenReg::f8grf(114, 0), src, 0, valueNum);
       if (this->simdWidth == 8) {
-        p->MOV(GenReg::f8grf(112, 0), GenReg::retype(address, GEN_TYPE_F));
-        p->UNTYPED_READ(value, GenReg::f8grf(112, 0), 0, 1);
+        for (uint32_t value = 0; value < valueNum; ++value) {
+          const GenReg dst = this->genReg(insn.getValue(value), TYPE_FLOAT);
+          p->MOV(dst, GenReg::f8grf(114+value, 0));
+        }
       } else if (this->simdWidth == 16) {
-        p->MOV(GenReg::f16grf(112, 0), GenReg::retype(address, GEN_TYPE_F));
-        p->UNTYPED_READ(value, GenReg::f16grf(112, 0), 0, 1);
-      }
-    } else
-      p->UNTYPED_READ(value, address, 0, 1);
+        for (uint32_t value = 0; value < valueNum; ++value) {
+          const GenReg dst = this->genReg(insn.getValue(value), TYPE_FLOAT);
+          p->MOV(dst, GenReg::f16grf(114+2*value, 0));
+        }
+      } else
+        NOT_SUPPORTED;
+    }
   }
 
   INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
@@ -629,31 +655,37 @@ namespace gbe
 
   void GenContext::emitLoadInstruction(const ir::LoadInstruction &insn) {
     using namespace ir;
-    const GenReg value = this->genReg(insn.getValue(0));
     const GenReg address = this->genReg(insn.getAddress());
     GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL);
     GBE_ASSERT(this->isScalarReg(insn.getValue(0)) == false);
     if (insn.isAligned() == true)
-      this->emitUntypedRead(insn, address, value);
+      this->emitUntypedRead(insn, address);
     else {
+      const GenReg value = this->genReg(insn.getValue(0));
       this->emitByteGather(insn, address, value);
     }
   }
 
-  void GenContext::emitUntypedWrite(const ir::StoreInstruction &insn,
-                                    GenReg address,
-                                    GenReg value)
+  void GenContext::emitUntypedWrite(const ir::StoreInstruction &insn)
   {
     using namespace ir;
-    if (this->simdWidth == 8) {
-      p->MOV(GenReg::f8grf(112, 0), GenReg::retype(address, GEN_TYPE_F));
-      p->MOV(GenReg::f8grf(113, 0), GenReg::retype(value, GEN_TYPE_F));
-    } else if (this->simdWidth == 16) {
-      p->MOV(GenReg::f16grf(112, 0), GenReg::retype(address, GEN_TYPE_F));
-      p->MOV(GenReg::f16grf(114, 0), GenReg::retype(value, GEN_TYPE_F));
-    } else
+    const uint32_t srcNum = insn.getSrcNum();
+    const uint32_t valueNum = insn.getValueNum();
+
+    // We do it stupidly right now. We just move everything to temporaries
+    if (this->simdWidth == 8)
+      for (uint32_t src = 0; src < srcNum; ++src) {
+        const GenReg reg = this->genReg(insn.getSrc(src), TYPE_FLOAT);
+        p->MOV(GenReg::f8grf(112+src, 0), reg);
+      }
+    else if (this->simdWidth == 16)
+      for (uint32_t src = 0; src < srcNum; ++src) {
+        const GenReg reg = this->genReg(insn.getSrc(src), TYPE_FLOAT);
+        p->MOV(GenReg::f16grf(112+2*src, 0), reg);
+      }
+    else
       NOT_IMPLEMENTED;
-    p->UNTYPED_WRITE(GenReg::f8grf(112, 0), 0, 1);
+    p->UNTYPED_WRITE(GenReg::f8grf(112, 0), 0, valueNum);
   }
 
   void GenContext::emitByteScatter(const ir::StoreInstruction &insn,
@@ -664,6 +696,7 @@ namespace gbe
     const Type type = insn.getValueType();
     const uint32_t elemSize = getByteScatterGatherSize(type);
 
+    GBE_ASSERT(insn.getValueNum() == 1);
     if (this->simdWidth == 8) {
       p->MOV(GenReg::f8grf(112, 0), GenReg::retype(address, GEN_TYPE_F));
       if (elemSize == GEN_BYTE_SCATTER_DWORD)
@@ -689,13 +722,13 @@ namespace gbe
   void GenContext::emitStoreInstruction(const ir::StoreInstruction &insn) {
     using namespace ir;
     GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL);
-    GBE_ASSERT(insn.getValueNum() == 1);
-    const GenReg address = this->genReg(insn.getAddress());
-    const GenReg value = this->genReg(insn.getValue(0));
     if (insn.isAligned() == true)
-      this->emitUntypedWrite(insn, address, value);
-     else
+      this->emitUntypedWrite(insn);
+    else {
+      const GenReg address = this->genReg(insn.getAddress());
+      const GenReg value = this->genReg(insn.getValue(0));
       this->emitByteScatter(insn, address, value);
+    }
   }
 
   void GenContext::emitFenceInstruction(const ir::FenceInstruction &insn) {}
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index c5b29d6..578086f 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -82,8 +82,8 @@ namespace gbe
     /*! It is not natively suppored on Gen. We implement it here */
     void emitIntMul32x32(const ir::Instruction &insn, GenReg dst, GenReg src0, GenReg src1);
     /*! Use untyped writes and reads for everything aligned on 4 bytes */
-    void emitUntypedRead(const ir::LoadInstruction &insn, GenReg address, GenReg value);
-    void emitUntypedWrite(const ir::StoreInstruction &insn, GenReg address, GenReg value);
+    void emitUntypedRead(const ir::LoadInstruction &insn, GenReg address);
+    void emitUntypedWrite(const ir::StoreInstruction &insn);
     /*! Use byte scatters and gathers for everything not aligned on 4 bytes */
     void emitByteGather(const ir::LoadInstruction &insn, GenReg address, GenReg value);
     void emitByteScatter(const ir::StoreInstruction &insn, GenReg address, GenReg value);
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index a7dafde..bf27f6f 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -75,7 +75,6 @@ namespace ir {
       newBra->replace(&insn);
     });
 
-    std::cout << "blockNum: " << this->blockNum() << std::endl;
     // Reset the label to block mapping
     this->labels.resize(last);
     foreachBlock([&](BasicBlock &bb) {
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 5576f30..6b93263 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -17,9 +17,50 @@
  * Author: Benjamin Segovia <benjamin.segovia@intel.com>
  */
 
-#define DECL_INTERNAL_WORK_ITEM_FN(NAME)                             \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void);  \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void);  \
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL basic types
+/////////////////////////////////////////////////////////////////////////////
+typedef unsigned int uint;
+typedef unsigned int size_t;
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
+typedef unsigned uint3 __attribute__((ext_vector_type(3)));
+typedef unsigned uint4 __attribute__((ext_vector_type(4)));
+typedef unsigned uint8 __attribute__((ext_vector_type(8)));
+typedef unsigned uint16 __attribute__((ext_vector_type(16)));
+typedef bool bool2 __attribute__((ext_vector_type(2)));
+typedef bool bool3 __attribute__((ext_vector_type(3)));
+typedef bool bool4 __attribute__((ext_vector_type(4)));
+typedef bool bool8 __attribute__((ext_vector_type(8)));
+typedef bool bool16 __attribute__((ext_vector_type(16)));
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL address space
+/////////////////////////////////////////////////////////////////////////////
+#define __private __attribute__((address_space(0)))
+#define __global __attribute__((address_space(1)))
+#define __constant __attribute__((address_space(2)))
+//#define __local __attribute__((address_space(3)))
+#define global __global
+//#define local __local
+#define constant __constant
+#define private __private
+
+/////////////////////////////////////////////////////////////////////////////
+// Work groups and work items functions
+/////////////////////////////////////////////////////////////////////////////
+#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
+__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \
+__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \
 __attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);
 DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
@@ -28,12 +69,12 @@ DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
 DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #undef DECL_INTERNAL_WORK_ITEM_FN
 
-#define DECL_PUBLIC_WORK_ITEM_FN(NAME)              \
-inline unsigned NAME(unsigned int dim) {            \
-  if (dim == 0) return __gen_ocl_##NAME##0();       \
-  else if (dim == 1) return __gen_ocl_##NAME##1();  \
-  else if (dim == 2) return __gen_ocl_##NAME##2();  \
-  else return 0;                                    \
+#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \
+inline unsigned NAME(unsigned int dim) { \
+  if (dim == 0) return __gen_ocl_##NAME##0(); \
+  else if (dim == 1) return __gen_ocl_##NAME##1(); \
+  else if (dim == 2) return __gen_ocl_##NAME##2(); \
+  else return 0; \
 }
 DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
@@ -42,74 +83,98 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
-inline unsigned int get_global_id(unsigned int dim) {
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+__attribute__((always_inline, overloadable)) \
+inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+__attribute__((always_inline, overloadable)) \
+inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(int)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+
+/////////////////////////////////////////////////////////////////////////////
+// Arithmetic functions
+/////////////////////////////////////////////////////////////////////////////
+__attribute__((always_inline))
+inline uint get_global_id(uint dim) {
   return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
 }
 
-__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);
-__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) {
+__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c);
+__attribute__((overloadable, always_inline))
+inline uint select(uint src0, uint src1, uint cond) {
   return cond ? src0 : src1;
 }
-__attribute__((overloadable)) inline int select(int src0, int src1, int cond) {
+__attribute__((overloadable, always_inline))
+inline int select(int src0, int src1, int cond) {
   return cond ? src0 : src1;
 }
 
-typedef unsigned int uint;
-typedef float float2 __attribute__((ext_vector_type(2)));
-typedef float float3 __attribute__((ext_vector_type(3)));
-typedef float float4 __attribute__((ext_vector_type(4)));
-typedef int int2 __attribute__((ext_vector_type(2)));
-typedef int int3 __attribute__((ext_vector_type(3)));
-typedef int int4 __attribute__((ext_vector_type(4)));
-typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
-typedef unsigned uint3 __attribute__((ext_vector_type(3)));
-typedef unsigned uint4 __attribute__((ext_vector_type(4)));
-typedef bool bool2 __attribute__((ext_vector_type(2)));
-typedef bool bool3 __attribute__((ext_vector_type(3)));
-typedef bool bool4 __attribute__((ext_vector_type(4)));
-
 // This will be optimized out by LLVM and will output LLVM select instructions
-#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK)                       \
-__attribute__((overloadable))                                             \
-inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) {            \
-  TYPE4 dst;                                                              \
-  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */          \
-  const TYPE x1 = src1.x;                                                 \
-  const TYPE y0 = src0.y;                                                 \
-  const TYPE y1 = src1.y;                                                 \
-  const TYPE z0 = src0.z;                                                 \
-  const TYPE z1 = src1.z;                                                 \
-  const TYPE w0 = src0.w;                                                 \
-  const TYPE w1 = src1.w;                                                 \
-                                                                          \
-  dst.x = (cond.x & MASK) ? x1 : x0;                                      \
-  dst.y = (cond.y & MASK) ? y1 : y0;                                      \
-  dst.z = (cond.z & MASK) ? z1 : z0;                                      \
-  dst.w = (cond.w & MASK) ? w1 : w0;                                      \
-  return dst;                                                             \
+#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
+__attribute__((overloadable)) \
+inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+  TYPE4 dst; \
+  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
+  const TYPE x1 = src1.x; \
+  const TYPE y0 = src0.y; \
+  const TYPE y1 = src1.y; \
+  const TYPE z0 = src0.z; \
+  const TYPE z1 = src1.z; \
+  const TYPE w0 = src0.w; \
+  const TYPE w1 = src1.w; \
+  dst.x = (cond.x & MASK) ? x1 : x0; \
+  dst.y = (cond.y & MASK) ? y1 : y0; \
+  dst.z = (cond.z & MASK) ? z1 : z0; \
+  dst.w = (cond.w & MASK) ? w1 : w0; \
+  return dst; \
 }
 DECL_SELECT4(int4, int, int4, 0x80000000)
 DECL_SELECT4(float4, float, int4, 0x80000000)
 #undef DECL_SELECT4
 
-__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) {
+__attribute__((overloadable,always_inline))
+inline float2 mad(float2 a, float2 b, float2 c) {
   return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
 }
-__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) {
+__attribute__((overloadable,always_inline))
+inline float3 mad(float3 a, float3 b, float3 c) {
   return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
 }
-__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) {
+__attribute__((overloadable,always_inline))
+inline float4 mad(float4 a, float4 b, float4 c) {
   return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
                   mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
 }
 
-#define __private __attribute__((address_space(0)))
-#define __global __attribute__((address_space(1)))
-#define __constant __attribute__((address_space(2)))
-//#define __local __attribute__((address_space(3)))
-#define global __global
-//#define local __local
-#define constant __constant
-#define private __private
 
 #define NULL ((void*)0)
diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp
index 778aa1a..7b37955 100644
--- a/backend/src/ocl_stdlib_str.cpp
+++ b/backend/src/ocl_stdlib_str.cpp
@@ -20,9 +20,50 @@
 #include "string"
 namespace gbe {
 std::string ocl_stdlib_str = 
-"#define DECL_INTERNAL_WORK_ITEM_FN(NAME)                             \\\n"
-"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void);  \\\n"
-"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void);  \\\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// OpenCL basic types\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"typedef unsigned int uint;\n"
+"typedef unsigned int size_t;\n"
+"typedef float float2 __attribute__((ext_vector_type(2)));\n"
+"typedef float float3 __attribute__((ext_vector_type(3)));\n"
+"typedef float float4 __attribute__((ext_vector_type(4)));\n"
+"typedef float float8 __attribute__((ext_vector_type(8)));\n"
+"typedef float float16 __attribute__((ext_vector_type(16)));\n"
+"typedef int int2 __attribute__((ext_vector_type(2)));\n"
+"typedef int int3 __attribute__((ext_vector_type(3)));\n"
+"typedef int int4 __attribute__((ext_vector_type(4)));\n"
+"typedef int int8 __attribute__((ext_vector_type(8)));\n"
+"typedef int int16 __attribute__((ext_vector_type(16)));\n"
+"typedef unsigned int uint2 __attribute__((ext_vector_type(2)));\n"
+"typedef unsigned uint3 __attribute__((ext_vector_type(3)));\n"
+"typedef unsigned uint4 __attribute__((ext_vector_type(4)));\n"
+"typedef unsigned uint8 __attribute__((ext_vector_type(8)));\n"
+"typedef unsigned uint16 __attribute__((ext_vector_type(16)));\n"
+"typedef bool bool2 __attribute__((ext_vector_type(2)));\n"
+"typedef bool bool3 __attribute__((ext_vector_type(3)));\n"
+"typedef bool bool4 __attribute__((ext_vector_type(4)));\n"
+"typedef bool bool8 __attribute__((ext_vector_type(8)));\n"
+"typedef bool bool16 __attribute__((ext_vector_type(16)));\n"
+"\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// OpenCL address space\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"#define __private __attribute__((address_space(0)))\n"
+"#define __global __attribute__((address_space(1)))\n"
+"#define __constant __attribute__((address_space(2)))\n"
+"//#define __local __attribute__((address_space(3)))\n"
+"#define global __global\n"
+"//#define local __local\n"
+"#define constant __constant\n"
+"#define private __private\n"
+"\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// Work groups and work items functions\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n"
+"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n"
+"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n"
 "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);\n"
 "DECL_INTERNAL_WORK_ITEM_FN(get_group_id)\n"
 "DECL_INTERNAL_WORK_ITEM_FN(get_local_id)\n"
@@ -31,12 +72,12 @@ std::string ocl_stdlib_str =
 "DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)\n"
 "#undef DECL_INTERNAL_WORK_ITEM_FN\n"
 "\n"
-"#define DECL_PUBLIC_WORK_ITEM_FN(NAME)              \\\n"
-"inline unsigned NAME(unsigned int dim) {            \\\n"
-"  if (dim == 0) return __gen_ocl_##NAME##0();       \\\n"
-"  else if (dim == 1) return __gen_ocl_##NAME##1();  \\\n"
-"  else if (dim == 2) return __gen_ocl_##NAME##2();  \\\n"
-"  else return 0;                                    \\\n"
+"#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \\\n"
+"inline unsigned NAME(unsigned int dim) { \\\n"
+"  if (dim == 0) return __gen_ocl_##NAME##0(); \\\n"
+"  else if (dim == 1) return __gen_ocl_##NAME##1(); \\\n"
+"  else if (dim == 2) return __gen_ocl_##NAME##2(); \\\n"
+"  else return 0; \\\n"
 "}\n"
 "DECL_PUBLIC_WORK_ITEM_FN(get_group_id)\n"
 "DECL_PUBLIC_WORK_ITEM_FN(get_local_id)\n"
@@ -45,75 +86,99 @@ std::string ocl_stdlib_str =
 "DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)\n"
 "#undef DECL_PUBLIC_WORK_ITEM_FN\n"
 "\n"
-"inline unsigned int get_global_id(unsigned int dim) {\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// Vector loads and stores\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"\n"
+"// These loads and stores will use untyped reads and writes, so we can just\n"
+"// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.\n"
+"// Well we do not care, we do not activate TBAA in the compiler\n"
+"#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \\\n"
+"__attribute__((always_inline, overloadable)) \\\n"
+"inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \\\n"
+"  return *(SPACE TYPE##DIM *) (p + DIM * offset); \\\n"
+"} \\\n"
+"__attribute__((always_inline, overloadable)) \\\n"
+"inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \\\n"
+"  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \\\n"
+"}\n"
+"\n"
+"#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n"
+"\n"
+"#define DECL_UNTYPED_RW_ALL(TYPE) \\\n"
+"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n"
+"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n"
+"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n"
+"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n"
+"\n"
+"DECL_UNTYPED_RW_ALL(float)\n"
+"DECL_UNTYPED_RW_ALL(uint)\n"
+"DECL_UNTYPED_RW_ALL(int)\n"
+"\n"
+"#undef DECL_UNTYPED_RW_ALL\n"
+"#undef DECL_UNTYPED_RW_ALL_SPACE\n"
+"#undef DECL_UNTYPED_RW_SPACE_N\n"
+"\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// Arithmetic functions\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"__attribute__((always_inline))\n"
+"inline uint get_global_id(uint dim) {\n"
 "  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);\n"
 "}\n"
 "\n"
-"__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);\n"
-"__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) {\n"
+"__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c);\n"
+"__attribute__((overloadable, always_inline))\n"
+"inline uint select(uint src0, uint src1, uint cond) {\n"
 "  return cond ? src0 : src1;\n"
 "}\n"
-"__attribute__((overloadable)) inline int select(int src0, int src1, int cond) {\n"
+"__attribute__((overloadable, always_inline))\n"
+"inline int select(int src0, int src1, int cond) {\n"
 "  return cond ? src0 : src1;\n"
 "}\n"
 "\n"
-"typedef unsigned int uint;\n"
-"typedef float float2 __attribute__((ext_vector_type(2)));\n"
-"typedef float float3 __attribute__((ext_vector_type(3)));\n"
-"typedef float float4 __attribute__((ext_vector_type(4)));\n"
-"typedef int int2 __attribute__((ext_vector_type(2)));\n"
-"typedef int int3 __attribute__((ext_vector_type(3)));\n"
-"typedef int int4 __attribute__((ext_vector_type(4)));\n"
-"typedef unsigned int uint2 __attribute__((ext_vector_type(2)));\n"
-"typedef unsigned uint3 __attribute__((ext_vector_type(3)));\n"
-"typedef unsigned uint4 __attribute__((ext_vector_type(4)));\n"
-"typedef bool bool2 __attribute__((ext_vector_type(2)));\n"
-"typedef bool bool3 __attribute__((ext_vector_type(3)));\n"
-"typedef bool bool4 __attribute__((ext_vector_type(4)));\n"
-"\n"
 "// This will be optimized out by LLVM and will output LLVM select instructions\n"
-"#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK)                       \\\n"
-"__attribute__((overloadable))                                             \\\n"
-"inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) {            \\\n"
-"  TYPE4 dst;                                                              \\\n"
-"  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */          \\\n"
-"  const TYPE x1 = src1.x;                                                 \\\n"
-"  const TYPE y0 = src0.y;                                                 \\\n"
-"  const TYPE y1 = src1.y;                                                 \\\n"
-"  const TYPE z0 = src0.z;                                                 \\\n"
-"  const TYPE z1 = src1.z;                                                 \\\n"
-"  const TYPE w0 = src0.w;                                                 \\\n"
-"  const TYPE w1 = src1.w;                                                 \\\n"
-"                                                                          \\\n"
-"  dst.x = (cond.x & MASK) ? x1 : x0;                                      \\\n"
-"  dst.y = (cond.y & MASK) ? y1 : y0;                                      \\\n"
-"  dst.z = (cond.z & MASK) ? z1 : z0;                                      \\\n"
-"  dst.w = (cond.w & MASK) ? w1 : w0;                                      \\\n"
-"  return dst;                                                             \\\n"
+"#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n"
+"__attribute__((overloadable)) \\\n"
+"inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n"
+"  TYPE4 dst; \\\n"
+"  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n"
+"  const TYPE x1 = src1.x; \\\n"
+"  const TYPE y0 = src0.y; \\\n"
+"  const TYPE y1 = src1.y; \\\n"
+"  const TYPE z0 = src0.z; \\\n"
+"  const TYPE z1 = src1.z; \\\n"
+"  const TYPE w0 = src0.w; \\\n"
+"  const TYPE w1 = src1.w; \\\n"
+"  dst.x = (cond.x & MASK) ? x1 : x0; \\\n"
+"  dst.y = (cond.y & MASK) ? y1 : y0; \\\n"
+"  dst.z = (cond.z & MASK) ? z1 : z0; \\\n"
+"  dst.w = (cond.w & MASK) ? w1 : w0; \\\n"
+"  return dst; \\\n"
 "}\n"
 "DECL_SELECT4(int4, int, int4, 0x80000000)\n"
 "DECL_SELECT4(float4, float, int4, 0x80000000)\n"
 "#undef DECL_SELECT4\n"
 "\n"
-"__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) {\n"
+"__attribute__((overloadable,always_inline))\n"
+"inline float2 mad(float2 a, float2 b, float2 c) {\n"
 "  return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));\n"
 "}\n"
-"__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) {\n"
+"__attribute__((overloadable,always_inline))\n"
+"inline float3 mad(float3 a, float3 b, float3 c) {\n"
 "  return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));\n"
 "}\n"
-"__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) {\n"
+"__attribute__((overloadable,always_inline))\n"
+"inline float4 mad(float4 a, float4 b, float4 c) {\n"
 "  return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),\n"
 "                  mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));\n"
 "}\n"
 "\n"
-"#define __private __attribute__((address_space(0)))\n"
-"#define __global __attribute__((address_space(1)))\n"
-"#define __constant __attribute__((address_space(2)))\n"
-"//#define __local __attribute__((address_space(3)))\n"
-"#define global __global\n"
-"//#define local __local\n"
-"#define constant __constant\n"
-"#define private __private\n"
 "\n"
 "#define NULL ((void*)0)\n"
 ;
-- 
2.7.4