From 4c5a0cfb128318adfdbc6d87cf77689584ee5bfb Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <benjamin.segovia@intel.com>
Date: Fri, 2 Nov 2012 17:55:20 -0700
Subject: [PATCH] Made compiler_clod pass. The image is now properly computed.
 I basically added a bunch of new intrinsics and cleaned a bit the ocl std
 library. Well, it is not going to be compliant for a while with the spec
 (mostly due to precision issues and the way denormals and nan are handled).
 But, it should do the job for now. Started to add a more complex test called
 "ribbon"

---
 backend/src/backend/gen_context.cpp        |   7 +-
 backend/src/backend/gen_encoder.cpp        |   3 +-
 backend/src/backend/gen_encoder.hpp        |   3 +-
 backend/src/backend/gen_insn_selection.cpp |  16 +-
 backend/src/backend/gen_insn_selection.hxx |   1 +
 backend/src/backend/program.cpp            |   2 +-
 backend/src/ir/instruction.cpp             |  11 +
 backend/src/ir/instruction.hpp             |  10 +
 backend/src/ir/instruction.hxx             |   5 +
 backend/src/llvm/llvm_gen_backend.cpp      |  24 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx |   5 +
 backend/src/ocl_stdlib.h                   | 350 ++++++++++++++++++++---------
 backend/src/ocl_stdlib_str.cpp             | 350 ++++++++++++++++++++---------
 kernels/compiler_clod.cl                   |  35 ++-
 kernels/compiler_ribbon.cl                 |  87 +++++++
 utests/CMakeLists.txt                      |   4 +-
 utests/compiler_clod.cpp                   |  49 ++++
 utests/compiler_ribbon.cpp                 |  50 +++++
 18 files changed, 777 insertions(+), 235 deletions(-)
 create mode 100644 kernels/compiler_ribbon.cl
 create mode 100644 utests/compiler_clod.cpp
 create mode 100644 utests/compiler_ribbon.cpp

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 366ea0d..a84eec7 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -133,6 +133,10 @@ namespace gbe
     switch (insn.opcode) {
       case SEL_OP_MOV: p->MOV(dst, src); break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
+      case SEL_OP_RNDD: p->RNDD(dst, src); break;
+      case SEL_OP_RNDU: p->RNDU(dst, src); break;
+      case SEL_OP_RNDE: p->RNDE(dst, src); break;
+      case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -206,8 +210,7 @@ namespace gbe
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->curr.execWidth = 8;
       p->curr.noMask = 1;
-      p->MOV(GenRegister::f8grf(127,0), GenRegister::f8grf(0,0));
-      p->EOT(127);
+      p->EOT(0);
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 4aa3317..633d5f8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -597,6 +597,8 @@ namespace gbe
   ALU1(MOV)
   ALU1(RNDZ)
   ALU1(RNDE)
+  ALU1(RNDD)
+  ALU1(RNDU)
   ALU2(SEL)
   ALU1(NOT)
   ALU2(AND)
@@ -608,7 +610,6 @@ namespace gbe
   ALU2(RSL)
   ALU2(ASR)
   ALU1(FRC)
-  ALU1(RNDD)
   ALU2(MAC)
   ALU1(LZD)
   ALU2(LINE)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 5308338..969fa41 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -98,6 +98,8 @@ namespace gbe
     ALU1(MOV)
     ALU1(RNDZ)
     ALU1(RNDE)
+    ALU1(RNDD)
+    ALU1(RNDU)
     ALU2(SEL)
     ALU1(NOT)
     ALU2(AND)
@@ -111,7 +113,6 @@ namespace gbe
     ALU2(ADD)
     ALU2(MUL)
     ALU1(FRC)
-    ALU1(RNDD)
     ALU2(MAC)
     ALU2(MACH)
     ALU1(LZD)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 0fe8e66..6a9aa53 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -318,6 +318,7 @@ namespace gbe
     ALU2(MUL)
     ALU1(FRC)
     ALU1(RNDD)
+    ALU1(RNDU)
     ALU2(MACH)
     ALU1(LZD)
     ALU3(MAD)
@@ -992,6 +993,10 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0));
       switch (opcode) {
         case ir::OP_MOV: sel.MOV(dst, src); break;
+        case ir::OP_RNDD: sel.RNDD(dst, src); break;
+        case ir::OP_RNDE: sel.RNDE(dst, src); break;
+        case ir::OP_RNDU: sel.RNDU(dst, src); break;
+        case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
         case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
         case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
         case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
@@ -1045,11 +1050,14 @@ namespace gbe
       GenRegister dst  = sel.selReg(insn.getDst(0), type);
 
       // Immediates not supported
-      if (opcode == OP_DIV) {
+      if (opcode == OP_DIV || opcode == OP_POW) {
         GBE_ASSERT(type == TYPE_FLOAT);
-        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
-        sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
+        const GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+        const GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+        const uint32_t mathOp = opcode == OP_DIV ?
+                                GEN_MATH_FUNCTION_FDIV :
+                                GEN_MATH_FUNCTION_POW;
+        sel.MATH(dst, mathOp, src0, src1);
         this->markAllChildren(dag);
         return true;
       }
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 418af5e..8dcf2b5 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -5,6 +5,7 @@ DECL_SELECTION_IR(LZD, UnaryInstruction)
 DECL_SELECTION_IR(RNDZ, UnaryInstruction)
 DECL_SELECTION_IR(RNDE, UnaryInstruction)
 DECL_SELECTION_IR(RNDD, UnaryInstruction)
+DECL_SELECTION_IR(RNDU, UnaryInstruction)
 DECL_SELECTION_IR(FRC, UnaryInstruction)
 DECL_SELECTION_IR(SEL, BinaryInstruction)
 DECL_SELECTION_IR(AND, BinaryInstruction)
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 42962b5..adf6314 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -106,7 +106,7 @@ namespace gbe {
 
     // Now compile the code to llvm using clang
     // XXX use popen and stuff instead of that
-    std::string compileCmd = "clang -emit-llvm -O3 -ccc-host-triple ptx32 -c ";
+    std::string compileCmd = "clang -x cl -emit-llvm -O3 -ccc-host-triple ptx32 -c ";
     compileCmd += clName;
     compileCmd += " -o ";
     compileCmd += llName;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 1c58faf..e1c75e5 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -702,11 +702,17 @@ namespace ir {
         default:
           CHECK_TYPE(this->type, allButBool);
           break;
+        case OP_POW:
         case OP_COS:
         case OP_SIN:
         case OP_RCP:
+        case OP_ABS:
         case OP_RSQ:
         case OP_SQR:
+        case OP_RNDD:
+        case OP_RNDE:
+        case OP_RNDU:
+        case OP_RNDZ:
           const Type fp = TYPE_FLOAT;
           if (UNLIKELY(checkTypeFamily(TYPE_FLOAT, &fp, 1, whyNot)) == false)
             return false;
@@ -1425,6 +1431,10 @@ DECL_MEM_FN(VoteInstruction, VotePredicate, getVotePredicate(void), getVotePredi
   DECL_EMIT_FUNCTION(LOG)
   DECL_EMIT_FUNCTION(SQR)
   DECL_EMIT_FUNCTION(RSQ)
+  DECL_EMIT_FUNCTION(RNDD)
+  DECL_EMIT_FUNCTION(RNDE)
+  DECL_EMIT_FUNCTION(RNDU)
+  DECL_EMIT_FUNCTION(RNDZ)
 
 #undef DECL_EMIT_FUNCTION
 
@@ -1434,6 +1444,7 @@ DECL_MEM_FN(VoteInstruction, VotePredicate, getVotePredicate(void), getVotePredi
     return internal::BinaryInstruction(OP_##NAME, type, dst, src0, src1).convert(); \
   }
 
+  DECL_EMIT_FUNCTION(POW)
   DECL_EMIT_FUNCTION(MUL)
   DECL_EMIT_FUNCTION(ADD)
   DECL_EMIT_FUNCTION(SUB)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index f17c0f0..c1b8323 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -488,12 +488,22 @@ namespace ir {
   Instruction SIN(Type type, Register dst, Register src);
   /*! tan.type dst src */
   Instruction RCP(Type type, Register dst, Register src);
+  /*! abs.type dst src */
+  Instruction ABS(Type type, Register dst, Register src);
   /*! log.type dst src */
   Instruction LOG(Type type, Register dst, Register src);
   /*! sqr.type dst src */
   Instruction SQR(Type type, Register dst, Register src);
   /*! rsq.type dst src */
   Instruction RSQ(Type type, Register dst, Register src);
+  /*! rndd.type dst src */
+  Instruction RNDD(Type type, Register dst, Register src);
+  /*! rnde.type dst src */
+  Instruction RNDE(Type type, Register dst, Register src);
+  /*! rndu.type dst src */
+  Instruction RNDU(Type type, Register dst, Register src);
+  /*! rndz.type dst src */
+  Instruction RNDZ(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 6704acc..4f154b0 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -32,6 +32,11 @@ DECL_INSN(LOG, UnaryInstruction)
 DECL_INSN(SQR, UnaryInstruction)
 DECL_INSN(RSQ, UnaryInstruction)
 DECL_INSN(RCP, UnaryInstruction)
+DECL_INSN(ABS, UnaryInstruction)
+DECL_INSN(RNDD, UnaryInstruction)
+DECL_INSN(RNDE, UnaryInstruction)
+DECL_INSN(RNDU, UnaryInstruction)
+DECL_INSN(RNDZ, UnaryInstruction)
 DECL_INSN(POW, BinaryInstruction)
 DECL_INSN(MUL, BinaryInstruction)
 DECL_INSN(ADD, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index ae2400b..c3c3ff4 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -666,7 +666,8 @@ namespace gbe
       uint32_t argID = 1; // Start at one actually
       for (; I != E; ++I, ++argID) {
         Type *type = I->getType();
-        GBE_ASSERT(isScalarType(type) == true);
+        GBE_ASSERTM(isScalarType(type) == true,
+                    "vector type in the function argument is not supported yet");
         const ir::Register reg = regTranslator.newScalar(I);
         if (type->isPointerTy() == false)
           ctx.input(ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type));
@@ -1431,6 +1432,11 @@ namespace gbe
       case GEN_OCL_LOG:
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
+      case GEN_OCL_ABS:
+      case GEN_OCL_RNDZ:
+      case GEN_OCL_RNDE:
+      case GEN_OCL_RNDU:
+      case GEN_OCL_RNDD:
         // No structure can be returned
         GBE_ASSERT(I.hasStructRetAttr() == false);
         this->newRegister(&I);
@@ -1613,16 +1619,28 @@ namespace gbe
             ctx.MAD(ir::TYPE_FLOAT, dst, src0, src1, src2);
             break;
           }
+          case GEN_OCL_POW:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
+            break;
+          }
           case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
           case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
+          case GEN_OCL_ABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
+          case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
+          case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
+          case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
+          case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
           case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
           case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
-          default:
-            break;
+          default: break;
         }
       }
     }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index f51bb63..3228d57 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -39,6 +39,7 @@ DECL_LLVM_GEN_FUNCTION(RGATHER7, _Z17__gen_ocl_rgathertiiiiiii)
 DECL_LLVM_GEN_FUNCTION(RGATHER8, _Z17__gen_ocl_rgathertiiiiiiii)
 
 // Math function
+DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
 DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos)
 DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
 DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
@@ -46,6 +47,10 @@ DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(LOG, __gen_ocl_log)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
+DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz)
+DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
+DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
+DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
 
 // Uniform conditions
 DECL_LLVM_GEN_FUNCTION(ALL, _Z13__gen_ocl_allt)
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 9b3b454..eda7247 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -20,8 +20,10 @@ uint* Copyright Â© 2012 Intel Corporation
 #ifndef __GEN_OCL_STDLIB_H__
 #define __GEN_OCL_STDLIB_H__
 
-#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline)) inline
+#define INLINE __attribute__((always_inline)) inline
 #define OVERLOADABLE __attribute__((overloadable))
+#define PURE __attribute__((pure))
+#define CONST __attribute__((const))
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL basic types
@@ -62,12 +64,15 @@ typedef bool bool16 __attribute__((ext_vector_type(16)));
 #define private __private
 
 /////////////////////////////////////////////////////////////////////////////
-// Work groups and work items functions
+// Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
+// TODO get_global_offset
+// TODO get_work_dim
+
 #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);
+PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##2(void);
 DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
@@ -89,66 +94,64 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
-/////////////////////////////////////////////////////////////////////////////
-// Vector loads and stores
-/////////////////////////////////////////////////////////////////////////////
-
-// These loads and stores will use untyped reads and writes, so we can just
-// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
-// Well we do not care, we do not activate TBAA in the compiler
-#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
-__attribute__((always_inline, overloadable)) \
-inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
-  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
-} \
-__attribute__((always_inline, overloadable)) \
-inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
-  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+INLINE uint get_global_id(uint dim) {
+  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
 }
 
-#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
-
-#define DECL_UNTYPED_RW_ALL(TYPE) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
-
-DECL_UNTYPED_RW_ALL(float)
-DECL_UNTYPED_RW_ALL(uint)
-DECL_UNTYPED_RW_ALL(int)
-
-#undef DECL_UNTYPED_RW_ALL
-#undef DECL_UNTYPED_RW_ALL_SPACE
-#undef DECL_UNTYPED_RW_SPACE_N
-
 /////////////////////////////////////////////////////////////////////////////
-// Arithmetic functions
+// Math Functions (see 6.11.2 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-__attribute__((always_inline))
-inline uint get_global_id(uint dim) {
-  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
+PURE CONST float __gen_ocl_fabs(float x);
+PURE CONST float __gen_ocl_sin(float x);
+PURE CONST float __gen_ocl_cos(float x);
+PURE CONST float __gen_ocl_sqrt(float x);
+PURE CONST float __gen_ocl_rsqrt(float x);
+PURE CONST float __gen_ocl_log(float x);
+PURE CONST float __gen_ocl_pow(float x, float y);
+PURE CONST float __gen_ocl_rcp(float x);
+PURE CONST float __gen_ocl_rndz(float x);
+PURE CONST float __gen_ocl_rnde(float x);
+PURE CONST float __gen_ocl_rndu(float x);
+PURE CONST float __gen_ocl_rndd(float x);
+INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+INLINE OVERLOADABLE float native_tan(float x) {
+  return native_sin(x) / native_cos(x);
 }
 
-__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c);
-__attribute__((overloadable, always_inline))
-inline uint select(uint src0, uint src1, uint cond) {
+// TODO make them actually compliant precision-wise
+#define cos native_cos   // XXX work-around ptx profile: cos already defined
+#define sin native_sin   // XXX work-around ptr profile: sin already defined
+#define sqrt native_sqrt // XXX work-around ptr profile: sin already defined
+
+INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); }
+INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); }
+INLINE OVERLOADABLE float round(float x) { return __gen_ocl_rnde(x); }
+INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); }
+INLINE OVERLOADABLE float ceil(float x)  { return __gen_ocl_rndu(x); }
+INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); }
+
+// Hack pow is already a builtin
+#define pow powr
+
+PURE CONST OVERLOADABLE float mad(float a, float b, float c);
+OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) {
   return cond ? src1 : src0;
 }
-__attribute__((overloadable, always_inline))
-inline int select(int src0, int src1, int cond) {
+OVERLOADABLE INLINE int select(int src0, int src1, int cond) {
   return cond ? src1 : src0;
 }
 
 // This will be optimized out by LLVM and will output LLVM select instructions
 #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-__attribute__((overloadable)) \
-inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
   TYPE4 dst; \
   const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
   const TYPE x1 = src1.x; \
@@ -168,22 +171,27 @@ DECL_SELECT4(int4, int, int4, 0x80000000)
 DECL_SELECT4(float4, float, int4, 0x80000000)
 #undef DECL_SELECT4
 
-INLINE_OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {
+#if 0
+INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {
   return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
 }
-INLINE_OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {
+INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {
   return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
 }
-INLINE_OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {
+INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {
   return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
                   mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
 }
+#endif
 
+/////////////////////////////////////////////////////////////////////////////
+// Common Functions (see 6.11.4 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
 #define DECL_MIN_MAX(TYPE) \
-INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
   return a > b ? a : b; \
 } \
-INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
   return a < b ? a : b; \
 }
 DECL_MIN_MAX(float)
@@ -195,48 +203,133 @@ DECL_MIN_MAX(unsigned short)
 DECL_MIN_MAX(unsigned char)
 #undef DECL_MIN_MAX
 
+INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+
 /////////////////////////////////////////////////////////////////////////////
-// Math intrinsic functions
+// Geometric functions (see 6.11.5 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-__attribute__((pure,const)) float __gen_ocl_sin(float x);
-__attribute__((pure,const)) float __gen_ocl_cos(float x);
-__attribute__((pure,const)) float __gen_ocl_sqrt(float x);
-__attribute__((pure,const)) float __gen_ocl_rsqrt(float x);
-__attribute__((pure,const)) float __gen_ocl_log(float x);
-__attribute__((pure,const)) float __gen_ocl_pow(float x, float y);
-__attribute__((pure,const)) float __gen_ocl_rcp(float x);
-
-INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-INLINE_OVERLOADABLE float native_tan(float x) {
-  return native_sin(x) / native_cos(x);
+INLINE OVERLOADABLE float dot(float2 p0, float2 p1) {
+  return mad(p0.x,p1.x,p0.y*p1.y);
+}
+INLINE OVERLOADABLE float dot(float3 p0, float3 p1) {
+  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
+}
+INLINE OVERLOADABLE float dot(float4 p0, float4 p1) {
+  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
 }
 
+INLINE OVERLOADABLE float dot(float8 p0, float8 p1) {
+  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
+         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
+}
+INLINE OVERLOADABLE float dot(float16 p0, float16 p1) {
+  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
+         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
+         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
+         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
+}
+
+INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
+INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
+INLINE OVERLOADABLE float normalize(float x) { return 1.f; }
+INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
+INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
+INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
+INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; }
+INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
+}
+INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(int)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+
 /////////////////////////////////////////////////////////////////////////////
 // Declare functions for vector types which are derived from scalar ones
 /////////////////////////////////////////////////////////////////////////////
 #define DECL_VECTOR_1OP(NAME, TYPE) \
-  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
+  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
     return (TYPE##2)(NAME(v.x), NAME(v.y)); \
   }\
-  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
+  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
     return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \
   }\
-  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
+  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
     return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \
   }\
-  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
+  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v.s0123);\
     dst.s4567 = NAME(v.s4567);\
     return dst;\
   }\
-  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
+  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v.s01234567);\
     dst.s89abcdef = NAME(v.s89abcdef);\
@@ -249,31 +342,73 @@ DECL_VECTOR_1OP(native_sqrt, float);
 DECL_VECTOR_1OP(native_rsqrt, float);
 DECL_VECTOR_1OP(native_log2, float);
 DECL_VECTOR_1OP(native_recip, float);
+DECL_VECTOR_1OP(fabs, float);
+DECL_VECTOR_1OP(trunc, float);
+DECL_VECTOR_1OP(round, float);
+DECL_VECTOR_1OP(floor, float);
+DECL_VECTOR_1OP(ceil, float);
 #undef DECL_VECTOR_1OP
 
-/////////////////////////////////////////////////////////////////////////////
-// Geometric functions
-/////////////////////////////////////////////////////////////////////////////
-INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
-  return mad(p0.x,p1.x,p0.y*p1.y);
-}
-INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
-  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
-}
-INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
-  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
-}
+#define DECL_VECTOR_2OP(NAME, TYPE) \
+  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
+    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
+  }\
+  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
+    return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
+  }\
+  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
+    return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
+  }\
+  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
+    TYPE##8 dst;\
+    dst.s0123 = NAME(v0.s0123, v1.s0123);\
+    dst.s4567 = NAME(v0.s4567, v1.s4567);\
+    return dst;\
+  }\
+  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
+    TYPE##16 dst;\
+    dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
+    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
+    return dst;\
+  }
+DECL_VECTOR_2OP(min, float);
+DECL_VECTOR_2OP(max, float);
+DECL_VECTOR_2OP(fmod, float);
+DECL_VECTOR_2OP(powr, float);
+#undef DECL_VECTOR_2OP
+
+#define DECL_VECTOR_3OP(NAME, TYPE) \
+  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
+    return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \
+  }\
+  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
+    return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \
+  }\
+  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
+    return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \
+  }\
+  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
+    TYPE##8 dst;\
+    dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\
+    dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\
+    return dst;\
+  }\
+  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
+    TYPE##16 dst;\
+    dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\
+    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\
+    return dst;\
+  }
+DECL_VECTOR_3OP(mad, float);
+DECL_VECTOR_3OP(mix, float);
+#undef DECL_VECTOR_3OP
 
-INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
-  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
-         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
-}
-INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
-  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
-         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
-         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
-         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
-}
+// mix requires more variants
+INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
+INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
+INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
+INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
+INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
 
 /////////////////////////////////////////////////////////////////////////////
 // Extensions to manipulate the register file
@@ -324,12 +459,10 @@ int __gen_ocl_force_simd8(void);
 int __gen_ocl_force_simd16(void);
 
 #define DECL_VOTE(TYPE) \
-__attribute__((overloadable,always_inline)) \
-TYPE __gen_ocl_any(TYPE cond) { \
+INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \
   return (TYPE) __gen_ocl_any((unsigned short) cond); \
 } \
-__attribute__((overloadable,always_inline)) \
-TYPE __gen_ocl_all(TYPE cond) { \
+INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \
   return (TYPE) __gen_ocl_all((unsigned short) cond); \
 }
 DECL_VOTE(unsigned int)
@@ -341,6 +474,9 @@ DECL_VOTE(bool)
 #undef DECL_VOTE
 
 #define NULL ((void*)0)
-#undef INLINE_OVERLOADABLE
+#undef PURE
+#undef CONST
+#undef OVERLOADABLE
+#undef INLINE
 #endif /* __GEN_OCL_STDLIB_H__ */
 
diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp
index bd4ec87..b47e7ef 100644
--- a/backend/src/ocl_stdlib_str.cpp
+++ b/backend/src/ocl_stdlib_str.cpp
@@ -23,8 +23,10 @@ std::string ocl_stdlib_str =
 "#ifndef __GEN_OCL_STDLIB_H__\n"
 "#define __GEN_OCL_STDLIB_H__\n"
 "\n"
-"#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline)) inline\n"
+"#define INLINE __attribute__((always_inline)) inline\n"
 "#define OVERLOADABLE __attribute__((overloadable))\n"
+"#define PURE __attribute__((pure))\n"
+"#define CONST __attribute__((const))\n"
 "\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
 "// OpenCL basic types\n"
@@ -65,12 +67,15 @@ std::string ocl_stdlib_str =
 "#define private __private\n"
 "\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
-"// Work groups and work items functions\n"
+"// Work Items functions (see 6.11.1 of OCL 1.1 spec)\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
+"// TODO get_global_offset\n"
+"// TODO get_work_dim\n"
+"\n"
 "#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n"
-"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n"
-"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n"
-"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);\n"
+"PURE CONST unsigned int __gen_ocl_##NAME##0(void); \\\n"
+"PURE CONST unsigned int __gen_ocl_##NAME##1(void); \\\n"
+"PURE CONST unsigned int __gen_ocl_##NAME##2(void);\n"
 "DECL_INTERNAL_WORK_ITEM_FN(get_group_id)\n"
 "DECL_INTERNAL_WORK_ITEM_FN(get_local_id)\n"
 "DECL_INTERNAL_WORK_ITEM_FN(get_local_size)\n"
@@ -92,66 +97,64 @@ std::string ocl_stdlib_str =
 "DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)\n"
 "#undef DECL_PUBLIC_WORK_ITEM_FN\n"
 "\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"// Vector loads and stores\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"\n"
-"// These loads and stores will use untyped reads and writes, so we can just\n"
-"// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.\n"
-"// Well we do not care, we do not activate TBAA in the compiler\n"
-"#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \\\n"
-"__attribute__((always_inline, overloadable)) \\\n"
-"inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \\\n"
-"  return *(SPACE TYPE##DIM *) (p + DIM * offset); \\\n"
-"} \\\n"
-"__attribute__((always_inline, overloadable)) \\\n"
-"inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \\\n"
-"  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \\\n"
+"INLINE uint get_global_id(uint dim) {\n"
+"  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);\n"
 "}\n"
 "\n"
-"#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \\\n"
-"  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \\\n"
-"  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \\\n"
-"  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \\\n"
-"  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \\\n"
-"  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n"
-"\n"
-"#define DECL_UNTYPED_RW_ALL(TYPE) \\\n"
-"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n"
-"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n"
-"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n"
-"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n"
-"\n"
-"DECL_UNTYPED_RW_ALL(float)\n"
-"DECL_UNTYPED_RW_ALL(uint)\n"
-"DECL_UNTYPED_RW_ALL(int)\n"
-"\n"
-"#undef DECL_UNTYPED_RW_ALL\n"
-"#undef DECL_UNTYPED_RW_ALL_SPACE\n"
-"#undef DECL_UNTYPED_RW_SPACE_N\n"
-"\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
-"// Arithmetic functions\n"
+"// Math Functions (see 6.11.2 of OCL 1.1 spec)\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
-"__attribute__((always_inline))\n"
-"inline uint get_global_id(uint dim) {\n"
-"  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);\n"
+"PURE CONST float __gen_ocl_fabs(float x);\n"
+"PURE CONST float __gen_ocl_sin(float x);\n"
+"PURE CONST float __gen_ocl_cos(float x);\n"
+"PURE CONST float __gen_ocl_sqrt(float x);\n"
+"PURE CONST float __gen_ocl_rsqrt(float x);\n"
+"PURE CONST float __gen_ocl_log(float x);\n"
+"PURE CONST float __gen_ocl_pow(float x, float y);\n"
+"PURE CONST float __gen_ocl_rcp(float x);\n"
+"PURE CONST float __gen_ocl_rndz(float x);\n"
+"PURE CONST float __gen_ocl_rnde(float x);\n"
+"PURE CONST float __gen_ocl_rndu(float x);\n"
+"PURE CONST float __gen_ocl_rndd(float x);\n"
+"INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }\n"
+"INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }\n"
+"INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }\n"
+"INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }\n"
+"INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }\n"
+"INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }\n"
+"INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }\n"
+"INLINE OVERLOADABLE float native_tan(float x) {\n"
+"  return native_sin(x) / native_cos(x);\n"
 "}\n"
 "\n"
-"__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c);\n"
-"__attribute__((overloadable, always_inline))\n"
-"inline uint select(uint src0, uint src1, uint cond) {\n"
+"// TODO make them actually compliant precision-wise\n"
+"#define cos native_cos   // XXX work-around ptx profile: cos already defined\n"
+"#define sin native_sin   // XXX work-around ptr profile: sin already defined\n"
+"#define sqrt native_sqrt // XXX work-around ptr profile: sin already defined\n"
+"\n"
+"INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }\n"
+"INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); }\n"
+"INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); }\n"
+"INLINE OVERLOADABLE float round(float x) { return __gen_ocl_rnde(x); }\n"
+"INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); }\n"
+"INLINE OVERLOADABLE float ceil(float x)  { return __gen_ocl_rndu(x); }\n"
+"INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }\n"
+"INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); }\n"
+"\n"
+"// Hack pow is already a builtin\n"
+"#define pow powr\n"
+"\n"
+"PURE CONST OVERLOADABLE float mad(float a, float b, float c);\n"
+"OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) {\n"
 "  return cond ? src1 : src0;\n"
 "}\n"
-"__attribute__((overloadable, always_inline))\n"
-"inline int select(int src0, int src1, int cond) {\n"
+"OVERLOADABLE INLINE int select(int src0, int src1, int cond) {\n"
 "  return cond ? src1 : src0;\n"
 "}\n"
 "\n"
 "// This will be optimized out by LLVM and will output LLVM select instructions\n"
 "#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n"
-"__attribute__((overloadable)) \\\n"
-"inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n"
+"OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n"
 "  TYPE4 dst; \\\n"
 "  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n"
 "  const TYPE x1 = src1.x; \\\n"
@@ -171,22 +174,27 @@ std::string ocl_stdlib_str =
 "DECL_SELECT4(float4, float, int4, 0x80000000)\n"
 "#undef DECL_SELECT4\n"
 "\n"
-"INLINE_OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {\n"
+"#if 0\n"
+"INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {\n"
 "  return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));\n"
 "}\n"
-"INLINE_OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {\n"
+"INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {\n"
 "  return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));\n"
 "}\n"
-"INLINE_OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {\n"
+"INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {\n"
 "  return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),\n"
 "                  mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));\n"
 "}\n"
+"#endif\n"
 "\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// Common Functions (see 6.11.4 of OCL 1.1 spec)\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
 "#define DECL_MIN_MAX(TYPE) \\\n"
-"INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \\\n"
+"INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \\\n"
 "  return a > b ? a : b; \\\n"
 "} \\\n"
-"INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \\\n"
+"INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \\\n"
 "  return a < b ? a : b; \\\n"
 "}\n"
 "DECL_MIN_MAX(float)\n"
@@ -198,48 +206,133 @@ std::string ocl_stdlib_str =
 "DECL_MIN_MAX(unsigned char)\n"
 "#undef DECL_MIN_MAX\n"
 "\n"
+"INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}\n"
+"\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
-"// Math intrinsic functions\n"
+"// Geometric functions (see 6.11.5 of OCL 1.1 spec)\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
-"__attribute__((pure,const)) float __gen_ocl_sin(float x);\n"
-"__attribute__((pure,const)) float __gen_ocl_cos(float x);\n"
-"__attribute__((pure,const)) float __gen_ocl_sqrt(float x);\n"
-"__attribute__((pure,const)) float __gen_ocl_rsqrt(float x);\n"
-"__attribute__((pure,const)) float __gen_ocl_log(float x);\n"
-"__attribute__((pure,const)) float __gen_ocl_pow(float x, float y);\n"
-"__attribute__((pure,const)) float __gen_ocl_rcp(float x);\n"
-"\n"
-"INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }\n"
-"INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }\n"
-"INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }\n"
-"INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }\n"
-"INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }\n"
-"INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }\n"
-"INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }\n"
-"INLINE_OVERLOADABLE float native_tan(float x) {\n"
-"  return native_sin(x) / native_cos(x);\n"
+"INLINE OVERLOADABLE float dot(float2 p0, float2 p1) {\n"
+"  return mad(p0.x,p1.x,p0.y*p1.y);\n"
+"}\n"
+"INLINE OVERLOADABLE float dot(float3 p0, float3 p1) {\n"
+"  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));\n"
+"}\n"
+"INLINE OVERLOADABLE float dot(float4 p0, float4 p1) {\n"
+"  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));\n"
 "}\n"
 "\n"
+"INLINE OVERLOADABLE float dot(float8 p0, float8 p1) {\n"
+"  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n"
+"         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));\n"
+"}\n"
+"INLINE OVERLOADABLE float dot(float16 p0, float16 p1) {\n"
+"  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,\n"
+"         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,\n"
+"         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n"
+"         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));\n"
+"}\n"
+"\n"
+"INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }\n"
+"INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float normalize(float x) { return 1.f; }\n"
+"INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }\n"
+"\n"
+"INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }\n"
+"INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }\n"
+"INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }\n"
+"INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; }\n"
+"INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }\n"
+"INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }\n"
+"\n"
+"INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) {\n"
+"   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;\n"
+"}\n"
+"INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {\n"
+"   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);\n"
+"}\n"
+"\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"// Vector loads and stores\n"
+"/////////////////////////////////////////////////////////////////////////////\n"
+"\n"
+"// These loads and stores will use untyped reads and writes, so we can just\n"
+"// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.\n"
+"// Well we do not care, we do not activate TBAA in the compiler\n"
+"#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \\\n"
+"INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \\\n"
+"  return *(SPACE TYPE##DIM *) (p + DIM * offset); \\\n"
+"} \\\n"
+"INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \\\n"
+"  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \\\n"
+"}\n"
+"\n"
+"#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \\\n"
+"  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n"
+"\n"
+"#define DECL_UNTYPED_RW_ALL(TYPE) \\\n"
+"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n"
+"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n"
+"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n"
+"  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n"
+"\n"
+"DECL_UNTYPED_RW_ALL(float)\n"
+"DECL_UNTYPED_RW_ALL(uint)\n"
+"DECL_UNTYPED_RW_ALL(int)\n"
+"\n"
+"#undef DECL_UNTYPED_RW_ALL\n"
+"#undef DECL_UNTYPED_RW_ALL_SPACE\n"
+"#undef DECL_UNTYPED_RW_SPACE_N\n"
+"\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
 "// Declare functions for vector types which are derived from scalar ones\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
 "#define DECL_VECTOR_1OP(NAME, TYPE) \\\n"
-"  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \\\n"
+"  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \\\n"
 "    return (TYPE##2)(NAME(v.x), NAME(v.y)); \\\n"
 "  }\\\n"
-"  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \\\n"
+"  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \\\n"
 "    return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \\\n"
 "  }\\\n"
-"  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \\\n"
+"  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \\\n"
 "    return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \\\n"
 "  }\\\n"
-"  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \\\n"
+"  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \\\n"
 "    TYPE##8 dst;\\\n"
 "    dst.s0123 = NAME(v.s0123);\\\n"
 "    dst.s4567 = NAME(v.s4567);\\\n"
 "    return dst;\\\n"
 "  }\\\n"
-"  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \\\n"
+"  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \\\n"
 "    TYPE##16 dst;\\\n"
 "    dst.s01234567 = NAME(v.s01234567);\\\n"
 "    dst.s89abcdef = NAME(v.s89abcdef);\\\n"
@@ -252,31 +345,73 @@ std::string ocl_stdlib_str =
 "DECL_VECTOR_1OP(native_rsqrt, float);\n"
 "DECL_VECTOR_1OP(native_log2, float);\n"
 "DECL_VECTOR_1OP(native_recip, float);\n"
+"DECL_VECTOR_1OP(fabs, float);\n"
+"DECL_VECTOR_1OP(trunc, float);\n"
+"DECL_VECTOR_1OP(round, float);\n"
+"DECL_VECTOR_1OP(floor, float);\n"
+"DECL_VECTOR_1OP(ceil, float);\n"
 "#undef DECL_VECTOR_1OP\n"
 "\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"// Geometric functions\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {\n"
-"  return mad(p0.x,p1.x,p0.y*p1.y);\n"
-"}\n"
-"INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {\n"
-"  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));\n"
-"}\n"
-"INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {\n"
-"  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));\n"
-"}\n"
+"#define DECL_VECTOR_2OP(NAME, TYPE) \\\n"
+"  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \\\n"
+"    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \\\n"
+"    return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \\\n"
+"    return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \\\n"
+"    TYPE##8 dst;\\\n"
+"    dst.s0123 = NAME(v0.s0123, v1.s0123);\\\n"
+"    dst.s4567 = NAME(v0.s4567, v1.s4567);\\\n"
+"    return dst;\\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \\\n"
+"    TYPE##16 dst;\\\n"
+"    dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\\\n"
+"    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\\\n"
+"    return dst;\\\n"
+"  }\n"
+"DECL_VECTOR_2OP(min, float);\n"
+"DECL_VECTOR_2OP(max, float);\n"
+"DECL_VECTOR_2OP(fmod, float);\n"
+"DECL_VECTOR_2OP(powr, float);\n"
+"#undef DECL_VECTOR_2OP\n"
+"\n"
+"#define DECL_VECTOR_3OP(NAME, TYPE) \\\n"
+"  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \\\n"
+"    return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \\\n"
+"    return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \\\n"
+"    return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \\\n"
+"    TYPE##8 dst;\\\n"
+"    dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\\\n"
+"    dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\\\n"
+"    return dst;\\\n"
+"  }\\\n"
+"  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \\\n"
+"    TYPE##16 dst;\\\n"
+"    dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\\\n"
+"    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\\\n"
+"    return dst;\\\n"
+"  }\n"
+"DECL_VECTOR_3OP(mad, float);\n"
+"DECL_VECTOR_3OP(mix, float);\n"
+"#undef DECL_VECTOR_3OP\n"
 "\n"
-"INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {\n"
-"  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n"
-"         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));\n"
-"}\n"
-"INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {\n"
-"  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,\n"
-"         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,\n"
-"         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n"
-"         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));\n"
-"}\n"
+"// mix requires more variants\n"
+"INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}\n"
+"INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}\n"
+"INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}\n"
+"INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}\n"
+"INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}\n"
 "\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
 "// Extensions to manipulate the register file\n"
@@ -327,12 +462,10 @@ std::string ocl_stdlib_str =
 "int __gen_ocl_force_simd16(void);\n"
 "\n"
 "#define DECL_VOTE(TYPE) \\\n"
-"__attribute__((overloadable,always_inline)) \\\n"
-"TYPE __gen_ocl_any(TYPE cond) { \\\n"
+"INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \\\n"
 "  return (TYPE) __gen_ocl_any((unsigned short) cond); \\\n"
 "} \\\n"
-"__attribute__((overloadable,always_inline)) \\\n"
-"TYPE __gen_ocl_all(TYPE cond) { \\\n"
+"INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \\\n"
 "  return (TYPE) __gen_ocl_all((unsigned short) cond); \\\n"
 "}\n"
 "DECL_VOTE(unsigned int)\n"
@@ -344,7 +477,10 @@ std::string ocl_stdlib_str =
 "#undef DECL_VOTE\n"
 "\n"
 "#define NULL ((void*)0)\n"
-"#undef INLINE_OVERLOADABLE\n"
+"#undef PURE\n"
+"#undef CONST\n"
+"#undef OVERLOADABLE\n"
+"#undef INLINE\n"
 "#endif /* __GEN_OCL_STDLIB_H__ */\n"
 "\n"
 ;
diff --git a/kernels/compiler_clod.cl b/kernels/compiler_clod.cl
index d7c945a..ec9d33e 100644
--- a/kernels/compiler_clod.cl
+++ b/kernels/compiler_clod.cl
@@ -31,12 +31,13 @@ inline uint pack_fp4(float4 u4) {
 
 float f(vec3 o)
 {
-    float a=(sin(o.x)+o.y*.25)*.35f;
+    float a=(sin(o.x)+o.y*.25f)*.35f;
     o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z);
     return dot(cos(o)*cos(o),(vec3)(1.f))-1.2f;
 }
 
-vec3 s(vec3 o,vec3 d)
+// XXX front end does not inline this function
+__attribute((always_inline)) vec3 s(vec3 o,vec3 d)
 {
     float t=0.0f;
     float dt = 0.2f;
@@ -45,27 +46,45 @@ vec3 s(vec3 o,vec3 d)
     for(int i=0;i<50;i++)
     {
         nh = f(o+d*t);
-        if(nh>0.0) { lh=nh; t+=dt; }
+        if(nh>0.0f) { lh=nh; t+=dt; }
     }
 
-    if( nh>0.0 ) return (vec3)(.93f,.94f,.85f);
+    if( nh>0.0f ) return (vec3)(.93f,.94f,.85f);
 
     t = t - dt*nh/(nh-lh);
 
     vec3 e=(vec3)(.1f,0.0f,0.0f);
     vec3 p=o+d*t;
-    vec3 n=-normalize((vec3)(f(p+e),f(p+e.yxy),f(p+e.yyx))+(vec3)((sin(p*75.)))*.01f);
+    vec3 n=-normalize((vec3)(f(p+e),f(p+e.yxy),f(p+e.yyx))+(vec3)((sin(p*75.f)))*.01f);
 
     return (vec3)(mix( ((max(-dot(n,(vec3)(.577f)),0.f) + 0.125f*max(-dot(n,(vec3)(-.707f,-.707f,0.f)),0.f)))*(mod
-    (length(p.xy)*20.f,2.f)<1.0?(vec3)(.71f,.85f,.25f):(vec3)(.79f,.93f,.4f))
+    (length(p.xy)*20.f,2.f)<1.0f?(vec3)(.71f,.85f,.25f):(vec3)(.79f,.93f,.4f))
                            ,(vec3)(.93f,.94f,.85f), (vec3)(pow(t/9.f,5.f)) ) );
 }
 
+#if 0
+// XXX vector type in the function arguments not supported yet
 __kernel void compiler_clod(__global uint *dst, vec2 resolution, int w)
 {
     vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
-    vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
-    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
+    //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
+    vec2 p;
+    p.x = -1.0f + 2.0f * gl_FragCoord.x / resolution.x;
+    p.y = -1.0f + 2.0f * gl_FragCoord.y / resolution.y;
+    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
     OUTPUT;
 }
+#else
+__kernel void compiler_clod(__global uint *dst, float resx, float resy, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
+    vec2 p;
+    p.x = -1.0f + 2.0f * gl_FragCoord.x / resx;
+    p.y = -1.0f + 2.0f * gl_FragCoord.y / resy;
+    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
+    OUTPUT;
+}
+
+#endif
 
diff --git a/kernels/compiler_ribbon.cl b/kernels/compiler_ribbon.cl
new file mode 100644
index 0000000..f5729e5
--- /dev/null
+++ b/kernels/compiler_ribbon.cl
@@ -0,0 +1,87 @@
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define normalize fast_normalize
+#define length fast_length
+
+inline vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+#define time 1.f
+
+// Object A (tunnel)
+inline float oa(vec3 q) {
+ return cos(q.x)+cos(q.y*1.5f)+cos(q.z)+cos(q.y*20.f)*.05f;
+}
+
+// Object B (ribbon)
+inline float ob(vec3 q) {
+  return length(max(fabs(q-(vec3)(cos(q.z*1.5f)*.3f,-.5f+cos(q.z)*.2f,.0f))-(vec3)(.125f,.02f,time+3.f),(vec3)(.0f)));
+}
+
+// Scene
+inline float o(vec3 q) { return min(oa(q),ob(q)); }
+
+// Get Normal XXX Not inline by LLVM
+__attribute__((always_inline)) vec3 gn(vec3 q) {
+ const vec3 f = (vec3)(.01f, 0.f, 0.f);
+ return normalize((vec3)(o(q+f.xyy),
+                         o(q+f.yxy),
+                         o(q+f.yyx)));
+}
+
+inline uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+// XXX vector not supported in function argument yet
+__kernel void compiler_ribbon(__global uint *dst, float resx, float resy, int w)
+{
+  vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+  vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / (vec2)(resx, resy);
+  p.x *= resx/resy;
+
+  vec4 c = (vec4)(1.0f);
+  const vec3 org = (vec3)(sin(time)*.5f,
+                          cos(time*.5f)*.25f+.25f,
+                          time);
+  vec3 dir=normalize((vec3)(p.x*1.6f,p.y,1.0f));
+  vec3 q = org, pp;
+  float d=.0f;
+
+  // First raymarching
+  for(int i=0;i<64;i++) {
+    d=o(q);
+    q+=d*dir;
+  }
+  pp=q;
+  const float f = length(q-org)*0.02f;
+
+  // Second raymarching (reflection)
+  dir=reflect(dir,gn(q));
+  q+=dir;
+  for(int i=0;i<64;i++) {
+    d=o(q);
+    q+=d*dir;
+  }
+  c = max(dot(gn(q), (vec3)(0.1f,0.1f,0.0f)), 0.0f)
+    + (vec4)(0.3f, cos(time*.5f)*.5f+.5f, sin(time*.5f)*.5f+.5f, 1.f) * min(length(q-org)*.04f,1.f);
+
+  // Ribbon Color
+  if(oa(pp)>ob(pp))
+    c = mix(c, (vec4)(cos(time*.3f)*0.5f + 0.5f,cos(time*.2f)*.5f+.5f,sin(time*.3f)*.5f+.5f,1.f),.3f);
+
+  // Final Color
+  const vec4 color = ((c+(vec4)(f))+(1.f-min(pp.y+1.9f,1.f))*(vec4)(1.f,.8f,.7f,1.f))*min(time*.5f,1.f);
+  const vec4 final = 255.f * max(min(color, (vec4)(1.f)), (vec4)(0.f));
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final);
+}
+
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 0624786..e3daf84 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -6,6 +6,8 @@ ADD_LIBRARY(utests SHARED
   utest_error.c
   compiler_mandelbrot.cpp
   compiler_mandelbrot_alternate.cpp
+  compiler_clod.cpp
+# compiler_ribbon.cpp
   compiler_box_blur_float.cpp
   compiler_box_blur.cpp
   compiler_argument_structure.cpp
@@ -57,7 +59,7 @@ ADD_LIBRARY(utests SHARED
 TARGET_LINK_LIBRARIES(utests cl m)
 
 ADD_EXECUTABLE(utest_run utest_run.cpp)
-TARGET_LINK_LIBRARIES(run utests)
+TARGET_LINK_LIBRARIES(utest_run utests)
 
 ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
 TARGET_LINK_LIBRARIES(flat_address_space utests)
diff --git a/utests/compiler_clod.cpp b/utests/compiler_clod.cpp
new file mode 100644
index 0000000..360d197
--- /dev/null
+++ b/utests/compiler_clod.cpp
@@ -0,0 +1,49 @@
+/* 
+ * Copyright Â© 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static int *dst = NULL;
+static const int w = 1024;
+static const int h = 1024;
+
+static void compiler_clod(void)
+{
+  const size_t global[2] = {size_t(w), size_t(h)};
+  const size_t local[2] = {16, 1};
+  const size_t sz = w * h * sizeof(char[4]);
+  const float fx = float(w);
+  const float fy = float(h);
+  OCL_CREATE_KERNEL("compiler_clod");
+
+  cl_mem cl_dst = clCreateBuffer(ctx, 0, sz, NULL, NULL);
+  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &cl_dst);
+  OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx);
+  OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy);
+  OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+  dst = (int *) clIntelMapBuffer(cl_dst, NULL);
+
+  cl_write_bmp(dst, w, h, "clod.bmp");
+  OCL_CALL (clIntelUnmapBuffer, cl_dst);
+  OCL_CALL (clReleaseMemObject, cl_dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_clod);
+
diff --git a/utests/compiler_ribbon.cpp b/utests/compiler_ribbon.cpp
new file mode 100644
index 0000000..733b8ac
--- /dev/null
+++ b/utests/compiler_ribbon.cpp
@@ -0,0 +1,50 @@
+/* 
+ * Copyright Â© 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static int *dst = NULL;
+static const int w = 1024;
+static const int h = 1024;
+
+static void compiler_ribbon(void)
+{
+  const size_t global[2] = {size_t(w), size_t(h)};
+  const size_t local[2] = {16, 1};
+  const size_t sz = w * h * sizeof(char[4]);
+  const float fx = float(w);
+  const float fy = float(h);
+  OCL_CREATE_KERNEL("compiler_ribbon");
+
+  cl_mem cl_dst = clCreateBuffer(ctx, 0, sz, NULL, NULL);
+  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &cl_dst);
+  OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx);
+  OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy);
+  OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+  dst = (int *) clIntelMapBuffer(cl_dst, NULL);
+
+  cl_write_bmp(dst, w, h, "ribbon.bmp");
+  OCL_CALL (clIntelUnmapBuffer, cl_dst);
+  OCL_CALL (clReleaseMemObject, cl_dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_ribbon);
+
+
-- 
2.7.4