From 4c5a0cfb128318adfdbc6d87cf77689584ee5bfb Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Fri, 2 Nov 2012 17:55:20 -0700 Subject: [PATCH] Made compiler_clod pass. The image is now properly computed. I basically added a bunch of new intrinsics and cleaned a bit the ocl std library. Well, it is not going to be compliant for a while with the spec (mostly due to precision issues and the way denormals and nan are handled). But, it should do the job for now. Started to add a more complex test called "ribbon" --- backend/src/backend/gen_context.cpp | 7 +- backend/src/backend/gen_encoder.cpp | 3 +- backend/src/backend/gen_encoder.hpp | 3 +- backend/src/backend/gen_insn_selection.cpp | 16 +- backend/src/backend/gen_insn_selection.hxx | 1 + backend/src/backend/program.cpp | 2 +- backend/src/ir/instruction.cpp | 11 + backend/src/ir/instruction.hpp | 10 + backend/src/ir/instruction.hxx | 5 + backend/src/llvm/llvm_gen_backend.cpp | 24 +- backend/src/llvm/llvm_gen_ocl_function.hxx | 5 + backend/src/ocl_stdlib.h | 350 ++++++++++++++++++++--------- backend/src/ocl_stdlib_str.cpp | 350 ++++++++++++++++++++--------- kernels/compiler_clod.cl | 35 ++- kernels/compiler_ribbon.cl | 87 +++++++ utests/CMakeLists.txt | 4 +- utests/compiler_clod.cpp | 49 ++++ utests/compiler_ribbon.cpp | 50 +++++ 18 files changed, 777 insertions(+), 235 deletions(-) create mode 100644 kernels/compiler_ribbon.cl create mode 100644 utests/compiler_clod.cpp create mode 100644 utests/compiler_ribbon.cpp diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 366ea0d..a84eec7 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -133,6 +133,10 @@ namespace gbe switch (insn.opcode) { case SEL_OP_MOV: p->MOV(dst, src); break; case SEL_OP_NOT: p->NOT(dst, src); break; + case SEL_OP_RNDD: p->RNDD(dst, src); break; + case SEL_OP_RNDU: p->RNDU(dst, src); break; + case SEL_OP_RNDE: p->RNDE(dst, src); break; + case SEL_OP_RNDZ: p->RNDZ(dst, src); break; default: NOT_IMPLEMENTED; } } @@ -206,8 +210,7 @@ namespace gbe p->curr.predicate = GEN_PREDICATE_NONE; p->curr.execWidth = 8; p->curr.noMask = 1; - p->MOV(GenRegister::f8grf(127,0), GenRegister::f8grf(0,0)); - p->EOT(127); + p->EOT(0); p->pop(); } diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 4aa3317..633d5f8 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -597,6 +597,8 @@ namespace gbe ALU1(MOV) ALU1(RNDZ) ALU1(RNDE) + ALU1(RNDD) + ALU1(RNDU) ALU2(SEL) ALU1(NOT) ALU2(AND) @@ -608,7 +610,6 @@ namespace gbe ALU2(RSL) ALU2(ASR) ALU1(FRC) - ALU1(RNDD) ALU2(MAC) ALU1(LZD) ALU2(LINE) diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index 5308338..969fa41 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -98,6 +98,8 @@ namespace gbe ALU1(MOV) ALU1(RNDZ) ALU1(RNDE) + ALU1(RNDD) + ALU1(RNDU) ALU2(SEL) ALU1(NOT) ALU2(AND) @@ -111,7 +113,6 @@ namespace gbe ALU2(ADD) ALU2(MUL) ALU1(FRC) - ALU1(RNDD) ALU2(MAC) ALU2(MACH) ALU1(LZD) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 0fe8e66..6a9aa53 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -318,6 +318,7 @@ namespace gbe ALU2(MUL) ALU1(FRC) ALU1(RNDD) + ALU1(RNDU) ALU2(MACH) ALU1(LZD) ALU3(MAD) @@ -992,6 +993,10 @@ namespace gbe const GenRegister src = sel.selReg(insn.getSrc(0)); switch (opcode) { case ir::OP_MOV: sel.MOV(dst, src); break; + case ir::OP_RNDD: sel.RNDD(dst, src); break; + case ir::OP_RNDE: sel.RNDE(dst, src); break; + case ir::OP_RNDU: sel.RNDU(dst, src); break; + case ir::OP_RNDZ: sel.RNDZ(dst, src); break; case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break; case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break; case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break; @@ -1045,11 +1050,14 @@ namespace gbe GenRegister dst = sel.selReg(insn.getDst(0), type); // Immediates not supported - if (opcode == OP_DIV) { + if (opcode == OP_DIV || opcode == OP_POW) { GBE_ASSERT(type == TYPE_FLOAT); - GenRegister src0 = sel.selReg(insn.getSrc(0), type); - GenRegister src1 = sel.selReg(insn.getSrc(1), type); - sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1); + const GenRegister src0 = sel.selReg(insn.getSrc(0), type); + const GenRegister src1 = sel.selReg(insn.getSrc(1), type); + const uint32_t mathOp = opcode == OP_DIV ? + GEN_MATH_FUNCTION_FDIV : + GEN_MATH_FUNCTION_POW; + sel.MATH(dst, mathOp, src0, src1); this->markAllChildren(dag); return true; } diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index 418af5e..8dcf2b5 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -5,6 +5,7 @@ DECL_SELECTION_IR(LZD, UnaryInstruction) DECL_SELECTION_IR(RNDZ, UnaryInstruction) DECL_SELECTION_IR(RNDE, UnaryInstruction) DECL_SELECTION_IR(RNDD, UnaryInstruction) +DECL_SELECTION_IR(RNDU, UnaryInstruction) DECL_SELECTION_IR(FRC, UnaryInstruction) DECL_SELECTION_IR(SEL, BinaryInstruction) DECL_SELECTION_IR(AND, BinaryInstruction) diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 42962b5..adf6314 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -106,7 +106,7 @@ namespace gbe { // Now compile the code to llvm using clang // XXX use popen and stuff instead of that - std::string compileCmd = "clang -emit-llvm -O3 -ccc-host-triple ptx32 -c "; + std::string compileCmd = "clang -x cl -emit-llvm -O3 -ccc-host-triple ptx32 -c "; compileCmd += clName; compileCmd += " -o "; compileCmd += llName; diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 1c58faf..e1c75e5 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -702,11 +702,17 @@ namespace ir { default: CHECK_TYPE(this->type, allButBool); break; + case OP_POW: case OP_COS: case OP_SIN: case OP_RCP: + case OP_ABS: case OP_RSQ: case OP_SQR: + case OP_RNDD: + case OP_RNDE: + case OP_RNDU: + case OP_RNDZ: const Type fp = TYPE_FLOAT; if (UNLIKELY(checkTypeFamily(TYPE_FLOAT, &fp, 1, whyNot)) == false) return false; @@ -1425,6 +1431,10 @@ DECL_MEM_FN(VoteInstruction, VotePredicate, getVotePredicate(void), getVotePredi DECL_EMIT_FUNCTION(LOG) DECL_EMIT_FUNCTION(SQR) DECL_EMIT_FUNCTION(RSQ) + DECL_EMIT_FUNCTION(RNDD) + DECL_EMIT_FUNCTION(RNDE) + DECL_EMIT_FUNCTION(RNDU) + DECL_EMIT_FUNCTION(RNDZ) #undef DECL_EMIT_FUNCTION @@ -1434,6 +1444,7 @@ DECL_MEM_FN(VoteInstruction, VotePredicate, getVotePredicate(void), getVotePredi return internal::BinaryInstruction(OP_##NAME, type, dst, src0, src1).convert(); \ } + DECL_EMIT_FUNCTION(POW) DECL_EMIT_FUNCTION(MUL) DECL_EMIT_FUNCTION(ADD) DECL_EMIT_FUNCTION(SUB) diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index f17c0f0..c1b8323 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -488,12 +488,22 @@ namespace ir { Instruction SIN(Type type, Register dst, Register src); /*! tan.type dst src */ Instruction RCP(Type type, Register dst, Register src); + /*! abs.type dst src */ + Instruction ABS(Type type, Register dst, Register src); /*! log.type dst src */ Instruction LOG(Type type, Register dst, Register src); /*! sqr.type dst src */ Instruction SQR(Type type, Register dst, Register src); /*! rsq.type dst src */ Instruction RSQ(Type type, Register dst, Register src); + /*! rndd.type dst src */ + Instruction RNDD(Type type, Register dst, Register src); + /*! rnde.type dst src */ + Instruction RNDE(Type type, Register dst, Register src); + /*! rndu.type dst src */ + Instruction RNDU(Type type, Register dst, Register src); + /*! rndz.type dst src */ + Instruction RNDZ(Type type, Register dst, Register src); /*! pow.type dst src0 src1 */ Instruction POW(Type type, Register dst, Register src0, Register src1); /*! mul.type dst src0 src1 */ diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx index 6704acc..4f154b0 100644 --- a/backend/src/ir/instruction.hxx +++ b/backend/src/ir/instruction.hxx @@ -32,6 +32,11 @@ DECL_INSN(LOG, UnaryInstruction) DECL_INSN(SQR, UnaryInstruction) DECL_INSN(RSQ, UnaryInstruction) DECL_INSN(RCP, UnaryInstruction) +DECL_INSN(ABS, UnaryInstruction) +DECL_INSN(RNDD, UnaryInstruction) +DECL_INSN(RNDE, UnaryInstruction) +DECL_INSN(RNDU, UnaryInstruction) +DECL_INSN(RNDZ, UnaryInstruction) DECL_INSN(POW, BinaryInstruction) DECL_INSN(MUL, BinaryInstruction) DECL_INSN(ADD, BinaryInstruction) diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index ae2400b..c3c3ff4 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -666,7 +666,8 @@ namespace gbe uint32_t argID = 1; // Start at one actually for (; I != E; ++I, ++argID) { Type *type = I->getType(); - GBE_ASSERT(isScalarType(type) == true); + GBE_ASSERTM(isScalarType(type) == true, + "vector type in the function argument is not supported yet"); const ir::Register reg = regTranslator.newScalar(I); if (type->isPointerTy() == false) ctx.input(ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type)); @@ -1431,6 +1432,11 @@ namespace gbe case GEN_OCL_LOG: case GEN_OCL_POW: case GEN_OCL_RCP: + case GEN_OCL_ABS: + case GEN_OCL_RNDZ: + case GEN_OCL_RNDE: + case GEN_OCL_RNDU: + case GEN_OCL_RNDD: // No structure can be returned GBE_ASSERT(I.hasStructRetAttr() == false); this->newRegister(&I); @@ -1613,16 +1619,28 @@ namespace gbe ctx.MAD(ir::TYPE_FLOAT, dst, src0, src1, src2); break; } + case GEN_OCL_POW: + { + const ir::Register src0 = this->getRegister(*AI); ++AI; + const ir::Register src1 = this->getRegister(*AI); + const ir::Register dst = this->getRegister(&I); + ctx.POW(ir::TYPE_FLOAT, dst, src0, src1); + break; + } case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break; case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break; case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break; case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break; case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break; case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break; + case GEN_OCL_ABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break; + case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break; + case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break; + case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break; + case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break; case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break; case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break; - default: - break; + default: break; } } } diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index f51bb63..3228d57 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -39,6 +39,7 @@ DECL_LLVM_GEN_FUNCTION(RGATHER7, _Z17__gen_ocl_rgathertiiiiiii) DECL_LLVM_GEN_FUNCTION(RGATHER8, _Z17__gen_ocl_rgathertiiiiiiii) // Math function +DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs) DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos) DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin) DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt) @@ -46,6 +47,10 @@ DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt) DECL_LLVM_GEN_FUNCTION(LOG, __gen_ocl_log) DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow) DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp) +DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz) +DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde) +DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu) +DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd) // Uniform conditions DECL_LLVM_GEN_FUNCTION(ALL, _Z13__gen_ocl_allt) diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h index 9b3b454..eda7247 100644 --- a/backend/src/ocl_stdlib.h +++ b/backend/src/ocl_stdlib.h @@ -20,8 +20,10 @@ uint* Copyright © 2012 Intel Corporation #ifndef __GEN_OCL_STDLIB_H__ #define __GEN_OCL_STDLIB_H__ -#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline)) inline +#define INLINE __attribute__((always_inline)) inline #define OVERLOADABLE __attribute__((overloadable)) +#define PURE __attribute__((pure)) +#define CONST __attribute__((const)) ///////////////////////////////////////////////////////////////////////////// // OpenCL basic types @@ -62,12 +64,15 @@ typedef bool bool16 __attribute__((ext_vector_type(16))); #define private __private ///////////////////////////////////////////////////////////////////////////// -// Work groups and work items functions +// Work Items functions (see 6.11.1 of OCL 1.1 spec) ///////////////////////////////////////////////////////////////////////////// +// TODO get_global_offset +// TODO get_work_dim + #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \ -__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \ -__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \ -__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void); +PURE CONST unsigned int __gen_ocl_##NAME##0(void); \ +PURE CONST unsigned int __gen_ocl_##NAME##1(void); \ +PURE CONST unsigned int __gen_ocl_##NAME##2(void); DECL_INTERNAL_WORK_ITEM_FN(get_group_id) DECL_INTERNAL_WORK_ITEM_FN(get_local_id) DECL_INTERNAL_WORK_ITEM_FN(get_local_size) @@ -89,66 +94,64 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_size) DECL_PUBLIC_WORK_ITEM_FN(get_num_groups) #undef DECL_PUBLIC_WORK_ITEM_FN -///////////////////////////////////////////////////////////////////////////// -// Vector loads and stores -///////////////////////////////////////////////////////////////////////////// - -// These loads and stores will use untyped reads and writes, so we can just -// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue. -// Well we do not care, we do not activate TBAA in the compiler -#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \ -__attribute__((always_inline, overloadable)) \ -inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \ - return *(SPACE TYPE##DIM *) (p + DIM * offset); \ -} \ -__attribute__((always_inline, overloadable)) \ -inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \ - *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \ +INLINE uint get_global_id(uint dim) { + return get_local_id(dim) + get_local_size(dim) * get_group_id(dim); } -#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \ - DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \ - DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \ - DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \ - DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \ - DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE) - -#define DECL_UNTYPED_RW_ALL(TYPE) \ - DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \ - DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \ - DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \ - DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private) - -DECL_UNTYPED_RW_ALL(float) -DECL_UNTYPED_RW_ALL(uint) -DECL_UNTYPED_RW_ALL(int) - -#undef DECL_UNTYPED_RW_ALL -#undef DECL_UNTYPED_RW_ALL_SPACE -#undef DECL_UNTYPED_RW_SPACE_N - ///////////////////////////////////////////////////////////////////////////// -// Arithmetic functions +// Math Functions (see 6.11.2 of OCL 1.1 spec) ///////////////////////////////////////////////////////////////////////////// -__attribute__((always_inline)) -inline uint get_global_id(uint dim) { - return get_local_id(dim) + get_local_size(dim) * get_group_id(dim); +PURE CONST float __gen_ocl_fabs(float x); +PURE CONST float __gen_ocl_sin(float x); +PURE CONST float __gen_ocl_cos(float x); +PURE CONST float __gen_ocl_sqrt(float x); +PURE CONST float __gen_ocl_rsqrt(float x); +PURE CONST float __gen_ocl_log(float x); +PURE CONST float __gen_ocl_pow(float x, float y); +PURE CONST float __gen_ocl_rcp(float x); +PURE CONST float __gen_ocl_rndz(float x); +PURE CONST float __gen_ocl_rnde(float x); +PURE CONST float __gen_ocl_rndu(float x); +PURE CONST float __gen_ocl_rndd(float x); +INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); } +INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); } +INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); } +INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); } +INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); } +INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); } +INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); } +INLINE OVERLOADABLE float native_tan(float x) { + return native_sin(x) / native_cos(x); } -__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c); -__attribute__((overloadable, always_inline)) -inline uint select(uint src0, uint src1, uint cond) { +// TODO make them actually compliant precision-wise +#define cos native_cos // XXX work-around ptx profile: cos already defined +#define sin native_sin // XXX work-around ptr profile: sin already defined +#define sqrt native_sqrt // XXX work-around ptr profile: sin already defined + +INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); } +INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); } +INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); } +INLINE OVERLOADABLE float round(float x) { return __gen_ocl_rnde(x); } +INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); } +INLINE OVERLOADABLE float ceil(float x) { return __gen_ocl_rndu(x); } +INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); } +INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); } + +// Hack pow is already a builtin +#define pow powr + +PURE CONST OVERLOADABLE float mad(float a, float b, float c); +OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) { return cond ? src1 : src0; } -__attribute__((overloadable, always_inline)) -inline int select(int src0, int src1, int cond) { +OVERLOADABLE INLINE int select(int src0, int src1, int cond) { return cond ? src1 : src0; } // This will be optimized out by LLVM and will output LLVM select instructions #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \ -__attribute__((overloadable)) \ -inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ +OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ TYPE4 dst; \ const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \ const TYPE x1 = src1.x; \ @@ -168,22 +171,27 @@ DECL_SELECT4(int4, int, int4, 0x80000000) DECL_SELECT4(float4, float, int4, 0x80000000) #undef DECL_SELECT4 -INLINE_OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) { +#if 0 +INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) { return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y)); } -INLINE_OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) { +INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) { return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z)); } -INLINE_OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) { +INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) { return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z), mad(a.w,b.w,c.w)); } +#endif +///////////////////////////////////////////////////////////////////////////// +// Common Functions (see 6.11.4 of OCL 1.1 spec) +///////////////////////////////////////////////////////////////////////////// #define DECL_MIN_MAX(TYPE) \ -INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \ +INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \ return a > b ? a : b; \ } \ -INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \ +INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \ return a < b ? a : b; \ } DECL_MIN_MAX(float) @@ -195,48 +203,133 @@ DECL_MIN_MAX(unsigned short) DECL_MIN_MAX(unsigned char) #undef DECL_MIN_MAX +INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;} + ///////////////////////////////////////////////////////////////////////////// -// Math intrinsic functions +// Geometric functions (see 6.11.5 of OCL 1.1 spec) ///////////////////////////////////////////////////////////////////////////// -__attribute__((pure,const)) float __gen_ocl_sin(float x); -__attribute__((pure,const)) float __gen_ocl_cos(float x); -__attribute__((pure,const)) float __gen_ocl_sqrt(float x); -__attribute__((pure,const)) float __gen_ocl_rsqrt(float x); -__attribute__((pure,const)) float __gen_ocl_log(float x); -__attribute__((pure,const)) float __gen_ocl_pow(float x, float y); -__attribute__((pure,const)) float __gen_ocl_rcp(float x); - -INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); } -INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); } -INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); } -INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); } -INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); } -INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); } -INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); } -INLINE_OVERLOADABLE float native_tan(float x) { - return native_sin(x) / native_cos(x); +INLINE OVERLOADABLE float dot(float2 p0, float2 p1) { + return mad(p0.x,p1.x,p0.y*p1.y); +} +INLINE OVERLOADABLE float dot(float3 p0, float3 p1) { + return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y)); +} +INLINE OVERLOADABLE float dot(float4 p0, float4 p1) { + return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y))); } +INLINE OVERLOADABLE float dot(float8 p0, float8 p1) { + return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5, + mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y))))))); +} +INLINE OVERLOADABLE float dot(float16 p0, float16 p1) { + return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf, + mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb, + mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5, + mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y))))))))))))))); +} + +INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); } +INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); } +INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); } +INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); } +INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); } +INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); } +INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); } +INLINE OVERLOADABLE float normalize(float x) { return 1.f; } +INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); } + +INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); } +INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); } +INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); } +INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); } +INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); } +INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); } +INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); } +INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); } +INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; } +INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); } +INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); } + +INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) { + return v0.yzx*v1.zxy-v0.zxy*v1.yzx; +} +INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) { + return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f); +} + +///////////////////////////////////////////////////////////////////////////// +// Vector loads and stores +///////////////////////////////////////////////////////////////////////////// + +// These loads and stores will use untyped reads and writes, so we can just +// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue. +// Well we do not care, we do not activate TBAA in the compiler +#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \ +INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \ + return *(SPACE TYPE##DIM *) (p + DIM * offset); \ +} \ +INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \ + *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \ +} + +#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE) + +#define DECL_UNTYPED_RW_ALL(TYPE) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private) + +DECL_UNTYPED_RW_ALL(float) +DECL_UNTYPED_RW_ALL(uint) +DECL_UNTYPED_RW_ALL(int) + +#undef DECL_UNTYPED_RW_ALL +#undef DECL_UNTYPED_RW_ALL_SPACE +#undef DECL_UNTYPED_RW_SPACE_N + ///////////////////////////////////////////////////////////////////////////// // Declare functions for vector types which are derived from scalar ones ///////////////////////////////////////////////////////////////////////////// #define DECL_VECTOR_1OP(NAME, TYPE) \ - INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \ + INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \ return (TYPE##2)(NAME(v.x), NAME(v.y)); \ }\ - INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \ + INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \ return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \ }\ - INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \ + INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \ return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \ }\ - INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \ + INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \ TYPE##8 dst;\ dst.s0123 = NAME(v.s0123);\ dst.s4567 = NAME(v.s4567);\ return dst;\ }\ - INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \ + INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \ TYPE##16 dst;\ dst.s01234567 = NAME(v.s01234567);\ dst.s89abcdef = NAME(v.s89abcdef);\ @@ -249,31 +342,73 @@ DECL_VECTOR_1OP(native_sqrt, float); DECL_VECTOR_1OP(native_rsqrt, float); DECL_VECTOR_1OP(native_log2, float); DECL_VECTOR_1OP(native_recip, float); +DECL_VECTOR_1OP(fabs, float); +DECL_VECTOR_1OP(trunc, float); +DECL_VECTOR_1OP(round, float); +DECL_VECTOR_1OP(floor, float); +DECL_VECTOR_1OP(ceil, float); #undef DECL_VECTOR_1OP -///////////////////////////////////////////////////////////////////////////// -// Geometric functions -///////////////////////////////////////////////////////////////////////////// -INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) { - return mad(p0.x,p1.x,p0.y*p1.y); -} -INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) { - return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y)); -} -INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) { - return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y))); -} +#define DECL_VECTOR_2OP(NAME, TYPE) \ + INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \ + return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \ + }\ + INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \ + return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \ + }\ + INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \ + return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \ + }\ + INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \ + TYPE##8 dst;\ + dst.s0123 = NAME(v0.s0123, v1.s0123);\ + dst.s4567 = NAME(v0.s4567, v1.s4567);\ + return dst;\ + }\ + INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \ + TYPE##16 dst;\ + dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\ + dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\ + return dst;\ + } +DECL_VECTOR_2OP(min, float); +DECL_VECTOR_2OP(max, float); +DECL_VECTOR_2OP(fmod, float); +DECL_VECTOR_2OP(powr, float); +#undef DECL_VECTOR_2OP + +#define DECL_VECTOR_3OP(NAME, TYPE) \ + INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \ + return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \ + }\ + INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \ + return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \ + }\ + INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \ + return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \ + }\ + INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \ + TYPE##8 dst;\ + dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\ + dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\ + return dst;\ + }\ + INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \ + TYPE##16 dst;\ + dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\ + dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\ + return dst;\ + } +DECL_VECTOR_3OP(mad, float); +DECL_VECTOR_3OP(mix, float); +#undef DECL_VECTOR_3OP -INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) { - return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5, - mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y))))))); -} -INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) { - return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf, - mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb, - mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5, - mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y))))))))))))))); -} +// mix requires more variants +INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));} +INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));} +INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));} +INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));} +INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));} ///////////////////////////////////////////////////////////////////////////// // Extensions to manipulate the register file @@ -324,12 +459,10 @@ int __gen_ocl_force_simd8(void); int __gen_ocl_force_simd16(void); #define DECL_VOTE(TYPE) \ -__attribute__((overloadable,always_inline)) \ -TYPE __gen_ocl_any(TYPE cond) { \ +INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \ return (TYPE) __gen_ocl_any((unsigned short) cond); \ } \ -__attribute__((overloadable,always_inline)) \ -TYPE __gen_ocl_all(TYPE cond) { \ +INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \ return (TYPE) __gen_ocl_all((unsigned short) cond); \ } DECL_VOTE(unsigned int) @@ -341,6 +474,9 @@ DECL_VOTE(bool) #undef DECL_VOTE #define NULL ((void*)0) -#undef INLINE_OVERLOADABLE +#undef PURE +#undef CONST +#undef OVERLOADABLE +#undef INLINE #endif /* __GEN_OCL_STDLIB_H__ */ diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp index bd4ec87..b47e7ef 100644 --- a/backend/src/ocl_stdlib_str.cpp +++ b/backend/src/ocl_stdlib_str.cpp @@ -23,8 +23,10 @@ std::string ocl_stdlib_str = "#ifndef __GEN_OCL_STDLIB_H__\n" "#define __GEN_OCL_STDLIB_H__\n" "\n" -"#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline)) inline\n" +"#define INLINE __attribute__((always_inline)) inline\n" "#define OVERLOADABLE __attribute__((overloadable))\n" +"#define PURE __attribute__((pure))\n" +"#define CONST __attribute__((const))\n" "\n" "/////////////////////////////////////////////////////////////////////////////\n" "// OpenCL basic types\n" @@ -65,12 +67,15 @@ std::string ocl_stdlib_str = "#define private __private\n" "\n" "/////////////////////////////////////////////////////////////////////////////\n" -"// Work groups and work items functions\n" +"// Work Items functions (see 6.11.1 of OCL 1.1 spec)\n" "/////////////////////////////////////////////////////////////////////////////\n" +"// TODO get_global_offset\n" +"// TODO get_work_dim\n" +"\n" "#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n" -"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n" -"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n" -"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);\n" +"PURE CONST unsigned int __gen_ocl_##NAME##0(void); \\\n" +"PURE CONST unsigned int __gen_ocl_##NAME##1(void); \\\n" +"PURE CONST unsigned int __gen_ocl_##NAME##2(void);\n" "DECL_INTERNAL_WORK_ITEM_FN(get_group_id)\n" "DECL_INTERNAL_WORK_ITEM_FN(get_local_id)\n" "DECL_INTERNAL_WORK_ITEM_FN(get_local_size)\n" @@ -92,66 +97,64 @@ std::string ocl_stdlib_str = "DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)\n" "#undef DECL_PUBLIC_WORK_ITEM_FN\n" "\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"// Vector loads and stores\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"\n" -"// These loads and stores will use untyped reads and writes, so we can just\n" -"// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.\n" -"// Well we do not care, we do not activate TBAA in the compiler\n" -"#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \\\n" -"__attribute__((always_inline, overloadable)) \\\n" -"inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \\\n" -" return *(SPACE TYPE##DIM *) (p + DIM * offset); \\\n" -"} \\\n" -"__attribute__((always_inline, overloadable)) \\\n" -"inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \\\n" -" *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \\\n" +"INLINE uint get_global_id(uint dim) {\n" +" return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);\n" "}\n" "\n" -"#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \\\n" -" DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \\\n" -" DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \\\n" -" DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \\\n" -" DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \\\n" -" DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n" -"\n" -"#define DECL_UNTYPED_RW_ALL(TYPE) \\\n" -" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n" -" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n" -" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n" -" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n" -"\n" -"DECL_UNTYPED_RW_ALL(float)\n" -"DECL_UNTYPED_RW_ALL(uint)\n" -"DECL_UNTYPED_RW_ALL(int)\n" -"\n" -"#undef DECL_UNTYPED_RW_ALL\n" -"#undef DECL_UNTYPED_RW_ALL_SPACE\n" -"#undef DECL_UNTYPED_RW_SPACE_N\n" -"\n" "/////////////////////////////////////////////////////////////////////////////\n" -"// Arithmetic functions\n" +"// Math Functions (see 6.11.2 of OCL 1.1 spec)\n" "/////////////////////////////////////////////////////////////////////////////\n" -"__attribute__((always_inline))\n" -"inline uint get_global_id(uint dim) {\n" -" return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);\n" +"PURE CONST float __gen_ocl_fabs(float x);\n" +"PURE CONST float __gen_ocl_sin(float x);\n" +"PURE CONST float __gen_ocl_cos(float x);\n" +"PURE CONST float __gen_ocl_sqrt(float x);\n" +"PURE CONST float __gen_ocl_rsqrt(float x);\n" +"PURE CONST float __gen_ocl_log(float x);\n" +"PURE CONST float __gen_ocl_pow(float x, float y);\n" +"PURE CONST float __gen_ocl_rcp(float x);\n" +"PURE CONST float __gen_ocl_rndz(float x);\n" +"PURE CONST float __gen_ocl_rnde(float x);\n" +"PURE CONST float __gen_ocl_rndu(float x);\n" +"PURE CONST float __gen_ocl_rndd(float x);\n" +"INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }\n" +"INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }\n" +"INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }\n" +"INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }\n" +"INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }\n" +"INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }\n" +"INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }\n" +"INLINE OVERLOADABLE float native_tan(float x) {\n" +" return native_sin(x) / native_cos(x);\n" "}\n" "\n" -"__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c);\n" -"__attribute__((overloadable, always_inline))\n" -"inline uint select(uint src0, uint src1, uint cond) {\n" +"// TODO make them actually compliant precision-wise\n" +"#define cos native_cos // XXX work-around ptx profile: cos already defined\n" +"#define sin native_sin // XXX work-around ptr profile: sin already defined\n" +"#define sqrt native_sqrt // XXX work-around ptr profile: sin already defined\n" +"\n" +"INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }\n" +"INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); }\n" +"INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); }\n" +"INLINE OVERLOADABLE float round(float x) { return __gen_ocl_rnde(x); }\n" +"INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); }\n" +"INLINE OVERLOADABLE float ceil(float x) { return __gen_ocl_rndu(x); }\n" +"INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }\n" +"INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); }\n" +"\n" +"// Hack pow is already a builtin\n" +"#define pow powr\n" +"\n" +"PURE CONST OVERLOADABLE float mad(float a, float b, float c);\n" +"OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) {\n" " return cond ? src1 : src0;\n" "}\n" -"__attribute__((overloadable, always_inline))\n" -"inline int select(int src0, int src1, int cond) {\n" +"OVERLOADABLE INLINE int select(int src0, int src1, int cond) {\n" " return cond ? src1 : src0;\n" "}\n" "\n" "// This will be optimized out by LLVM and will output LLVM select instructions\n" "#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n" -"__attribute__((overloadable)) \\\n" -"inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n" +"OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n" " TYPE4 dst; \\\n" " const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n" " const TYPE x1 = src1.x; \\\n" @@ -171,22 +174,27 @@ std::string ocl_stdlib_str = "DECL_SELECT4(float4, float, int4, 0x80000000)\n" "#undef DECL_SELECT4\n" "\n" -"INLINE_OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {\n" +"#if 0\n" +"INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {\n" " return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));\n" "}\n" -"INLINE_OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {\n" +"INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {\n" " return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));\n" "}\n" -"INLINE_OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {\n" +"INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {\n" " return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),\n" " mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));\n" "}\n" +"#endif\n" "\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// Common Functions (see 6.11.4 of OCL 1.1 spec)\n" +"/////////////////////////////////////////////////////////////////////////////\n" "#define DECL_MIN_MAX(TYPE) \\\n" -"INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \\\n" +"INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \\\n" " return a > b ? a : b; \\\n" "} \\\n" -"INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \\\n" +"INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \\\n" " return a < b ? a : b; \\\n" "}\n" "DECL_MIN_MAX(float)\n" @@ -198,48 +206,133 @@ std::string ocl_stdlib_str = "DECL_MIN_MAX(unsigned char)\n" "#undef DECL_MIN_MAX\n" "\n" +"INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}\n" +"\n" "/////////////////////////////////////////////////////////////////////////////\n" -"// Math intrinsic functions\n" +"// Geometric functions (see 6.11.5 of OCL 1.1 spec)\n" "/////////////////////////////////////////////////////////////////////////////\n" -"__attribute__((pure,const)) float __gen_ocl_sin(float x);\n" -"__attribute__((pure,const)) float __gen_ocl_cos(float x);\n" -"__attribute__((pure,const)) float __gen_ocl_sqrt(float x);\n" -"__attribute__((pure,const)) float __gen_ocl_rsqrt(float x);\n" -"__attribute__((pure,const)) float __gen_ocl_log(float x);\n" -"__attribute__((pure,const)) float __gen_ocl_pow(float x, float y);\n" -"__attribute__((pure,const)) float __gen_ocl_rcp(float x);\n" -"\n" -"INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }\n" -"INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }\n" -"INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }\n" -"INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }\n" -"INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }\n" -"INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }\n" -"INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }\n" -"INLINE_OVERLOADABLE float native_tan(float x) {\n" -" return native_sin(x) / native_cos(x);\n" +"INLINE OVERLOADABLE float dot(float2 p0, float2 p1) {\n" +" return mad(p0.x,p1.x,p0.y*p1.y);\n" +"}\n" +"INLINE OVERLOADABLE float dot(float3 p0, float3 p1) {\n" +" return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));\n" +"}\n" +"INLINE OVERLOADABLE float dot(float4 p0, float4 p1) {\n" +" return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));\n" "}\n" "\n" +"INLINE OVERLOADABLE float dot(float8 p0, float8 p1) {\n" +" return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n" +" mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));\n" +"}\n" +"INLINE OVERLOADABLE float dot(float16 p0, float16 p1) {\n" +" return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,\n" +" mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,\n" +" mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n" +" mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));\n" +"}\n" +"\n" +"INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }\n" +"INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float normalize(float x) { return 1.f; }\n" +"INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }\n" +"\n" +"INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }\n" +"INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }\n" +"INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }\n" +"INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; }\n" +"INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }\n" +"INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }\n" +"\n" +"INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) {\n" +" return v0.yzx*v1.zxy-v0.zxy*v1.yzx;\n" +"}\n" +"INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {\n" +" return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);\n" +"}\n" +"\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// Vector loads and stores\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"\n" +"// These loads and stores will use untyped reads and writes, so we can just\n" +"// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.\n" +"// Well we do not care, we do not activate TBAA in the compiler\n" +"#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \\\n" +"INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \\\n" +" return *(SPACE TYPE##DIM *) (p + DIM * offset); \\\n" +"} \\\n" +"INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \\\n" +" *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \\\n" +"}\n" +"\n" +"#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n" +"\n" +"#define DECL_UNTYPED_RW_ALL(TYPE) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n" +" DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n" +"\n" +"DECL_UNTYPED_RW_ALL(float)\n" +"DECL_UNTYPED_RW_ALL(uint)\n" +"DECL_UNTYPED_RW_ALL(int)\n" +"\n" +"#undef DECL_UNTYPED_RW_ALL\n" +"#undef DECL_UNTYPED_RW_ALL_SPACE\n" +"#undef DECL_UNTYPED_RW_SPACE_N\n" +"\n" "/////////////////////////////////////////////////////////////////////////////\n" "// Declare functions for vector types which are derived from scalar ones\n" "/////////////////////////////////////////////////////////////////////////////\n" "#define DECL_VECTOR_1OP(NAME, TYPE) \\\n" -" INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \\\n" +" INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \\\n" " return (TYPE##2)(NAME(v.x), NAME(v.y)); \\\n" " }\\\n" -" INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \\\n" +" INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \\\n" " return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \\\n" " }\\\n" -" INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \\\n" +" INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \\\n" " return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \\\n" " }\\\n" -" INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \\\n" +" INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \\\n" " TYPE##8 dst;\\\n" " dst.s0123 = NAME(v.s0123);\\\n" " dst.s4567 = NAME(v.s4567);\\\n" " return dst;\\\n" " }\\\n" -" INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \\\n" +" INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \\\n" " TYPE##16 dst;\\\n" " dst.s01234567 = NAME(v.s01234567);\\\n" " dst.s89abcdef = NAME(v.s89abcdef);\\\n" @@ -252,31 +345,73 @@ std::string ocl_stdlib_str = "DECL_VECTOR_1OP(native_rsqrt, float);\n" "DECL_VECTOR_1OP(native_log2, float);\n" "DECL_VECTOR_1OP(native_recip, float);\n" +"DECL_VECTOR_1OP(fabs, float);\n" +"DECL_VECTOR_1OP(trunc, float);\n" +"DECL_VECTOR_1OP(round, float);\n" +"DECL_VECTOR_1OP(floor, float);\n" +"DECL_VECTOR_1OP(ceil, float);\n" "#undef DECL_VECTOR_1OP\n" "\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"// Geometric functions\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {\n" -" return mad(p0.x,p1.x,p0.y*p1.y);\n" -"}\n" -"INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {\n" -" return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));\n" -"}\n" -"INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {\n" -" return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));\n" -"}\n" +"#define DECL_VECTOR_2OP(NAME, TYPE) \\\n" +" INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \\\n" +" return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \\\n" +" return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \\\n" +" return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \\\n" +" TYPE##8 dst;\\\n" +" dst.s0123 = NAME(v0.s0123, v1.s0123);\\\n" +" dst.s4567 = NAME(v0.s4567, v1.s4567);\\\n" +" return dst;\\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \\\n" +" TYPE##16 dst;\\\n" +" dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\\\n" +" dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\\\n" +" return dst;\\\n" +" }\n" +"DECL_VECTOR_2OP(min, float);\n" +"DECL_VECTOR_2OP(max, float);\n" +"DECL_VECTOR_2OP(fmod, float);\n" +"DECL_VECTOR_2OP(powr, float);\n" +"#undef DECL_VECTOR_2OP\n" +"\n" +"#define DECL_VECTOR_3OP(NAME, TYPE) \\\n" +" INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \\\n" +" return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \\\n" +" return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \\\n" +" return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \\\n" +" TYPE##8 dst;\\\n" +" dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\\\n" +" dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\\\n" +" return dst;\\\n" +" }\\\n" +" INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \\\n" +" TYPE##16 dst;\\\n" +" dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\\\n" +" dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\\\n" +" return dst;\\\n" +" }\n" +"DECL_VECTOR_3OP(mad, float);\n" +"DECL_VECTOR_3OP(mix, float);\n" +"#undef DECL_VECTOR_3OP\n" "\n" -"INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {\n" -" return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n" -" mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));\n" -"}\n" -"INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {\n" -" return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,\n" -" mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,\n" -" mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,\n" -" mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));\n" -"}\n" +"// mix requires more variants\n" +"INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}\n" +"INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}\n" +"INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}\n" +"INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}\n" +"INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}\n" "\n" "/////////////////////////////////////////////////////////////////////////////\n" "// Extensions to manipulate the register file\n" @@ -327,12 +462,10 @@ std::string ocl_stdlib_str = "int __gen_ocl_force_simd16(void);\n" "\n" "#define DECL_VOTE(TYPE) \\\n" -"__attribute__((overloadable,always_inline)) \\\n" -"TYPE __gen_ocl_any(TYPE cond) { \\\n" +"INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \\\n" " return (TYPE) __gen_ocl_any((unsigned short) cond); \\\n" "} \\\n" -"__attribute__((overloadable,always_inline)) \\\n" -"TYPE __gen_ocl_all(TYPE cond) { \\\n" +"INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \\\n" " return (TYPE) __gen_ocl_all((unsigned short) cond); \\\n" "}\n" "DECL_VOTE(unsigned int)\n" @@ -344,7 +477,10 @@ std::string ocl_stdlib_str = "#undef DECL_VOTE\n" "\n" "#define NULL ((void*)0)\n" -"#undef INLINE_OVERLOADABLE\n" +"#undef PURE\n" +"#undef CONST\n" +"#undef OVERLOADABLE\n" +"#undef INLINE\n" "#endif /* __GEN_OCL_STDLIB_H__ */\n" "\n" ; diff --git a/kernels/compiler_clod.cl b/kernels/compiler_clod.cl index d7c945a..ec9d33e 100644 --- a/kernels/compiler_clod.cl +++ b/kernels/compiler_clod.cl @@ -31,12 +31,13 @@ inline uint pack_fp4(float4 u4) { float f(vec3 o) { - float a=(sin(o.x)+o.y*.25)*.35f; + float a=(sin(o.x)+o.y*.25f)*.35f; o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z); return dot(cos(o)*cos(o),(vec3)(1.f))-1.2f; } -vec3 s(vec3 o,vec3 d) +// XXX front end does not inline this function +__attribute((always_inline)) vec3 s(vec3 o,vec3 d) { float t=0.0f; float dt = 0.2f; @@ -45,27 +46,45 @@ vec3 s(vec3 o,vec3 d) for(int i=0;i<50;i++) { nh = f(o+d*t); - if(nh>0.0) { lh=nh; t+=dt; } + if(nh>0.0f) { lh=nh; t+=dt; } } - if( nh>0.0 ) return (vec3)(.93f,.94f,.85f); + if( nh>0.0f ) return (vec3)(.93f,.94f,.85f); t = t - dt*nh/(nh-lh); vec3 e=(vec3)(.1f,0.0f,0.0f); vec3 p=o+d*t; - vec3 n=-normalize((vec3)(f(p+e),f(p+e.yxy),f(p+e.yyx))+(vec3)((sin(p*75.)))*.01f); + vec3 n=-normalize((vec3)(f(p+e),f(p+e.yxy),f(p+e.yyx))+(vec3)((sin(p*75.f)))*.01f); return (vec3)(mix( ((max(-dot(n,(vec3)(.577f)),0.f) + 0.125f*max(-dot(n,(vec3)(-.707f,-.707f,0.f)),0.f)))*(mod - (length(p.xy)*20.f,2.f)<1.0?(vec3)(.71f,.85f,.25f):(vec3)(.79f,.93f,.4f)) + (length(p.xy)*20.f,2.f)<1.0f?(vec3)(.71f,.85f,.25f):(vec3)(.79f,.93f,.4f)) ,(vec3)(.93f,.94f,.85f), (vec3)(pow(t/9.f,5.f)) ) ); } +#if 0 +// XXX vector type in the function arguments not supported yet __kernel void compiler_clod(__global uint *dst, vec2 resolution, int w) { vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1)); - vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy; - vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f); + //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy; + vec2 p; + p.x = -1.0f + 2.0f * gl_FragCoord.x / resolution.x; + p.y = -1.0f + 2.0f * gl_FragCoord.y / resolution.y; + vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f); OUTPUT; } +#else +__kernel void compiler_clod(__global uint *dst, float resx, float resy, int w) +{ + vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1)); + //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy; + vec2 p; + p.x = -1.0f + 2.0f * gl_FragCoord.x / resx; + p.y = -1.0f + 2.0f * gl_FragCoord.y / resy; + vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f); + OUTPUT; +} + +#endif diff --git a/kernels/compiler_ribbon.cl b/kernels/compiler_ribbon.cl new file mode 100644 index 0000000..f5729e5 --- /dev/null +++ b/kernels/compiler_ribbon.cl @@ -0,0 +1,87 @@ +typedef float2 vec2; +typedef float3 vec3; +typedef float4 vec4; +#define sin native_sin +#define cos native_cos +#define tan native_tan +#define normalize fast_normalize +#define length fast_length + +inline vec3 reflect(vec3 I, vec3 N) { + return I - 2.0f * dot(N, I) * N; +} + +#define time 1.f + +// Object A (tunnel) +inline float oa(vec3 q) { + return cos(q.x)+cos(q.y*1.5f)+cos(q.z)+cos(q.y*20.f)*.05f; +} + +// Object B (ribbon) +inline float ob(vec3 q) { + return length(max(fabs(q-(vec3)(cos(q.z*1.5f)*.3f,-.5f+cos(q.z)*.2f,.0f))-(vec3)(.125f,.02f,time+3.f),(vec3)(.0f))); +} + +// Scene +inline float o(vec3 q) { return min(oa(q),ob(q)); } + +// Get Normal XXX Not inline by LLVM +__attribute__((always_inline)) vec3 gn(vec3 q) { + const vec3 f = (vec3)(.01f, 0.f, 0.f); + return normalize((vec3)(o(q+f.xyy), + o(q+f.yxy), + o(q+f.yyx))); +} + +inline uint pack_fp4(float4 u4) { + uint u; + u = (((uint) u4.x)) | + (((uint) u4.y) << 8) | + (((uint) u4.z) << 16); + return u; +} + +// XXX vector not supported in function argument yet +__kernel void compiler_ribbon(__global uint *dst, float resx, float resy, int w) +{ + vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1)); + vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / (vec2)(resx, resy); + p.x *= resx/resy; + + vec4 c = (vec4)(1.0f); + const vec3 org = (vec3)(sin(time)*.5f, + cos(time*.5f)*.25f+.25f, + time); + vec3 dir=normalize((vec3)(p.x*1.6f,p.y,1.0f)); + vec3 q = org, pp; + float d=.0f; + + // First raymarching + for(int i=0;i<64;i++) { + d=o(q); + q+=d*dir; + } + pp=q; + const float f = length(q-org)*0.02f; + + // Second raymarching (reflection) + dir=reflect(dir,gn(q)); + q+=dir; + for(int i=0;i<64;i++) { + d=o(q); + q+=d*dir; + } + c = max(dot(gn(q), (vec3)(0.1f,0.1f,0.0f)), 0.0f) + + (vec4)(0.3f, cos(time*.5f)*.5f+.5f, sin(time*.5f)*.5f+.5f, 1.f) * min(length(q-org)*.04f,1.f); + + // Ribbon Color + if(oa(pp)>ob(pp)) + c = mix(c, (vec4)(cos(time*.3f)*0.5f + 0.5f,cos(time*.2f)*.5f+.5f,sin(time*.3f)*.5f+.5f,1.f),.3f); + + // Final Color + const vec4 color = ((c+(vec4)(f))+(1.f-min(pp.y+1.9f,1.f))*(vec4)(1.f,.8f,.7f,1.f))*min(time*.5f,1.f); + const vec4 final = 255.f * max(min(color, (vec4)(1.f)), (vec4)(0.f)); + dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); +} + diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index 0624786..e3daf84 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -6,6 +6,8 @@ ADD_LIBRARY(utests SHARED utest_error.c compiler_mandelbrot.cpp compiler_mandelbrot_alternate.cpp + compiler_clod.cpp +# compiler_ribbon.cpp compiler_box_blur_float.cpp compiler_box_blur.cpp compiler_argument_structure.cpp @@ -57,7 +59,7 @@ ADD_LIBRARY(utests SHARED TARGET_LINK_LIBRARIES(utests cl m) ADD_EXECUTABLE(utest_run utest_run.cpp) -TARGET_LINK_LIBRARIES(run utests) +TARGET_LINK_LIBRARIES(utest_run utests) ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp) TARGET_LINK_LIBRARIES(flat_address_space utests) diff --git a/utests/compiler_clod.cpp b/utests/compiler_clod.cpp new file mode 100644 index 0000000..360d197 --- /dev/null +++ b/utests/compiler_clod.cpp @@ -0,0 +1,49 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * Author: Benjamin Segovia + */ + +#include "utest_helper.hpp" + +static int *dst = NULL; +static const int w = 1024; +static const int h = 1024; + +static void compiler_clod(void) +{ + const size_t global[2] = {size_t(w), size_t(h)}; + const size_t local[2] = {16, 1}; + const size_t sz = w * h * sizeof(char[4]); + const float fx = float(w); + const float fy = float(h); + OCL_CREATE_KERNEL("compiler_clod"); + + cl_mem cl_dst = clCreateBuffer(ctx, 0, sz, NULL, NULL); + OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &cl_dst); + OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx); + OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy); + OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w); + OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL); + dst = (int *) clIntelMapBuffer(cl_dst, NULL); + + cl_write_bmp(dst, w, h, "clod.bmp"); + OCL_CALL (clIntelUnmapBuffer, cl_dst); + OCL_CALL (clReleaseMemObject, cl_dst); +} + +MAKE_UTEST_FROM_FUNCTION(compiler_clod); + diff --git a/utests/compiler_ribbon.cpp b/utests/compiler_ribbon.cpp new file mode 100644 index 0000000..733b8ac --- /dev/null +++ b/utests/compiler_ribbon.cpp @@ -0,0 +1,50 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * Author: Benjamin Segovia + */ + +#include "utest_helper.hpp" + +static int *dst = NULL; +static const int w = 1024; +static const int h = 1024; + +static void compiler_ribbon(void) +{ + const size_t global[2] = {size_t(w), size_t(h)}; + const size_t local[2] = {16, 1}; + const size_t sz = w * h * sizeof(char[4]); + const float fx = float(w); + const float fy = float(h); + OCL_CREATE_KERNEL("compiler_ribbon"); + + cl_mem cl_dst = clCreateBuffer(ctx, 0, sz, NULL, NULL); + OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &cl_dst); + OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx); + OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy); + OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w); + OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL); + dst = (int *) clIntelMapBuffer(cl_dst, NULL); + + cl_write_bmp(dst, w, h, "ribbon.bmp"); + OCL_CALL (clIntelUnmapBuffer, cl_dst); + OCL_CALL (clReleaseMemObject, cl_dst); +} + +MAKE_UTEST_FROM_FUNCTION(compiler_ribbon); + + -- 2.7.4