From 9487593e2ce5268b4b9bfc606fdcd4c6c88401e2 Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Sun, 13 May 2012 17:58:26 +0000 Subject: [PATCH] Added support for loads and stores of uint2/3/4 --- backend/src/backend/gen_context.cpp | 119 +++++++++++++++--------- backend/src/backend/gen_context.hpp | 4 +- backend/src/ir/function.cpp | 1 - backend/src/ocl_stdlib.h | 177 ++++++++++++++++++++++++------------ backend/src/ocl_stdlib_str.cpp | 177 ++++++++++++++++++++++++------------ 5 files changed, 320 insertions(+), 158 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 52732f7..5613b18 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -118,18 +118,6 @@ namespace gbe } } - uint32_t GenContext::createGenReg(ir::Register reg, uint32_t grfOffset) { - using namespace ir; - if (fn.isSpecialReg(reg) == true) return grfOffset; // already done - if (fn.getInput(reg) != NULL) return grfOffset; // already done - const RegisterData regData = fn.getRegisterData(reg); - const RegisterFamily family = regData.family; - const uint32_t typeSize = familySize[family]; - const uint32_t regSize = simdWidth*typeSize; - grfOffset = ALIGN(grfOffset, regSize); - if (grfOffset + regSize <= GEN_GRF_SIZE) { - const uint32_t nr = grfOffset / GEN_REG_SIZE; - const uint32_t subnr = (grfOffset % GEN_REG_SIZE) / typeSize; #define INSERT_REG(SIMD16, SIMD8, SIMD1) \ if (this->isScalarReg(reg) == true) { \ RA.insert(std::make_pair(reg, GenReg::SIMD1(nr, subnr))); \ @@ -142,6 +130,19 @@ namespace gbe grfOffset += simdWidth * typeSize; \ } else \ NOT_SUPPORTED; + + uint32_t GenContext::createGenReg(ir::Register reg, uint32_t grfOffset) { + using namespace ir; + if (fn.isSpecialReg(reg) == true) return grfOffset; // already done + if (fn.getInput(reg) != NULL) return grfOffset; // already done + const RegisterData regData = fn.getRegisterData(reg); + const RegisterFamily family = regData.family; + const uint32_t typeSize = familySize[family]; + const uint32_t regSize = simdWidth*typeSize; + grfOffset = ALIGN(grfOffset, regSize); + if (grfOffset + regSize <= GEN_GRF_SIZE) { + const uint32_t nr = grfOffset / GEN_REG_SIZE; + const uint32_t subnr = (grfOffset % GEN_REG_SIZE) / typeSize; switch (family) { case FAMILY_BOOL: case FAMILY_WORD: @@ -156,12 +157,13 @@ namespace gbe default: NOT_SUPPORTED; } -#undef INSERT_REG } else NOT_SUPPORTED; return grfOffset; } +#undef INSERT_REG + void GenContext::allocateRegister(void) { using namespace ir; GBE_ASSERT(fn.getProfile() == PROFILE_OCL); @@ -466,7 +468,8 @@ namespace gbe p->CMP(genCmp, src0, src1); p->pop(); - // We emit a very unoptimized code where we store the resulting mask in a GRF + // We emit a very unoptimized code where we store the resulting mask in a + // GRF p->push(); p->curr.flag = 0; p->curr.subFlag = 1; @@ -549,23 +552,46 @@ namespace gbe } } - void GenContext::emitUntypedRead(const ir::LoadInstruction &insn, - GenReg address, - GenReg value) + void GenContext::emitUntypedRead(const ir::LoadInstruction &insn, GenReg address) { using namespace ir; + const uint32_t valueNum = insn.getValueNum(); + GenReg src; GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL); - GBE_ASSERT(insn.getValueNum() == 1); + + // A scalar address register requires to be aligned if (isScalarReg(insn.getAddress()) == true) { + if (this->simdWidth == 8) + src = GenReg::f8grf(112, 0); + else + src = GenReg::f16grf(112, 0); + p->MOV(src, GenReg::retype(address, GEN_TYPE_F)); + } else + src = address; + + // Gather of integer is simpler since we may not need to move the + // destination + if (valueNum == 1) { + const GenReg value = this->genReg(insn.getValue(0)); + p->UNTYPED_READ(value, src, 0, 1); + } + // Right now, we just move everything to registers when dealing with + // int2/3/4 + else { + p->UNTYPED_READ(GenReg::f8grf(114, 0), src, 0, valueNum); if (this->simdWidth == 8) { - p->MOV(GenReg::f8grf(112, 0), GenReg::retype(address, GEN_TYPE_F)); - p->UNTYPED_READ(value, GenReg::f8grf(112, 0), 0, 1); + for (uint32_t value = 0; value < valueNum; ++value) { + const GenReg dst = this->genReg(insn.getValue(value), TYPE_FLOAT); + p->MOV(dst, GenReg::f8grf(114+value, 0)); + } } else if (this->simdWidth == 16) { - p->MOV(GenReg::f16grf(112, 0), GenReg::retype(address, GEN_TYPE_F)); - p->UNTYPED_READ(value, GenReg::f16grf(112, 0), 0, 1); - } - } else - p->UNTYPED_READ(value, address, 0, 1); + for (uint32_t value = 0; value < valueNum; ++value) { + const GenReg dst = this->genReg(insn.getValue(value), TYPE_FLOAT); + p->MOV(dst, GenReg::f16grf(114+2*value, 0)); + } + } else + NOT_SUPPORTED; + } } INLINE uint32_t getByteScatterGatherSize(ir::Type type) { @@ -629,31 +655,37 @@ namespace gbe void GenContext::emitLoadInstruction(const ir::LoadInstruction &insn) { using namespace ir; - const GenReg value = this->genReg(insn.getValue(0)); const GenReg address = this->genReg(insn.getAddress()); GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL); GBE_ASSERT(this->isScalarReg(insn.getValue(0)) == false); if (insn.isAligned() == true) - this->emitUntypedRead(insn, address, value); + this->emitUntypedRead(insn, address); else { + const GenReg value = this->genReg(insn.getValue(0)); this->emitByteGather(insn, address, value); } } - void GenContext::emitUntypedWrite(const ir::StoreInstruction &insn, - GenReg address, - GenReg value) + void GenContext::emitUntypedWrite(const ir::StoreInstruction &insn) { using namespace ir; - if (this->simdWidth == 8) { - p->MOV(GenReg::f8grf(112, 0), GenReg::retype(address, GEN_TYPE_F)); - p->MOV(GenReg::f8grf(113, 0), GenReg::retype(value, GEN_TYPE_F)); - } else if (this->simdWidth == 16) { - p->MOV(GenReg::f16grf(112, 0), GenReg::retype(address, GEN_TYPE_F)); - p->MOV(GenReg::f16grf(114, 0), GenReg::retype(value, GEN_TYPE_F)); - } else + const uint32_t srcNum = insn.getSrcNum(); + const uint32_t valueNum = insn.getValueNum(); + + // We do it stupidly right now. We just move everything to temporaries + if (this->simdWidth == 8) + for (uint32_t src = 0; src < srcNum; ++src) { + const GenReg reg = this->genReg(insn.getSrc(src), TYPE_FLOAT); + p->MOV(GenReg::f8grf(112+src, 0), reg); + } + else if (this->simdWidth == 16) + for (uint32_t src = 0; src < srcNum; ++src) { + const GenReg reg = this->genReg(insn.getSrc(src), TYPE_FLOAT); + p->MOV(GenReg::f16grf(112+2*src, 0), reg); + } + else NOT_IMPLEMENTED; - p->UNTYPED_WRITE(GenReg::f8grf(112, 0), 0, 1); + p->UNTYPED_WRITE(GenReg::f8grf(112, 0), 0, valueNum); } void GenContext::emitByteScatter(const ir::StoreInstruction &insn, @@ -664,6 +696,7 @@ namespace gbe const Type type = insn.getValueType(); const uint32_t elemSize = getByteScatterGatherSize(type); + GBE_ASSERT(insn.getValueNum() == 1); if (this->simdWidth == 8) { p->MOV(GenReg::f8grf(112, 0), GenReg::retype(address, GEN_TYPE_F)); if (elemSize == GEN_BYTE_SCATTER_DWORD) @@ -689,13 +722,13 @@ namespace gbe void GenContext::emitStoreInstruction(const ir::StoreInstruction &insn) { using namespace ir; GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL); - GBE_ASSERT(insn.getValueNum() == 1); - const GenReg address = this->genReg(insn.getAddress()); - const GenReg value = this->genReg(insn.getValue(0)); if (insn.isAligned() == true) - this->emitUntypedWrite(insn, address, value); - else + this->emitUntypedWrite(insn); + else { + const GenReg address = this->genReg(insn.getAddress()); + const GenReg value = this->genReg(insn.getValue(0)); this->emitByteScatter(insn, address, value); + } } void GenContext::emitFenceInstruction(const ir::FenceInstruction &insn) {} diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index c5b29d6..578086f 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -82,8 +82,8 @@ namespace gbe /*! It is not natively suppored on Gen. We implement it here */ void emitIntMul32x32(const ir::Instruction &insn, GenReg dst, GenReg src0, GenReg src1); /*! Use untyped writes and reads for everything aligned on 4 bytes */ - void emitUntypedRead(const ir::LoadInstruction &insn, GenReg address, GenReg value); - void emitUntypedWrite(const ir::StoreInstruction &insn, GenReg address, GenReg value); + void emitUntypedRead(const ir::LoadInstruction &insn, GenReg address); + void emitUntypedWrite(const ir::StoreInstruction &insn); /*! Use byte scatters and gathers for everything not aligned on 4 bytes */ void emitByteGather(const ir::LoadInstruction &insn, GenReg address, GenReg value); void emitByteScatter(const ir::StoreInstruction &insn, GenReg address, GenReg value); diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp index a7dafde..bf27f6f 100644 --- a/backend/src/ir/function.cpp +++ b/backend/src/ir/function.cpp @@ -75,7 +75,6 @@ namespace ir { newBra->replace(&insn); }); - std::cout << "blockNum: " << this->blockNum() << std::endl; // Reset the label to block mapping this->labels.resize(last); foreachBlock([&](BasicBlock &bb) { diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h index 5576f30..6b93263 100644 --- a/backend/src/ocl_stdlib.h +++ b/backend/src/ocl_stdlib.h @@ -17,9 +17,50 @@ * Author: Benjamin Segovia */ -#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \ -__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \ -__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \ +///////////////////////////////////////////////////////////////////////////// +// OpenCL basic types +///////////////////////////////////////////////////////////////////////////// +typedef unsigned int uint; +typedef unsigned int size_t; +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef float float8 __attribute__((ext_vector_type(8))); +typedef float float16 __attribute__((ext_vector_type(16))); +typedef int int2 __attribute__((ext_vector_type(2))); +typedef int int3 __attribute__((ext_vector_type(3))); +typedef int int4 __attribute__((ext_vector_type(4))); +typedef int int8 __attribute__((ext_vector_type(8))); +typedef int int16 __attribute__((ext_vector_type(16))); +typedef unsigned int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned uint3 __attribute__((ext_vector_type(3))); +typedef unsigned uint4 __attribute__((ext_vector_type(4))); +typedef unsigned uint8 __attribute__((ext_vector_type(8))); +typedef unsigned uint16 __attribute__((ext_vector_type(16))); +typedef bool bool2 __attribute__((ext_vector_type(2))); +typedef bool bool3 __attribute__((ext_vector_type(3))); +typedef bool bool4 __attribute__((ext_vector_type(4))); +typedef bool bool8 __attribute__((ext_vector_type(8))); +typedef bool bool16 __attribute__((ext_vector_type(16))); + +///////////////////////////////////////////////////////////////////////////// +// OpenCL address space +///////////////////////////////////////////////////////////////////////////// +#define __private __attribute__((address_space(0))) +#define __global __attribute__((address_space(1))) +#define __constant __attribute__((address_space(2))) +//#define __local __attribute__((address_space(3))) +#define global __global +//#define local __local +#define constant __constant +#define private __private + +///////////////////////////////////////////////////////////////////////////// +// Work groups and work items functions +///////////////////////////////////////////////////////////////////////////// +#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \ +__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \ +__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \ __attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void); DECL_INTERNAL_WORK_ITEM_FN(get_group_id) DECL_INTERNAL_WORK_ITEM_FN(get_local_id) @@ -28,12 +69,12 @@ DECL_INTERNAL_WORK_ITEM_FN(get_global_size) DECL_INTERNAL_WORK_ITEM_FN(get_num_groups) #undef DECL_INTERNAL_WORK_ITEM_FN -#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \ -inline unsigned NAME(unsigned int dim) { \ - if (dim == 0) return __gen_ocl_##NAME##0(); \ - else if (dim == 1) return __gen_ocl_##NAME##1(); \ - else if (dim == 2) return __gen_ocl_##NAME##2(); \ - else return 0; \ +#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \ +inline unsigned NAME(unsigned int dim) { \ + if (dim == 0) return __gen_ocl_##NAME##0(); \ + else if (dim == 1) return __gen_ocl_##NAME##1(); \ + else if (dim == 2) return __gen_ocl_##NAME##2(); \ + else return 0; \ } DECL_PUBLIC_WORK_ITEM_FN(get_group_id) DECL_PUBLIC_WORK_ITEM_FN(get_local_id) @@ -42,74 +83,98 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_size) DECL_PUBLIC_WORK_ITEM_FN(get_num_groups) #undef DECL_PUBLIC_WORK_ITEM_FN -inline unsigned int get_global_id(unsigned int dim) { +///////////////////////////////////////////////////////////////////////////// +// Vector loads and stores +///////////////////////////////////////////////////////////////////////////// + +// These loads and stores will use untyped reads and writes, so we can just +// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue. +// Well we do not care, we do not activate TBAA in the compiler +#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \ +__attribute__((always_inline, overloadable)) \ +inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \ + return *(SPACE TYPE##DIM *) (p + DIM * offset); \ +} \ +__attribute__((always_inline, overloadable)) \ +inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \ + *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \ +} + +#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE) + +#define DECL_UNTYPED_RW_ALL(TYPE) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private) + +DECL_UNTYPED_RW_ALL(float) +DECL_UNTYPED_RW_ALL(uint) +DECL_UNTYPED_RW_ALL(int) + +#undef DECL_UNTYPED_RW_ALL +#undef DECL_UNTYPED_RW_ALL_SPACE +#undef DECL_UNTYPED_RW_SPACE_N + +///////////////////////////////////////////////////////////////////////////// +// Arithmetic functions +///////////////////////////////////////////////////////////////////////////// +__attribute__((always_inline)) +inline uint get_global_id(uint dim) { return get_local_id(dim) + get_local_size(dim) * get_group_id(dim); } -__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c); -__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) { +__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c); +__attribute__((overloadable, always_inline)) +inline uint select(uint src0, uint src1, uint cond) { return cond ? src0 : src1; } -__attribute__((overloadable)) inline int select(int src0, int src1, int cond) { +__attribute__((overloadable, always_inline)) +inline int select(int src0, int src1, int cond) { return cond ? src0 : src1; } -typedef unsigned int uint; -typedef float float2 __attribute__((ext_vector_type(2))); -typedef float float3 __attribute__((ext_vector_type(3))); -typedef float float4 __attribute__((ext_vector_type(4))); -typedef int int2 __attribute__((ext_vector_type(2))); -typedef int int3 __attribute__((ext_vector_type(3))); -typedef int int4 __attribute__((ext_vector_type(4))); -typedef unsigned int uint2 __attribute__((ext_vector_type(2))); -typedef unsigned uint3 __attribute__((ext_vector_type(3))); -typedef unsigned uint4 __attribute__((ext_vector_type(4))); -typedef bool bool2 __attribute__((ext_vector_type(2))); -typedef bool bool3 __attribute__((ext_vector_type(3))); -typedef bool bool4 __attribute__((ext_vector_type(4))); - // This will be optimized out by LLVM and will output LLVM select instructions -#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \ -__attribute__((overloadable)) \ -inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ - TYPE4 dst; \ - const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \ - const TYPE x1 = src1.x; \ - const TYPE y0 = src0.y; \ - const TYPE y1 = src1.y; \ - const TYPE z0 = src0.z; \ - const TYPE z1 = src1.z; \ - const TYPE w0 = src0.w; \ - const TYPE w1 = src1.w; \ - \ - dst.x = (cond.x & MASK) ? x1 : x0; \ - dst.y = (cond.y & MASK) ? y1 : y0; \ - dst.z = (cond.z & MASK) ? z1 : z0; \ - dst.w = (cond.w & MASK) ? w1 : w0; \ - return dst; \ +#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \ +__attribute__((overloadable)) \ +inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ + TYPE4 dst; \ + const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \ + const TYPE x1 = src1.x; \ + const TYPE y0 = src0.y; \ + const TYPE y1 = src1.y; \ + const TYPE z0 = src0.z; \ + const TYPE z1 = src1.z; \ + const TYPE w0 = src0.w; \ + const TYPE w1 = src1.w; \ + dst.x = (cond.x & MASK) ? x1 : x0; \ + dst.y = (cond.y & MASK) ? y1 : y0; \ + dst.z = (cond.z & MASK) ? z1 : z0; \ + dst.w = (cond.w & MASK) ? w1 : w0; \ + return dst; \ } DECL_SELECT4(int4, int, int4, 0x80000000) DECL_SELECT4(float4, float, int4, 0x80000000) #undef DECL_SELECT4 -__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) { +__attribute__((overloadable,always_inline)) +inline float2 mad(float2 a, float2 b, float2 c) { return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y)); } -__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) { +__attribute__((overloadable,always_inline)) +inline float3 mad(float3 a, float3 b, float3 c) { return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z)); } -__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) { +__attribute__((overloadable,always_inline)) +inline float4 mad(float4 a, float4 b, float4 c) { return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z), mad(a.w,b.w,c.w)); } -#define __private __attribute__((address_space(0))) -#define __global __attribute__((address_space(1))) -#define __constant __attribute__((address_space(2))) -//#define __local __attribute__((address_space(3))) -#define global __global -//#define local __local -#define constant __constant -#define private __private #define NULL ((void*)0) diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp index 778aa1a..7b37955 100644 --- a/backend/src/ocl_stdlib_str.cpp +++ b/backend/src/ocl_stdlib_str.cpp @@ -20,9 +20,50 @@ #include "string" namespace gbe { std::string ocl_stdlib_str = -"#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n" -"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n" -"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// OpenCL basic types\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"typedef unsigned int uint;\n" +"typedef unsigned int size_t;\n" +"typedef float float2 __attribute__((ext_vector_type(2)));\n" +"typedef float float3 __attribute__((ext_vector_type(3)));\n" +"typedef float float4 __attribute__((ext_vector_type(4)));\n" +"typedef float float8 __attribute__((ext_vector_type(8)));\n" +"typedef float float16 __attribute__((ext_vector_type(16)));\n" +"typedef int int2 __attribute__((ext_vector_type(2)));\n" +"typedef int int3 __attribute__((ext_vector_type(3)));\n" +"typedef int int4 __attribute__((ext_vector_type(4)));\n" +"typedef int int8 __attribute__((ext_vector_type(8)));\n" +"typedef int int16 __attribute__((ext_vector_type(16)));\n" +"typedef unsigned int uint2 __attribute__((ext_vector_type(2)));\n" +"typedef unsigned uint3 __attribute__((ext_vector_type(3)));\n" +"typedef unsigned uint4 __attribute__((ext_vector_type(4)));\n" +"typedef unsigned uint8 __attribute__((ext_vector_type(8)));\n" +"typedef unsigned uint16 __attribute__((ext_vector_type(16)));\n" +"typedef bool bool2 __attribute__((ext_vector_type(2)));\n" +"typedef bool bool3 __attribute__((ext_vector_type(3)));\n" +"typedef bool bool4 __attribute__((ext_vector_type(4)));\n" +"typedef bool bool8 __attribute__((ext_vector_type(8)));\n" +"typedef bool bool16 __attribute__((ext_vector_type(16)));\n" +"\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// OpenCL address space\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"#define __private __attribute__((address_space(0)))\n" +"#define __global __attribute__((address_space(1)))\n" +"#define __constant __attribute__((address_space(2)))\n" +"//#define __local __attribute__((address_space(3)))\n" +"#define global __global\n" +"//#define local __local\n" +"#define constant __constant\n" +"#define private __private\n" +"\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// Work groups and work items functions\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \\\n" +"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \\\n" +"__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \\\n" "__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);\n" "DECL_INTERNAL_WORK_ITEM_FN(get_group_id)\n" "DECL_INTERNAL_WORK_ITEM_FN(get_local_id)\n" @@ -31,12 +72,12 @@ std::string ocl_stdlib_str = "DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)\n" "#undef DECL_INTERNAL_WORK_ITEM_FN\n" "\n" -"#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \\\n" -"inline unsigned NAME(unsigned int dim) { \\\n" -" if (dim == 0) return __gen_ocl_##NAME##0(); \\\n" -" else if (dim == 1) return __gen_ocl_##NAME##1(); \\\n" -" else if (dim == 2) return __gen_ocl_##NAME##2(); \\\n" -" else return 0; \\\n" +"#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \\\n" +"inline unsigned NAME(unsigned int dim) { \\\n" +" if (dim == 0) return __gen_ocl_##NAME##0(); \\\n" +" else if (dim == 1) return __gen_ocl_##NAME##1(); \\\n" +" else if (dim == 2) return __gen_ocl_##NAME##2(); \\\n" +" else return 0; \\\n" "}\n" "DECL_PUBLIC_WORK_ITEM_FN(get_group_id)\n" "DECL_PUBLIC_WORK_ITEM_FN(get_local_id)\n" @@ -45,75 +86,99 @@ std::string ocl_stdlib_str = "DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)\n" "#undef DECL_PUBLIC_WORK_ITEM_FN\n" "\n" -"inline unsigned int get_global_id(unsigned int dim) {\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// Vector loads and stores\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"\n" +"// These loads and stores will use untyped reads and writes, so we can just\n" +"// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.\n" +"// Well we do not care, we do not activate TBAA in the compiler\n" +"#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \\\n" +"__attribute__((always_inline, overloadable)) \\\n" +"inline TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \\\n" +" return *(SPACE TYPE##DIM *) (p + DIM * offset); \\\n" +"} \\\n" +"__attribute__((always_inline, overloadable)) \\\n" +"inline void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \\\n" +" *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \\\n" +"}\n" +"\n" +"#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \\\n" +" DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)\n" +"\n" +"#define DECL_UNTYPED_RW_ALL(TYPE) \\\n" +"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \\\n" +"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \\\n" +"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \\\n" +"DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)\n" +"\n" +"DECL_UNTYPED_RW_ALL(float)\n" +"DECL_UNTYPED_RW_ALL(uint)\n" +"DECL_UNTYPED_RW_ALL(int)\n" +"\n" +"#undef DECL_UNTYPED_RW_ALL\n" +"#undef DECL_UNTYPED_RW_ALL_SPACE\n" +"#undef DECL_UNTYPED_RW_SPACE_N\n" +"\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"// Arithmetic functions\n" +"/////////////////////////////////////////////////////////////////////////////\n" +"__attribute__((always_inline))\n" +"inline uint get_global_id(uint dim) {\n" " return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);\n" "}\n" "\n" -"__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);\n" -"__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) {\n" +"__attribute__ ((pure, const, overloadable)) float mad(float a, float b, float c);\n" +"__attribute__((overloadable, always_inline))\n" +"inline uint select(uint src0, uint src1, uint cond) {\n" " return cond ? src0 : src1;\n" "}\n" -"__attribute__((overloadable)) inline int select(int src0, int src1, int cond) {\n" +"__attribute__((overloadable, always_inline))\n" +"inline int select(int src0, int src1, int cond) {\n" " return cond ? src0 : src1;\n" "}\n" "\n" -"typedef unsigned int uint;\n" -"typedef float float2 __attribute__((ext_vector_type(2)));\n" -"typedef float float3 __attribute__((ext_vector_type(3)));\n" -"typedef float float4 __attribute__((ext_vector_type(4)));\n" -"typedef int int2 __attribute__((ext_vector_type(2)));\n" -"typedef int int3 __attribute__((ext_vector_type(3)));\n" -"typedef int int4 __attribute__((ext_vector_type(4)));\n" -"typedef unsigned int uint2 __attribute__((ext_vector_type(2)));\n" -"typedef unsigned uint3 __attribute__((ext_vector_type(3)));\n" -"typedef unsigned uint4 __attribute__((ext_vector_type(4)));\n" -"typedef bool bool2 __attribute__((ext_vector_type(2)));\n" -"typedef bool bool3 __attribute__((ext_vector_type(3)));\n" -"typedef bool bool4 __attribute__((ext_vector_type(4)));\n" -"\n" "// This will be optimized out by LLVM and will output LLVM select instructions\n" -"#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n" -"__attribute__((overloadable)) \\\n" -"inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n" -" TYPE4 dst; \\\n" -" const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n" -" const TYPE x1 = src1.x; \\\n" -" const TYPE y0 = src0.y; \\\n" -" const TYPE y1 = src1.y; \\\n" -" const TYPE z0 = src0.z; \\\n" -" const TYPE z1 = src1.z; \\\n" -" const TYPE w0 = src0.w; \\\n" -" const TYPE w1 = src1.w; \\\n" -" \\\n" -" dst.x = (cond.x & MASK) ? x1 : x0; \\\n" -" dst.y = (cond.y & MASK) ? y1 : y0; \\\n" -" dst.z = (cond.z & MASK) ? z1 : z0; \\\n" -" dst.w = (cond.w & MASK) ? w1 : w0; \\\n" -" return dst; \\\n" +"#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n" +"__attribute__((overloadable)) \\\n" +"inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n" +" TYPE4 dst; \\\n" +" const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n" +" const TYPE x1 = src1.x; \\\n" +" const TYPE y0 = src0.y; \\\n" +" const TYPE y1 = src1.y; \\\n" +" const TYPE z0 = src0.z; \\\n" +" const TYPE z1 = src1.z; \\\n" +" const TYPE w0 = src0.w; \\\n" +" const TYPE w1 = src1.w; \\\n" +" dst.x = (cond.x & MASK) ? x1 : x0; \\\n" +" dst.y = (cond.y & MASK) ? y1 : y0; \\\n" +" dst.z = (cond.z & MASK) ? z1 : z0; \\\n" +" dst.w = (cond.w & MASK) ? w1 : w0; \\\n" +" return dst; \\\n" "}\n" "DECL_SELECT4(int4, int, int4, 0x80000000)\n" "DECL_SELECT4(float4, float, int4, 0x80000000)\n" "#undef DECL_SELECT4\n" "\n" -"__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) {\n" +"__attribute__((overloadable,always_inline))\n" +"inline float2 mad(float2 a, float2 b, float2 c) {\n" " return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));\n" "}\n" -"__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) {\n" +"__attribute__((overloadable,always_inline))\n" +"inline float3 mad(float3 a, float3 b, float3 c) {\n" " return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));\n" "}\n" -"__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) {\n" +"__attribute__((overloadable,always_inline))\n" +"inline float4 mad(float4 a, float4 b, float4 c) {\n" " return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),\n" " mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));\n" "}\n" "\n" -"#define __private __attribute__((address_space(0)))\n" -"#define __global __attribute__((address_space(1)))\n" -"#define __constant __attribute__((address_space(2)))\n" -"//#define __local __attribute__((address_space(3)))\n" -"#define global __global\n" -"//#define local __local\n" -"#define constant __constant\n" -"#define private __private\n" "\n" "#define NULL ((void*)0)\n" ; -- 2.7.4