From d0c6d8fe26e150ef457b72249aec6a163c841e81 Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Thu, 10 Oct 2013 15:13:50 +0800 Subject: [PATCH] GBE: Support local variable inside kernel function. As Clang treat local variable in similar way like global constant, (they are treated as Global variable in each own address space) we refine the previous constant implementation in order to share same code between local variable and global constant. We will allocate an address register for each GlobalVariable (constant or local) through calling newRegister(). In later step, through getRegister() we will get a proper register derived from the allocated address register. Signed-off-by: Ruiling Song Reviewed-by: "Yang, Rong R" --- backend/src/backend/context.cpp | 1 + backend/src/backend/program.cpp | 10 ++- backend/src/backend/program.h | 3 + backend/src/backend/program.hpp | 3 + backend/src/ir/function.cpp | 2 +- backend/src/ir/function.hpp | 5 ++ backend/src/llvm/llvm_gen_backend.cpp | 157 +++++++++++++++++++++++----------- kernels/compiler_local_slm.cl | 28 ++++-- src/cl_command_queue_gen7.c | 3 +- utests/CMakeLists.txt | 1 + utests/compiler_local_slm.cpp | 30 ++++++- 11 files changed, 179 insertions(+), 64 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index cbd38f1..bc15761 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -632,6 +632,7 @@ namespace gbe void Context::handleSLM(void) { const bool useSLM = fn.getUseSLM(); kernel->useSLM = useSLM; + kernel->slmSize = fn.getSLMSize(); } bool Context::isScalarReg(const ir::Register ®) const { diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 6ba9593..83fc515 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -75,7 +75,7 @@ namespace gbe { Kernel::Kernel(const std::string &name) : - name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL), samplerSet(NULL), imageSet(NULL) + name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL) {} Kernel::~Kernel(void) { if(ctx) GBE_DELETE(ctx); @@ -714,6 +714,12 @@ namespace gbe { return kernel->getUseSLM() ? 1 : 0; } + static int32_t kernelGetSLMSize(gbe_kernel genKernel) { + if (genKernel == NULL) return 0; + const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel; + return kernel->getSLMSize(); + } + static int32_t kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) { if (genKernel == NULL) return -1; gbe::Kernel *kernel = (gbe::Kernel*) genKernel; @@ -781,6 +787,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL; +GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL; GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL; @@ -815,6 +822,7 @@ namespace gbe gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize; gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize; gbe_kernel_use_slm = gbe::kernelUseSLM; + gbe_kernel_get_slm_size = gbe::kernelGetSLMSize; gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize; gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData; gbe_kernel_get_image_size = gbe::kernelGetImageSize; diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index 8774344..10fcc49 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -218,6 +218,9 @@ extern gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_ /*! Says if SLM is used. Required to reconfigure the L3 complex */ typedef int32_t (gbe_kernel_use_slm_cb)(gbe_kernel); extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm; +/*! Get slm size needed for kernel local variables */ +typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel); +extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size; #ifdef __cplusplus } diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp index 28a792d..895cd01 100644 --- a/backend/src/backend/program.hpp +++ b/backend/src/backend/program.hpp @@ -104,6 +104,8 @@ namespace gbe { INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; } /*! Says if SLM is needed for it */ INLINE bool getUseSLM(void) const { return this->useSLM; } + /*! get slm size for kernel local variable */ + INLINE uint32_t getSLMSize(void) const { return this->slmSize; } /*! set constant buffer size and return the cb curbe offset */ int32_t setConstBufSize(uint32_t argID, size_t sz) { if(argID >= argNum) return -1; @@ -169,6 +171,7 @@ namespace gbe { uint32_t stackSize; //!< Stack size (may be 0 if unused) uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused) bool useSLM; //!< SLM requires a special HW config + uint32_t slmSize; //!< slm size for kernel variable Context *ctx; //!< Save context after compiler to alloc constant buffer curbe ir::SamplerSet *samplerSet;//!< Copy from the corresponding function. ir::ImageSet *imageSet; //!< Copy from the corresponding function. diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp index 88aae08..c15c292 100644 --- a/backend/src/ir/function.cpp +++ b/backend/src/ir/function.cpp @@ -43,7 +43,7 @@ namespace ir { /////////////////////////////////////////////////////////////////////////// Function::Function(const std::string &name, const Unit &unit, Profile profile) : - name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false) + name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0) { initProfile(*this); samplerSet = GBE_NEW(SamplerSet); diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp index 6e712cd..3d4733d 100644 --- a/backend/src/ir/function.hpp +++ b/backend/src/ir/function.hpp @@ -301,6 +301,10 @@ namespace ir { INLINE bool getUseSLM(void) const { return this->useSLM; } /*! Change the SLM config for the function */ INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; } + /*! get SLM size needed for local variable inside kernel function */ + INLINE uint32_t getSLMSize(void) const { return this->slmSize; } + /*! set slm size needed for local variable inside kernel function */ + INLINE void setSLMSize(uint32_t size) { this->slmSize = size; } /*! Get sampler set in this function */ SamplerSet* getSamplerSet(void) const {return samplerSet; } /*! Get image set in this function */ @@ -320,6 +324,7 @@ namespace ir { LocationMap locationMap; //!< Pushed function arguments (loc->reg) uint32_t simdWidth; //!< 8 or 16 if forced, 0 otherwise bool useSLM; //!< Is SLM required? + uint32_t slmSize; //!< local variable size inside kernel function SamplerSet *samplerSet; //!< samplers used in this function. ImageSet* imageSet; //!< Image set in this function's arguments.. GBE_CLASS(Function); //!< Use custom allocator diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 1fb3fd6..5fb4f49 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -321,7 +321,9 @@ namespace gbe /*! Allocate a new scalar register */ ir::Register newScalar(Value *value, Value *key = NULL, uint32_t index = 0u) { - GBE_ASSERT(dyn_cast(value) == NULL); + // we don't allow normal constant, but GlobalValue is a special case, + // it needs a register to store its address + GBE_ASSERT(! (isa(value) && !isa(value))); Type *type = value->getType(); auto typeID = type->getTypeID(); switch (typeID) { @@ -477,7 +479,8 @@ namespace gbe } virtual bool doFinalization(Module &M) { return false; } - + /*! handle global variable register allocation (local, constant space) */ + void allocateGlobalVariableRegister(Function &F); /*! Emit the complete function code and declaration */ void emitFunction(Function &F); /*! Handle input and output function parameters */ @@ -488,6 +491,8 @@ namespace gbe void emitMovForPHI(BasicBlock *curr, BasicBlock *succ); /*! Alocate one or several registers (if vector) for the value */ INLINE void newRegister(Value *value, Value *key = NULL); + /*! get the register for a llvm::Constant */ + ir::Register getConstantRegister(Constant *c, uint32_t index = 0); /*! Return a valid register from an operand (can use LOADI to make one) */ INLINE ir::Register getRegister(Value *value, uint32_t index = 0); /*! Create a new immediate from a constant */ @@ -838,40 +843,46 @@ namespace gbe }; } - ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) { - //the real value may be constant, so get real value before constant check - regTranslator.getRealValue(value, elemID); + ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) { + GBE_ASSERT(c != NULL); - if (dyn_cast(value)) { - ConstantExpr *ce = dyn_cast(value); - if(ce->isCast()) { - GBE_ASSERT(ce->getOpcode() == Instruction::PtrToInt); - const Value *pointer = ce->getOperand(0); - GBE_ASSERT(pointer->hasName()); - auto name = pointer->getName().str(); - uint16_t reg = unit.getConstantSet().getConstant(name).getReg(); - return ir::Register(reg); - } + if(isa(c)) { + return regTranslator.getScalar(c, elemID); } - Constant *CPV = dyn_cast(value); - if (CPV) { - if (isa(CPV)) { - auto name = CPV->getName().str(); - uint16_t reg = unit.getConstantSet().getConstant(name).getReg(); - return ir::Register(reg); - } - if (isa(CPV)) { + + if(isa(c)) { + ConstantExpr * ce = dyn_cast(c); + + if(ce->isCast()) { + Value* op = ce->getOperand(0); + ir::Register pointer_reg; + if(isa(op)) { + // try to get the real pointer register, for case like: + // store i64 ptrtoint (i8 addrspace(3)* getelementptr inbounds ... + // in which ptrtoint and getelementptr are ConstantExpr. + pointer_reg = getConstantRegister(dyn_cast(op), elemID); + } else { + pointer_reg = regTranslator.getScalar(op, elemID); + } + // if ptrToInt request another type other than 32bit, convert as requested + ir::Type dstType = getType(ctx, ce->getType()); + if(ce->getOpcode() == Instruction::PtrToInt && ir::TYPE_S32 != dstType) { + ir::Register tmp = ctx.reg(getFamily(dstType)); + ctx.CVT(dstType, ir::TYPE_S32, tmp, pointer_reg); + return tmp; + } + return pointer_reg; + } else { uint32_t TypeIndex; uint32_t constantOffset = 0; uint32_t offset = 0; - ConstantExpr *CE = dyn_cast(CPV); // currently only GetElementPtr is handled - GBE_ASSERT(CE->getOpcode() == Instruction::GetElementPtr); - Value *pointer = CE->getOperand(0); + GBE_ASSERT(ce->getOpcode() == Instruction::GetElementPtr); + Value *pointer = ce->getOperand(0); CompositeType* CompTy = cast(pointer->getType()); - for(uint32_t op=1; opgetNumOperands(); ++op) { - ConstantInt* ConstOP = dyn_cast(CE->getOperand(op)); + for(uint32_t op=1; opgetNumOperands(); ++op) { + ConstantInt* ConstOP = dyn_cast(ce->getOperand(op)); GBE_ASSERT(ConstOP); TypeIndex = ConstOP->getZExtValue(); for(uint32_t ty_i=0; ty_i(CompTy->getTypeAtIndex(TypeIndex)); } - const std::string &pointer_name = pointer->getName().str(); - ir::Register pointer_reg = ir::Register(unit.getConstantSet().getConstant(pointer_name).getReg()); + ir::Register pointer_reg; + pointer_reg = regTranslator.getScalar(pointer, elemID); ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD); ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32)); ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD); ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg); return reg; } - const ir::ImmediateIndex immIndex = this->newImmediate(CPV, elemID); - const ir::Immediate imm = ctx.getImmediate(immIndex); - const ir::Register reg = ctx.reg(getFamily(imm.type)); - ctx.LOADI(imm.type, reg, immIndex); - return reg; } - else + + const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID); + const ir::Immediate imm = ctx.getImmediate(immIndex); + const ir::Register reg = ctx.reg(getFamily(imm.type)); + ctx.LOADI(imm.type, reg, immIndex); + return reg; + } + + ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) { + //the real value may be constant, so get real value before constant check + regTranslator.getRealValue(value, elemID); + if(isa(value)) { + Constant *c = dyn_cast(value); + return getConstantRegister(c, elemID); + } else return regTranslator.getScalar(value, elemID); } @@ -1273,6 +1293,55 @@ namespace gbe BVAR(OCL_OPTIMIZE_PHI_MOVES, true); BVAR(OCL_OPTIMIZE_LOADI, true); + void GenWriter::allocateGlobalVariableRegister(Function &F) + { + // Allocate a address register for each global variable + const Module::GlobalListType &globalList = TheModule->getGlobalList(); + size_t j = 0; + for(auto i = globalList.begin(); i != globalList.end(); i ++) { + const GlobalVariable &v = *i; + if(!v.isConstantUsed()) continue; + + ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace()); + if(addrSpace == ir::MEM_LOCAL) { + ir::Function &f = ctx.getFunction(); + f.setUseSLM(true); + const Constant *c = v.getInitializer(); + Type *ty = c->getType(); + uint32_t oldSlm = f.getSLMSize(); + uint32_t align = 8 * getAlignmentByte(unit, ty); + uint32_t padding = getPadding(oldSlm*8, align); + + f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty)); + const Value * parent = cast(&v); + // local variable can only be used in one kernel function. so, don't need to check its all uses. + // loop through the Constant to find the instruction that use the global variable + do { + Value::const_use_iterator it = parent->use_begin(); + parent = cast(*it); + } while(isa(parent)); + + const Instruction * insn = cast(parent); + const BasicBlock * bb = insn->getParent(); + const Function * func = bb->getParent(); + if(func != &F) continue; + + this->newRegister(const_cast(&v)); + ir::Register reg = regTranslator.getScalar(const_cast(&v), 0); + ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32)); + } else if(addrSpace == ir::MEM_CONSTANT) { + GBE_ASSERT(v.hasInitializer()); + this->newRegister(const_cast(&v)); + ir::Register reg = regTranslator.getScalar(const_cast(&v), 0); + ir::Constant &con = unit.getConstantSet().getConstant(j ++); + ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); + } else { + GBE_ASSERT(0); + } + } + + } + void GenWriter::emitFunction(Function &F) { switch (F.getCallingConv()) { @@ -1293,21 +1362,7 @@ namespace gbe this->labelMap.clear(); this->emitFunctionPrototype(F); - // Allocate a virtual register for each global constant array - const Module::GlobalListType &globalList = TheModule->getGlobalList(); - size_t j = 0; - for(auto i = globalList.begin(); i != globalList.end(); i ++) { - const GlobalVariable &v = *i; - unsigned addrSpace = v.getType()->getAddressSpace(); - if(addrSpace != ir::AddressSpace::MEM_CONSTANT) - continue; - GBE_ASSERT(v.hasInitializer()); - ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD); - ir::Constant &con = unit.getConstantSet().getConstant(j ++); - con.setReg(reg.value()); - ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); - } - + this->allocateGlobalVariableRegister(F); // Visit all the instructions and emit the IR registers or the value to // value mapping when a new register is not needed pass = PASS_EMIT_REGISTERS; diff --git a/kernels/compiler_local_slm.cl b/kernels/compiler_local_slm.cl index 1a4b175..52c078c 100644 --- a/kernels/compiler_local_slm.cl +++ b/kernels/compiler_local_slm.cl @@ -1,10 +1,24 @@ -#if 0 -__kernel void compiler_local_slm(__global int *dst, __local int *hop) { -#else +struct Test{ + char t0; + int t1; +}; + +constant int two= 2; + __kernel void compiler_local_slm(__global int *dst) { - __local int hop[10]; -#endif - hop[get_global_id(0)] = get_local_id(1); - dst[get_global_id(0)] = hop[get_local_id(0)]; + __local int hop[16]; + __local char a; + __local struct Test c; + + c.t1 = get_group_id(0); + a = two;// seems clang currently has a bug if I write 'a=2;' so currently workaroud it. + hop[get_local_id(0)] = get_local_id(0); + barrier(CLK_LOCAL_MEM_FENCE); + dst[get_global_id(0)] = hop[get_local_id(0)] + (int)a + hop[1] + c.t1; } +__kernel void compiler_local_slm1(__global ulong *dst) { + __local int hop[16]; + dst[1] = (ulong)&hop[1]; + dst[0] = (ulong)&hop[0]; +} diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index b85c0cd..be7bcef 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -200,7 +200,8 @@ cl_curbe_fill(cl_kernel ker, } /* Handle the various offsets to SLM */ const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); - int32_t arg, slm_offset = 0; + /* align so that we kernel argument get good alignment */ + int32_t arg, slm_offset = ALIGN(gbe_kernel_get_slm_size(ker->opaque), 32); for (arg = 0; arg < arg_n; ++arg) { const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); if (type != GBE_ARG_LOCAL_PTR) diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index a24c490..daa4d6f 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -96,6 +96,7 @@ set (utests_sources compiler_local_memory_barrier.cpp compiler_local_memory_barrier_wg64.cpp compiler_local_memory_barrier_2.cpp + compiler_local_slm.cpp compiler_movforphi_undef.cpp compiler_volatile.cpp compiler_copy_image1.cpp diff --git a/utests/compiler_local_slm.cpp b/utests/compiler_local_slm.cpp index aa9a2fe..48a072f 100644 --- a/utests/compiler_local_slm.cpp +++ b/utests/compiler_local_slm.cpp @@ -2,9 +2,33 @@ void compiler_local_slm(void) { - // Setup kernel and buffers - OCL_CREATE_KERNEL("compiler_local_slm"); + const size_t n = 32; + OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm"); + OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + globals[0] = n; + locals[0] = 16; + OCL_NDRANGE(1); + OCL_MAP_BUFFER(0); + for (uint32_t i = 0; i < n; ++i) +// std::cout << ((int32_t*)buf_data[0])[i] << std::endl; + OCL_ASSERT(((int32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16)); + OCL_UNMAP_BUFFER(0); } +void compiler_local_slm1(void) +{ + const size_t n = 2; + OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm1"); + OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + globals[0] = 1; + locals[0] = 1; + OCL_NDRANGE(1); + OCL_MAP_BUFFER(0); + uint64_t * ptr = (uint64_t*)buf_data[0]; + OCL_ASSERT((ptr[1] -ptr[0]) == 4); + OCL_UNMAP_BUFFER(0); +} MAKE_UTEST_FROM_FUNCTION(compiler_local_slm); - +MAKE_UTEST_FROM_FUNCTION(compiler_local_slm1); -- 2.7.4