From: Zhigang Gong Date: Wed, 25 Sep 2013 10:26:49 +0000 (+0800) Subject: GBE/Runtime: implement workaround for IVB sampler bug X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b835433189a3dca202e7fc7d9ff0bfbc49676281;p=contrib%2Fbeignet.git GBE/Runtime: implement workaround for IVB sampler bug Per IVB spec, If the surface format of the associated surface is UINT or SINT, the Surface Type cannot be SURFTYPE_3D or SURFTYPE_CUBE and Address Control Mode cannot be CLAMP_BORDER or HALF_BORDER. Besides this bug, there is another undocumented issue. If a surface data type is IEEE float. Then when we use sampler to sample the pixel, if the value is betweeo -1p-20 to 0, the sampler will rounding it to zero. And this will also bring problem when we are using the clamp mode. This patch is to workaround the above two hardware issues. It introduces a new intrinsic get_sampler_info to get a sampler type at runtime. When calling to read_image, it will check whether it hits the above two cases. If it hit case 1, then we will force it to use clamp to edge for those pixels within the box, And for those pixel out of the box, we manually set the border color. To achieve this solution, we have to prepare two sampler slot for each CL_ADDRESS_CLAMP sampler. And the first has slot_1 which is using CL_ADDRESS_CLAMP, the second use slot_1 + 8. Thus we can only use half of 16 samplers. Fortunately, 8 samplers comply with the OpenCL's minimal requirement. If it hits case 2, then we minor a epsilon to the coordinate, and let it not rounds to zero. If possible, programer should avoid to use float coordinates and/or int/uint format image. Otherwise, it will hit the very slow path. With this workaround, the compiler_copy_image1 can pass now. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 09e910c..3d18f50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,9 @@ ELSE (EMULATE_IVB) ADD_DEFINITIONS(-DEMULATE_GEN=0) ENDIF (EMULATE_HSW) +# XXX now hard coded to enable the clamp to border workaround for IVB. +ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND) + IF (USE_FULSIM) ADD_DEFINITIONS(-DUSE_FULSIM=1) ELSE (USE_FULSIM) diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt index 8622f3e..476c6f2 100644 --- a/backend/CMakeLists.txt +++ b/backend/CMakeLists.txt @@ -34,7 +34,7 @@ else (GBE_DEBUG_MEMORY) endif (GBE_DEBUG_MEMORY) # Hide all symbols and allows the symbols declared as visible to be exported -set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden") +set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}") if (COMPILER STREQUAL "GCC") set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall") diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index 5ef1234..36bf688 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -3,6 +3,7 @@ set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h) set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h) set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h) set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h) +set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h) set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h) set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp) set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py) @@ -32,7 +33,7 @@ add_custom_command( add_custom_command( OUTPUT ${ocl_blob_file} COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file} - DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}) + DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}) set (pch_object ${ocl_blob_file}.pch) @@ -46,7 +47,7 @@ else (LLVM_VERSION_NODOT VERSION_GREATER 32) set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch) endif (LLVM_VERSION_NODOT VERSION_GREATER 31) endif (LLVM_VERSION_NODOT VERSION_GREATER 32) -set (clang_cmd ${clang_cmd} -fno-builtin) +set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND) add_custom_command( OUTPUT ${pch_object} diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index ac3a243..a55ef04 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -315,10 +315,12 @@ namespace gbe GBE_DELETE(this->kernel); this->kernel = NULL; } - if(this->kernel != NULL) + if(this->kernel != NULL) { + // Align it on 32 bytes properly + this->kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE); this->kernel->scratchSize = alignScratchSize(this->scratchOffset); - if(this->kernel != NULL) this->kernel->ctx = this; + } return this->kernel; } @@ -419,9 +421,11 @@ namespace gbe this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize); this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize); this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize); + this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32); specialRegs.insert(ir::ocl::lid0); specialRegs.insert(ir::ocl::lid1); specialRegs.insert(ir::ocl::lid2); + specialRegs.insert(ir::ocl::samplerinfo); // Go over all the instructions and find the special register we need // to push @@ -470,7 +474,6 @@ namespace gbe // research faster std::sort(kernel->patches.begin(), kernel->patches.end()); - // Align it on 32 bytes properly kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE); } diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 3e539a2..d9ea7ff 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -2755,6 +2755,13 @@ namespace gbe using namespace ir; GenRegister msgPayloads[4]; GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2]; + uint32_t srcNum = insn.getSrcNum(); + uint32_t samplerOffset = 0; + if (srcNum == 6) { + /* We have the clamp border workaround. */ + samplerOffset = insn.getSrc(srcNum - 1).value() * 8; + srcNum--; + } for( int i = 0; i < 4; ++i) msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); @@ -2762,15 +2769,15 @@ namespace gbe for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID) dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType()); - for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID) + for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID) src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType()); uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx (insn.getSrc(SampleInstruction::SURFACE_BTI)); uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx - (insn.getSrc(SampleInstruction::SAMPLER_BTI)); + (insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset; - sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler); + sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler); return true; } DECL_CTOR(SampleInstruction, 1, 1); @@ -2793,7 +2800,7 @@ namespace gbe msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); // u, v, w coords should use coord type. - for (; valueID < 1 + coordNum; ++valueID) + for (; valueID < coordNum; ++valueID) src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType()); for (; (valueID + 1) < insn.getSrcNum(); ++valueID) @@ -2826,6 +2833,22 @@ namespace gbe DECL_CTOR(GetImageInfoInstruction, 1, 1); }; + /*! get sampler info instruction pattern. */ + DECL_PATTERN(GetSamplerInfoInstruction) + { + INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const + { + using namespace ir; + GenRegister dst, src; + dst = sel.selReg(insn.getDst(0), TYPE_U16); + src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2); + src.subphysical = 1; + sel.MOV(dst, src); + return true; + } + DECL_CTOR(GetSamplerInfoInstruction, 1, 1); + }; + /*! Branch instruction pattern */ DECL_PATTERN(BranchInstruction) { @@ -3000,6 +3023,7 @@ namespace gbe this->insert(); this->insert(); this->insert(); + this->insert(); // Sort all the patterns with the number of instructions they output for (uint32_t op = 0; op < ir::OP_INVALID; ++op) diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 2abfb12..a9132df 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -568,6 +568,7 @@ namespace gbe allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1); allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2); allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim); + allocatePayloadReg(GBE_CURBE_SAMPLER_INFO, ocl::samplerinfo); allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0); allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1); allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2); @@ -753,7 +754,8 @@ namespace gbe } GBE_ASSERT(RA.contains(reg.reg()) != false); const uint32_t grfOffset = RA.find(reg.reg())->second; - const GenRegister dst = setGenReg(reg, grfOffset); + const uint32_t suboffset = reg.subphysical ? reg.subnr : 0; + const GenRegister dst = setGenReg(reg, grfOffset + suboffset); if (reg.quarter != 0) return GenRegister::Qn(dst, reg.quarter); else diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index ddf53a2..538f16c 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -235,6 +235,7 @@ namespace gbe uint32_t nr:8; //!< Just for some physical registers (acc, null) uint32_t subnr:8; //!< Idem uint32_t physical:1; //!< 1 if physical, 0 otherwise + uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise uint32_t type:4; //!< Gen type uint32_t file:2; //!< Register file uint32_t negation:1; //!< For source diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 68bb17e..ffd31d9 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -468,7 +468,9 @@ namespace gbe { useless.push_back(str); args.push_back(str.c_str()); } - +#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND + args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND"); +#endif args.push_back("-emit-llvm"); // XXX we haven't implement those builtin functions, // so disable it currently. diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index ff4d157..8774344 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -70,6 +70,7 @@ enum gbe_curbe_type { GBE_CURBE_GROUP_NUM_Y, GBE_CURBE_GROUP_NUM_Z, GBE_CURBE_WORK_DIM, + GBE_CURBE_SAMPLER_INFO, GBE_CURBE_IMAGE_INFO, GBE_CURBE_STACK_POINTER, GBE_CURBE_KERNEL_ARGUMENT, diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index b3b9e10..0278bc6 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -486,7 +486,7 @@ namespace ir { INLINE Type getSrcType(void) const { return this->srcType; } INLINE Type getDstType(void) const { return this->dstType; } - static const uint32_t srcNum = 5; + static const uint32_t srcNum = 6; static const uint32_t dstNum = 4; }; @@ -528,6 +528,32 @@ namespace ir { Register dst[0]; //!< No dest register }; + class ALIGNED_INSTRUCTION GetSamplerInfoInstruction : + public BasePolicy, + public NSrcPolicy, + public NDstPolicy + { + public: + GetSamplerInfoInstruction( Register dst, + Register src) + { + this->opcode = OP_GET_SAMPLER_INFO; + this->dst[0] = dst; + this->src[0] = src; + } + + INLINE bool wellFormed(const Function &fn, std::string &why) const; + INLINE void out(std::ostream &out, const Function &fn) const { + this->outOpcode(out); + out << " sampler id %" << this->getSrc(fn, 0) + << " %" << this->getDst(fn, 0); + } + + Register src[1]; //!< Surface to get info + Register dst[1]; //!< return value + static const uint32_t dstNum = 1; + }; + class ALIGNED_INSTRUCTION GetImageInfoInstruction : public BasePolicy, public NSrcPolicy, @@ -886,6 +912,9 @@ namespace ir { { return true; } INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const { return true; } + INLINE bool GetSamplerInfoInstruction::wellFormed(const Function &fn, std::string &why) const + { return true; } + // Ensure that types and register family match INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const @@ -1144,6 +1173,10 @@ START_INTROSPECTION(GetImageInfoInstruction) #include "ir/instruction.hxx" END_INTROSPECTION(GetImageInfoInstruction) +START_INTROSPECTION(GetSamplerInfoInstruction) +#include "ir/instruction.hxx" +END_INTROSPECTION(GetSamplerInfoInstruction) + START_INTROSPECTION(LoadImmInstruction) #include "ir/instruction.hxx" END_INTROSPECTION(LoadImmInstruction) @@ -1499,6 +1532,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType()) return internal::GetImageInfoInstruction(infoType, dst, src).convert(); } + Instruction GET_SAMPLER_INFO(Register dst, Register src) { + return internal::GetSamplerInfoInstruction(dst, src).convert(); + } + std::ostream &operator<< (std::ostream &out, const Instruction &insn) { const Function &fn = insn.getFunction(); switch (insn.getOpcode()) { diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 0f7df58..3697c17 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -365,6 +365,7 @@ namespace ir { }; uint32_t data; } ImageInfoKey; + /*! Get image information */ class GetImageInfoInstruction : public Instruction { public: @@ -399,6 +400,14 @@ namespace ir { static bool isClassOf(const Instruction &insn); }; + /*! Get image information */ + class GetSamplerInfoInstruction : public Instruction { + public: + + /*! Return true if the given instruction is an instance of this class */ + static bool isClassOf(const Instruction &insn); + }; + /*! Branch instruction is the unified way to branch (with or without * predicate) */ @@ -636,6 +645,8 @@ namespace ir { Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType); /*! get image information , such as width/height/depth/... */ Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src); + /*! get sampler information */ + Instruction GET_SAMPLER_INFO(Register dst, Register src); /*! label labelIndex */ Instruction LABEL(LabelIndex labelIndex); diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx index f3f2db6..1a9f867 100644 --- a/backend/src/ir/instruction.hxx +++ b/backend/src/ir/instruction.hxx @@ -72,6 +72,7 @@ DECL_INSN(SAMPLE, SampleInstruction) DECL_INSN(SYNC, SyncInstruction) DECL_INSN(LABEL, LabelInstruction) DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction) +DECL_INSN(GET_SAMPLER_INFO, GetSamplerInfoInstruction) DECL_INSN(MUL_HI, BinaryInstruction) DECL_INSN(I64_MUL_HI, BinaryInstruction) DECL_INSN(FBH, UnaryInstruction) diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 927e43d..10e0c59 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -40,7 +40,7 @@ namespace ir { "stack_pointer", "block_ip", "barrier_id", "thread_number", - "work_dimension", + "work_dimension", "sampler_info" }; #if GBE_DEBUG @@ -76,6 +76,7 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, barrierid); DECL_NEW_REG(FAMILY_DWORD, threadn); DECL_NEW_REG(FAMILY_DWORD, workdim); + DECL_NEW_REG(FAMILY_WORD, samplerinfo); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index c79bc3b..89dd69f 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -64,7 +64,8 @@ namespace ir { static const Register barrierid = Register(20);// barrierid static const Register threadn = Register(21); // number of threads static const Register workdim = Register(22); // work dimention. - static const uint32_t regNum = 23; // number of special registers + static const Register samplerinfo = Register(23); // store sampler info. + static const uint32_t regNum = 24; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 05b5874..27263f8 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -540,6 +540,8 @@ namespace gbe // Emit unary instructions from gen native function void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode); + ir::Register appendSampler(CallSite::arg_iterator AI); + // These instructions are not supported at all void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;} void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;} @@ -1809,6 +1811,7 @@ namespace gbe case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE: case GEN_OCL_GET_IMAGE_CHANNEL_ORDER: case GEN_OCL_GET_IMAGE_DEPTH: + case GEN_OCL_GET_SAMPLER_INFO: case GEN_OCL_ATOMIC_ADD0: case GEN_OCL_ATOMIC_ADD1: case GEN_OCL_ATOMIC_SUB0: @@ -1952,6 +1955,25 @@ namespace gbe ctx.ATOMIC(opcode, dst, addrSpace, srcTuple); } + /* append a new sampler. should be called before any reference to + * a sampler_t value. */ + ir::Register GenWriter::appendSampler(CallSite::arg_iterator AI) { + Constant *CPV = dyn_cast(*AI); + ir::Register sampler; + if (CPV != NULL) + { + // This is not a kernel argument sampler, we need to append it to sampler set, + // and allocate a sampler slot for it. + auto x = processConstant(CPV, InsertExtractFunctor(ctx)); + GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type"); + sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx); + } else { + sampler = this->getRegister(*AI); + ctx.getFunction().getSamplerSet()->append(sampler, &ctx); + } + return sampler; + } + void GenWriter::emitCallInst(CallInst &I) { if (Function *F = I.getCalledFunction()) { if (F->getIntrinsicID() != 0) { @@ -2092,6 +2114,14 @@ namespace gbe ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id); break; } + case GEN_OCL_GET_SAMPLER_INFO: + { + GBE_ASSERT(AI != AE); + const ir::Register sampler = this->appendSampler(AI); ++AI; + const ir::Register reg = this->getRegister(&I, 0); + ctx.GET_SAMPLER_INFO(reg, sampler); + break; + } case GEN_OCL_READ_IMAGE0: case GEN_OCL_READ_IMAGE1: case GEN_OCL_READ_IMAGE2: @@ -2107,19 +2137,7 @@ namespace gbe { GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI; GBE_ASSERT(AI != AE); - Constant *CPV = dyn_cast(*AI); - ir::Register sampler; - if (CPV != NULL) - { - // This is not a kernel argument sampler, we need to append it to sampler set, - // and allocate a sampler slot for it. - auto x = processConstant(CPV, InsertExtractFunctor(ctx)); - GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type"); - sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx); - } else { - sampler = this->getRegister(*AI); - ctx.getFunction().getSamplerSet()->append(sampler, &ctx); - } + const ir::Register sampler = this->appendSampler(AI); ++AI; GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI; @@ -2141,8 +2159,19 @@ namespace gbe srcTupleData.push_back(ucoord); srcTupleData.push_back(vcoord); srcTupleData.push_back(wcoord); +#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND + GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast(*AI); + assert(CPV); + auto x = processConstant(CPV, InsertExtractFunctor(ctx)); + GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type"); + ir::Register offsetReg(x.data.u32); + srcTupleData.push_back(offsetReg); +#else + ir::Register offsetReg(0); +#endif + srcTupleData.push_back(offsetReg); const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum); - const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 5); + const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 6); ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32; diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 5ea879c..321fc4e 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -42,19 +42,19 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8) DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16) // To read_image functions. -DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjii) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjff) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjii) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjff) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjii) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjff) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjiij) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjffj) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjiij) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjffj) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjiij) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjffj) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiii) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfff) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiii) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfff) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiii) -DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfff) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiiij) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfffj) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiiij) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfffj) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiiij) +DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfffj) // To write_image functions. DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i) @@ -143,3 +143,6 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm) DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless) DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii) DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell) + +// get sampler info +DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info) diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h index 1ea150b..b736a88 100644 --- a/backend/src/ocl_common_defines.h +++ b/backend/src/ocl_common_defines.h @@ -4,6 +4,7 @@ // // Common defines for Image intrinsics // Channel order +#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB) enum { CLK_R = 0x10B0, CLK_A = 0x10B1, @@ -66,54 +67,52 @@ typedef enum clk_channel_type { typedef enum clk_sampler_type { __CLK_ADDRESS_BASE = 0, - CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE, - CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE, - CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE, - CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE, - CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE, + CLK_ADDRESS_NONE = (0 << __CLK_ADDRESS_BASE), + CLK_ADDRESS_CLAMP = (1 << __CLK_ADDRESS_BASE), + CLK_ADDRESS_CLAMP_TO_EDGE = (2 << __CLK_ADDRESS_BASE), + CLK_ADDRESS_REPEAT = (3 << __CLK_ADDRESS_BASE), + CLK_ADDRESS_MIRROR = (4 << __CLK_ADDRESS_BASE), #if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1) CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR, #endif - __CLK_ADDRESS_MASK = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | + __CLK_ADDRESS_MASK = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP_TO_EDGE | - CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR, + CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR), __CLK_ADDRESS_BITS = 3, // number of bits required to // represent address info __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS, CLK_NORMALIZED_COORDS_FALSE = 0, - CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE, - __CLK_NORMALIZED_MASK = CLK_NORMALIZED_COORDS_FALSE | - CLK_NORMALIZED_COORDS_TRUE, + CLK_NORMALIZED_COORDS_TRUE = (1 << __CLK_NORMALIZED_BASE), + __CLK_NORMALIZED_MASK = (CLK_NORMALIZED_COORDS_FALSE | + CLK_NORMALIZED_COORDS_TRUE), __CLK_NORMALIZED_BITS = 1, // number of bits required to // represent normalization - - __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + - __CLK_NORMALIZED_BITS, - CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE, - CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE, - CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE, - __CLK_FILTER_MASK = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | - CLK_FILTER_ANISOTROPIC, + __CLK_FILTER_BASE = (__CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS), + CLK_FILTER_NEAREST = (0 << __CLK_FILTER_BASE), + CLK_FILTER_LINEAR = (1 << __CLK_FILTER_BASE), + CLK_FILTER_ANISOTROPIC = (2 << __CLK_FILTER_BASE), + __CLK_FILTER_MASK = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | + CLK_FILTER_ANISOTROPIC), __CLK_FILTER_BITS = 2, // number of bits required to // represent address info - __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS, - CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE, - CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE, - CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE, - __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | - CLK_MIP_ANISOTROPIC, + __CLK_MIP_BASE = (__CLK_FILTER_BASE + __CLK_FILTER_BITS), + CLK_MIP_NEAREST = (0 << __CLK_MIP_BASE), + CLK_MIP_LINEAR = (1 << __CLK_MIP_BASE), + CLK_MIP_ANISOTROPIC = (2 << __CLK_MIP_BASE), + __CLK_MIP_MASK = (CLK_MIP_NEAREST | CLK_MIP_LINEAR | + CLK_MIP_ANISOTROPIC), __CLK_MIP_BITS = 2, - __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS, - __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK | - __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK, + __CLK_SAMPLER_BITS = (__CLK_MIP_BASE + __CLK_MIP_BITS), + __CLK_SAMPLER_MASK = (__CLK_MIP_MASK | __CLK_FILTER_MASK | + __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK), - __CLK_SAMPLER_ARG_BASE = __CLK_MIP_BASE + __CLK_SAMPLER_BITS, + __CLK_SAMPLER_ARG_BASE = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS), __CLK_SAMPLER_ARG_BITS = 8, - __CLK_SAMPLER_ARG_MASK = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE, + __CLK_SAMPLER_ARG_MASK = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE), __CLK_SAMPLER_ARG_KEY_BIT = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)), __CLK_SAMPLER_ARG_KEY_BITS = 1, diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index ff6f251..26fa8b4 100644 --- a/backend/src/ocl_stdlib.tmpl.h +++ b/backend/src/ocl_stdlib.tmpl.h @@ -2255,19 +2255,19 @@ int __gen_ocl_force_simd16(void); // Image access functions ///////////////////////////////////////////////////////////////////////////// -OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v); -OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v); -OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v); -OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v); -OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v); -OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v); - -OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w); -OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w); -OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w); -OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w); -OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w); -OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w); +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, uint sampler_offset); +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, uint sampler_offset); +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, uint sampler_offset); +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, uint sampler_offset); +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, uint sampler_offset); +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, uint sampler_offset); + +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset); +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset); +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset); +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset); +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset); +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset); OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color); OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color); @@ -2287,22 +2287,63 @@ int __gen_ocl_get_image_height(uint surface_id); int __gen_ocl_get_image_channel_data_type(uint surface_id); int __gen_ocl_get_image_channel_order(uint surface_id); int __gen_ocl_get_image_depth(uint surface_id); +ushort __gen_ocl_get_sampler_info(uint sampler_id); #define GET_IMAGE(cl_image, surface_id) \ uint surface_id = (uint)cl_image -#define DECL_READ_IMAGE(image_type, type, suffix, coord_type) \ - INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, sampler_t sampler, coord_type coord) \ - {\ - GET_IMAGE(cl_image, surface_id);\ - return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, sampler, coord));\ +#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND +#define GEN_FIX_1 1 +#else +#define GEN_FIX_1 0 +#endif + +#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix, \ + image_type, type, suffix, coord_type) \ + INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \ + sampler_t sampler, \ + coord_type coord) \ + { \ + GET_IMAGE(cl_image, surface_id); \ + coord_type tmpCoord = coord; \ + ushort samplerValue; \ + if (float_coord_rounding_fix | int_clamping_fix) { \ + samplerValue = __gen_ocl_get_sampler_info(sampler); \ + if (((samplerValue & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) \ + && ((samplerValue & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) { \ + if (float_coord_rounding_fix \ + && ((samplerValue & CLK_NORMALIZED_COORDS_TRUE) == 0)) { \ + FIXUP_FLOAT_COORD(tmpCoord); \ + } \ + if (int_clamping_fix) { \ + if (OUT_OF_BOX(tmpCoord, surface_id)) { \ + unsigned int border_alpha; \ + int order = __gen_ocl_get_image_channel_order(surface_id); \ + if (!CLK_HAS_ALPHA(order)) { \ + border_alpha = 1; \ + } else \ + border_alpha = 0; \ + return (type)(0, 0, 0, border_alpha); \ + } else \ + return __gen_ocl_read_image ##suffix( \ + EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\ + } \ + } \ + } \ + return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, \ + sampler, tmpCoord), 0); \ } -#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type) \ - INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, coord_type coord) \ - {\ - GET_IMAGE(cl_image, surface_id);\ - return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord));\ +#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type) \ + INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \ + coord_type coord) \ + { \ + GET_IMAGE(cl_image, surface_id); \ + return __gen_ocl_read_image ##suffix( \ + EXPEND_READ_COORD(surface_id, \ + CLK_NORMALIZED_COORDS_FALSE \ + | CLK_ADDRESS_NONE \ + | CLK_FILTER_NEAREST, coord), 0); \ } #define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \ @@ -2315,37 +2356,70 @@ int __gen_ocl_get_image_depth(uint surface_id); #define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color -#define DECL_IMAGE(image_type, type, suffix, n) \ - DECL_READ_IMAGE(image_type, type, suffix, int ##n) \ - DECL_READ_IMAGE(image_type, type, suffix, float ##n) \ - DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n) \ - DECL_WRITE_IMAGE(image_type, type, suffix, int ## n) \ +#define OUT_OF_BOX(coord, surface) \ + (coord.s0 < 0 || coord.s1 < 0 \ + || coord.s0 >= __gen_ocl_get_image_width(surface) \ + || coord.s1 >= __gen_ocl_get_image_height(surface)) + +#define FIXUP_FLOAT_COORD(tmpCoord) \ + { \ + if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f) \ + tmpCoord.s0 += -0x1p-9; \ + if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f) \ + tmpCoord.s1 += -0x1p-9f; \ + } + +#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n) \ + DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n) \ + DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \ + DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n) \ + DECL_WRITE_IMAGE(image_type, type, suffix, int ## n) \ DECL_WRITE_IMAGE(image_type, type, suffix, float ## n) -DECL_IMAGE(image2d_t, int4, i, 2) -DECL_IMAGE(image2d_t, uint4, ui, 2) -DECL_IMAGE(image2d_t, float4, f, 2) +DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2) +DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2) +DECL_IMAGE(0, image2d_t, float4, f, 2) #undef EXPEND_READ_COORD #undef EXPEND_WRITE_COORD +#undef OUT_OF_BOX +#undef FIXUP_FLOAT_COORD #define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color - -DECL_IMAGE(image3d_t, int4, i, 4) -DECL_IMAGE(image3d_t, uint4, ui, 4) -DECL_IMAGE(image3d_t, float4, f, 4) - -DECL_IMAGE(image3d_t, int4, i, 3) -DECL_IMAGE(image3d_t, uint4, ui, 3) -DECL_IMAGE(image3d_t, float4, f, 3) +#define OUT_OF_BOX(coord, surface) \ + (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0 \ + || coord.s0 >= __gen_ocl_get_image_width(surface) \ + || coord.s1 >= __gen_ocl_get_image_height(surface) \ + || coord.s2 >= __gen_ocl_get_image_depth(surface)) + +#define FIXUP_FLOAT_COORD(tmpCoord) \ + { \ + if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20) \ + tmpCoord.s0 += -0x1p-9; \ + if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20) \ + tmpCoord.s1 += -0x1p-9; \ + if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20) \ + tmpCoord.s2 += -0x1p-9; \ + } + +DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4) +DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4) +DECL_IMAGE(0, image3d_t, float4, f, 4) + +DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3) +DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3) +DECL_IMAGE(0, image3d_t, float4, f, 3) #undef EXPEND_READ_COORD #undef EXPEND_WRITE_COORD +#undef OUT_OF_BOX +#undef FIXUP_FLOAT_COORD #undef DECL_IMAGE #undef DECL_READ_IMAGE #undef DECL_READ_IMAGE_NOSAMPLER #undef DECL_WRITE_IMAGE +#undef GEN_FIX_1 #define DECL_IMAGE_INFO(image_type) \ INLINE_OVERLOADABLE int get_image_width(image_type image) \ diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 68630cf..f2c051b 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -180,6 +180,13 @@ cl_curbe_fill(cl_kernel ker, UPLOAD(GBE_CURBE_WORK_DIM, work_dim); #undef UPLOAD + /* Upload sampler information. */ + offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0); + uint32_t i; + for(i = 0; i < ker->sampler_sz; i++, offset += 2) { + *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF; + } + /* Write identity for the stack pointer. This is required by the stack pointer * computation in the kernel */ diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index 1eb790f..6bfc453 100644 --- a/src/cl_gt_device.h +++ b/src/cl_gt_device.h @@ -45,7 +45,7 @@ .image3d_max_width = 8192, .image3d_max_height = 8192, .image3d_max_depth = 8192, -.max_samplers = 0, +.max_samplers = 8, .mem_base_addr_align = sizeof(cl_uint) * 8, .min_data_type_align_size = sizeof(cl_uint), .single_fp_config = 0, /* XXX */ diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 44f44ef..034ecba 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -832,9 +832,22 @@ static void intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz) { int index; +#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND + assert(sampler_sz <= GEN_MAX_SAMPLERS/2); +#else assert(sampler_sz <= GEN_MAX_SAMPLERS); - for(index = 0; index < sampler_sz; index++) - intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK); +#endif + for(index = 0; index < sampler_sz; index++) { + intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]); +#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND + /* Duplicate the sampler to 8 + index and fixup the address mode + * to repeat.*/ + if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) { + intel_gpgpu_insert_sampler(gpgpu, index + 8, + (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_REPEAT); + } +#endif + } } static void diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp index 39ff3f5..d469fbd 100644 --- a/utests/compiler_copy_image1.cpp +++ b/utests/compiler_copy_image1.cpp @@ -68,4 +68,4 @@ static void compiler_copy_image1(void) OCL_UNMAP_BUFFER(5); } -MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_copy_image1); +MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);