ADD_DEFINITIONS(-DEMULATE_GEN=0)
ENDIF (EMULATE_HSW)
+# XXX now hard coded to enable the clamp to border workaround for IVB.
+ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
IF (USE_FULSIM)
ADD_DEFINITIONS(-DUSE_FULSIM=1)
ELSE (USE_FULSIM)
endif (GBE_DEBUG_MEMORY)
# Hide all symbols and allows the symbols declared as visible to be exported
-set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden")
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}")
if (COMPILER STREQUAL "GCC")
set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
add_custom_command(
OUTPUT ${ocl_blob_file}
COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
- DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
+ DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
set (pch_object ${ocl_blob_file}.pch)
set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
-set (clang_cmd ${clang_cmd} -fno-builtin)
+set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
add_custom_command(
OUTPUT ${pch_object}
GBE_DELETE(this->kernel);
this->kernel = NULL;
}
- if(this->kernel != NULL)
+ if(this->kernel != NULL) {
+ // Align it on 32 bytes properly
+ this->kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
- if(this->kernel != NULL)
this->kernel->ctx = this;
+ }
return this->kernel;
}
this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize);
this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize);
this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize);
+ this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32);
specialRegs.insert(ir::ocl::lid0);
specialRegs.insert(ir::ocl::lid1);
specialRegs.insert(ir::ocl::lid2);
+ specialRegs.insert(ir::ocl::samplerinfo);
// Go over all the instructions and find the special register we need
// to push
// research faster
std::sort(kernel->patches.begin(), kernel->patches.end());
- // Align it on 32 bytes properly
kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
}
using namespace ir;
GenRegister msgPayloads[4];
GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
+ uint32_t srcNum = insn.getSrcNum();
+ uint32_t samplerOffset = 0;
+ if (srcNum == 6) {
+ /* We have the clamp border workaround. */
+ samplerOffset = insn.getSrc(srcNum - 1).value() * 8;
+ srcNum--;
+ }
for( int i = 0; i < 4; ++i)
msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
- for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID)
+ for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
(insn.getSrc(SampleInstruction::SURFACE_BTI));
uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
- (insn.getSrc(SampleInstruction::SAMPLER_BTI));
+ (insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset;
- sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler);
+ sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler);
return true;
}
DECL_CTOR(SampleInstruction, 1, 1);
msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
// u, v, w coords should use coord type.
- for (; valueID < 1 + coordNum; ++valueID)
+ for (; valueID < coordNum; ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
DECL_CTOR(GetImageInfoInstruction, 1, 1);
};
+ /*! get sampler info instruction pattern. */
+ DECL_PATTERN(GetSamplerInfoInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const
+ {
+ using namespace ir;
+ GenRegister dst, src;
+ dst = sel.selReg(insn.getDst(0), TYPE_U16);
+ src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2);
+ src.subphysical = 1;
+ sel.MOV(dst, src);
+ return true;
+ }
+ DECL_CTOR(GetSamplerInfoInstruction, 1, 1);
+ };
+
/*! Branch instruction pattern */
DECL_PATTERN(BranchInstruction)
{
this->insert<SelectModifierInstructionPattern>();
this->insert<SampleInstructionPattern>();
this->insert<GetImageInfoInstructionPattern>();
+ this->insert<GetSamplerInfoInstructionPattern>();
// Sort all the patterns with the number of instructions they output
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
+ allocatePayloadReg(GBE_CURBE_SAMPLER_INFO, ocl::samplerinfo);
allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
}
GBE_ASSERT(RA.contains(reg.reg()) != false);
const uint32_t grfOffset = RA.find(reg.reg())->second;
- const GenRegister dst = setGenReg(reg, grfOffset);
+ const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+ const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
if (reg.quarter != 0)
return GenRegister::Qn(dst, reg.quarter);
else
uint32_t nr:8; //!< Just for some physical registers (acc, null)
uint32_t subnr:8; //!< Idem
uint32_t physical:1; //!< 1 if physical, 0 otherwise
+ uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
uint32_t type:4; //!< Gen type
uint32_t file:2; //!< Register file
uint32_t negation:1; //!< For source
useless.push_back(str);
args.push_back(str.c_str());
}
-
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
+#endif
args.push_back("-emit-llvm");
// XXX we haven't implement those builtin functions,
// so disable it currently.
GBE_CURBE_GROUP_NUM_Y,
GBE_CURBE_GROUP_NUM_Z,
GBE_CURBE_WORK_DIM,
+ GBE_CURBE_SAMPLER_INFO,
GBE_CURBE_IMAGE_INFO,
GBE_CURBE_STACK_POINTER,
GBE_CURBE_KERNEL_ARGUMENT,
INLINE Type getSrcType(void) const { return this->srcType; }
INLINE Type getDstType(void) const { return this->dstType; }
- static const uint32_t srcNum = 5;
+ static const uint32_t srcNum = 6;
static const uint32_t dstNum = 4;
};
Register dst[0]; //!< No dest register
};
+ class ALIGNED_INSTRUCTION GetSamplerInfoInstruction :
+ public BasePolicy,
+ public NSrcPolicy<GetSamplerInfoInstruction, 1>,
+ public NDstPolicy<GetSamplerInfoInstruction, 1>
+ {
+ public:
+ GetSamplerInfoInstruction( Register dst,
+ Register src)
+ {
+ this->opcode = OP_GET_SAMPLER_INFO;
+ this->dst[0] = dst;
+ this->src[0] = src;
+ }
+
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << " sampler id %" << this->getSrc(fn, 0)
+ << " %" << this->getDst(fn, 0);
+ }
+
+ Register src[1]; //!< Surface to get info
+ Register dst[1]; //!< return value
+ static const uint32_t dstNum = 1;
+ };
+
class ALIGNED_INSTRUCTION GetImageInfoInstruction :
public BasePolicy,
public NSrcPolicy<GetImageInfoInstruction, 1>,
{ return true; }
INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
{ return true; }
+ INLINE bool GetSamplerInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
+
// Ensure that types and register family match
INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
#include "ir/instruction.hxx"
END_INTROSPECTION(GetImageInfoInstruction)
+START_INTROSPECTION(GetSamplerInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetSamplerInfoInstruction)
+
START_INTROSPECTION(LoadImmInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(LoadImmInstruction)
return internal::GetImageInfoInstruction(infoType, dst, src).convert();
}
+ Instruction GET_SAMPLER_INFO(Register dst, Register src) {
+ return internal::GetSamplerInfoInstruction(dst, src).convert();
+ }
+
std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
const Function &fn = insn.getFunction();
switch (insn.getOpcode()) {
};
uint32_t data;
} ImageInfoKey;
+
/*! Get image information */
class GetImageInfoInstruction : public Instruction {
public:
static bool isClassOf(const Instruction &insn);
};
+ /*! Get image information */
+ class GetSamplerInfoInstruction : public Instruction {
+ public:
+
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
/*! Branch instruction is the unified way to branch (with or without
* predicate)
*/
Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
/*! get image information , such as width/height/depth/... */
Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src);
+ /*! get sampler information */
+ Instruction GET_SAMPLER_INFO(Register dst, Register src);
/*! label labelIndex */
Instruction LABEL(LabelIndex labelIndex);
DECL_INSN(SYNC, SyncInstruction)
DECL_INSN(LABEL, LabelInstruction)
DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(GET_SAMPLER_INFO, GetSamplerInfoInstruction)
DECL_INSN(MUL_HI, BinaryInstruction)
DECL_INSN(I64_MUL_HI, BinaryInstruction)
DECL_INSN(FBH, UnaryInstruction)
"stack_pointer",
"block_ip",
"barrier_id", "thread_number",
- "work_dimension",
+ "work_dimension", "sampler_info"
};
#if GBE_DEBUG
DECL_NEW_REG(FAMILY_DWORD, barrierid);
DECL_NEW_REG(FAMILY_DWORD, threadn);
DECL_NEW_REG(FAMILY_DWORD, workdim);
+ DECL_NEW_REG(FAMILY_WORD, samplerinfo);
}
#undef DECL_NEW_REG
static const Register barrierid = Register(20);// barrierid
static const Register threadn = Register(21); // number of threads
static const Register workdim = Register(22); // work dimention.
- static const uint32_t regNum = 23; // number of special registers
+ static const Register samplerinfo = Register(23); // store sampler info.
+ static const uint32_t regNum = 24; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
// Emit unary instructions from gen native function
void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
+ ir::Register appendSampler(CallSite::arg_iterator AI);
+
// These instructions are not supported at all
void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
case GEN_OCL_GET_IMAGE_DEPTH:
+ case GEN_OCL_GET_SAMPLER_INFO:
case GEN_OCL_ATOMIC_ADD0:
case GEN_OCL_ATOMIC_ADD1:
case GEN_OCL_ATOMIC_SUB0:
ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
}
+ /* append a new sampler. should be called before any reference to
+ * a sampler_t value. */
+ ir::Register GenWriter::appendSampler(CallSite::arg_iterator AI) {
+ Constant *CPV = dyn_cast<Constant>(*AI);
+ ir::Register sampler;
+ if (CPV != NULL)
+ {
+ // This is not a kernel argument sampler, we need to append it to sampler set,
+ // and allocate a sampler slot for it.
+ auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+ GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+ sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+ } else {
+ sampler = this->getRegister(*AI);
+ ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+ }
+ return sampler;
+ }
+
void GenWriter::emitCallInst(CallInst &I) {
if (Function *F = I.getCalledFunction()) {
if (F->getIntrinsicID() != 0) {
ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id);
break;
}
+ case GEN_OCL_GET_SAMPLER_INFO:
+ {
+ GBE_ASSERT(AI != AE);
+ const ir::Register sampler = this->appendSampler(AI); ++AI;
+ const ir::Register reg = this->getRegister(&I, 0);
+ ctx.GET_SAMPLER_INFO(reg, sampler);
+ break;
+ }
case GEN_OCL_READ_IMAGE0:
case GEN_OCL_READ_IMAGE1:
case GEN_OCL_READ_IMAGE2:
{
GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE);
- Constant *CPV = dyn_cast<Constant>(*AI);
- ir::Register sampler;
- if (CPV != NULL)
- {
- // This is not a kernel argument sampler, we need to append it to sampler set,
- // and allocate a sampler slot for it.
- auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
- GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
- sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
- } else {
- sampler = this->getRegister(*AI);
- ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
- }
+ const ir::Register sampler = this->appendSampler(AI);
++AI;
GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
srcTupleData.push_back(ucoord);
srcTupleData.push_back(vcoord);
srcTupleData.push_back(wcoord);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+ assert(CPV);
+ auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+ GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+ ir::Register offsetReg(x.data.u32);
+ srcTupleData.push_back(offsetReg);
+#else
+ ir::Register offsetReg(0);
+#endif
+ srcTupleData.push_back(offsetReg);
const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
- const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 5);
+ const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 6);
ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
// To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfffj)
// To write_image functions.
DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+
+// get sampler info
+DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info)
//
// Common defines for Image intrinsics
// Channel order
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
enum {
CLK_R = 0x10B0,
CLK_A = 0x10B1,
typedef enum clk_sampler_type {
__CLK_ADDRESS_BASE = 0,
- CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_NONE = (0 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_CLAMP = (1 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_CLAMP_TO_EDGE = (2 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_REPEAT = (3 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_MIRROR = (4 << __CLK_ADDRESS_BASE),
#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
#endif
- __CLK_ADDRESS_MASK = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+ __CLK_ADDRESS_MASK = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
CLK_ADDRESS_CLAMP_TO_EDGE |
- CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+ CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
__CLK_ADDRESS_BITS = 3, // number of bits required to
// represent address info
__CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
CLK_NORMALIZED_COORDS_FALSE = 0,
- CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
- __CLK_NORMALIZED_MASK = CLK_NORMALIZED_COORDS_FALSE |
- CLK_NORMALIZED_COORDS_TRUE,
+ CLK_NORMALIZED_COORDS_TRUE = (1 << __CLK_NORMALIZED_BASE),
+ __CLK_NORMALIZED_MASK = (CLK_NORMALIZED_COORDS_FALSE |
+ CLK_NORMALIZED_COORDS_TRUE),
__CLK_NORMALIZED_BITS = 1, // number of bits required to
// represent normalization
-
- __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE +
- __CLK_NORMALIZED_BITS,
- CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
- CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
- CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
- __CLK_FILTER_MASK = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
- CLK_FILTER_ANISOTROPIC,
+ __CLK_FILTER_BASE = (__CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS),
+ CLK_FILTER_NEAREST = (0 << __CLK_FILTER_BASE),
+ CLK_FILTER_LINEAR = (1 << __CLK_FILTER_BASE),
+ CLK_FILTER_ANISOTROPIC = (2 << __CLK_FILTER_BASE),
+ __CLK_FILTER_MASK = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+ CLK_FILTER_ANISOTROPIC),
__CLK_FILTER_BITS = 2, // number of bits required to
// represent address info
- __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
- CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
- CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
- CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
- __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
- CLK_MIP_ANISOTROPIC,
+ __CLK_MIP_BASE = (__CLK_FILTER_BASE + __CLK_FILTER_BITS),
+ CLK_MIP_NEAREST = (0 << __CLK_MIP_BASE),
+ CLK_MIP_LINEAR = (1 << __CLK_MIP_BASE),
+ CLK_MIP_ANISOTROPIC = (2 << __CLK_MIP_BASE),
+ __CLK_MIP_MASK = (CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+ CLK_MIP_ANISOTROPIC),
__CLK_MIP_BITS = 2,
- __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
- __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
- __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+ __CLK_SAMPLER_BITS = (__CLK_MIP_BASE + __CLK_MIP_BITS),
+ __CLK_SAMPLER_MASK = (__CLK_MIP_MASK | __CLK_FILTER_MASK |
+ __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK),
- __CLK_SAMPLER_ARG_BASE = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+ __CLK_SAMPLER_ARG_BASE = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS),
__CLK_SAMPLER_ARG_BITS = 8,
- __CLK_SAMPLER_ARG_MASK = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+ __CLK_SAMPLER_ARG_MASK = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE),
__CLK_SAMPLER_ARG_KEY_BIT = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
__CLK_SAMPLER_ARG_KEY_BITS = 1,
// Image access functions
/////////////////////////////////////////////////////////////////////////////
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v);
-
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
int __gen_ocl_get_image_channel_data_type(uint surface_id);
int __gen_ocl_get_image_channel_order(uint surface_id);
int __gen_ocl_get_image_depth(uint surface_id);
+ushort __gen_ocl_get_sampler_info(uint sampler_id);
#define GET_IMAGE(cl_image, surface_id) \
uint surface_id = (uint)cl_image
-#define DECL_READ_IMAGE(image_type, type, suffix, coord_type) \
- INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, sampler_t sampler, coord_type coord) \
- {\
- GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, sampler, coord));\
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix, \
+ image_type, type, suffix, coord_type) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ sampler_t sampler, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ coord_type tmpCoord = coord; \
+ ushort samplerValue; \
+ if (float_coord_rounding_fix | int_clamping_fix) { \
+ samplerValue = __gen_ocl_get_sampler_info(sampler); \
+ if (((samplerValue & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) \
+ && ((samplerValue & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) { \
+ if (float_coord_rounding_fix \
+ && ((samplerValue & CLK_NORMALIZED_COORDS_TRUE) == 0)) { \
+ FIXUP_FLOAT_COORD(tmpCoord); \
+ } \
+ if (int_clamping_fix) { \
+ if (OUT_OF_BOX(tmpCoord, surface_id)) { \
+ unsigned int border_alpha; \
+ int order = __gen_ocl_get_image_channel_order(surface_id); \
+ if (!CLK_HAS_ALPHA(order)) { \
+ border_alpha = 1; \
+ } else \
+ border_alpha = 0; \
+ return (type)(0, 0, 0, border_alpha); \
+ } else \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
+ } \
+ } \
+ } \
+ return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, \
+ sampler, tmpCoord), 0); \
}
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type) \
- INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, coord_type coord) \
- {\
- GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord));\
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORD(surface_id, \
+ CLK_NORMALIZED_COORDS_FALSE \
+ | CLK_ADDRESS_NONE \
+ | CLK_FILTER_NEAREST, coord), 0); \
}
#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1
#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
-#define DECL_IMAGE(image_type, type, suffix, n) \
- DECL_READ_IMAGE(image_type, type, suffix, int ##n) \
- DECL_READ_IMAGE(image_type, type, suffix, float ##n) \
- DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n) \
- DECL_WRITE_IMAGE(image_type, type, suffix, int ## n) \
+#define OUT_OF_BOX(coord, surface) \
+ (coord.s0 < 0 || coord.s1 < 0 \
+ || coord.s0 >= __gen_ocl_get_image_width(surface) \
+ || coord.s1 >= __gen_ocl_get_image_height(surface))
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f) \
+ tmpCoord.s1 += -0x1p-9f; \
+ }
+
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n) \
+ DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n) \
+ DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \
+ DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, int ## n) \
DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
-DECL_IMAGE(image2d_t, int4, i, 2)
-DECL_IMAGE(image2d_t, uint4, ui, 2)
-DECL_IMAGE(image2d_t, float4, f, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
#undef EXPEND_READ_COORD
#undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2
#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
-
-DECL_IMAGE(image3d_t, int4, i, 4)
-DECL_IMAGE(image3d_t, uint4, ui, 4)
-DECL_IMAGE(image3d_t, float4, f, 4)
-
-DECL_IMAGE(image3d_t, int4, i, 3)
-DECL_IMAGE(image3d_t, uint4, ui, 3)
-DECL_IMAGE(image3d_t, float4, f, 3)
+#define OUT_OF_BOX(coord, surface) \
+ (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0 \
+ || coord.s0 >= __gen_ocl_get_image_width(surface) \
+ || coord.s1 >= __gen_ocl_get_image_height(surface) \
+ || coord.s2 >= __gen_ocl_get_image_depth(surface))
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20) \
+ tmpCoord.s1 += -0x1p-9; \
+ if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20) \
+ tmpCoord.s2 += -0x1p-9; \
+ }
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
#undef EXPEND_READ_COORD
#undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
#undef DECL_IMAGE
#undef DECL_READ_IMAGE
#undef DECL_READ_IMAGE_NOSAMPLER
#undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
#define DECL_IMAGE_INFO(image_type) \
INLINE_OVERLOADABLE int get_image_width(image_type image) \
UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
#undef UPLOAD
+ /* Upload sampler information. */
+ offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0);
+ uint32_t i;
+ for(i = 0; i < ker->sampler_sz; i++, offset += 2) {
+ *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF;
+ }
+
/* Write identity for the stack pointer. This is required by the stack pointer
* computation in the kernel
*/
.image3d_max_width = 8192,
.image3d_max_height = 8192,
.image3d_max_depth = 8192,
-.max_samplers = 0,
+.max_samplers = 8,
.mem_base_addr_align = sizeof(cl_uint) * 8,
.min_data_type_align_size = sizeof(cl_uint),
.single_fp_config = 0, /* XXX */
intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
{
int index;
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
+#else
assert(sampler_sz <= GEN_MAX_SAMPLERS);
- for(index = 0; index < sampler_sz; index++)
- intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
+#endif
+ for(index = 0; index < sampler_sz; index++) {
+ intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ /* Duplicate the sampler to 8 + index and fixup the address mode
+ * to repeat.*/
+ if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) {
+ intel_gpgpu_insert_sampler(gpgpu, index + 8,
+ (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_REPEAT);
+ }
+#endif
+ }
}
static void
OCL_UNMAP_BUFFER(5);
}
-MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_copy_image1);
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);