From 0491433d93845c0a45427ed26b64869f2e381ed7 Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Tue, 18 Feb 2014 14:40:59 +0800 Subject: [PATCH] GBE: optimize sample instruction. The U,V,W registers could be allocated to a selection vector directly. Then we can save some MOV instructions for the read_image functions. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" --- backend/src/backend/gen_context.cpp | 20 ++++---------------- backend/src/backend/gen_encoder.cpp | 4 ++-- backend/src/backend/gen_encoder.hpp | 2 +- backend/src/backend/gen_insn_selection.cpp | 26 ++++++++++++-------------- backend/src/backend/gen_insn_selection.hpp | 2 +- backend/src/ocl_stdlib.tmpl.h | 12 ++++++------ 6 files changed, 26 insertions(+), 40 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 44dbee2..5541cfb 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1788,23 +1788,11 @@ namespace gbe const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F); const unsigned char bti = insn.extra.rdbti; const unsigned char sampler = insn.extra.sampler; - const GenRegister ucoord = ra->genReg(insn.src(4)); - const GenRegister vcoord = ra->genReg(insn.src(5)); + const unsigned int msgLen = insn.extra.rdmsglen; uint32_t simdWidth = p->curr.execWidth; - uint32_t coord_cnt = 2; - p->push(); - const uint32_t nr = msgPayload.nr; - // prepare mesg desc and move to a0.0. - // desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31) - /* Prepare message payload. */ - p->MOV(GenRegister::f8grf(nr , 0), ucoord); - p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord); - if (insn.extra.is3DRead) { - p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), ra->genReg(insn.src(6))); - coord_cnt++; - } - p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0); - p->pop(); + //p->push(); + p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0); + //p->pop(); } void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) { diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index aaf7dce..0664d77 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -1207,10 +1207,10 @@ namespace gbe void GenEncoder::SAMPLE(GenRegister dest, GenRegister msg, + unsigned int msg_len, bool header_present, unsigned char bti, unsigned char sampler, - unsigned int coord_cnt, uint32_t simdWidth, uint32_t writemask, uint32_t return_format) @@ -1219,7 +1219,7 @@ namespace gbe uint32_t msg_type = (simdWidth == 16) ? GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE; uint32_t response_length = (4 * (simdWidth / 8)); - uint32_t msg_length = (coord_cnt * (simdWidth / 8)); + uint32_t msg_length = (msg_len * (simdWidth / 8)); if (header_present) msg_length++; uint32_t simd_mode = (simdWidth == 16) ? diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index 13db6ae..094a5c2 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -167,10 +167,10 @@ namespace gbe /*! Send instruction for the sampler */ void SAMPLE(GenRegister dest, GenRegister msg, + unsigned int msg_len, bool header_present, unsigned char bti, unsigned char sampler, - unsigned int coord_cnt, unsigned int simdWidth, uint32_t writemask, uint32_t return_format); diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 75ee906..d76f580 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -543,7 +543,7 @@ namespace gbe /*! Encode ternary instructions */ void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2); /*! Encode sample instructions */ - void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D); + void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D); /*! Encode typed write instructions */ void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D); /*! Get image information */ @@ -1415,10 +1415,9 @@ namespace gbe } void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum, - GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum); + SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum); SelectionVector *dstVector = this->appendVector(); SelectionVector *msgVector = this->appendVector(); @@ -1427,8 +1426,6 @@ namespace gbe insn->dst(elemID) = dst[elemID]; for (uint32_t elemID = 0; elemID < msgNum; ++elemID) insn->src(elemID) = msgPayloads[elemID]; - for (uint32_t elemID = 0; elemID < srcNum; ++elemID) - insn->src(msgNum + elemID) = src[elemID]; // Sends require contiguous allocation dstVector->regNum = dstNum; @@ -1442,7 +1439,7 @@ namespace gbe insn->extra.rdbti = bti; insn->extra.sampler = sampler; - insn->extra.is3DRead = is3D; + insn->extra.rdmsglen = msgNum; } /////////////////////////////////////////////////////////////////////////// @@ -3009,23 +3006,24 @@ namespace gbe { using namespace ir; GenRegister msgPayloads[4]; - GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()]; + GenRegister dst[insn.getDstNum()]; uint32_t srcNum = insn.getSrcNum(); + uint32_t valueID = 0; - for( int i = 0; i < 4; ++i) - msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); - - for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID) + for (valueID = 0; valueID < insn.getDstNum(); ++valueID) dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType()); - for (uint32_t valueID = 0; valueID < srcNum; ++valueID) - src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); + if (!insn.is3D()) + srcNum--; + /* U, V, [W] */ + for (valueID = 0; valueID < srcNum; ++valueID) + msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); uint32_t bti = insn.getImageIndex(); /* We have the clamp border workaround. */ uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8; - sel.SAMPLE(dst, insn.getDstNum(), src, srcNum, msgPayloads, 4, bti, sampler, insn.is3D()); + sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, srcNum, bti, sampler, insn.is3D()); return true; } DECL_CTOR(SampleInstruction, 1, 1); diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp index 09e6762..cb80d7c 100644 --- a/backend/src/backend/gen_insn_selection.hpp +++ b/backend/src/backend/gen_insn_selection.hpp @@ -120,7 +120,7 @@ namespace gbe struct { uint16_t rdbti:8; uint16_t sampler:5; - uint16_t is3DRead:1; + uint16_t rdmsglen:3; }; uint32_t barrierType; } extra; diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index cea4700..153c1c8 100755 --- a/backend/src/ocl_stdlib.tmpl.h +++ b/backend/src/ocl_stdlib.tmpl.h @@ -4487,18 +4487,18 @@ int __gen_ocl_force_simd16(void); // Image access functions ///////////////////////////////////////////////////////////////////////////// -OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset); +//OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset); OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset); -OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset); +//OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset); OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset); -OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset); +//OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset); OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset); -OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset); +//OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset); OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset); -OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset); +//OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset); OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset); -OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset); +//OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset); OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset); OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color); -- 2.7.4