From 35f39cc83e2ccfbe6fd6795af9f3fe18458217a7 Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Tue, 18 Feb 2014 17:19:41 +0800 Subject: [PATCH] GBE: Optimize write_image instruction for simd8 mode. On simd8 mode, we can put the u,v,w,x,r,g,b,a to a selection vector directly and don't need to assign those values again. Let's see an example, the following code is generated without this patch which is doing a simple image copy: (26 ) (+f0) mov(8) g113<1>F g114<8,8,1>D { align1 WE_normal 1Q }; (28 ) (+f0) send(8) g108<1>UD g112<8,8,1>F sampler (3, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; (30 ) mov(8) g99<1>UD 0x0UD { align1 WE_all 1Q }; (32 ) mov(1) g99.7<1>UD 0xffffUD { align1 WE_all }; (34 ) mov(8) g103<1>UD 0x0UD { align1 WE_all 1Q }; (36 ) (+f0) mov(8) g100<1>UD g117<8,8,1>UD { align1 WE_normal 1Q }; (38 ) (+f0) mov(8) g101<1>UD g114<8,8,1>UD { align1 WE_normal 1Q }; (40 ) (+f0) mov(8) g104<1>UD g108<8,8,1>UD { align1 WE_normal 1Q }; (42 ) (+f0) mov(8) g105<1>UD g109<8,8,1>UD { align1 WE_normal 1Q }; (44 ) (+f0) mov(8) g106<1>UD g110<8,8,1>UD { align1 WE_normal 1Q }; (46 ) (+f0) mov(8) g107<1>UD g111<8,8,1>UD { align1 WE_normal 1Q }; (48 ) (+f0) send(8) null g99<8,8,1>UD renderunsupported target 5 mlen 9 rlen 0 { align1 WE_normal 1Q }; (50 ) (+f0) mov(8) g1<1>UW 0x1UW { align1 WE_normal 1Q }; L1: (52 ) mov(8) g112<1>UD g0<8,8,1>UD { align1 WE_all 1Q }; (54 ) send(8) null g112<8,8,1>UD thread_spawnerunsupported target 7 mlen 1 rlen 0 { align1 WE_normal 1Q EOT }; With this patch, we can optimize it as below: (26 ) (+f0) mov(8) g106<1>F g111<8,8,1>D { align1 WE_normal 1Q }; (28 ) (+f0) send(8) g114<1>UD g105<8,8,1>F sampler (3, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; (30 ) mov(8) g109<1>UD 0x0UD { align1 WE_all 1Q }; (32 ) mov(1) g109.7<1>UD 0xffffUD { align1 WE_all }; (34 ) mov(8) g113<1>UD 0x0UD { align1 WE_all 1Q }; (36 ) (+f0) send(8) null g109<8,8,1>UD renderunsupported target 5 mlen 9 rlen 0 { align1 WE_normal 1Q }; (38 ) (+f0) mov(8) g1<1>UW 0x1UW { align1 WE_normal 1Q }; L1: (40 ) mov(8) g112<1>UD g0<8,8,1>UD { align1 WE_all 1Q }; (42 ) send(8) null g112<8,8,1>UD thread_spawnerunsupported target 7 mlen 1 rlen 0 { align1 WE_normal 1Q EOT }; This patch could save about 8 instructions per write_image. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" --- backend/src/backend/gen_context.cpp | 58 +------------------- backend/src/backend/gen_insn_selection.cpp | 88 +++++++++++++++++++++++------- backend/src/ocl_stdlib.tmpl.h | 12 ++-- 3 files changed, 77 insertions(+), 81 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 5541cfb..2a720c2 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1790,9 +1790,7 @@ namespace gbe const unsigned char sampler = insn.extra.sampler; const unsigned int msgLen = insn.extra.rdmsglen; uint32_t simdWidth = p->curr.execWidth; - //p->push(); p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0); - //p->pop(); } void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) { @@ -1828,60 +1826,8 @@ namespace gbe void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) { const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); - const GenRegister ucoord = ra->genReg(insn.src(insn.extra.msglen)); - const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.msglen)); - const GenRegister R = ra->genReg(insn.src(3 + insn.extra.msglen)); - const GenRegister G = ra->genReg(insn.src(4 + insn.extra.msglen)); - const GenRegister B = ra->genReg(insn.src(5 + insn.extra.msglen)); - const GenRegister A = ra->genReg(insn.src(6 + insn.extra.msglen)); - const unsigned char bti = insn.extra.bti; - - p->push(); - uint32_t simdWidth = p->curr.execWidth; - const uint32_t nr = header.nr; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::immud(0x0)); - p->curr.execWidth = 1; - - // prepare mesg desc and move to a0.0. - // desc = bti | (msg_type << 14) | (header_present << 19)) - // prepare header, we need to enable all the 8 planes. - p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff)); - p->curr.execWidth = 8; - // Typed write only support SIMD8. - // Prepare message payload U + V + R(ignored) + LOD(0) + RGBA. - // Currently, we don't support non-zero lod, so we clear all lod to - // zero for both quarters thus save one instruction here. - // Thus we must put this instruction in noMask and no predication state. - p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD - p->pop(); - p->push(); - p->curr.execWidth = 8; - // TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we - // need to call it twice. - uint32_t quarterNum = (simdWidth == 8) ? 1 : 2; - - for( uint32_t quarter = 0; quarter < quarterNum; quarter++) - { -#define QUARTER_MOV0(dst_nr, src) p->MOV(GenRegister::ud8grf(dst_nr, 0), \ - GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type)) -#define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \ - GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type)) - if (quarter == 1) - p->curr.quarterControl = GEN_COMPRESSION_Q2; - QUARTER_MOV0(nr + 1, ucoord); - QUARTER_MOV0(nr + 2, vcoord); - if (insn.extra.is3DWrite) - QUARTER_MOV0(nr + 3, ra->genReg(insn.src(2 + insn.extra.msglen))); - QUARTER_MOV1(nr + 5, R); - QUARTER_MOV1(nr + 6, G); - QUARTER_MOV1(nr + 7, B); - QUARTER_MOV1(nr + 8, A); -#undef QUARTER_MOV - p->TYPED_WRITE(header, true, bti); - } - p->pop(); + const uint32_t bti = insn.extra.bti; + p->TYPED_WRITE(header, true, bti); } BVAR(OCL_OUTPUT_REG_ALLOC, false); diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index d76f580..697ed1a 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -545,7 +545,7 @@ namespace gbe /*! Encode sample instructions */ void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D); /*! Encode typed write instructions */ - void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D); + void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D); /*! Get image information */ void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti); /*! Multiply 64-bit integers */ @@ -1451,18 +1451,15 @@ namespace gbe this->opaque = GBE_NEW(Selection::Opaque, ctx); } - void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum, - GenRegister *msgs, uint32_t msgNum, + void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D) { uint32_t elemID = 0; uint32_t i; - SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum); + SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum); SelectionVector *msgVector = this->appendVector();; for( i = 0; i < msgNum; ++i, ++elemID) insn->src(elemID) = msgs[i]; - for (i = 0; i < srcNum; ++i, ++elemID) - insn->src(elemID) = src[i]; insn->extra.bti = bti; insn->extra.msglen = msgNum; @@ -3036,24 +3033,77 @@ namespace gbe { using namespace ir; const uint32_t simdWidth = sel.ctx.getSimdWidth(); - uint32_t valueID; GenRegister msgs[9]; // (header + U + V + R + LOD + 4) - GenRegister src[insn.getSrcNum()]; - uint32_t msgNum = (8 / (simdWidth / 8)) + 1; - uint32_t coordNum = 3; + const uint32_t msgNum = (8 / (simdWidth / 8)) + 1; + const uint32_t coordNum = 3; - for(uint32_t i = 0; i < msgNum; i++) - msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); - - // u, v, w coords should use coord type. - for (valueID = 0; valueID < coordNum; ++valueID) - src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType()); + if (simdWidth == 16) { + for(uint32_t i = 0; i < msgNum; i++) + msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + } else { + uint32_t valueID = 0; + msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++) + msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType()); + // fake w. + if (!insn.is3D()) + msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + // LOD. + msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++) + msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); + } - for (; valueID < insn.getSrcNum(); ++valueID) - src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); + sel.push(); + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.MOV(msgs[0], GenRegister::immud(0)); + sel.curr.execWidth = 1; + + GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4); + channelEn.subphysical = 1; + // Enable all channels. + sel.MOV(channelEn, GenRegister::immud(0xffff)); + sel.curr.execWidth = 8; + // Set zero LOD. + if (simdWidth == 8) + sel.MOV(msgs[4], GenRegister::immud(0)); + else + sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0)); + sel.pop(); uint32_t bti = insn.getImageIndex(); - sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum, bti, insn.is3D()); + if (simdWidth == 8) + sel.TYPED_WRITE(msgs, msgNum, bti, insn.is3D()); + else { + sel.push(); + sel.curr.execWidth = 8; + for( uint32_t quarter = 0; quarter < 2; quarter++) + { + #define QUARTER_MOV0(msgs, msgid, src) \ + sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \ + GenRegister::Qn(src, quarter)) + + #define QUARTER_MOV1(msgs, msgid, src) \ + sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \ + GenRegister::Qn(src, quarter)) + sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2; + // Set U,V,W + QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType())); + QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType())); + if (insn.is3D()) + QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType())); + // Set R, G, B, A + QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType())); + QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType())); + QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType())); + QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType())); + sel.TYPED_WRITE(msgs, msgNum, bti, insn.is3D()); + #undef QUARTER_MOV0 + #undef QUARTER_MOV1 + } + sel.pop(); + } return true; } DECL_CTOR(TypedWriteInstruction, 1, 1); diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index 153c1c8..9dec3a5 100755 --- a/backend/src/ocl_stdlib.tmpl.h +++ b/backend/src/ocl_stdlib.tmpl.h @@ -4502,18 +4502,18 @@ OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, fl OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset); OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color); -OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color); +//OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color); OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color); -OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color); +//OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color); OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color); -OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color); +//OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color); OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color); -OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color); +//OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color); OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color); -OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color); +//OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color); OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color); -OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color); +//OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color); int __gen_ocl_get_image_width(uint surface_id); int __gen_ocl_get_image_height(uint surface_id); int __gen_ocl_get_image_channel_data_type(uint surface_id); -- 2.7.4