From 35f39cc83e2ccfbe6fd6795af9f3fe18458217a7 Mon Sep 17 00:00:00 2001
From: Zhigang Gong <zhigang.gong@intel.com>
Date: Tue, 18 Feb 2014 17:19:41 +0800
Subject: [PATCH] GBE: Optimize write_image instruction for simd8 mode.

On simd8 mode, we can put the u,v,w,x,r,g,b,a to
a selection vector directly and don't need to
assign those values again.

Let's see an example, the following code is generated without this
patch which is doing a simple image copy:

    (26      )  (+f0) mov(8)    g113<1>F        g114<8,8,1>D                    { align1 WE_normal 1Q };
    (28      )  (+f0) send(8)   g108<1>UD       g112<8,8,1>F
                sampler (3, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
    (30      )  mov(8)          g99<1>UD        0x0UD                           { align1 WE_all 1Q };
    (32      )  mov(1)          g99.7<1>UD      0xffffUD                        { align1 WE_all };
    (34      )  mov(8)          g103<1>UD       0x0UD                           { align1 WE_all 1Q };
    (36      )  (+f0) mov(8)    g100<1>UD       g117<8,8,1>UD                   { align1 WE_normal 1Q };
    (38      )  (+f0) mov(8)    g101<1>UD       g114<8,8,1>UD                   { align1 WE_normal 1Q };
    (40      )  (+f0) mov(8)    g104<1>UD       g108<8,8,1>UD                   { align1 WE_normal 1Q };
    (42      )  (+f0) mov(8)    g105<1>UD       g109<8,8,1>UD                   { align1 WE_normal 1Q };
    (44      )  (+f0) mov(8)    g106<1>UD       g110<8,8,1>UD                   { align1 WE_normal 1Q };
    (46      )  (+f0) mov(8)    g107<1>UD       g111<8,8,1>UD                   { align1 WE_normal 1Q };
    (48      )  (+f0) send(8)   null            g99<8,8,1>UD
                renderunsupported target 5 mlen 9 rlen 0        { align1 WE_normal 1Q };
    (50      )  (+f0) mov(8)    g1<1>UW         0x1UW                           { align1 WE_normal 1Q };
  L1:
    (52      )  mov(8)          g112<1>UD       g0<8,8,1>UD                     { align1 WE_all 1Q };
    (54      )  send(8)         null            g112<8,8,1>UD
                thread_spawnerunsupported target 7 mlen 1 rlen 0 { align1 WE_normal 1Q EOT };

With this patch, we can optimize it as below:

    (26      )  (+f0) mov(8)    g106<1>F        g111<8,8,1>D                    { align1 WE_normal 1Q };
    (28      )  (+f0) send(8)   g114<1>UD       g105<8,8,1>F
                sampler (3, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
    (30      )  mov(8)          g109<1>UD       0x0UD                           { align1 WE_all 1Q };
    (32      )  mov(1)          g109.7<1>UD     0xffffUD                        { align1 WE_all };
    (34      )  mov(8)          g113<1>UD       0x0UD                           { align1 WE_all 1Q };
    (36      )  (+f0) send(8)   null            g109<8,8,1>UD
                renderunsupported target 5 mlen 9 rlen 0        { align1 WE_normal 1Q };
    (38      )  (+f0) mov(8)    g1<1>UW         0x1UW                           { align1 WE_normal 1Q };
  L1:
    (40      )  mov(8)          g112<1>UD       g0<8,8,1>UD                     { align1 WE_all 1Q };
    (42      )  send(8)         null            g112<8,8,1>UD
                thread_spawnerunsupported target 7 mlen 1 rlen 0 { align1 WE_normal 1Q EOT };

This patch could save about 8 instructions per write_image.

Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
Reviewed-by: "Yang, Rong R" <rong.r.yang@intel.com>
---
 backend/src/backend/gen_context.cpp        | 58 +-------------------
 backend/src/backend/gen_insn_selection.cpp | 88 +++++++++++++++++++++++-------
 backend/src/ocl_stdlib.tmpl.h              | 12 ++--
 3 files changed, 77 insertions(+), 81 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 5541cfb..2a720c2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1790,9 +1790,7 @@ namespace gbe
     const unsigned char sampler = insn.extra.sampler;
     const unsigned int msgLen = insn.extra.rdmsglen;
     uint32_t simdWidth = p->curr.execWidth;
-    //p->push();
     p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0);
-    //p->pop();
   }
 
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
@@ -1828,60 +1826,8 @@ namespace gbe
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister ucoord = ra->genReg(insn.src(insn.extra.msglen));
-    const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.msglen));
-    const GenRegister R = ra->genReg(insn.src(3 + insn.extra.msglen));
-    const GenRegister G = ra->genReg(insn.src(4 + insn.extra.msglen));
-    const GenRegister B = ra->genReg(insn.src(5 + insn.extra.msglen));
-    const GenRegister A = ra->genReg(insn.src(6 + insn.extra.msglen));
-    const unsigned char bti = insn.extra.bti;
-
-    p->push();
-    uint32_t simdWidth = p->curr.execWidth;
-    const uint32_t nr = header.nr;
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    p->MOV(header, GenRegister::immud(0x0));
-    p->curr.execWidth = 1;
-
-    // prepare mesg desc and move to a0.0.
-    // desc = bti | (msg_type << 14) | (header_present << 19))
-    // prepare header, we need to enable all the 8 planes.
-    p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff));
-    p->curr.execWidth = 8;
-    // Typed write only support SIMD8.
-    // Prepare message payload U + V + R(ignored) + LOD(0) + RGBA.
-    // Currently, we don't support non-zero lod, so we clear all lod to
-    // zero for both quarters thus save one instruction here.
-    // Thus we must put this instruction in noMask and no predication state.
-    p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD
-    p->pop();
-    p->push();
-    p->curr.execWidth = 8;
-    // TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we
-    // need to call it twice.
-    uint32_t quarterNum = (simdWidth == 8) ? 1 : 2;
-
-    for( uint32_t quarter = 0; quarter < quarterNum; quarter++)
-    {
-#define QUARTER_MOV0(dst_nr, src) p->MOV(GenRegister::ud8grf(dst_nr, 0), \
-                                        GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type))
-#define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \
-                                        GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type))
-      if (quarter == 1)
-        p->curr.quarterControl = GEN_COMPRESSION_Q2;
-      QUARTER_MOV0(nr + 1, ucoord);
-      QUARTER_MOV0(nr + 2, vcoord);
-      if (insn.extra.is3DWrite)
-        QUARTER_MOV0(nr + 3, ra->genReg(insn.src(2 + insn.extra.msglen)));
-      QUARTER_MOV1(nr + 5, R);
-      QUARTER_MOV1(nr + 6, G);
-      QUARTER_MOV1(nr + 7, B);
-      QUARTER_MOV1(nr + 8, A);
-#undef QUARTER_MOV
-      p->TYPED_WRITE(header, true, bti);
-    }
-    p->pop();
+    const uint32_t bti = insn.extra.bti;
+    p->TYPED_WRITE(header, true, bti);
   }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d76f580..697ed1a 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -545,7 +545,7 @@ namespace gbe
     /*! Encode sample instructions */
     void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
     /*! Encode typed write instructions */
-    void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
+    void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
     /*! Multiply 64-bit integers */
@@ -1451,18 +1451,15 @@ namespace gbe
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
   }
 
-  void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
-                                      GenRegister *msgs, uint32_t msgNum,
+  void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
                                       uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
     uint32_t i;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
     SelectionVector *msgVector = this->appendVector();;
 
     for( i = 0; i < msgNum; ++i, ++elemID)
       insn->src(elemID) = msgs[i];
-    for (i = 0; i < srcNum; ++i, ++elemID)
-      insn->src(elemID) = src[i];
 
     insn->extra.bti = bti;
     insn->extra.msglen = msgNum;
@@ -3036,24 +3033,77 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      uint32_t valueID;
       GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
-      GenRegister src[insn.getSrcNum()];
-      uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-      uint32_t coordNum = 3;
+      const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
+      const uint32_t coordNum = 3;
 
-      for(uint32_t i = 0; i < msgNum; i++)
-        msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-
-      // u, v, w coords should use coord type.
-      for (valueID = 0; valueID < coordNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType());
+      if (simdWidth == 16) {
+        for(uint32_t i = 0; i < msgNum; i++)
+          msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      } else {
+        uint32_t valueID = 0;
+        msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
+          msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+        // fake w.
+        if (!insn.is3D())
+          msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        // LOD.
+        msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
+          msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      }
 
-      for (; valueID < insn.getSrcNum(); ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      sel.push();
+      sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.noMask = 1;
+      sel.MOV(msgs[0], GenRegister::immud(0));
+      sel.curr.execWidth = 1;
+
+      GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4);
+      channelEn.subphysical = 1;
+      // Enable all channels.
+      sel.MOV(channelEn, GenRegister::immud(0xffff));
+      sel.curr.execWidth = 8;
+      // Set zero LOD.
+      if (simdWidth == 8)
+        sel.MOV(msgs[4], GenRegister::immud(0));
+      else
+        sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
+      sel.pop();
 
       uint32_t bti = insn.getImageIndex();
-      sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum, bti, insn.is3D());
+      if (simdWidth == 8)
+        sel.TYPED_WRITE(msgs, msgNum, bti, insn.is3D());
+      else {
+        sel.push();
+        sel.curr.execWidth = 8;
+        for( uint32_t quarter = 0; quarter < 2; quarter++)
+        {
+          #define QUARTER_MOV0(msgs, msgid, src) \
+                    sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
+                            GenRegister::Qn(src, quarter))
+
+          #define QUARTER_MOV1(msgs, msgid, src) \
+                  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
+                          GenRegister::Qn(src, quarter))
+          sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+          // Set U,V,W
+          QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
+          QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
+          if (insn.is3D())
+            QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
+          // Set R, G, B, A
+          QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType()));
+          sel.TYPED_WRITE(msgs, msgNum, bti, insn.is3D());
+          #undef QUARTER_MOV0
+          #undef QUARTER_MOV1
+        }
+        sel.pop();
+      }
       return true;
     }
     DECL_CTOR(TypedWriteInstruction, 1, 1);
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 153c1c8..9dec3a5 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -4502,18 +4502,18 @@ OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, fl
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
+//OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
 OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
+//OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
+//OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
+//OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
 OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
+//OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
+//OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
 int __gen_ocl_get_image_width(uint surface_id);
 int __gen_ocl_get_image_height(uint surface_id);
 int __gen_ocl_get_image_channel_data_type(uint surface_id);
-- 
2.7.4