From: Zhigang Gong <zhigang.gong@intel.com>
Date: Thu, 29 May 2014 01:26:24 +0000 (+0800)
Subject: GBE: fix uniform/scalar related bugs.
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1b8042620e32b05334f6bbbd87bde7ee4b4a8a03;p=contrib%2Fbeignet.git

GBE: fix uniform/scalar related bugs.

One major fix is that even a register is a scalar, when
we move a scalar Dword to a scalar Byte, we have to set
the hstride to 4, otherwise, it breaks the following
register restication:
  B. When the Execution Data Type is wider than the destination data type,
     the destination must be aligned as required by the wider execution data
     type and specify a HorzStride equal to the ratio in sizes of the two data
     types. For example, a mov with a D source and B destination must use a
     4-byte aligned destination and a Dst.HorzStride of 4.

The following instruction may doesn't take effect.
mov.sat(1)  g127.4<1>:B  g126<0,1,0>:D
We have to change it to
mov.sat(1)  g127.4<4>:B  g126<0,1,0>:D

v2: keep the instruction selection stage unchanged, we fix this restircation
    in setDst only.

Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
Reviewed-by: "Song, Ruiling" <ruiling.song@intel.com>
---

diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 0091e81..ed2fd32 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -68,14 +68,17 @@ namespace gbe
   }
 
   INLINE bool needToSplitAlu1(GenEncoder *p, GenRegister dst, GenRegister src) {
-    if (p->curr.execWidth != 16) return false;
+    if (p->curr.execWidth != 16 || src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
     if (isVectorOfBytes(dst) == true) return true;
     if (isVectorOfBytes(src) == true) return true;
     return false;
   }
 
   INLINE bool needToSplitAlu2(GenEncoder *p, GenRegister dst, GenRegister src0, GenRegister src1) {
-    if (p->curr.execWidth != 16) return false;
+    if (p->curr.execWidth != 16 ||
+         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
     if (isVectorOfBytes(dst) == true) return true;
     if (isVectorOfBytes(src0) == true) return true;
     if (isVectorOfBytes(src1) == true) return true;
@@ -83,7 +86,10 @@ namespace gbe
   }
 
   INLINE bool needToSplitCmp(GenEncoder *p, GenRegister src0, GenRegister src1) {
-    if (p->curr.execWidth != 16) return false;
+    if (p->curr.execWidth != 16 ||
+         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
     if (isVectorOfBytes(src0) == true) return true;
     if (isVectorOfBytes(src1) == true) return true;
     if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type == GEN_TYPE_F)
@@ -93,7 +99,6 @@ namespace gbe
     return false;
   }
 
-
   void GenEncoder::setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
                                         unsigned msg_length, unsigned response_length,
                                         bool header_present, bool end_of_thread)
@@ -268,8 +273,14 @@ namespace gbe
      insn->bits1.da1.dest_address_mode = dest.address_mode;
      insn->bits1.da1.dest_reg_nr = dest.nr;
      insn->bits1.da1.dest_subreg_nr = dest.subnr;
-     if (dest.hstride == GEN_HORIZONTAL_STRIDE_0)
-       dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+     if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
+       if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
+         dest.hstride = GEN_HORIZONTAL_STRIDE_4;
+       else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
+         dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+       else
+         dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+     }
      insn->bits1.da1.dest_horiz_stride = dest.hstride;
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index cf0af9d..19921d4 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3076,6 +3076,13 @@ namespace gbe
         narrowDst = 0;
       }
 
+      sel.push();
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
       for(int i = 0; i < narrowNum; i++, index++) {
         GenRegister narrowReg, wideReg;
         if(narrowDst) {
@@ -3120,6 +3127,7 @@ namespace gbe
         } else
           sel.MOV(xdst, xsrc);
       }
+      sel.pop();
 
       return true;
     }
@@ -3154,7 +3162,14 @@ namespace gbe
       } else if (opcode == OP_F32TO16) {
         GenRegister unpacked;
         unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-        sel.F32TO16(unpacked, src);
+        sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.F32TO16(unpacked, src);
+        sel.pop();
         sel.MOV(dst, unpacked);
       } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
         GenRegister unpacked;
@@ -3172,8 +3187,16 @@ namespace gbe
           tmp.type = GEN_TYPE_D;
           sel.CONVI64_TO_I(tmp, src);
           sel.MOV(unpacked, tmp);
-        } else
-          sel.MOV(unpacked, src);
+        } else {
+          sel.push();
+            if (sel.isScalarReg(insn.getSrc(0))) {
+              sel.curr.execWidth = 1;
+              sel.curr.predicate = GEN_PREDICATE_NONE;
+              sel.curr.noMask = 1;
+            }
+            sel.MOV(unpacked, src);
+          sel.pop();
+        }
         sel.MOV(dst, unpacked);
       } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && srcFamily == FAMILY_QWORD) {
         sel.CONVI64_TO_I(dst, src);
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index f642c2e..3d8b0b3 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -941,6 +941,10 @@ namespace gbe
         || ctx.reservedSpillRegs != 0)
       this->expireGRF(interval);
     tick++;
+    // For some scalar byte register, it may be used as a destination register
+    // and the source is a scalar Dword. If that is the case, the byte register
+    // must get 4byte alignment register offset.
+    alignment = (alignment + 3) & ~3;
     while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
       const bool success = this->expireGRF(interval);
       if (success == false) {