GBE: fix the potential issue when there are inactive lanes.

author Zhigang Gong <zhigang.gong@intel.com>

Mon, 30 Dec 2013 10:55:01 +0000 (18:55 +0800)

committer Zhigang Gong <zhigang.gong@intel.com>

Tue, 31 Dec 2013 05:55:15 +0000 (13:55 +0800)
author Zhigang Gong <zhigang.gong@intel.com>
Mon, 30 Dec 2013 10:55:01 +0000 (18:55 +0800)
committer Zhigang Gong <zhigang.gong@intel.com>
Tue, 31 Dec 2013 05:55:15 +0000 (13:55 +0800)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp

index 1bbe700..88375ad 100644 (file)
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -430,6 +430,8 @@ namespace gbe
  
      // We insert the block IP mask first
      this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
+    this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0,  sizeof(uint32_t)));
+    this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
  
      // Go over the arguments and find the related patch locations
      const uint32_t argNum = fn.argNum();
@@ -512,9 +514,6 @@ namespace gbe
      });
  #undef INSERT_REG
  
-    // Insert the number of threads
-    insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
-
      // Insert the stack buffer if used
      if (useStackPtr)
        insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
@@ -686,7 +685,9 @@ namespace gbe
          reg == ir::ocl::goffset0  ||
          reg == ir::ocl::goffset1  ||
          reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim)
+        reg == ir::ocl::workdim   ||
+        reg == ir::ocl::emask     ||
+        reg == ir::ocl::notemask)
        return true;
      return false;
    }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp

index 384924d..0028c73 100644 (file)
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -91,12 +91,24 @@ namespace gbe
    void GenContext::clearFlagRegister(void) {
      // when group size not aligned to simdWidth, flag register need clear to
      // make prediction(any8/16h) work correctly
+    const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
+    const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
+    uint32_t execWidth = p->curr.execWidth;
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
      p->curr.noMask = 1;
+    /* clear all the bit in f0.0. */
      p->curr.execWidth = 1;
-    p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
-    p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000));
+    p->curr.noMask = 0;
+    p->curr.useFlag(0, 0);
+    p->curr.execWidth = execWidth;
+    /* set all the active lane to 1. Inactive lane remains 0. */
+    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
+    p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
      p->pop();
    }
  
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp

index 3adeec0..1e28843 100644 (file)
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2881,6 +2881,14 @@ namespace gbe
        if (sel.ctx.hasJIP(&insn)) {
          const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
          sel.push();
+
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+          GenRegister flagReg = GenRegister::flag(0, 0);
+          sel.AND(flagReg, flagReg, emaskReg);
+
            if (simdWidth == 8)
              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
            else if (simdWidth == 16)
@@ -2888,10 +2896,8 @@ namespace gbe
            else
              NOT_IMPLEMENTED;
            sel.curr.inversePredicate = 1;
-          sel.curr.execWidth = 1;
            sel.curr.flag = 0;
            sel.curr.subFlag = 0;
-          sel.curr.noMask = 1;
            sel.JMPI(GenRegister::immd(0), jip);
          sel.pop();
        }
@@ -3031,6 +3037,9 @@ namespace gbe
          // It is slightly more complicated than for backward jump. We check that
          // all PcIPs are greater than the next block IP to be sure that we can
          // jump
+        // We set all the inactive channel to 1 as the GEN_PREDICATE_ALIGN1_ALL8/16
+        // will check those bits as well.
+
          sel.push();
            sel.curr.physicalFlag = 0;
            sel.curr.flagIndex = uint16_t(pred);
@@ -3041,14 +3050,19 @@ namespace gbe
            // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
            // work correct, as flag register bits mapped to non-active lanes tend
            // to be zero.
+
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          GenRegister notEmaskReg = GenRegister::uw1grf(ocl::notemask);
+          sel.OR(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), notEmaskReg);
+
            if (simdWidth == 8)
              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
            else if (simdWidth == 16)
              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
            else
              NOT_SUPPORTED;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
+
            sel.JMPI(GenRegister::immd(0), jip);
          sel.pop();
  
@@ -3083,6 +3097,7 @@ namespace gbe
        if (insn.isPredicated() == true) {
          const Register pred = insn.getPredicateIndex();
  
+
          // Update the PcIPs for all the branches. Just put the IPs of the next
          // block. Next instruction will properly reupdate the IPs of the lanes
          // that actually take the branch
@@ -3095,6 +3110,14 @@ namespace gbe
            sel.curr.flagIndex = uint16_t(pred);
            sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
  
+        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ALL8/16
+        // will check those bits as well.
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+          sel.AND(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), emaskReg);
+
            // Branch to the jump target
            if (simdWidth == 8)
              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -3102,8 +3125,6 @@ namespace gbe
              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
            else
              NOT_SUPPORTED;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
            sel.JMPI(GenRegister::immd(0), jip);
          sel.pop();
  
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp

index 2bb0d19..02fc534 100644 (file)
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -634,6 +634,13 @@ namespace gbe
          this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
      }
  
+    this->intervals[ocl::emask].minID = 0;
+    this->intervals[ocl::emask].maxID = INT_MAX;
+    this->intervals[ocl::notemask].minID = 0;
+    this->intervals[ocl::notemask].maxID = INT_MAX;
+    this->intervals[ocl::retVal].minID = INT_MAX;
+    this->intervals[ocl::retVal].maxID = -INT_MAX;
+
      // Sort both intervals in starting point and ending point increasing orders
      const uint32_t regNum = ctx.sel->getRegNum();
      this->starting.resize(regNum);
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h

index e574764..9a3570e 100644 (file)
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -76,7 +76,9 @@ enum gbe_curbe_type {
    GBE_CURBE_KERNEL_ARGUMENT,
    GBE_CURBE_EXTRA_ARGUMENT,
    GBE_CURBE_BLOCK_IP,
-  GBE_CURBE_THREAD_NUM
+  GBE_CURBE_THREAD_NUM,
+  GBE_CURBE_EMASK,
+  GBE_CURBE_NOT_EMASK,
  };
  
  /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp

index c15c292..4273f66 100644 (file)
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -227,7 +227,7 @@ namespace ir {
          GBE_ASSERT(target != NULL);
          target->predecessors.insert(&bb);
          bb.successors.insert(target);
-        if (insn.isPredicated() == true) jumpToNext = &bb;
+        if ( insn.isPredicated() == true) jumpToNext = &bb;
        }
      });
    }
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp

index b693c2b..9c3f333 100644 (file)
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,7 @@ namespace ir {
          "stack_pointer",
          "block_ip",
          "barrier_id", "thread_number",
-        "work_dimension", "sampler_info", "retVal"
+        "work_dimension", "sampler_info", "emask", "notemask", "retVal"
      };
  
  #if GBE_DEBUG
@@ -77,6 +77,8 @@ namespace ir {
        DECL_NEW_REG(FAMILY_DWORD, threadn);
        DECL_NEW_REG(FAMILY_DWORD, workdim);
        DECL_NEW_REG(FAMILY_WORD, samplerinfo);
+      DECL_NEW_REG(FAMILY_WORD, emask);
+      DECL_NEW_REG(FAMILY_WORD, notemask);
        DECL_NEW_REG(FAMILY_WORD, retVal);
      }
  #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp

index 2f16741..90c504f 100644 (file)
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -65,8 +65,10 @@ namespace ir {
      static const Register threadn = Register(21);  // number of threads
      static const Register workdim = Register(22);  // work dimention.
      static const Register samplerinfo = Register(23); // store sampler info.
-    static const Register retVal = Register(24);   // helper register to do data flow analysis.
-    static const uint32_t regNum = 25;             // number of special registers
+    static const Register emask = Register(24);    // store the emask bits for the branching fix.
+    static const Register notemask = Register(25); // store the !emask bits for the branching fix.
+    static const Register retVal = Register(26);   // helper register to do data flow analysis.
+    static const uint32_t regNum = 27;             // number of special registers
      extern const char *specialRegMean[];           // special register name.
    } /* namespace ocl */
author	Zhigang Gong <zhigang.gong@intel.com>
	Mon, 30 Dec 2013 10:55:01 +0000 (18:55 +0800)
committer	Zhigang Gong <zhigang.gong@intel.com>
	Tue, 31 Dec 2013 05:55:15 +0000 (13:55 +0800)
backend/src/backend/context.cpp		patch \| blob \| history
backend/src/backend/gen_context.cpp		patch \| blob \| history
backend/src/backend/gen_insn_selection.cpp		patch \| blob \| history
backend/src/backend/gen_reg_allocation.cpp		patch \| blob \| history
backend/src/backend/program.h		patch \| blob \| history
backend/src/ir/function.cpp		patch \| blob \| history
backend/src/ir/profile.cpp		patch \| blob \| history
backend/src/ir/profile.hpp		patch \| blob \| history