GBE: Disable SPF and use JMPI + IF/ENDIF to handle each blocks.

author Zhigang Gong <zhigang.gong@intel.com>

Tue, 18 Mar 2014 07:28:44 +0000 (15:28 +0800)

committer Zhigang Gong <zhigang.gong@intel.com>

Tue, 8 Apr 2014 08:21:32 +0000 (16:21 +0800)
author Zhigang Gong <zhigang.gong@intel.com>
Tue, 18 Mar 2014 07:28:44 +0000 (15:28 +0800)
committer Zhigang Gong <zhigang.gong@intel.com>
Tue, 8 Apr 2014 08:21:32 +0000 (16:21 +0800)
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c

index 84ef0c8..e58ef31 100644 (file)
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -100,13 +100,13 @@ static const struct {
    [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
    [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
    [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
-  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 },
-  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 },
-  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
    [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
    [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
    [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
@@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void *opaque_insn)
      } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
            inst->header.opcode == GEN_OPCODE_ELSE ||
            inst->header.opcode == GEN_OPCODE_ENDIF ||
-          inst->header.opcode == GEN_OPCODE_WHILE)) {
-      // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
-      assert(0);
+          inst->header.opcode == GEN_OPCODE_WHILE ||
+          inst->header.opcode == GEN_OPCODE_BRD ||
+          inst->header.opcode == GEN_OPCODE_JMPI)) {
+      format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
      } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
            inst->header.opcode == GEN_OPCODE_CONTINUE ||
-          inst->header.opcode == GEN_OPCODE_HALT)) {
-      // XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip);
-      assert(0);
-    } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+          inst->header.opcode == GEN_OPCODE_HALT ||
+          inst->header.opcode == GEN_OPCODE_BRC)) {
+      format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+    }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
        format (file, " %d", inst->bits3.d);
-    }
+    }*/
  
      if (opcode[inst->header.opcode].nsrc > 0) {
        pad (file, 32);
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp

index c46127a..19bc9d2 100644 (file)
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -87,33 +87,29 @@ namespace gbe
        const LabelIndex label = pair.first;
        const int32_t insnID = pair.second;
        const int32_t targetID = labelPos.find(label)->second;
-      p->patchJMPI(insnID, (targetID-insnID-1) * 2);
+      p->patchJMPI(insnID, (targetID - insnID) * 2);
+    }
+    for (auto pair : branchPos3) {
+      const LabelPair labelPair = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t jip = labelPos.find(labelPair.l0)->second + labelPair.offset0;
+      const int32_t uip = labelPos.find(labelPair.l1)->second + labelPair.offset1;
+      assert((jip - insnID) * 2 < 32767 && (jip - insnID) * 2 > -32768);
+      assert((uip - insnID) * 2 < 32767 && (uip - insnID) * 2 > -32768);
+      p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2));
      }
    }
  
    void GenContext::clearFlagRegister(void) {
      // when group size not aligned to simdWidth, flag register need clear to
      // make prediction(any8/16h) work correctly
-    const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
-    const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
-    uint32_t execWidth = p->curr.execWidth;
+    const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
      p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    /* clear all the bit in f0.0. */
-    p->curr.execWidth = 1;
-    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000));
-    /* clear the barrier mask bits to all zero0*/
-    p->curr.noMask = 0;
-    p->curr.useFlag(0, 0);
-    p->curr.execWidth = execWidth;
-    /* set all the active lane to 1. Inactive lane remains 0. */
-    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
-    p->curr.noMask = 1;
-    p->curr.execWidth = 1;
-    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
-    p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
-    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+      p->curr.noMask = 0;
+      p->MOV(blockip, GenRegister::immuw(0));
      p->pop();
    }
  
@@ -148,7 +144,6 @@ namespace gbe
      // Check that everything is consistent in the kernel code
      const uint32_t perLaneSize = kernel->getStackSize();
      const uint32_t perThreadSize = perLaneSize * this->simdWidth;
-    //const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
      GBE_ASSERT(perLaneSize > 0);
      GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
      GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
@@ -325,6 +320,7 @@ namespace gbe
          for (int i = 0; i < w / 8; i ++) {
            p->push();
            p->curr.predicate = GEN_PREDICATE_NONE;
+          p->curr.noMask = 1;
            p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
            p->curr.accWrEnable = 1;
            p->MACH(tmp, src0, src1);
@@ -500,6 +496,7 @@ namespace gbe
      int execWidth = p->curr.execWidth;
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      p->curr.execWidth = 8;
      for (int nib = 0; nib < execWidth / 4; nib ++) {
        p->AND(dest, src.bottom_half(), GenRegister::immud(63));
@@ -539,6 +536,7 @@ namespace gbe
    void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
      p->SHR(sign, high, GenRegister::immud(31));
      p->push();
+    p->curr.noMask = 1;
      p->curr.predicate = GEN_PREDICATE_NONE;
      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
      p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
@@ -574,6 +572,7 @@ namespace gbe
        I64FullMult(e, f, g, h, a, b, c, d);
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -626,6 +625,7 @@ namespace gbe
        p->OR(a, e, f);
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->CMP(GEN_CONDITIONAL_NZ, a, zero);
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -639,6 +639,7 @@ namespace gbe
        I64FullMult(e, f, g, h, a, b, c, d);
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->CMP(GEN_CONDITIONAL_NZ, i, zero);
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -670,6 +671,7 @@ namespace gbe
        p->push();
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->CMP(GEN_CONDITIONAL_NZ, e, zero);
        p->curr.predicate = GEN_PREDICATE_NORMAL;
        p->MOV(b, one);
@@ -793,6 +795,7 @@ namespace gbe
        case SEL_OP_I64SHL:
          p->push();
          p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
          collectShifter(a, y);
          loadBottomHalf(e, x);
          loadTopHalf(f, x);
@@ -820,6 +823,7 @@ namespace gbe
        case SEL_OP_I64SHR:
          p->push();
          p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
          collectShifter(a, y);
          loadBottomHalf(e, x);
          loadTopHalf(f, x);
@@ -848,6 +852,7 @@ namespace gbe
          f.type = GEN_TYPE_D;
          p->push();
          p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
          collectShifter(a, y);
          loadBottomHalf(e, x);
          loadTopHalf(f, x);
@@ -894,6 +899,7 @@ namespace gbe
      p->push();
        p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));   //high == 0
        p->curr.predicate = GEN_PREDICATE_NORMAL;
        p->MOV(dst, low);
@@ -911,6 +917,7 @@ namespace gbe
        p->pop();
  
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
        p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));  //exp>23 && high!=0
@@ -936,6 +943,7 @@ namespace gbe
        p->pop();
  
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
        p->MOV(dst_ud, GenRegister::immud(0));   //exp==9, SHR == 0
@@ -956,7 +964,7 @@ namespace gbe
        p->SHL(high, low, tmp);
        p->MOV(low, GenRegister::immud(0));
  
-      p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
+      p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2);
        p->curr.predicate = GEN_PREDICATE_NONE;
        p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));  //update dst where high != 0
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -970,7 +978,7 @@ namespace gbe
        p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
        p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
        p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
-      p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
+      p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2);
  
      p->pop();
  
@@ -994,6 +1002,7 @@ namespace gbe
        p->MOV(tmp_high, high);
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
        p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1006,6 +1015,7 @@ namespace gbe
        UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
        p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1039,6 +1049,7 @@ namespace gbe
      if(dst.is_signed_int()) {
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
        p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1066,11 +1077,10 @@ namespace gbe
                  f1.width = GEN_WIDTH_1;
      GenRegister f2 = GenRegister::suboffset(f1, 1);
      GenRegister f3 = GenRegister::suboffset(f1, 2);
-    GenRegister f4 = GenRegister::suboffset(f1, 3);
  
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
-    saveFlag(f4, flag, subFlag);
+    p->curr.noMask = 1;
      loadTopHalf(tmp0, src0);
      loadTopHalf(tmp1, src1);
      switch(insn.extra.function) {
@@ -1130,12 +1140,13 @@ namespace gbe
          NOT_IMPLEMENTED;
      }
      p->curr.execWidth = 1;
-    p->AND(f1, f1, f4);
      p->MOV(GenRegister::flag(flag, subFlag), f1);
      p->pop();
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      p->MOV(dst, GenRegister::immd(0));
+    p->curr.noMask = 0;
      p->curr.predicate = GEN_PREDICATE_NORMAL;
      p->MOV(dst, GenRegister::immd(-1));
      p->pop();
@@ -1163,6 +1174,7 @@ namespace gbe
      p->ADD(c, c, d);
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
      if(! dst.is_signed_int()) {
        p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1176,6 +1188,7 @@ namespace gbe
        p->MOV(a, GenRegister::immud(0x80000000u));
        p->MOV(b, GenRegister::immud(0));
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
        p->curr.predicate = GEN_PREDICATE_NORMAL;
        p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
@@ -1209,6 +1222,7 @@ namespace gbe
      p->ADD(c, c, d);
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
      if(! dst.is_signed_int()) {
        p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1238,6 +1252,7 @@ namespace gbe
      src = src.top_half();
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      p->curr.execWidth = 8;
      p->MOV(dest, src);
      p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1252,6 +1267,7 @@ namespace gbe
      int execWidth = p->curr.execWidth;
      dest = dest.top_half();
      p->push();
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
      p->curr.execWidth = 8;
      p->MOV(dest, src);
      p->curr.nibControl = 1;
@@ -1271,6 +1287,7 @@ namespace gbe
      src = src.bottom_half();
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      p->curr.execWidth = 8;
      p->MOV(dest, src);
      p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1286,6 +1303,7 @@ namespace gbe
      dest = dest.bottom_half();
      p->push();
      p->curr.execWidth = 8;
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
      p->MOV(dest, src);
      p->curr.nibControl = 1;
      p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1369,6 +1387,7 @@ namespace gbe
      loadBottomHalf(d, y);
      p->push();
      p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
      I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
      I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
      p->ADD(e, e, f);
@@ -1443,6 +1462,7 @@ namespace gbe
        // condition <- (c,d)==0 && (a,b)>=(e,f)
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->MOV(l, zero);
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->CMP(GEN_CONDITIONAL_EQ, a, e);
@@ -1477,6 +1497,7 @@ namespace gbe
        p->ADD(m, m, one);
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
  
@@ -1484,7 +1505,6 @@ namespace gbe
        p->curr.noMask = 1;
        p->AND(flagReg, flagReg, emaskReg);
  
-      p->curr.predicate = GEN_PREDICATE_NORMAL;
        // under condition, jump back to start point
        if (simdWidth == 8)
          p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -1493,8 +1513,9 @@ namespace gbe
        else
          NOT_IMPLEMENTED;
        int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+      p->curr.noMask = 1;
        p->JMPI(zero);
-      p->patchJMPI(p->n_instruction()-2, jip);
+      p->patchJMPI(p->n_instruction() - 2, jip + 2);
        p->pop();
        // end of loop
      }
@@ -1502,6 +1523,7 @@ namespace gbe
      if(x.is_signed_int()) {
        p->push();
        p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
        p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
        p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1534,7 +1556,7 @@ namespace gbe
    }
  
    void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
-    NOT_IMPLEMENTED;
+   p->NOP();
    }
  
    void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
@@ -1546,59 +1568,24 @@ namespace gbe
      const GenRegister fenceDst = ra->genReg(insn.dst(0));
      uint32_t barrierType = insn.extra.barrierType;
      const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
-    GenRegister blockIP;
-    uint32_t exeWidth = p->curr.execWidth;
-    ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
-
-    if (exeWidth == 16)
-      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
-    else if (exeWidth == 8)
-      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
  
-    p->push();
-    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
-       after the barrier, If there is any lane still remains zero. */
-    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
-    p->curr.noMask = 1;
-    p->curr.execWidth = 1;
-    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
-    if (exeWidth == 16)
-      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
-    else if (exeWidth == 8)
-      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
-    else
-      NOT_IMPLEMENTED;
-    p->curr.inversePredicate = 1;
-    // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
-    // jump to next basic block.
-    p->JMPI(GenRegister::immud(0));
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
-    p->pop();
-
-    p->push();
-    p->curr.useFlag(0, 0);
-    /* Restore the blockIP to current label. */
-    p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
      if (barrierType == ir::syncGlobalBarrier) {
        p->FENCE(fenceDst);
        p->MOV(fenceDst, fenceDst);
      }
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    // As only the payload.2 is used and all the other regions are ignored
-    // SIMD8 mode here is safe.
-    p->curr.execWidth = 8;
-    p->curr.physicalFlag = 0;
-    p->curr.noMask = 1;
-    // Copy barrier id from r0.
-    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
-    // A barrier is OK to start the thread synchronization *and* SLM fence
-    p->BARRIER(src);
-    // Now we wait for the other threads
-    p->curr.execWidth = 1;
-    p->WAIT();
-    // we executed the barrier then restore the barrier soft mask to initial value.
-    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+    p->push();
+      // As only the payload.2 is used and all the other regions are ignored
+      // SIMD8 mode here is safe.
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      // Copy barrier id from r0.
+      p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      p->BARRIER(src);
+      p->curr.execWidth = 1;
+      // Now we wait for the other threads
+      p->WAIT();
      p->pop();
    }
  
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp

index 7c49497..e731174 100644 (file)
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -896,6 +896,11 @@ struct GenInstruction
        uint32_t end_of_thread:1;
      } gen7_msg_gw;
  
+    struct {
+      uint32_t jip:16;
+      uint32_t uip:16;
+    } gen7_branch;
+
      int d;
      uint32_t ud;
      float f;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp

index fc7e53d..06aa769 100644 (file)
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -837,6 +837,7 @@ namespace gbe
      GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
      push();
      curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
      curr.execWidth = 1;
      MOV(r, GenRegister::immud(u.u[1]));
      MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
@@ -907,6 +908,7 @@ namespace gbe
        push();
        curr.execWidth = 8;
        curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = 1;
        MOV(r0, src0);
        MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
        curr.predicate = GEN_PREDICATE_NORMAL;
@@ -920,6 +922,7 @@ namespace gbe
          push();
          curr.execWidth = 8;
          curr.predicate = GEN_PREDICATE_NONE;
+        curr.noMask = 1;
          MOV(r0, GenRegister::suboffset(src0, 8));
          MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
          curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1058,7 +1061,7 @@ namespace gbe
  
  #define ALU2_BRA(OP) \
    void GenEncoder::OP(GenRegister src) { \
-    alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(), src); \
+    alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
    }
  
    ALU2_BRA(IF)
@@ -1071,9 +1074,21 @@ namespace gbe
      GBE_ASSERT(insnID < this->store.size());
      GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
                 insn.header.opcode == GEN_OPCODE_BRD  ||
-               insn.header.opcode == GEN_OPCODE_ENDIF);
-    if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
-          this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC);
+
+    if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768))  {
+          int offset = 0;
+           if (insn.header.opcode == GEN_OPCODE_IF) {
+             this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+             return;
+           }
+           else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+             offset = -2;
+             /*assert(jumpDistance > -32769 && jumpDistance < 32768);*/
+           }
+          this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset));
      } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
        // For the conditional jump distance out of S15 range, we need to use an
        // inverted jmp followed by a add ip, ip, distance to implement.
@@ -1085,10 +1100,12 @@ namespace gbe
        // for all the branching instruction. And need to adjust the distance
        // for those branch instruction's start point and end point contains
        // this instruction.
+      GenInstruction &insn2 = this->store[insnID+1];
+      GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
        insn.header.opcode = GEN_OPCODE_ADD;
        this->setDst(&insn, GenRegister::ip());
        this->setSrc0(&insn, GenRegister::ip());
-      this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
+      this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
      } else {
        insn.header.predicate_inverse ^= 1;
        this->setSrc1(&insn, GenRegister::immd(2));
@@ -1099,7 +1116,7 @@ namespace gbe
        insn2.header.opcode = GEN_OPCODE_ADD;
        this->setDst(&insn2, GenRegister::ip());
        this->setSrc0(&insn2, GenRegister::ip());
-      this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
+      this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
      }
    }
  
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp

index d86e04c..147c3e6 100644 (file)
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -76,8 +76,6 @@
   *
   * Also, there is some extra kludge to handle the predicates for JMPI.
   *
- * See TODO for a better idea for branching and masking
- *
   * TODO:
   * =====
   *
@@ -92,14 +90,9 @@
   * interesting approach which consists in traversing the dominator tree in post
   * order
   *
- * About masking and branching, a much better idea (that I found later unfortunately)
- * is to replace the use of the flag by uses of if/endif to enclose the basic
- * block. So, instead of using predication, we use auto-masking. The very cool
- * consequence is that we can reintegrate back the structured branches.
- * Basically, we will be able to identify branches that can be mapped to
- * structured branches and mix nicely unstructured branches (which will use
- * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
- * fast)
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
   */
  
  #include "backend/gen_insn_selection.hpp"
@@ -320,38 +313,6 @@ namespace gbe
      INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
      /*! indicate whether a register is a scalar/uniform register. */
      INLINE bool isScalarReg(const ir::Register &reg) const {
-#if 0
-      printf("reg %d ", reg.value());
-      printf("uniform: %d ", getRegisterData(reg).isUniform());
-      if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n"); return true; }
-      if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push location.\n"); return true; }
-      if (reg == ir::ocl::groupid0  ||
-          reg == ir::ocl::groupid1  ||
-          reg == ir::ocl::groupid2  ||
-          reg == ir::ocl::barrierid ||
-          reg == ir::ocl::threadn   ||
-          reg == ir::ocl::numgroup0 ||
-          reg == ir::ocl::numgroup1 ||
-          reg == ir::ocl::numgroup2 ||
-          reg == ir::ocl::lsize0    ||
-          reg == ir::ocl::lsize1    ||
-          reg == ir::ocl::lsize2    ||
-          reg == ir::ocl::gsize0    ||
-          reg == ir::ocl::gsize1    ||
-          reg == ir::ocl::gsize2    ||
-          reg == ir::ocl::goffset0  ||
-          reg == ir::ocl::goffset1  ||
-          reg == ir::ocl::goffset2  ||
-          reg == ir::ocl::workdim   ||
-          reg == ir::ocl::emask     ||
-          reg == ir::ocl::notemask  ||
-          reg == ir::ocl::barriermask
-        ) {
-        printf("special reg.\n");
-        return true;
-      }
-      return false;
-#endif
        const ir::RegisterData &regData = getRegisterData(reg);
        return regData.isUniform();
      }
@@ -992,7 +953,7 @@ namespace gbe
    }
  
    void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
      insn->src(0) = src;
      insn->index = uint16_t(jip);
    }
@@ -1412,9 +1373,17 @@ namespace gbe
      for (uint32_t regID = 0; regID < this->regNum; ++regID)
        this->regDAG[regID] = NULL;
  
+    this->block->hasBarrier = false;
+    this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+                             bb.getLastInstruction()->getOpcode() == OP_RET;
+    if (!this->block->hasBranch)
+      this->block->endifOffset = -1;
+
      // Build the DAG on the fly
      uint32_t insnNum = 0;
      const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+      if (insn.getOpcode() == OP_SYNC)
+        this->block->hasBarrier = true;
  
        // Build a selectionDAG node for instruction
        SelectionDAG *dag = this->newSelectionDAG(insn);
@@ -1465,6 +1434,7 @@ namespace gbe
    void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
    {
      // Bottom up code generation
+    bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
      for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
        // Process all possible patterns for this instruction
        SelectionDAG &dag = *insnDAG[insnID];
@@ -1476,8 +1446,10 @@ namespace gbe
  
          // Start a new code fragment
          this->startBackwardGeneration();
+        // If there is no branch at the end of this block.
  
          // Try all the patterns from best to worst
+
          do {
            if ((*it)->emit(*this, dag))
              break;
@@ -1485,6 +1457,13 @@ namespace gbe
          } while (it != end);
          GBE_ASSERT(it != end);
  
+        if (needEndif) {
+          const ir::BasicBlock *curr = insn.getParent();
+          const ir::BasicBlock *next = curr->getNextBlock();
+          this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+          needEndif = false;
+        }
+
          // Output the code in the current basic block
          this->endBackwardGeneration();
        }
@@ -2133,6 +2112,7 @@ namespace gbe
        const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
  
        sel.push();
+        sel.curr.noMask = 1;
          sel.curr.predicate = GEN_PREDICATE_NONE;
          sel.curr.execWidth = simdWidth;
          sel.SEL_CMP(genCmp, tmp, src0, src1);
@@ -2329,7 +2309,6 @@ namespace gbe
        const Type type = insn.getType();
        const Immediate imm = insn.getImmediate();
        const GenRegister dst = sel.selReg(insn.getDst(0), type);
-      GenRegister flagReg;
  
        sel.push();
        if (sel.isScalarOrBool(insn.getDst(0)) == true) {
@@ -2371,24 +2350,10 @@ namespace gbe
      {
        using namespace ir;
        const ir::Register reg = sel.reg(FAMILY_DWORD);
-      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
        const uint32_t params = insn.getParameters();
  
-      sel.push();
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.noMask = 1;
-        sel.curr.execWidth = 1;
-        sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask);
-        sel.MOV(GenRegister::flag(1, 1), barrierMask);
-      sel.pop();
-
        // A barrier is OK to start the thread synchronization *and* SLM fence
-      sel.push();
-        //sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.flag = 1;
-        sel.curr.subFlag = 1;
-        sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
-      sel.pop();
+      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
        return true;
      }
  
@@ -2696,7 +2661,7 @@ namespace gbe
        GenRegister tmpDst;
  
        if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)
-        tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL);
+        tmpDst = sel.selReg(dst, TYPE_BOOL);
        else
          tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32);
  
@@ -2724,36 +2689,23 @@ namespace gbe
        sel.push();
          sel.curr.flag = 1;
          sel.curr.subFlag = 1;
-        sel.curr.predicate  = GEN_PREDICATE_NONE;
          if (type == TYPE_S64 || type == TYPE_U64) {
            GenRegister tmp[3];
            for(int i=0; i<3; i++)
              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          sel.push();
-            sel.curr.execWidth = 1;
-            sel.curr.noMask = 1;
-            sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
-          sel.pop();
-          sel.curr.predicate = GEN_PREDICATE_NORMAL;
            sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst);
          } else if(opcode == OP_ORD) {
            sel.push();
-            sel.curr.execWidth = 1;
-            sel.curr.noMask = 1;
-            sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
+            sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
            sel.pop();
-          sel.curr.predicate = GEN_PREDICATE_NORMAL;
-
-          sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
-          sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
          } else
            sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
        sel.pop();
  
        if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16))
          sel.MOV(sel.selReg(dst, TYPE_U16), GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg));
-      else
-        sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst);
        return true;
      }
    };
@@ -2979,11 +2931,6 @@ namespace gbe
          markAllChildren(dag);
        }
  
-      // Since we cannot predicate the select instruction with our current mask,
-      // we need to perform the selection in two steps (one to select, one to
-      // update the destination register)
-      const RegisterFamily family = getFamily(type);
-      const GenRegister tmp = sel.selReg(sel.reg(family), type);
        const uint32_t simdWidth = sel.ctx.getSimdWidth();
        const Register pred = insn.getPredicate();
        sel.push();
@@ -2992,16 +2939,14 @@ namespace gbe
          sel.curr.flag = 1;
          sel.curr.subFlag = 1;
          sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
-        sel.curr.noMask = 0;
+        //sel.curr.noMask = 0;
          sel.curr.predicate = GEN_PREDICATE_NORMAL;
          if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
-          sel.SEL_INT64(tmp, src0, src1);
+          sel.SEL_INT64(dst, src0, src1);
          else
-          sel.SEL(tmp, src0, src1);
+          sel.SEL(dst, src0, src1);
        sel.pop();
  
-      // Update the destination register properly now
-      sel.MOV(dst, tmp);
        return true;
      }
    };
@@ -3041,6 +2986,7 @@ namespace gbe
      DECL_CTOR(TernaryInstruction, 1, 1);
     };
  
+
    /*! Label instruction pattern */
    DECL_PATTERN(LabelInstruction)
    {
@@ -3053,42 +2999,75 @@ namespace gbe
        const uint32_t simdWidth = sel.ctx.getSimdWidth();
        sel.LABEL(label);
  
-     // Do not emit any code for the "returning" block. There is no need for it
-     if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+      // Do not emit any code for the "returning" block. There is no need for it
+      if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
          return true;
  
+      LabelIndex jip;
+      const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+      if (sel.ctx.hasJIP(&insn))
+        jip = sel.ctx.getLabelIndex(&insn);
+      else
+        jip = nextLabel;
+
        // Emit the mask computation at the head of each basic block
        sel.push();
+        sel.curr.noMask = 1;
          sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.flag = 0;
-        sel.curr.subFlag = 0;
          sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1);
        sel.pop();
  
-      // If it is required, insert a JUMP to bypass the block
-      if (sel.ctx.hasJIP(&insn)) {
-        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+      if (sel.block->hasBarrier) {
+        // If this block has barrier, we don't execute the block until all lanes
+        // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+        // lane not reached, we jump to jip. And no need to issue if/endif for
+        // this block, as it will always excute with all lanes activated.
          sel.push();
-
-          sel.curr.noMask = 1;
-          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
            sel.curr.predicate = GEN_PREDICATE_NONE;
-          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
-          GenRegister flagReg = GenRegister::flag(0, 0);
-          sel.AND(flagReg, flagReg, emaskReg);
-
+          sel.curr.noMask = 1;
+          sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
            if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
            else if (simdWidth == 16)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
            else
              NOT_IMPLEMENTED;
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
            sel.curr.inversePredicate = 1;
-          sel.curr.flag = 0;
-          sel.curr.subFlag = 0;
            sel.JMPI(GenRegister::immd(0), jip);
          sel.pop();
+        // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+        sel.push();
+         sel.curr.predicate = GEN_PREDICATE_NORMAL;
+         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+        sel.pop();
+      }
+      else {
+        if (sel.ctx.hasJIP(&insn)) {
+          // If it is required, insert a JUMP to bypass the block
+          sel.push();
+            if (simdWidth == 8)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            else if (simdWidth == 16)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            else
+              NOT_IMPLEMENTED;
+            sel.curr.noMask = 1;
+            sel.curr.execWidth = 1;
+            sel.curr.inversePredicate = 1;
+            sel.JMPI(GenRegister::immd(0), jip);
+          sel.pop();
+        }
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          // It's easier to set the jip to a relative position over next block.
+          sel.IF(GenRegister::immd(0), nextLabel, nextLabel, sel.block->endifOffset, sel.block->endifOffset);
+        sel.pop();
        }
+
        return true;
      }
      DECL_CTOR(LabelInstruction, 1, 1);
@@ -3225,7 +3204,6 @@ namespace gbe
    /*! Branch instruction pattern */
    DECL_PATTERN(BranchInstruction)
    {
-
      void emitForwardBranch(Selection::Opaque &sel,
                             const ir::BranchInstruction &insn,
                             ir::LabelIndex dst,
@@ -3233,16 +3211,13 @@ namespace gbe
      {
        using namespace ir;
        const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
-      const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
  
        // We will not emit any jump if we must go the next block anyway
        const BasicBlock *curr = insn.getParent();
        const BasicBlock *next = curr->getNextBlock();
        const LabelIndex nextLabel = next->getLabelIndex();
-
        if (insn.isPredicated() == true) {
          const Register pred = insn.getPredicateIndex();
-
          sel.push();
            // we don't need to set next label to the pcip
            // as if there is no backward jump latter, then obviously everything will work fine.
@@ -3250,22 +3225,30 @@ namespace gbe
            sel.curr.flag = 0;
            sel.curr.subFlag = 0;
            sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
            sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), nextLabel);
+          sel.block->endifOffset = -1;
          sel.pop();
-
-        if (nextLabel == jip) return;
        } else {
          // Update the PcIPs
+        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
-        // Do not emit branch when we go to the next block anyway
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), nextLabel);
+        sel.block->endifOffset = -1;
          if (nextLabel == jip) return;
+        // Branch to the jump target
          sel.push();
            sel.curr.execWidth = 1;
            sel.curr.noMask = 1;
            sel.curr.predicate = GEN_PREDICATE_NONE;
            sel.JMPI(GenRegister::immd(0), jip);
          sel.pop();
+        // FIXME just for the correct endif offset.
+        // JMPI still has 2 instruction.
+        sel.block->endifOffset -= 2;
        }
      }
  
@@ -3290,37 +3273,32 @@ namespace gbe
          // that actually take the branch
          const LabelIndex next = bb.getNextBlock()->getLabelIndex();
          sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
-
+        GBE_ASSERT(jip == dst);
          sel.push();
            sel.curr.flag = 0;
            sel.curr.subFlag = 0;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
            sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
-          // Re-update the PcIPs for the branches that takes the backward jump
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
            sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
-          // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
-          // will check those bits as well.
            sel.curr.predicate = GEN_PREDICATE_NONE;
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), next);
            sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
-          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
-          sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1), emaskReg);
-
-          // Branch to the jump target
-          if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
-          else if (simdWidth == 16)
+          if (simdWidth == 16)
              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
            else
-            NOT_SUPPORTED;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+          sel.curr.noMask = 1;
            sel.JMPI(GenRegister::immd(0), jip);
+          sel.block->endifOffset = -3;
          sel.pop();
-
        } else {
-
+        const LabelIndex next = bb.getNextBlock()->getLabelIndex();
          // Update the PcIPs
          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), next);
          // Branch to the jump target
          sel.push();
            sel.curr.execWidth = 1;
@@ -3328,6 +3306,7 @@ namespace gbe
            sel.curr.predicate = GEN_PREDICATE_NONE;
            sel.JMPI(GenRegister::immd(0), jip);
          sel.pop();
+        sel.block->endifOffset = -3;
        }
      }
  
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp

index 04fbb9f..8557768 100644 (file)
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -42,6 +42,8 @@ namespace gbe
    /*! Translate IR compare to Gen compare */
    uint32_t getGenCompare(ir::Opcode opcode);
  
+  #define GEN_MAX_LABEL 0xFFFF
+
    /*! Selection opcodes properly encoded from 0 to n for fast jump tables
     *  generations
     */
@@ -190,6 +192,9 @@ namespace gbe
      void append(SelectionInstruction *insn);
      /*! Append a new selection instruction at the beginning of the block */
      void prepend(SelectionInstruction *insn);
+    int endifOffset;
+    bool hasBarrier;
+    bool hasBranch;
    };
  
    /*! Owns the selection engine */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx

index d318f8e..ddc9d5e 100644 (file)
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
  DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
  DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
  DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
-DECL_SELECTION_IR(BRC, BinaryInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
  DECL_SELECTION_IR(BRD, UnaryInstruction)
  DECL_SELECTION_IR(IF, UnaryInstruction)
  DECL_SELECTION_IR(ENDIF, UnaryInstruction)
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp

index 25cb428..051f16d 100644 (file)
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -118,7 +118,7 @@ namespace gbe
        this->noMask = 0;
        this->flag = 0;
        this->subFlag = 0;
-      this->predicate = GEN_PREDICATE_NORMAL;
+      this->predicate = GEN_PREDICATE_NONE;
        this->inversePredicate = 0;
        this->physicalFlag = 1;
        this->flagIndex = 0;
@@ -657,6 +657,17 @@ namespace gbe
                           GEN_HORIZONTAL_STRIDE_1);
      }
  
+    static INLINE GenRegister nullud(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_NULL,
+                         0,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+
      static INLINE bool isNull(GenRegister reg) {
        return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
                && reg.nr == GEN_ARF_NULL);
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c

index 5e474de..c2308da 100644 (file)
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver)
  {
    driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
    assert(driver->bufmgr);
+  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1); 
    drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
  }
  
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index e95b050..4819e9e 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
    memset(desc, 0, sizeof(*desc));
    ker_bo = (drm_intel_bo *) kernel->bo;
    desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
-  desc->desc1.single_program_flow = 1;
+  desc->desc1.single_program_flow = 0;
    desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
    desc->desc5.rounding_mode = 0; /* round to nearest even */
author	Zhigang Gong <zhigang.gong@intel.com>
	Tue, 18 Mar 2014 07:28:44 +0000 (15:28 +0800)
committer	Zhigang Gong <zhigang.gong@intel.com>
	Tue, 8 Apr 2014 08:21:32 +0000 (16:21 +0800)
backend/src/backend/gen/gen_mesa_disasm.c		patch \| blob \| history
backend/src/backend/gen_context.cpp		patch \| blob \| history
backend/src/backend/gen_defs.hpp		patch \| blob \| history
backend/src/backend/gen_encoder.cpp		patch \| blob \| history
backend/src/backend/gen_insn_selection.cpp		patch \| blob \| history
backend/src/backend/gen_insn_selection.hpp		patch \| blob \| history
backend/src/backend/gen_insn_selection.hxx		patch \| blob \| history
backend/src/backend/gen_register.hpp		patch \| blob \| history
src/intel/intel_driver.c		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history