From 4850c558cbec9234250a6e81601dd70259e10f74 Mon Sep 17 00:00:00 2001
From: Zhigang Gong <zhigang.gong@intel.com>
Date: Tue, 18 Mar 2014 15:28:44 +0800
Subject: [PATCH] GBE: Disable SPF and use JMPI + IF/ENDIF to handle each
 blocks.

When enable SPF (single program flow), we always need to use f0
as the predication of almost each instruction. This bring some
trouble when we want to get tow levels mask mechanism, for an
example the SEL instruction, and some BOOL operations. We
have to use more than one instructions to do that and simply
introduce 100% of overhead of those instructions.

v2:
fix the wrong assertion.

Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
Reviewed-by: "Yang, Rong R" <rong.r.yang@intel.com>
Reviewed-by: "Song, Ruiling" <ruiling.song@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c  |  31 ++--
 backend/src/backend/gen_context.cpp        | 141 ++++++++---------
 backend/src/backend/gen_defs.hpp           |   5 +
 backend/src/backend/gen_encoder.cpp        |  29 +++-
 backend/src/backend/gen_insn_selection.cpp | 237 +++++++++++++----------------
 backend/src/backend/gen_insn_selection.hpp |   5 +
 backend/src/backend/gen_insn_selection.hxx |   2 +-
 backend/src/backend/gen_register.hpp       |  13 +-
 src/intel/intel_driver.c                   |   1 +
 src/intel/intel_gpgpu.c                    |   2 +-
 10 files changed, 236 insertions(+), 230 deletions(-)

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 84ef0c8..e58ef31 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -100,13 +100,13 @@ static const struct {
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
-  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 },
-  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 },
-  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
   [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
@@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void *opaque_insn)
     } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
           inst->header.opcode == GEN_OPCODE_ELSE ||
           inst->header.opcode == GEN_OPCODE_ENDIF ||
-          inst->header.opcode == GEN_OPCODE_WHILE)) {
-      // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
-      assert(0);
+          inst->header.opcode == GEN_OPCODE_WHILE ||
+          inst->header.opcode == GEN_OPCODE_BRD ||
+          inst->header.opcode == GEN_OPCODE_JMPI)) {
+      format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
     } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
           inst->header.opcode == GEN_OPCODE_CONTINUE ||
-          inst->header.opcode == GEN_OPCODE_HALT)) {
-      // XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip);
-      assert(0);
-    } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+          inst->header.opcode == GEN_OPCODE_HALT ||
+          inst->header.opcode == GEN_OPCODE_BRC)) {
+      format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+    }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
       format (file, " %d", inst->bits3.d);
-    }
+    }*/
 
     if (opcode[inst->header.opcode].nsrc > 0) {
       pad (file, 32);
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index c46127a..19bc9d2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -87,33 +87,29 @@ namespace gbe
       const LabelIndex label = pair.first;
       const int32_t insnID = pair.second;
       const int32_t targetID = labelPos.find(label)->second;
-      p->patchJMPI(insnID, (targetID-insnID-1) * 2);
+      p->patchJMPI(insnID, (targetID - insnID) * 2);
+    }
+    for (auto pair : branchPos3) {
+      const LabelPair labelPair = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t jip = labelPos.find(labelPair.l0)->second + labelPair.offset0;
+      const int32_t uip = labelPos.find(labelPair.l1)->second + labelPair.offset1;
+      assert((jip - insnID) * 2 < 32767 && (jip - insnID) * 2 > -32768);
+      assert((uip - insnID) * 2 < 32767 && (uip - insnID) * 2 > -32768);
+      p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2));
     }
   }
 
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
-    const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
-    const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
-    uint32_t execWidth = p->curr.execWidth;
+    const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    /* clear all the bit in f0.0. */
-    p->curr.execWidth = 1;
-    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000));
-    /* clear the barrier mask bits to all zero0*/
-    p->curr.noMask = 0;
-    p->curr.useFlag(0, 0);
-    p->curr.execWidth = execWidth;
-    /* set all the active lane to 1. Inactive lane remains 0. */
-    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
-    p->curr.noMask = 1;
-    p->curr.execWidth = 1;
-    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
-    p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
-    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+      p->curr.noMask = 0;
+      p->MOV(blockip, GenRegister::immuw(0));
     p->pop();
   }
 
@@ -148,7 +144,6 @@ namespace gbe
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
     const uint32_t perThreadSize = perLaneSize * this->simdWidth;
-    //const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
     GBE_ASSERT(perLaneSize > 0);
     GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
     GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
@@ -325,6 +320,7 @@ namespace gbe
         for (int i = 0; i < w / 8; i ++) {
           p->push();
           p->curr.predicate = GEN_PREDICATE_NONE;
+          p->curr.noMask = 1;
           p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
           p->curr.accWrEnable = 1;
           p->MACH(tmp, src0, src1);
@@ -500,6 +496,7 @@ namespace gbe
     int execWidth = p->curr.execWidth;
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.execWidth = 8;
     for (int nib = 0; nib < execWidth / 4; nib ++) {
       p->AND(dest, src.bottom_half(), GenRegister::immud(63));
@@ -539,6 +536,7 @@ namespace gbe
   void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
     p->SHR(sign, high, GenRegister::immud(31));
     p->push();
+    p->curr.noMask = 1;
     p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
     p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
@@ -574,6 +572,7 @@ namespace gbe
       I64FullMult(e, f, g, h, a, b, c, d);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -626,6 +625,7 @@ namespace gbe
       p->OR(a, e, f);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NZ, a, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -639,6 +639,7 @@ namespace gbe
       I64FullMult(e, f, g, h, a, b, c, d);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NZ, i, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -670,6 +671,7 @@ namespace gbe
       p->push();
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_NZ, e, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->MOV(b, one);
@@ -793,6 +795,7 @@ namespace gbe
       case SEL_OP_I64SHL:
         p->push();
         p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
         collectShifter(a, y);
         loadBottomHalf(e, x);
         loadTopHalf(f, x);
@@ -820,6 +823,7 @@ namespace gbe
       case SEL_OP_I64SHR:
         p->push();
         p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
         collectShifter(a, y);
         loadBottomHalf(e, x);
         loadTopHalf(f, x);
@@ -848,6 +852,7 @@ namespace gbe
         f.type = GEN_TYPE_D;
         p->push();
         p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
         collectShifter(a, y);
         loadBottomHalf(e, x);
         loadTopHalf(f, x);
@@ -894,6 +899,7 @@ namespace gbe
     p->push();
       p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));   //high == 0
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->MOV(dst, low);
@@ -911,6 +917,7 @@ namespace gbe
       p->pop();
 
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));  //exp>23 && high!=0
@@ -936,6 +943,7 @@ namespace gbe
       p->pop();
 
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->MOV(dst_ud, GenRegister::immud(0));   //exp==9, SHR == 0
@@ -956,7 +964,7 @@ namespace gbe
       p->SHL(high, low, tmp);
       p->MOV(low, GenRegister::immud(0));
 
-      p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
+      p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2);
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));  //update dst where high != 0
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -970,7 +978,7 @@ namespace gbe
       p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
       p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
       p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
-      p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
+      p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2);
 
     p->pop();
 
@@ -994,6 +1002,7 @@ namespace gbe
       p->MOV(tmp_high, high);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
       p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1006,6 +1015,7 @@ namespace gbe
       UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
       p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1039,6 +1049,7 @@ namespace gbe
     if(dst.is_signed_int()) {
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
       p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1066,11 +1077,10 @@ namespace gbe
                 f1.width = GEN_WIDTH_1;
     GenRegister f2 = GenRegister::suboffset(f1, 1);
     GenRegister f3 = GenRegister::suboffset(f1, 2);
-    GenRegister f4 = GenRegister::suboffset(f1, 3);
 
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
-    saveFlag(f4, flag, subFlag);
+    p->curr.noMask = 1;
     loadTopHalf(tmp0, src0);
     loadTopHalf(tmp1, src1);
     switch(insn.extra.function) {
@@ -1130,12 +1140,13 @@ namespace gbe
         NOT_IMPLEMENTED;
     }
     p->curr.execWidth = 1;
-    p->AND(f1, f1, f4);
     p->MOV(GenRegister::flag(flag, subFlag), f1);
     p->pop();
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->MOV(dst, GenRegister::immd(0));
+    p->curr.noMask = 0;
     p->curr.predicate = GEN_PREDICATE_NORMAL;
     p->MOV(dst, GenRegister::immd(-1));
     p->pop();
@@ -1163,6 +1174,7 @@ namespace gbe
     p->ADD(c, c, d);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
     if(! dst.is_signed_int()) {
       p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1176,6 +1188,7 @@ namespace gbe
       p->MOV(a, GenRegister::immud(0x80000000u));
       p->MOV(b, GenRegister::immud(0));
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
@@ -1209,6 +1222,7 @@ namespace gbe
     p->ADD(c, c, d);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
     if(! dst.is_signed_int()) {
       p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1238,6 +1252,7 @@ namespace gbe
     src = src.top_half();
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.execWidth = 8;
     p->MOV(dest, src);
     p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1252,6 +1267,7 @@ namespace gbe
     int execWidth = p->curr.execWidth;
     dest = dest.top_half();
     p->push();
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
     p->curr.execWidth = 8;
     p->MOV(dest, src);
     p->curr.nibControl = 1;
@@ -1271,6 +1287,7 @@ namespace gbe
     src = src.bottom_half();
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.execWidth = 8;
     p->MOV(dest, src);
     p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1286,6 +1303,7 @@ namespace gbe
     dest = dest.bottom_half();
     p->push();
     p->curr.execWidth = 8;
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
     p->MOV(dest, src);
     p->curr.nibControl = 1;
     p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1369,6 +1387,7 @@ namespace gbe
     loadBottomHalf(d, y);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
     I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
     p->ADD(e, e, f);
@@ -1443,6 +1462,7 @@ namespace gbe
       // condition <- (c,d)==0 && (a,b)>=(e,f)
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->MOV(l, zero);
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_EQ, a, e);
@@ -1477,6 +1497,7 @@ namespace gbe
       p->ADD(m, m, one);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
 
@@ -1484,7 +1505,6 @@ namespace gbe
       p->curr.noMask = 1;
       p->AND(flagReg, flagReg, emaskReg);
 
-      p->curr.predicate = GEN_PREDICATE_NORMAL;
       // under condition, jump back to start point
       if (simdWidth == 8)
         p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -1493,8 +1513,9 @@ namespace gbe
       else
         NOT_IMPLEMENTED;
       int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+      p->curr.noMask = 1;
       p->JMPI(zero);
-      p->patchJMPI(p->n_instruction()-2, jip);
+      p->patchJMPI(p->n_instruction() - 2, jip + 2);
       p->pop();
       // end of loop
     }
@@ -1502,6 +1523,7 @@ namespace gbe
     if(x.is_signed_int()) {
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1534,7 +1556,7 @@ namespace gbe
   }
 
   void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
-    NOT_IMPLEMENTED;
+   p->NOP();
   }
 
   void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
@@ -1546,59 +1568,24 @@ namespace gbe
     const GenRegister fenceDst = ra->genReg(insn.dst(0));
     uint32_t barrierType = insn.extra.barrierType;
     const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
-    GenRegister blockIP;
-    uint32_t exeWidth = p->curr.execWidth;
-    ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
-
-    if (exeWidth == 16)
-      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
-    else if (exeWidth == 8)
-      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
 
-    p->push();
-    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
-       after the barrier, If there is any lane still remains zero. */
-    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
-    p->curr.noMask = 1;
-    p->curr.execWidth = 1;
-    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
-    if (exeWidth == 16)
-      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
-    else if (exeWidth == 8)
-      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
-    else
-      NOT_IMPLEMENTED;
-    p->curr.inversePredicate = 1;
-    // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
-    // jump to next basic block.
-    p->JMPI(GenRegister::immud(0));
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
-    p->pop();
-
-    p->push();
-    p->curr.useFlag(0, 0);
-    /* Restore the blockIP to current label. */
-    p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
     if (barrierType == ir::syncGlobalBarrier) {
       p->FENCE(fenceDst);
       p->MOV(fenceDst, fenceDst);
     }
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    // As only the payload.2 is used and all the other regions are ignored
-    // SIMD8 mode here is safe.
-    p->curr.execWidth = 8;
-    p->curr.physicalFlag = 0;
-    p->curr.noMask = 1;
-    // Copy barrier id from r0.
-    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
-    // A barrier is OK to start the thread synchronization *and* SLM fence
-    p->BARRIER(src);
-    // Now we wait for the other threads
-    p->curr.execWidth = 1;
-    p->WAIT();
-    // we executed the barrier then restore the barrier soft mask to initial value.
-    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+    p->push();
+      // As only the payload.2 is used and all the other regions are ignored
+      // SIMD8 mode here is safe.
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      // Copy barrier id from r0.
+      p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      p->BARRIER(src);
+      p->curr.execWidth = 1;
+      // Now we wait for the other threads
+      p->WAIT();
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 7c49497..e731174 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -896,6 +896,11 @@ struct GenInstruction
       uint32_t end_of_thread:1;
     } gen7_msg_gw;
 
+    struct {
+      uint32_t jip:16;
+      uint32_t uip:16;
+    } gen7_branch;
+
     int d;
     uint32_t ud;
     float f;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index fc7e53d..06aa769 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -837,6 +837,7 @@ namespace gbe
     GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
     push();
     curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
     curr.execWidth = 1;
     MOV(r, GenRegister::immud(u.u[1]));
     MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
@@ -907,6 +908,7 @@ namespace gbe
       push();
       curr.execWidth = 8;
       curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = 1;
       MOV(r0, src0);
       MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
       curr.predicate = GEN_PREDICATE_NORMAL;
@@ -920,6 +922,7 @@ namespace gbe
         push();
         curr.execWidth = 8;
         curr.predicate = GEN_PREDICATE_NONE;
+        curr.noMask = 1;
         MOV(r0, GenRegister::suboffset(src0, 8));
         MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
         curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1058,7 +1061,7 @@ namespace gbe
 
 #define ALU2_BRA(OP) \
   void GenEncoder::OP(GenRegister src) { \
-    alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(), src); \
+    alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
   }
 
   ALU2_BRA(IF)
@@ -1071,9 +1074,21 @@ namespace gbe
     GBE_ASSERT(insnID < this->store.size());
     GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
                insn.header.opcode == GEN_OPCODE_BRD  ||
-               insn.header.opcode == GEN_OPCODE_ENDIF);
-    if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
-          this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC);
+
+    if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768))  {
+          int offset = 0;
+           if (insn.header.opcode == GEN_OPCODE_IF) {
+             this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+             return;
+           }
+           else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+             offset = -2;
+             /*assert(jumpDistance > -32769 && jumpDistance < 32768);*/
+           }
+          this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset));
     } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
       // For the conditional jump distance out of S15 range, we need to use an
       // inverted jmp followed by a add ip, ip, distance to implement.
@@ -1085,10 +1100,12 @@ namespace gbe
       // for all the branching instruction. And need to adjust the distance
       // for those branch instruction's start point and end point contains
       // this instruction.
+      GenInstruction &insn2 = this->store[insnID+1];
+      GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
       insn.header.opcode = GEN_OPCODE_ADD;
       this->setDst(&insn, GenRegister::ip());
       this->setSrc0(&insn, GenRegister::ip());
-      this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
+      this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
     } else {
       insn.header.predicate_inverse ^= 1;
       this->setSrc1(&insn, GenRegister::immd(2));
@@ -1099,7 +1116,7 @@ namespace gbe
       insn2.header.opcode = GEN_OPCODE_ADD;
       this->setDst(&insn2, GenRegister::ip());
       this->setSrc0(&insn2, GenRegister::ip());
-      this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
+      this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
     }
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d86e04c..147c3e6 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -76,8 +76,6 @@
  *
  * Also, there is some extra kludge to handle the predicates for JMPI.
  *
- * See TODO for a better idea for branching and masking
- *
  * TODO:
  * =====
  *
@@ -92,14 +90,9 @@
  * interesting approach which consists in traversing the dominator tree in post
  * order
  *
- * About masking and branching, a much better idea (that I found later unfortunately)
- * is to replace the use of the flag by uses of if/endif to enclose the basic
- * block. So, instead of using predication, we use auto-masking. The very cool
- * consequence is that we can reintegrate back the structured branches.
- * Basically, we will be able to identify branches that can be mapped to
- * structured branches and mix nicely unstructured branches (which will use
- * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
- * fast)
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
  */
 
 #include "backend/gen_insn_selection.hpp"
@@ -320,38 +313,6 @@ namespace gbe
     INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
     /*! indicate whether a register is a scalar/uniform register. */
     INLINE bool isScalarReg(const ir::Register &reg) const {
-#if 0
-      printf("reg %d ", reg.value());
-      printf("uniform: %d ", getRegisterData(reg).isUniform());
-      if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n"); return true; }
-      if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push location.\n"); return true; }
-      if (reg == ir::ocl::groupid0  ||
-          reg == ir::ocl::groupid1  ||
-          reg == ir::ocl::groupid2  ||
-          reg == ir::ocl::barrierid ||
-          reg == ir::ocl::threadn   ||
-          reg == ir::ocl::numgroup0 ||
-          reg == ir::ocl::numgroup1 ||
-          reg == ir::ocl::numgroup2 ||
-          reg == ir::ocl::lsize0    ||
-          reg == ir::ocl::lsize1    ||
-          reg == ir::ocl::lsize2    ||
-          reg == ir::ocl::gsize0    ||
-          reg == ir::ocl::gsize1    ||
-          reg == ir::ocl::gsize2    ||
-          reg == ir::ocl::goffset0  ||
-          reg == ir::ocl::goffset1  ||
-          reg == ir::ocl::goffset2  ||
-          reg == ir::ocl::workdim   ||
-          reg == ir::ocl::emask     ||
-          reg == ir::ocl::notemask  ||
-          reg == ir::ocl::barriermask
-        ) {
-        printf("special reg.\n");
-        return true;
-      }
-      return false;
-#endif
       const ir::RegisterData &regData = getRegisterData(reg);
       return regData.isUniform();
     }
@@ -992,7 +953,7 @@ namespace gbe
   }
 
   void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
     insn->src(0) = src;
     insn->index = uint16_t(jip);
   }
@@ -1412,9 +1373,17 @@ namespace gbe
     for (uint32_t regID = 0; regID < this->regNum; ++regID)
       this->regDAG[regID] = NULL;
 
+    this->block->hasBarrier = false;
+    this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+                             bb.getLastInstruction()->getOpcode() == OP_RET;
+    if (!this->block->hasBranch)
+      this->block->endifOffset = -1;
+
     // Build the DAG on the fly
     uint32_t insnNum = 0;
     const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+      if (insn.getOpcode() == OP_SYNC)
+        this->block->hasBarrier = true;
 
       // Build a selectionDAG node for instruction
       SelectionDAG *dag = this->newSelectionDAG(insn);
@@ -1465,6 +1434,7 @@ namespace gbe
   void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
   {
     // Bottom up code generation
+    bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
     for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
       // Process all possible patterns for this instruction
       SelectionDAG &dag = *insnDAG[insnID];
@@ -1476,8 +1446,10 @@ namespace gbe
 
         // Start a new code fragment
         this->startBackwardGeneration();
+        // If there is no branch at the end of this block.
 
         // Try all the patterns from best to worst
+
         do {
           if ((*it)->emit(*this, dag))
             break;
@@ -1485,6 +1457,13 @@ namespace gbe
         } while (it != end);
         GBE_ASSERT(it != end);
 
+        if (needEndif) {
+          const ir::BasicBlock *curr = insn.getParent();
+          const ir::BasicBlock *next = curr->getNextBlock();
+          this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+          needEndif = false;
+        }
+
         // Output the code in the current basic block
         this->endBackwardGeneration();
       }
@@ -2133,6 +2112,7 @@ namespace gbe
       const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
 
       sel.push();
+        sel.curr.noMask = 1;
         sel.curr.predicate = GEN_PREDICATE_NONE;
         sel.curr.execWidth = simdWidth;
         sel.SEL_CMP(genCmp, tmp, src0, src1);
@@ -2329,7 +2309,6 @@ namespace gbe
       const Type type = insn.getType();
       const Immediate imm = insn.getImmediate();
       const GenRegister dst = sel.selReg(insn.getDst(0), type);
-      GenRegister flagReg;
 
       sel.push();
       if (sel.isScalarOrBool(insn.getDst(0)) == true) {
@@ -2371,24 +2350,10 @@ namespace gbe
     {
       using namespace ir;
       const ir::Register reg = sel.reg(FAMILY_DWORD);
-      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
       const uint32_t params = insn.getParameters();
 
-      sel.push();
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.noMask = 1;
-        sel.curr.execWidth = 1;
-        sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask);
-        sel.MOV(GenRegister::flag(1, 1), barrierMask);
-      sel.pop();
-
       // A barrier is OK to start the thread synchronization *and* SLM fence
-      sel.push();
-        //sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.flag = 1;
-        sel.curr.subFlag = 1;
-        sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
-      sel.pop();
+      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
       return true;
     }
 
@@ -2696,7 +2661,7 @@ namespace gbe
       GenRegister tmpDst;
 
       if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)
-        tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL);
+        tmpDst = sel.selReg(dst, TYPE_BOOL);
       else
         tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32);
 
@@ -2724,36 +2689,23 @@ namespace gbe
       sel.push();
         sel.curr.flag = 1;
         sel.curr.subFlag = 1;
-        sel.curr.predicate  = GEN_PREDICATE_NONE;
         if (type == TYPE_S64 || type == TYPE_U64) {
           GenRegister tmp[3];
           for(int i=0; i<3; i++)
             tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          sel.push();
-            sel.curr.execWidth = 1;
-            sel.curr.noMask = 1;
-            sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
-          sel.pop();
-          sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst);
         } else if(opcode == OP_ORD) {
           sel.push();
-            sel.curr.execWidth = 1;
-            sel.curr.noMask = 1;
-            sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
+            sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
           sel.pop();
-          sel.curr.predicate = GEN_PREDICATE_NORMAL;
-
-          sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
-          sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
         } else
           sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
       sel.pop();
 
       if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16))
         sel.MOV(sel.selReg(dst, TYPE_U16), GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg));
-      else
-        sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst);
       return true;
     }
   };
@@ -2979,11 +2931,6 @@ namespace gbe
         markAllChildren(dag);
       }
 
-      // Since we cannot predicate the select instruction with our current mask,
-      // we need to perform the selection in two steps (one to select, one to
-      // update the destination register)
-      const RegisterFamily family = getFamily(type);
-      const GenRegister tmp = sel.selReg(sel.reg(family), type);
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Register pred = insn.getPredicate();
       sel.push();
@@ -2992,16 +2939,14 @@ namespace gbe
         sel.curr.flag = 1;
         sel.curr.subFlag = 1;
         sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
-        sel.curr.noMask = 0;
+        //sel.curr.noMask = 0;
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
         if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
-          sel.SEL_INT64(tmp, src0, src1);
+          sel.SEL_INT64(dst, src0, src1);
         else
-          sel.SEL(tmp, src0, src1);
+          sel.SEL(dst, src0, src1);
       sel.pop();
 
-      // Update the destination register properly now
-      sel.MOV(dst, tmp);
       return true;
     }
   };
@@ -3041,6 +2986,7 @@ namespace gbe
     DECL_CTOR(TernaryInstruction, 1, 1);
    };
 
+
   /*! Label instruction pattern */
   DECL_PATTERN(LabelInstruction)
   {
@@ -3053,42 +2999,75 @@ namespace gbe
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       sel.LABEL(label);
 
-     // Do not emit any code for the "returning" block. There is no need for it
-     if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+      // Do not emit any code for the "returning" block. There is no need for it
+      if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
         return true;
 
+      LabelIndex jip;
+      const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+      if (sel.ctx.hasJIP(&insn))
+        jip = sel.ctx.getLabelIndex(&insn);
+      else
+        jip = nextLabel;
+
       // Emit the mask computation at the head of each basic block
       sel.push();
+        sel.curr.noMask = 1;
         sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.flag = 0;
-        sel.curr.subFlag = 0;
         sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1);
       sel.pop();
 
-      // If it is required, insert a JUMP to bypass the block
-      if (sel.ctx.hasJIP(&insn)) {
-        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+      if (sel.block->hasBarrier) {
+        // If this block has barrier, we don't execute the block until all lanes
+        // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+        // lane not reached, we jump to jip. And no need to issue if/endif for
+        // this block, as it will always excute with all lanes activated.
         sel.push();
-
-          sel.curr.noMask = 1;
-          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
-          GenRegister flagReg = GenRegister::flag(0, 0);
-          sel.AND(flagReg, flagReg, emaskReg);
-
+          sel.curr.noMask = 1;
+          sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
           if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
           else
             NOT_IMPLEMENTED;
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
           sel.curr.inversePredicate = 1;
-          sel.curr.flag = 0;
-          sel.curr.subFlag = 0;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
+        // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+        sel.push();
+         sel.curr.predicate = GEN_PREDICATE_NORMAL;
+         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+        sel.pop();
+      }
+      else {
+        if (sel.ctx.hasJIP(&insn)) {
+          // If it is required, insert a JUMP to bypass the block
+          sel.push();
+            if (simdWidth == 8)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            else if (simdWidth == 16)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            else
+              NOT_IMPLEMENTED;
+            sel.curr.noMask = 1;
+            sel.curr.execWidth = 1;
+            sel.curr.inversePredicate = 1;
+            sel.JMPI(GenRegister::immd(0), jip);
+          sel.pop();
+        }
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          // It's easier to set the jip to a relative position over next block.
+          sel.IF(GenRegister::immd(0), nextLabel, nextLabel, sel.block->endifOffset, sel.block->endifOffset);
+        sel.pop();
       }
+
       return true;
     }
     DECL_CTOR(LabelInstruction, 1, 1);
@@ -3225,7 +3204,6 @@ namespace gbe
   /*! Branch instruction pattern */
   DECL_PATTERN(BranchInstruction)
   {
-
     void emitForwardBranch(Selection::Opaque &sel,
                            const ir::BranchInstruction &insn,
                            ir::LabelIndex dst,
@@ -3233,16 +3211,13 @@ namespace gbe
     {
       using namespace ir;
       const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
-      const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
 
       // We will not emit any jump if we must go the next block anyway
       const BasicBlock *curr = insn.getParent();
       const BasicBlock *next = curr->getNextBlock();
       const LabelIndex nextLabel = next->getLabelIndex();
-
       if (insn.isPredicated() == true) {
         const Register pred = insn.getPredicateIndex();
-
         sel.push();
           // we don't need to set next label to the pcip
           // as if there is no backward jump latter, then obviously everything will work fine.
@@ -3250,22 +3225,30 @@ namespace gbe
           sel.curr.flag = 0;
           sel.curr.subFlag = 0;
           sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), nextLabel);
+          sel.block->endifOffset = -1;
         sel.pop();
-
-        if (nextLabel == jip) return;
       } else {
         // Update the PcIPs
+        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
         sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
-        // Do not emit branch when we go to the next block anyway
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), nextLabel);
+        sel.block->endifOffset = -1;
         if (nextLabel == jip) return;
+        // Branch to the jump target
         sel.push();
           sel.curr.execWidth = 1;
           sel.curr.noMask = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
+        // FIXME just for the correct endif offset.
+        // JMPI still has 2 instruction.
+        sel.block->endifOffset -= 2;
       }
     }
 
@@ -3290,37 +3273,32 @@ namespace gbe
         // that actually take the branch
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
-
+        GBE_ASSERT(jip == dst);
         sel.push();
           sel.curr.flag = 0;
           sel.curr.subFlag = 0;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
-          // Re-update the PcIPs for the branches that takes the backward jump
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
-          // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
-          // will check those bits as well.
           sel.curr.predicate = GEN_PREDICATE_NONE;
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), next);
           sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
-          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
-          sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1), emaskReg);
-
-          // Branch to the jump target
-          if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
-          else if (simdWidth == 16)
+          if (simdWidth == 16)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
           else
-            NOT_SUPPORTED;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+          sel.curr.noMask = 1;
           sel.JMPI(GenRegister::immd(0), jip);
+          sel.block->endifOffset = -3;
         sel.pop();
-
       } else {
-
+        const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         // Update the PcIPs
         sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), next);
         // Branch to the jump target
         sel.push();
           sel.curr.execWidth = 1;
@@ -3328,6 +3306,7 @@ namespace gbe
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
+        sel.block->endifOffset = -3;
       }
     }
 
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 04fbb9f..8557768 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -42,6 +42,8 @@ namespace gbe
   /*! Translate IR compare to Gen compare */
   uint32_t getGenCompare(ir::Opcode opcode);
 
+  #define GEN_MAX_LABEL 0xFFFF
+
   /*! Selection opcodes properly encoded from 0 to n for fast jump tables
    *  generations
    */
@@ -190,6 +192,9 @@ namespace gbe
     void append(SelectionInstruction *insn);
     /*! Append a new selection instruction at the beginning of the block */
     void prepend(SelectionInstruction *insn);
+    int endifOffset;
+    bool hasBarrier;
+    bool hasBranch;
   };
 
   /*! Owns the selection engine */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index d318f8e..ddc9d5e 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
 DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
 DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
 DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
-DECL_SELECTION_IR(BRC, BinaryInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
 DECL_SELECTION_IR(BRD, UnaryInstruction)
 DECL_SELECTION_IR(IF, UnaryInstruction)
 DECL_SELECTION_IR(ENDIF, UnaryInstruction)
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 25cb428..051f16d 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -118,7 +118,7 @@ namespace gbe
       this->noMask = 0;
       this->flag = 0;
       this->subFlag = 0;
-      this->predicate = GEN_PREDICATE_NORMAL;
+      this->predicate = GEN_PREDICATE_NONE;
       this->inversePredicate = 0;
       this->physicalFlag = 1;
       this->flagIndex = 0;
@@ -657,6 +657,17 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_1);
     }
 
+    static INLINE GenRegister nullud(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_NULL,
+                         0,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+
     static INLINE bool isNull(GenRegister reg) {
       return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
               && reg.nr == GEN_ARF_NULL);
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 5e474de..c2308da 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver)
 {
   driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
   assert(driver->bufmgr);
+  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1); 
   drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
 }
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index e95b050..4819e9e 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   memset(desc, 0, sizeof(*desc));
   ker_bo = (drm_intel_bo *) kernel->bo;
   desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
-  desc->desc1.single_program_flow = 1;
+  desc->desc1.single_program_flow = 0;
   desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
   desc->desc5.rounding_mode = 0; /* round to nearest even */
 
-- 
2.7.4