From 858414e497141b54c0df18510cb764d3770a2d9c Mon Sep 17 00:00:00 2001
From: Zhigang Gong <zhigang.gong@gmail.com>
Date: Fri, 28 Mar 2014 00:38:29 +0800
Subject: [PATCH] GBE: Optimize the bool register allocation/processing.

Previously, we have a global flag allocation implemntation.
After some analysis, I found the global flag allocation is not
the best solution here.
As for the cross block reference of bool value, we have to
combine it with current emask. There is no obvious advantage to
allocate deadicate physical flag register for those cross block usage.
We just need to allocate physical flag within each BB. We need to handle
the following cases:

1. The bool's liveness never beyond this BB. And the bool is only used as
   a dst register or a pred register. This bool value could be
   allocated in physical flag only if there is enough physical flag.
   We already identified those bool at the instruction select stage, and
   put them in the flagBooleans set.
2. The bool is defined in another BB and used in this BB, then we need
   to prepend an instruction at the position where we use it.
3. The bool is defined in this BB but is also used as some instruction's
   source registers rather than the pred register. We have to keep the normal
   grf (UW8/UW16) register for this bool. For some CMP instruction, we need to
   append a SEL instruction convert the flag to the grf register.
4. Even for the spilling flag, if there is only one spilling flag, we will also
   try to reuse the temporary flag register latter. This requires all the
   instructions should got it flag at the instruction selection stage. And should
   not use the flag physical number directly at the gen_context stage. Otherwise,
   may break the algorithm here.
We will track all the validated bool value and to avoid any redundant
validation for the same flag. But if there is no enough physical flag,
we have to spill the previous allocated physical flag. And the spilling
policy is to spill the allocate flag which live to the last time end point.

Let's see an real example of the improvement of this patch:
I take the compiler_vect_compare as an example, before this patch, the
instructions are as below:
    (      24)  cmp.g.f1.1(8)   null            g110<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      26)  cmp.g.f1.1(8)   null            g111<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      28)  (+f1.1) sel(16) g109<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
    (      30)  cmp.g.f1.1(8)   null            g112<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      32)  cmp.g.f1.1(8)   null            g113<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      34)  (+f1.1) sel(16) g108<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
    (      36)  cmp.g.f1.1(8)   null            g114<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      38)  cmp.g.f1.1(8)   null            g115<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      40)  (+f1.1) sel(16) g107<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
    (      42)  cmp.g.f1.1(8)   null            g116<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      44)  cmp.g.f1.1(8)   null            g117<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      46)  (+f1.1) sel(16) g106<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
    (      48)  mov(16)         g104<1>F        -nanF                           { align1 WE_normal 1H };
    (      50)  cmp.ne.f1.1(16) null            g109<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
    (      52)  (+f1.1) sel(16) g96<1>D         g104<8,8,1>D    0D              { align1 WE_normal 1H };
    (      54)  cmp.ne.f1.1(16) null            g108<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
    (      56)  (+f1.1) sel(16) g98<1>D         g104<8,8,1>D    0D              { align1 WE_normal 1H };
    (      58)  cmp.ne.f1.1(16) null            g107<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
    (      60)  (+f1.1) sel(16) g100<1>D        g104<8,8,1>D    0D              { align1 WE_normal 1H };
    (      62)  cmp.ne.f1.1(16) null            g106<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
    (      64)  (+f1.1) sel(16) g102<1>D        g104<8,8,1>D    0D              { align1 WE_normal 1H };
    (      66)  add(16)         g94<1>D         g1.3<0,1,0>D    g120<8,8,1>D    { align1 WE_normal 1H };
    (      68)  send(16)        null            g94<8,8,1>UD
                data (bti: 1, rgba: 0, SIMD16, legacy, Untyped Surface Write) mlen 10 rlen 0 { align1 WE_normal 1H };
    (      70)  mov(16)         g2<1>UW         0x1UW                           { align1 WE_normal 1H };
    (      72)  endif(16) 2                     null                            { align1 WE_normal 1H };

After this patch, it becomes:

    (      24)  cmp.g(8)        null            g110<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      26)  cmp.g(8)        null            g111<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      28)  cmp.g.f1.1(8)   null            g112<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      30)  cmp.g.f1.1(8)   null            g113<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      32)  cmp.g.f0.1(8)   null            g114<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      34)  cmp.g.f0.1(8)   null            g115<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      36)  (+f0.1) sel(16) g109<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
    (      38)  cmp.g.f1.0(8)   null            g116<8,8,1>D    0D              { align1 WE_normal 1Q };
    (      40)  cmp.g.f1.0(8)   null            g117<8,8,1>D    0D              { align1 WE_normal 2Q };
    (      42)  mov(16)         g106<1>F        -nanF                           { align1 WE_normal 1H };
    (      44)  (+f0) sel(16)   g98<1>D         g106<8,8,1>D    0D              { align1 WE_normal 1H };
    (      46)  (+f1.1) sel(16) g100<1>D        g106<8,8,1>D    0D              { align1 WE_normal 1H };
    (      48)  (+f0.1) sel(16) g102<1>D        g106<8,8,1>D    0D              { align1 WE_normal 1H };
    (      50)  (+f1) sel(16)   g104<1>D        g106<8,8,1>D    0D              { align1 WE_normal 1H };
    (      52)  add(16)         g96<1>D         g1.3<0,1,0>D    g120<8,8,1>D    { align1 WE_normal 1H };
    (      54)  send(16)        null            g96<8,8,1>UD
                data (bti: 1, rgba: 0, SIMD16, legacy, Untyped Surface Write) mlen 10 rlen 0 { align1 WE_normal 1H };
    (      56)  mov(16)         g2<1>UW         0x1UW                           { align1 WE_normal 1H };
    (      58)  endif(16) 2                     null                            { align1 WE_normal 1H };

It reduces the instruction count from 25 to 18. Save about 28% instructions.

v2:
Fix some minor bugs.

Signed-off-by: Zhigang Gong <zhigang.gong@gmail.com>
Reviewed-by: "Yang, Rong R" <rong.r.yang@intel.com>
Reviewed-by: "Song, Ruiling" <ruiling.song@intel.com>
---
 backend/src/backend/gen_context.cpp        |  46 ++--
 backend/src/backend/gen_encoder.cpp        |  35 ++-
 backend/src/backend/gen_encoder.hpp        |  10 +-
 backend/src/backend/gen_insn_selection.cpp | 136 ++++++++---
 backend/src/backend/gen_reg_allocation.cpp | 375 ++++++++++++++++++-----------
 backend/src/backend/gen_register.hpp       |  11 +-
 6 files changed, 406 insertions(+), 207 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index c9e0835..3224c06 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -190,7 +190,7 @@ namespace gbe
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
     switch (insn.opcode) {
-      case SEL_OP_MOV: p->MOV(dst, src); break;
+      case SEL_OP_MOV: p->MOV(dst, src, insn.extra.function); break;
       case SEL_OP_FBH: p->FBH(dst, src); break;
       case SEL_OP_FBL: p->FBL(dst, src); break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
@@ -407,9 +407,9 @@ namespace gbe
           p->pop();
         }
         break;
-      case SEL_OP_AND:  p->AND(dst, src0, src1); break;
-      case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
-      case SEL_OP_XOR:  p->XOR(dst, src0, src1); break;
+      case SEL_OP_AND:  p->AND(dst, src0, src1, insn.extra.function); break;
+      case SEL_OP_OR:   p->OR (dst, src0, src1, insn.extra.function);  break;
+      case SEL_OP_XOR:  p->XOR(dst, src0, src1, insn.extra.function); break;
       case SEL_OP_I64AND:
         {
           GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
@@ -566,7 +566,9 @@ namespace gbe
     GenRegister g = ra->genReg(insn.dst(7));
     GenRegister h = ra->genReg(insn.dst(8));
     GenRegister i = ra->genReg(insn.dst(9));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
+    //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
+    // We just simply use the temporary flag here.
+    GenRegister flagReg = GenRegister::flag(0, 1);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -613,7 +615,9 @@ namespace gbe
     GenRegister g = ra->genReg(insn.dst(7));
     GenRegister h = ra->genReg(insn.dst(8));
     GenRegister i = ra->genReg(insn.dst(9));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
+    //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
+    // We just simply use the temporary flag here.
+    GenRegister flagReg = GenRegister::flag(0, 1);
     GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
@@ -797,7 +801,9 @@ namespace gbe
     GenRegister e = ra->genReg(insn.dst(5));
     GenRegister f = ra->genReg(insn.dst(6));
     a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(7)));
+    //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(7)));
+    // We just simply use the temporary flag here.
+    GenRegister flagReg = GenRegister::flag(0, 1);
     GenRegister zero = GenRegister::immud(0);
     switch(insn.opcode) {
       case SEL_OP_I64SHL:
@@ -1001,7 +1007,9 @@ namespace gbe
     GenRegister mantissa = ra->genReg(insn.dst(4));
     GenRegister tmp = ra->genReg(insn.dst(5));
     GenRegister tmp_high = ra->genReg(insn.dst(6));
-    GenRegister f0 = checkFlagRegister(ra->genReg(insn.dst(7)));
+    //GenRegister f0 = checkFlagRegister(ra->genReg(insn.dst(7)));
+    // We just simply use the temporary flag here.
+    GenRegister f0 = GenRegister::flag(0, 1);
     loadTopHalf(high, src);
     loadBottomHalf(low, src);
     if(!src.is_signed_int()) {
@@ -1039,7 +1047,9 @@ namespace gbe
     GenRegister dst = ra->genReg(insn.dst(0));
     GenRegister high = ra->genReg(insn.dst(1));
     GenRegister tmp = ra->genReg(insn.dst(2));
-    GenRegister flag0 = checkFlagRegister(ra->genReg(insn.dst(3)));
+    //GenRegister flag0 = checkFlagRegister(ra->genReg(insn.dst(3)));
+    // We just simply use the temporary flag here.
+    GenRegister flag0 = GenRegister::flag(0, 1);
 
     if(dst.is_signed_int())
       high = GenRegister::retype(high, GEN_TYPE_D);
@@ -1160,7 +1170,9 @@ namespace gbe
     GenRegister c = ra->genReg(insn.dst(3));
     GenRegister d = ra->genReg(insn.dst(4));
     GenRegister e = ra->genReg(insn.dst(5));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
+    //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
+    // We just simply use the temporary flag here.
+    GenRegister flagReg = GenRegister::flag(0, 1);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -1208,7 +1220,9 @@ namespace gbe
     GenRegister c = ra->genReg(insn.dst(3));
     GenRegister d = ra->genReg(insn.dst(4));
     GenRegister e = ra->genReg(insn.dst(5));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
+    //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
+    // We just simply use the temporary flag here.
+    GenRegister flagReg = GenRegister::flag(0, 1);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -1266,7 +1280,7 @@ namespace gbe
     int execWidth = p->curr.execWidth;
     dest = dest.top_half();
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NORMAL;
+    p->curr.noMask = 0;
     p->curr.execWidth = 8;
     p->MOV(dest, src);
     p->curr.nibControl = 1;
@@ -1302,7 +1316,7 @@ namespace gbe
     dest = dest.bottom_half();
     p->push();
     p->curr.execWidth = 8;
-    p->curr.predicate = GEN_PREDICATE_NORMAL;
+    p->curr.noMask = 0;
     p->MOV(dest, src);
     p->curr.nibControl = 1;
     p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
@@ -1414,7 +1428,9 @@ namespace gbe
     GenRegister k = ra->genReg(insn.dst(11));
     GenRegister l = ra->genReg(insn.dst(12));
     GenRegister m = ra->genReg(insn.dst(13));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(14)));
+    //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(14)));
+    // We just simply use the temporary flag here.
+    GenRegister flagReg = GenRegister::flag(0, 1);
     GenRegister zero = GenRegister::immud(0),
                 one = GenRegister::immud(1),
                 imm31 = GenRegister::immud(31);
@@ -1511,7 +1527,7 @@ namespace gbe
       int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
       p->curr.noMask = 1;
       p->JMPI(zero);
-      p->patchJMPI(p->n_instruction() - 2, jip + 2);
+      p->patchJMPI(p->n_instruction() - 1, jip + 2);
       p->pop();
       // end of loop
     }
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index e8670b9..9df031e 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -661,7 +661,8 @@ namespace gbe
       }
   }
 
-  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
+  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                   GenRegister src, uint32_t condition = 0) {
      if (dst.isdf() && src.isdf()) {
        handleDouble(p, opcode, dst, src);
      } else if (dst.isint64() && src.isint64()) { // handle int64
@@ -678,6 +679,11 @@ namespace gbe
        p->pop();
      } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
+       if (condition != 0) {
+         GBE_ASSERT(opcode == GEN_OPCODE_MOV ||
+                    opcode == GEN_OPCODE_NOT);
+         insn->header.destreg_or_condmod = condition;
+       }
        p->setHeader(insn);
        p->setDst(insn, dst);
        p->setSrc0(insn, src);
@@ -706,12 +712,19 @@ namespace gbe
                    uint32_t opcode,
                    GenRegister dst,
                    GenRegister src0,
-                   GenRegister src1)
+                   GenRegister src1,
+                   uint32_t condition = 0)
   {
     if (dst.isdf() && src0.isdf() && src1.isdf()) {
        handleDouble(p, opcode, dst, src0, src1);
     } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
+       if (condition != 0) {
+         GBE_ASSERT(opcode == GEN_OPCODE_OR ||
+                    opcode == GEN_OPCODE_XOR ||
+                    opcode == GEN_OPCODE_AND);
+         insn->header.destreg_or_condmod = condition;
+       }
        p->setHeader(insn);
        p->setDst(insn, dst);
        p->setSrc0(insn, src0);
@@ -817,15 +830,21 @@ namespace gbe
 #undef NO_SWIZZLE
 
 #define ALU1(OP) \
-  void GenEncoder::OP(GenRegister dest, GenRegister src0) { \
-    alu1(this, GEN_OPCODE_##OP, dest, src0); \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, uint32_t condition) { \
+    alu1(this, GEN_OPCODE_##OP, dest, src0, condition); \
   }
 
 #define ALU2(OP) \
   void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1) { \
-    alu2(this, GEN_OPCODE_##OP, dest, src0, src1); \
+    alu2(this, GEN_OPCODE_##OP, dest, src0, src1, 0); \
   }
 
+#define ALU2_MOD(OP) \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition) { \
+    alu2(this, GEN_OPCODE_##OP, dest, src0, src1, condition); \
+  }
+
+
 #define ALU3(OP) \
   void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \
     alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
@@ -947,9 +966,9 @@ namespace gbe
   ALU1(F32TO16)
   ALU2(SEL)
   ALU1(NOT)
-  ALU2(AND)
-  ALU2(OR)
-  ALU2(XOR)
+  ALU2_MOD(AND)
+  ALU2_MOD(OR)
+  ALU2_MOD(XOR)
   ALU2(SHR)
   ALU2(SHL)
   ALU2(RSR)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index f5e8548..50662fb 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -86,8 +86,9 @@ namespace gbe
     // Encoding functions
     ////////////////////////////////////////////////////////////////////////
 
-#define ALU1(OP) void OP(GenRegister dest, GenRegister src0);
+#define ALU1(OP) void OP(GenRegister dest, GenRegister src0, uint32_t condition = 0);
 #define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1);
+#define ALU2_MOD(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition = 0);
 #define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2);
     ALU1(MOV)
     ALU1(FBH)
@@ -103,9 +104,9 @@ namespace gbe
     ALU1(F32TO16)
     ALU2(SEL)
     ALU1(NOT)
-    ALU2(AND)
-    ALU2(OR)
-    ALU2(XOR)
+    ALU2_MOD(AND)
+    ALU2_MOD(OR)
+    ALU2_MOD(XOR)
     ALU2(SHR)
     ALU2(SHL)
     ALU2(RSR)
@@ -126,6 +127,7 @@ namespace gbe
     ALU1(BRD)
 #undef ALU1
 #undef ALU2
+#undef ALU2_MOD
 #undef ALU3
     void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index aec0459..975c71a 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -148,7 +148,9 @@ namespace gbe
 
   SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
     parent(NULL), opcode(op), dstNum(dst), srcNum(src)
-  {}
+  {
+    extra.function = 0;
+  }
 
   void SelectionInstruction::prepend(SelectionInstruction &other) {
     gbe::prepend(&other, this);
@@ -225,6 +227,7 @@ namespace gbe
       GBE_ASSERT(insn.getSrcNum() < 127);
       for (uint32_t childID = 0; childID < childNum; ++childID)
         this->child[childID] = NULL;
+      computeBool = false;
     }
     /*! Mergeable are non-root instructions with valid sources */
     INLINE void setAsMergeable(uint32_t which) { mergeable|=(1<<which); }
@@ -240,6 +243,8 @@ namespace gbe
     uint32_t childNum:7;
     /*! A root must be generated, no matter what */
     uint32_t isRoot:1;
+    /*! A bool register is used as normal computing sources. */
+    bool computeBool;
   };
 
   /*! A pattern is a tree to match. This is the general interface for them. For
@@ -1411,6 +1416,14 @@ namespace gbe
           }
           if (mergeable) dag->setAsMergeable(srcID);
           dag->child[srcID] = child;
+          // Check whether this bool is used as a normal source
+          // oprand other than BRA/SEL.
+          if (getRegisterFamily(reg) == FAMILY_BOOL) {
+            if (insn.getOpcode() != OP_BRA &&
+                 (insn.getOpcode() != OP_SEL ||
+                   (insn.getOpcode() == OP_SEL && srcID != 0)))
+              child->computeBool = true;
+          }
         } else
           dag->child[srcID] = NULL;
       }
@@ -1686,8 +1699,16 @@ namespace gbe
           if (dst.isdf()) {
             ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
             sel.MOV_DF(dst, src, sel.selReg(r));
-          } else
-            sel.MOV(dst, src);
+          } else {
+            sel.push();
+              if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) {
+                sel.curr.physicalFlag = 0;
+                sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+                sel.curr.modFlag = 1;
+              }
+              sel.MOV(dst, src);
+            sel.pop();
+          }
           break;
         case ir::OP_RNDD: sel.RNDD(dst, src); break;
         case ir::OP_RNDE: sel.RNDE(dst, src); break;
@@ -1842,6 +1863,16 @@ namespace gbe
       }
 
       // Output the binary instruction
+      if (sel.curr.execWidth != 1 &&
+          sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) {
+        GBE_ASSERT(insn.getOpcode() == OP_AND ||
+                   insn.getOpcode() == OP_OR ||
+                   insn.getOpcode() == OP_XOR);
+        sel.curr.physicalFlag = 0;
+        sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+        sel.curr.modFlag = 1;
+      }
+
       switch (opcode) {
         case OP_ADD:
           if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
@@ -2316,6 +2347,11 @@ namespace gbe
 
       switch (type) {
         case TYPE_BOOL:
+          if (!sel.isScalarOrBool(insn.getDst(0))) {
+            sel.curr.modFlag = 1;
+            sel.curr.physicalFlag = 0;
+            sel.curr.flagIndex = (uint16_t) insn.getDst(0);
+          }
           sel.MOV(dst, imm.data.b ? GenRegister::immuw(0xffff) : GenRegister::immuw(0));
         break;
         case TYPE_U32:
@@ -2656,13 +2692,20 @@ namespace gbe
       const Type type = insn.getType();
       const Register dst = insn.getDst(0);
       GenRegister tmpDst;
+      const BasicBlock *curr = insn.getParent();
+      const ir::Liveness &liveness = sel.ctx.getLiveness();
+      const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(curr);
+      bool needStoreBool = false;
+      if (liveOut.contains(dst) || dag.computeBool)
+        needStoreBool = true;
+
       if(type == TYPE_S64 || type == TYPE_U64 ||
          type == TYPE_DOUBLE || type == TYPE_FLOAT ||
-         type == TYPE_U32 ||  type == TYPE_S32 )
+         type == TYPE_U32 ||  type == TYPE_S32 /*||
+         (!needStoreBool)*/)
         tmpDst = GenRegister::nullud();
       else
         tmpDst = sel.selReg(dst, TYPE_BOOL);
-
       // Look for immediate values for the right source
       GenRegister src0, src1;
       SelectionDAG *dag0 = dag.child[0];
@@ -2685,35 +2728,42 @@ namespace gbe
       }
 
       sel.push();
-        sel.curr.flag = 1;
-        sel.curr.subFlag = 1;
+        sel.curr.physicalFlag = 0;
+        sel.curr.modFlag = 1;
+        sel.curr.flagIndex = (uint16_t)dst;
+        sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
         if (type == TYPE_S64 || type == TYPE_U64) {
           GenRegister tmp[3];
           for(int i=0; i<3; i++)
             tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.curr.flagGen = 1;
           sel.I64CMP(getGenCompare(opcode), src0, src1, tmp);
         } else if(opcode == OP_ORD) {
           sel.push();
             sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
             sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.curr.flagGen = 1;
             sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
           sel.pop();
-        } else
+        } else {
+          if((type == TYPE_S64 || type == TYPE_U64 ||
+              type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+              type == TYPE_U32 ||  type == TYPE_S32))
+            sel.curr.flagGen = 1;
           sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
+        }
+#if 0
+        if((type == TYPE_S64 || type == TYPE_U64 ||
+            type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+            type == TYPE_U32 ||  type == TYPE_S32) /*&&
+            needStoreBool*/) {
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.SEL(sel.selReg(dst, TYPE_U16),
+                    sel.selReg(ir::ocl::one, TYPE_U16),
+                    sel.selReg(ir::ocl::zero, TYPE_U16));
+        }
+#endif
       sel.pop();
-
-      if(type == TYPE_S64 || type == TYPE_U64 ||
-         type == TYPE_DOUBLE || type == TYPE_FLOAT ||
-         type == TYPE_U32 ||  type == TYPE_S32 ) {
-        sel.push();
-          sel.curr.flag = 1;
-          sel.curr.subFlag = 1;
-          sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.SEL(sel.selReg(dst, TYPE_U16),
-                  sel.selReg(ir::ocl::one, TYPE_U16),
-                  sel.selReg(ir::ocl::zero, TYPE_U16));
-        sel.pop();
-      }
       return true;
     }
   };
@@ -2941,13 +2991,11 @@ namespace gbe
 
       const Register pred = insn.getPredicate();
       sel.push();
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.execWidth = simdWidth;
-        sel.curr.flag = 1;
-        sel.curr.subFlag = 1;
-        sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
-        //sel.curr.noMask = 0;
+        sel.curr.physicalFlag = 0;
+        sel.curr.flagIndex = (uint16_t) pred;
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
+        if (!dag0)
+          sel.curr.externFlag = 1;
         if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
           sel.SEL_INT64(dst, src0, src1);
         else
@@ -3214,8 +3262,15 @@ namespace gbe
   };
 
   /*! Branch instruction pattern */
-  DECL_PATTERN(BranchInstruction)
+  class BranchInstructionPattern : public SelectionPattern
   {
+  public:
+    BranchInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::BranchInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
     void emitForwardBranch(Selection::Opaque &sel,
                            const ir::BranchInstruction &insn,
                            ir::LabelIndex dst,
@@ -3234,11 +3289,11 @@ namespace gbe
           // we don't need to set next label to the pcip
           // as if there is no backward jump latter, then obviously everything will work fine.
           // If there is backward jump latter, then all the pcip will be updated correctly there.
-          sel.curr.flag = 0;
-          sel.curr.subFlag = 0;
-          sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
+          sel.curr.physicalFlag = 0;
+          sel.curr.flagIndex = (uint16_t) pred;
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.curr.predicate = GEN_PREDICATE_NONE;
           if (!sel.block->hasBarrier)
             sel.ENDIF(GenRegister::immd(0), nextLabel);
           sel.block->endifOffset = -1;
@@ -3285,10 +3340,8 @@ namespace gbe
         sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
         GBE_ASSERT(jip == dst);
         sel.push();
-          sel.curr.flag = 0;
-          sel.curr.subFlag = 0;
-          sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
+          sel.curr.physicalFlag = 0;
+          sel.curr.flagIndex = (uint16_t) pred;
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
           sel.block->endifOffset = -1;
@@ -3320,8 +3373,9 @@ namespace gbe
       }
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::BranchInstruction &insn) const {
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
+      const ir::BranchInstruction &insn = cast<BranchInstruction>(dag.insn);
       const Opcode opcode = insn.getOpcode();
       if (opcode == OP_RET)
         sel.EOT();
@@ -3329,17 +3383,25 @@ namespace gbe
         const LabelIndex dst = insn.getLabelIndex();
         const LabelIndex src = insn.getParent()->getLabelIndex();
 
+        sel.push();
+        if (insn.isPredicated() == true) {
+          if (dag.child[0] == NULL)
+            sel.curr.externFlag = 1;
+        }
+
         // We handle foward and backward branches differently
         if (uint32_t(dst) <= uint32_t(src))
           this->emitBackwardBranch(sel, insn, dst, src);
         else
           this->emitForwardBranch(sel, insn, dst, src);
+        sel.pop();
       } else
         NOT_IMPLEMENTED;
+
+      markAllChildren(dag);
       return true;
     }
 
-    DECL_CTOR(BranchInstruction, 1, 1);
   };
 
   /*! Sort patterns */
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 3f2e746..c6d7d58 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -153,8 +153,13 @@ namespace gbe
     vector<SelectionVector*> vectors;
     /*! The set of booleans that will go to GRF (cannot be kept into flags) */
     set<ir::Register> grfBooleans;
+    /*! The set of booleans which be held in flags, don't need to allocate grf */
+    set<ir::Register> flagBooleans;
     /*! All the register intervals */
     vector<GenRegInterval> intervals;
+    /*! All the boolean register intervals on the corresponding BB*/
+    typedef map<ir::Register, GenRegInterval> RegIntervalMap;
+    map<SelectionBlock *, RegIntervalMap *> boolIntervalsMap;
     /*! Intervals sorting based on starting point positions */
     vector<GenRegInterval*> starting;
     /*! Intervals sorting based on ending point positions */
@@ -365,154 +370,219 @@ namespace gbe
     return ret;
   }
 
-  void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
-
-    // Store the registers allocated in the map
-    map<ir::Register, uint32_t> allocatedFlags;
-    GenRegInterval spill = ir::Register(ir::RegisterFile::MAX_INDEX);
 
-    // we have two flags we use for booleans f1.0 and f1.1
-    const uint32_t flagNum = 2;
-    uint32_t freeFlags[] = {0,1};
-    uint32_t freeNum = flagNum;
-
-    // Perform the linear scan allocator on the flag registers only. We only use
-    // two flags registers for the booleans right now: f1.0 and f1.1 
-    const uint32_t regNum = ctx.sel->getRegNum();
-    uint32_t endID = 0; // interval to expire
-    for (uint32_t startID = 0; startID < regNum; ++startID) {
-      const GenRegInterval &interval = *this->starting[startID];
-      const ir::Register reg = interval.reg;
-      if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
-        continue; // Not a flag. We don't care
-      if (grfBooleans.contains(reg))
-        continue; // Cannot use a flag register
-      if (interval.maxID == -INT_MAX)
-        continue; // Unused register
-      if (freeNum != 0) {
-        spill = interval;
-        allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+  void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
+    // Previously, we have a global flag allocation implemntation.
+    // After some analysis, I found the global flag allocation is not
+    // the best solution here.
+    // As for the cross block reference of bool value, we have to
+    // combine it with current emask. There is no obvious advantage to
+    // allocate deadicate physical flag register for those cross block usage.
+    // We just need to allocate physical flag within each BB. We need to handle
+    // the following cases:
+    //
+    // 1. The bool's liveness never beyond this BB. And the bool is only used as
+    //    a dst register or a pred register. This bool value could be
+    //    allocated in physical flag only if there is enough physical flag.
+    //    We already identified those bool at the instruction select stage, and
+    //    put them in the flagBooleans set.
+    // 2. The bool is defined in another BB and used in this BB, then we need
+    //    to prepend an instruction at the position where we use it.
+    // 3. The bool is defined in this BB but is also used as some instruction's
+    //    source registers rather than the pred register. We have to keep the normal
+    //    grf (UW8/UW16) register for this bool. For some CMP instruction, we need to
+    //    append a SEL instruction convert the flag to the grf register.
+    // 4. Even for the spilling flag, if there is only one spilling flag, we will also
+    //    try to reuse the temporary flag register latter. This requires all the
+    //    instructions should got it flag at the instruction selection stage. And should
+    //    not use the flag physical number directly at the gen_context stage. Otherwise,
+    //    may break the algorithm here.
+    // We will track all the validated bool value and to avoid any redundant
+    // validation for the same flag. But if there is no enough physical flag,
+    // we have to spill the previous allocated physical flag. And the spilling
+    // policy is to spill the allocate flag which live to the last time end point.
+
+    // we have three flags we use for booleans f0.0 , f1.0 and f1.1
+    for (auto &block : *selection.blockList) {
+      // Store the registers allocated in the map
+      map<ir::Register, uint32_t> allocatedFlags;
+      map<const GenRegInterval*, uint32_t> allocatedFlagIntervals;
+
+      const uint32_t flagNum = 3;
+      uint32_t freeFlags[] = {2, 3, 0};
+      uint32_t freeNum = flagNum;
+      if (boolIntervalsMap.find(&block) == boolIntervalsMap.end())
+        continue;
+      const auto boolsMap = boolIntervalsMap[&block];
+      vector<const GenRegInterval*> flagStarting;
+      vector<const GenRegInterval*> flagEnding;
+      GBE_ASSERT(boolsMap->size() > 0);
+      uint32_t regNum = boolsMap->size();
+      flagStarting.resize(regNum);
+      flagEnding.resize(regNum);
+      uint32_t id = 0;
+      for (auto &interval : *boolsMap) {
+        flagStarting[id] = flagEnding[id] = &interval.second;
+        id++;
       }
-      else {
+      std::sort(flagStarting.begin(), flagStarting.end(), cmp<true>);
+      std::sort(flagEnding.begin(), flagEnding.end(), cmp<false>);
+
+      uint32_t endID = 0; // interval to expire
+      for (uint32_t startID = 0; startID < regNum; ++startID) {
+        const GenRegInterval *interval = flagStarting[startID];
+        const ir::Register reg = interval->reg;
+        GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+        if (freeNum != 0) {
+          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+          allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+        } else {
         // Try to expire one register
-        while (endID != ending.size()) {
-          const GenRegInterval *toExpire = this->ending[endID];
-          const ir::Register reg = toExpire->reg;
+        while (endID != flagEnding.size()) {
+          const GenRegInterval *toExpire = flagEnding[endID];
           // Dead code produced by the insn selection -> we skip it
           if (toExpire->minID > toExpire->maxID) {
             endID++;
             continue;
           }
           // We cannot expire this interval and the next ones
-          if (toExpire->maxID >= interval.minID)
+          if (toExpire->maxID >= interval->minID)
             break;
-          // Must be a boolean allocated with a flag register
-          if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL || grfBooleans.contains(reg)) {
+          // We reuse a flag from a previous interval (the oldest one)
+          auto it = allocatedFlags.find(toExpire->reg);
+          if (it == allocatedFlags.end()) {
             endID++;
             continue;
           }
-          // We reuse a flag from a previous interval (the oldest one)
-          auto it = allocatedFlags.find(toExpire->reg);
-          GBE_ASSERT(it != allocatedFlags.end());
           freeFlags[freeNum++] = it->second;
           endID++;
           break;
         }
-
-        // We need to spill one of the previous boolean values
-        if (freeNum == 0) {
-          GBE_ASSERT(uint16_t(spill.reg) != ir::RegisterFile::MAX_INDEX);
-          // We spill the last inserted boolean and use its flag instead for
-          // this one
-          if (spill.maxID > interval.maxID) {
-            auto it = allocatedFlags.find(spill.reg);
-            GBE_ASSERT(it != allocatedFlags.end());
-            allocatedFlags.insert(std::make_pair(reg, it->second));
-            allocatedFlags.erase(spill.reg);
-            grfBooleans.insert(spill.reg);
-            spill = interval;
+        if (freeNum != 0) {
+          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+          allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+        }
+        else {
+          // FIXME we may sort the allocated flags before do the spilling in the furture.
+          int32_t spill = -1;
+          const GenRegInterval *spillInterval = NULL;
+          int32_t maxID = 0;
+          for (auto &it : allocatedFlagIntervals) {
+            if (it.first->maxID <= interval->minID)
+              continue;
+            if (it.first->maxID > maxID && it.second != 0) {
+              maxID = it.first->maxID;
+              spill = it.second;
+              spillInterval = it.first;
+            }
           }
-          // We will use a grf for the current register
-          else {
-            grfBooleans.insert(reg);
+          if (spill != -1) {
+            allocatedFlags.insert(std::make_pair(reg, spill));
+            allocatedFlagIntervals.insert(std::make_pair(interval, spill));
+            allocatedFlags.erase(spillInterval->reg);
+            allocatedFlagIntervals.erase(spillInterval);
+            // We spill this flag booleans register, so erase it from the flag boolean set.
+            if (flagBooleans.contains(spillInterval->reg))
+              flagBooleans.erase(spillInterval->reg);
+          } else {
+            GBE_ASSERT(0);
           }
         }
-        else
-          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
-      }
-    }
-
-    // Now, we traverse all the selection instructions and we patch them to make
-    // them use flag registers
-    for (auto &block : *selection.blockList)
-    for (auto &insn : block.insnList) {
-      const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
-
-      // Patch the source booleans
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const GenRegister selReg = insn.src(srcID);
-        const ir::Register reg = selReg.reg();
-        if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
-          continue;
-        auto it = allocatedFlags.find(reg);
-        if (it == allocatedFlags.end())
-          continue;
-        // Use a flag register for it now
-        insn.src(srcID) = GenRegister::flag(1,it->second);
-      }
-
-      // Patch the destination booleans
-      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
-        const GenRegister selReg = insn.dst(dstID);
-        const ir::Register reg = selReg.reg();
-        if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
-          continue;
-        auto it = allocatedFlags.find(reg);
-        if (it == allocatedFlags.end())
-          continue;
-        // Use a flag register for it now
-        insn.dst(dstID) = GenRegister::flag(1,it->second);
+        }
       }
+      delete boolsMap;
 
-      // Patch the predicate now. Note that only compares actually modify it (it
-      // is called a "conditional modifier"). The other instructions just read
-      // it
-      if (insn.state.physicalFlag == 0) {
-        auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
-        // Just patch it if we can use a flag directly
-        if (it != allocatedFlags.end()) {
-          insn.state.flag = 1;
-          insn.state.subFlag = it->second;
-          insn.state.physicalFlag = 1;
-        }
-        // When we let the boolean in a GRF, use f0.1 as a temporary
-        else {
-          // Mov the GRF to the flag such that the flag can be read
-          SelectionInstruction *mov0 = selection.create(SEL_OP_MOV,1,1);
-          mov0->state = GenInstructionState(1);
-          mov0->state.predicate = GEN_PREDICATE_NONE;
-          mov0->state.noMask = 1;
-          mov0->src(0) = GenRegister::uw1grf(ir::Register(insn.state.flagIndex));
-          mov0->dst(0) = GenRegister::flag(0,1);
-
-          // Do not prepend if the flag is not read (== used only as a
-          // conditional modifier)
-          if (insn.state.predicate != GEN_PREDICATE_NONE)
-            insn.prepend(*mov0);
-
-          // We can use f0.1 (our "backdoor" flag)
-          insn.state.flag = 0;
-          insn.state.subFlag = 1;
-          insn.state.physicalFlag = 1;
-
-          // Compare instructions update the flags so we must copy it back to
-          // the GRF
-          if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
-            SelectionInstruction *mov1 = selection.create(SEL_OP_MOV,1,1);
-            mov1->state = mov0->state;
-            mov1->dst(0) = mov0->src(0);
-            mov1->src(0) = mov0->dst(0);
-            insn.append(*mov1);
+      // Now, we traverse all the selection instructions and we patch them to make
+      // them use flag registers
+      set<uint16_t> validatedFlags;
+      uint16_t validTempFlagReg = 0;
+      for (auto &insn : block.insnList) {
+        // Patch the predicate now. Note that only compares actually modify it (it
+        // is called a "conditional modifier"). The other instructions just read
+        // it
+        if (insn.state.physicalFlag == 0) {
+          auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
+          if (it != allocatedFlags.end()) {
+            insn.state.flag = it->second / 2;
+            insn.state.subFlag = it->second & 1;
+            insn.state.physicalFlag = 1;
+            // modFlag is for the LOADI/MOV/AND/OR/XOR instructions which will modify a
+            // flag register. We set the condition for them to save one instruction if possible.
+            if (insn.state.modFlag == 1 &&
+                (insn.opcode == SEL_OP_MOV ||
+                 insn.opcode == SEL_OP_AND  ||
+                 insn.opcode == SEL_OP_OR  ||
+                 insn.opcode == SEL_OP_XOR))
+              insn.extra.function = GEN_CONDITIONAL_NEQ;
+            if ((insn.state.externFlag &&
+                insn.state.predicate != GEN_PREDICATE_NONE &&
+                validatedFlags.find(insn.state.flagIndex) == validatedFlags.end())) {
+              // This is an external bool, we need to validate it if it is not validated yet.
+              SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2);
+              cmp0->state = GenInstructionState(insn.state.execWidth);
+              cmp0->state.flag = insn.state.flag;
+              cmp0->state.subFlag = insn.state.subFlag;
+              cmp0->src(0) = GenRegister::uw8grf(ir::Register(insn.state.flagIndex));
+              cmp0->src(1) = GenRegister::immuw(0);
+              cmp0->dst(0) = GenRegister::null();
+              cmp0->extra.function = GEN_CONDITIONAL_NEQ;
+              insn.prepend(*cmp0);
+              validatedFlags.insert(insn.state.flagIndex);
+            }
+          } else {
+            // This bool doesn't have a deadicated flag, we use temporary flag here.
+            // each time we need to validate it from the grf register.
+            // We track the last temporary validate register, if it's the same as
+            // current, we can avoid the revalidation.
+            insn.state.flag = 0;
+            insn.state.subFlag = 1;
+            insn.state.physicalFlag = 1;
+            if ((insn.state.predicate != GEN_PREDICATE_NONE)
+                && validTempFlagReg != insn.state.flagIndex) {
+              SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2);
+              cmp0->state = GenInstructionState(insn.state.execWidth);
+              cmp0->state.flag = insn.state.flag;
+              cmp0->state.subFlag = insn.state.subFlag;
+              cmp0->src(0) = GenRegister::uw8grf(ir::Register(insn.state.flagIndex));
+              cmp0->src(1) = GenRegister::immuw(0);
+              cmp0->dst(0) = GenRegister::null();
+              cmp0->extra.function = GEN_CONDITIONAL_NEQ;
+              insn.prepend(*cmp0);
+            }
+            if (insn.state.modFlag == 0)
+              validTempFlagReg = insn.state.flagIndex;
+            else
+              validTempFlagReg = 0;
+          }
+          if (insn.opcode == SEL_OP_CMP &&
+            flagBooleans.contains((ir::Register)(insn.dst(0).value.reg))) {
+            // This is a CMP for a pure flag booleans, we don't need to write result to
+            // the grf. And latter, we will not allocate grf for it.
+            insn.dst(0) = GenRegister::null();
+          }
+          // If the instruction requires to generate (CMP for long/int/float..)
+          // the flag value to the register, and it's not a pure flag boolean,
+          // we need to use SEL instruction to generate the flag value to the UW8
+          // register.
+          if (insn.state.flagGen == 1 &&
+              !flagBooleans.contains((ir::Register)(insn.state.flagIndex))) {
+            SelectionInstruction *sel0 = selection.create(SEL_OP_SEL, 1, 2);
+            sel0->state = GenInstructionState(ctx.getSimdWidth());
+            sel0->state.flag = insn.state.flag;
+            sel0->state.subFlag = insn.state.subFlag;
+            sel0->state.predicate = GEN_PREDICATE_NORMAL;
+            sel0->src(0) = GenRegister::uw1grf(ir::ocl::one);
+            sel0->src(1) = GenRegister::uw1grf(ir::ocl::zero);
+            sel0->dst(0) = GenRegister::uw8grf((ir::Register)insn.state.flagIndex);
+            insn.append(*sel0);
+            // We use the zero one after the liveness analysis, we have to update
+            // the liveness data manually here.
+            GenRegInterval &interval0 = intervals[ir::ocl::zero];
+            GenRegInterval &interval1 = intervals[ir::ocl::one];
+            interval0.minID = std::min(interval0.minID, (int32_t)insn.ID);
+            interval0.maxID = std::max(interval0.maxID, (int32_t)insn.ID);
+            interval1.minID = std::min(interval1.minID, (int32_t)insn.ID);
+            interval1.maxID = std::max(interval1.maxID, (int32_t)insn.ID);
           }
         }
       }
@@ -530,6 +600,9 @@ namespace gbe
       if (RA.contains(reg))
         continue; // already allocated
 
+      if (flagBooleans.contains(reg))
+        continue;
+
       // Case 1: the register belongs to a vector, allocate all the registers in
       // one piece
       auto it = vectorMap.find(reg);
@@ -621,6 +694,8 @@ namespace gbe
   INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
   {
     auto it = RA.find(reg);
+    if (flagBooleans.contains(reg))
+      return false;
     GBE_ASSERT(it != RA.end());
     // offset less than 32 means it is not managed by our reg allocator.
     if (it->second < 32)
@@ -803,6 +878,7 @@ namespace gbe
       int32_t firstID = insnID;
       // Update the intervals of each used register. Note that we do not
       // register allocate R0, so we skip all sub-registers in r0
+      RegIntervalMap *boolsMap = new RegIntervalMap;
       for (auto &insn : block.insnList) {
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
         insn.ID  = insnID;
@@ -831,23 +907,33 @@ namespace gbe
           this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
         }
 
-        // Flag registers can only go to src[0]
-#if 0
-        const SelectionOpcode opcode = SelectionOpcode(insn.opcode);
-        if (opcode == SEL_OP_AND || opcode == SEL_OP_OR || opcode == SEL_OP_XOR
-            || opcode == SEL_OP_I64AND || opcode == SEL_OP_I64OR || opcode == SEL_OP_I64XOR) {
-          if (insn.src(1).physical == 0) {
-            const ir::Register reg = insn.src(1).reg();
-            if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL)
-              grfBooleans.insert(reg);
-          }
-        }
-#endif
         // OK, a flag is used as a predicate or a conditional modifier
         if (insn.state.physicalFlag == 0) {
           const ir::Register reg = ir::Register(insn.state.flagIndex);
           this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
           this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+          // Check whether this is a pure flag booleans candidate.
+          if (insn.state.grfFlag == 0)
+            flagBooleans.insert(reg);
+          GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+          // update the bool register's per-BB's interval data
+          if (boolsMap->find(reg) == boolsMap->end()) {
+            GenRegInterval boolInterval(reg);
+            boolsMap->insert(std::make_pair(reg, boolInterval));
+          }
+          boolsMap->find(reg)->second.minID = std::min(boolsMap->find(reg)->second.minID, insnID);
+          boolsMap->find(reg)->second.maxID = std::max(boolsMap->find(reg)->second.maxID, insnID);
+          if (&insn == block.insnList.back() &&
+              insn.opcode == SEL_OP_JMPI &&
+              insn.state.predicate != GEN_PREDICATE_NONE) {
+            // If this is the last instruction and is a predicated JMPI.
+            // We must extent its liveness before any other instrution.
+            // As we need to allocate f0 to it, and need to keep the f0
+            // unchanged during the block. The root cause is this instruction
+            // is out-of the if/endif region, so we have to borrow the f0
+            // to get correct bits for all channels.
+            boolsMap->find(reg)->second.minID = 0;
+          }
         }
         lastID = insnID;
         insnID++;
@@ -856,12 +942,17 @@ namespace gbe
       // All registers alive at the begining of the block must update their intervals.
       const ir::BasicBlock *bb = block.bb;
       for (auto reg : ctx.getLiveIn(bb))
-          this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
+        this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
 
       // All registers alive at the end of the block must have their intervals
       // updated as well
       for (auto reg : ctx.getLiveOut(bb))
         this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
+
+      if (boolsMap->size() > 0)
+        boolIntervalsMap.insert(std::make_pair(&block, boolsMap));
+      else
+        delete boolsMap;
     }
 
     this->intervals[ocl::retVal].minID = INT_MAX;
@@ -870,6 +961,9 @@ namespace gbe
     // Allocate all the vectors first since they need to be contiguous
     this->allocateVector(selection);
 
+    // First we try to put all booleans registers into flags
+    this->allocateFlags(selection);
+
     // Sort both intervals in starting point and ending point increasing orders
     const uint32_t regNum = ctx.sel->getRegNum();
     this->starting.resize(regNum);
@@ -889,9 +983,6 @@ namespace gbe
         break;
     }
 
-    // First we try to put all booleans registers into flags
-    //this->allocateFlags(selection);
-
     // Allocate all the GRFs now (regular register and boolean that are not in
     // flag registers)
     return this->allocateGRFs(selection);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 051f16d..0480dd8 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -118,6 +118,10 @@ namespace gbe
       this->noMask = 0;
       this->flag = 0;
       this->subFlag = 0;
+      this->grfFlag = 1;
+      this->externFlag = 0;
+      this->modFlag = 0;
+      this->flagGen = 0;
       this->predicate = GEN_PREDICATE_NONE;
       this->inversePredicate = 0;
       this->physicalFlag = 1;
@@ -125,9 +129,14 @@ namespace gbe
       this->saturate = GEN_MATH_SATURATE_NONE;
     }
     uint32_t physicalFlag:1; //!< Physical or virtual flag register
-    uint32_t flag:1;         //!< Only if physical flag
+    uint32_t flag:1;         //!< Only if physical flag,
     uint32_t subFlag:1;      //!< Only if physical flag
     uint32_t flagIndex:16;   //!< Only if virtual flag (index of the register)
+    uint32_t grfFlag:1;      //!< Only if virtual flag, 0 means we do not need to allocate GRF.
+    uint32_t externFlag:1;   //!< Only if virtual flag, 1 means this flag is from external BB.
+    uint32_t modFlag:1;      //!< Only if virtual flag, 1 means will modify flag.
+    uint32_t flagGen:1;      //!< Only if virtual flag, 1 means the gen_context stage may need to
+                             //!< generate the flag.
     uint32_t execWidth:5;
     uint32_t quarterControl:1;
     uint32_t nibControl:1;
-- 
2.7.4