// We insert the block IP mask first
this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
+ this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0, sizeof(uint32_t)));
+ this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
// Go over the arguments and find the related patch locations
const uint32_t argNum = fn.argNum();
});
#undef INSERT_REG
- // Insert the number of threads
- insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
-
// Insert the stack buffer if used
if (useStackPtr)
insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
reg == ir::ocl::goffset0 ||
reg == ir::ocl::goffset1 ||
reg == ir::ocl::goffset2 ||
- reg == ir::ocl::workdim)
+ reg == ir::ocl::workdim ||
+ reg == ir::ocl::emask ||
+ reg == ir::ocl::notemask)
return true;
return false;
}
void GenContext::clearFlagRegister(void) {
// when group size not aligned to simdWidth, flag register need clear to
// make prediction(any8/16h) work correctly
+ const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
+ const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
+ uint32_t execWidth = p->curr.execWidth;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
+ /* clear all the bit in f0.0. */
p->curr.execWidth = 1;
- p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
- p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+ p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000));
+ p->curr.noMask = 0;
+ p->curr.useFlag(0, 0);
+ p->curr.execWidth = execWidth;
+ /* set all the active lane to 1. Inactive lane remains 0. */
+ p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
+ p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
p->pop();
}
if (sel.ctx.hasJIP(&insn)) {
const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
sel.push();
+
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+ GenRegister flagReg = GenRegister::flag(0, 0);
+ sel.AND(flagReg, flagReg, emaskReg);
+
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
else if (simdWidth == 16)
else
NOT_IMPLEMENTED;
sel.curr.inversePredicate = 1;
- sel.curr.execWidth = 1;
sel.curr.flag = 0;
sel.curr.subFlag = 0;
- sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
}
// It is slightly more complicated than for backward jump. We check that
// all PcIPs are greater than the next block IP to be sure that we can
// jump
+ // We set all the inactive channel to 1 as the GEN_PREDICATE_ALIGN1_ALL8/16
+ // will check those bits as well.
+
sel.push();
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(pred);
// XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
// work correct, as flag register bits mapped to non-active lanes tend
// to be zero.
+
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ GenRegister notEmaskReg = GenRegister::uw1grf(ocl::notemask);
+ sel.OR(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), notEmaskReg);
+
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
else if (simdWidth == 16)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
else
NOT_SUPPORTED;
- sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
+
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
if (insn.isPredicated() == true) {
const Register pred = insn.getPredicateIndex();
+
// Update the PcIPs for all the branches. Just put the IPs of the next
// block. Next instruction will properly reupdate the IPs of the lanes
// that actually take the branch
sel.curr.flagIndex = uint16_t(pred);
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ALL8/16
+ // will check those bits as well.
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+ sel.AND(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), emaskReg);
+
// Branch to the jump target
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
else
NOT_SUPPORTED;
- sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
}
+ this->intervals[ocl::emask].minID = 0;
+ this->intervals[ocl::emask].maxID = INT_MAX;
+ this->intervals[ocl::notemask].minID = 0;
+ this->intervals[ocl::notemask].maxID = INT_MAX;
+ this->intervals[ocl::retVal].minID = INT_MAX;
+ this->intervals[ocl::retVal].maxID = -INT_MAX;
+
// Sort both intervals in starting point and ending point increasing orders
const uint32_t regNum = ctx.sel->getRegNum();
this->starting.resize(regNum);
GBE_CURBE_KERNEL_ARGUMENT,
GBE_CURBE_EXTRA_ARGUMENT,
GBE_CURBE_BLOCK_IP,
- GBE_CURBE_THREAD_NUM
+ GBE_CURBE_THREAD_NUM,
+ GBE_CURBE_EMASK,
+ GBE_CURBE_NOT_EMASK,
};
/*! Extra arguments use the negative range of sub-values */
GBE_ASSERT(target != NULL);
target->predecessors.insert(&bb);
bb.successors.insert(target);
- if (insn.isPredicated() == true) jumpToNext = &bb;
+ if ( insn.isPredicated() == true) jumpToNext = &bb;
}
});
}
"stack_pointer",
"block_ip",
"barrier_id", "thread_number",
- "work_dimension", "sampler_info", "retVal"
+ "work_dimension", "sampler_info", "emask", "notemask", "retVal"
};
#if GBE_DEBUG
DECL_NEW_REG(FAMILY_DWORD, threadn);
DECL_NEW_REG(FAMILY_DWORD, workdim);
DECL_NEW_REG(FAMILY_WORD, samplerinfo);
+ DECL_NEW_REG(FAMILY_WORD, emask);
+ DECL_NEW_REG(FAMILY_WORD, notemask);
DECL_NEW_REG(FAMILY_WORD, retVal);
}
#undef DECL_NEW_REG
static const Register threadn = Register(21); // number of threads
static const Register workdim = Register(22); // work dimention.
static const Register samplerinfo = Register(23); // store sampler info.
- static const Register retVal = Register(24); // helper register to do data flow analysis.
- static const uint32_t regNum = 25; // number of special registers
+ static const Register emask = Register(24); // store the emask bits for the branching fix.
+ static const Register notemask = Register(25); // store the !emask bits for the branching fix.
+ static const Register retVal = Register(26); // helper register to do data flow analysis.
+ static const uint32_t regNum = 27; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */