1. the thread_id is located in r0.5[0-8], so we need to get the correct bits.
2. also, we don't need so much stack size, max_compute_unit have already
been treated as: #EU * max_thread_per_eu.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Xing, Homer <homer.xing@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
- p->SHR(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(10));
+ p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
p->curr.execWidth = this->simdWidth;
p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
p->curr.execWidth = 1;
assert(offset >= 0);
stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit;
- stack_sz *= device->max_thread_per_unit;
cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
}