p->curr.predicate = GEN_PREDICATE_NONE;
//p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
- p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x80));
+ p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
p->curr.execWidth = this->simdWidth;
p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
p->curr.execWidth = 1;
- p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(1));
+ p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
p->curr.execWidth = this->simdWidth;
assert(offset >= 0);
stack_sz *= interp_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit;
+ /* Because HSW calc stack offset per thread is relative with half slice, when
+ thread schedule in half slice is not balance, would out of bound. Because
+ the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+ */
+ if(cl_driver_get_ver(ctx->drv) == 75)
+ stack_sz *= 4;
+
cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
}
drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
drm_intel_bo* old = gpgpu->scratch_b.bo;
uint32_t total = per_thread_size * gpgpu->max_threads;
+ /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
+ if (IS_HASWELL(gpgpu->drv->device_id))
+ total *= 2;
gpgpu->per_thread_scratch = per_thread_size;