*/
assert(offset >= 0);
stack_sz *= interp_kernel_get_simd_width(ker->opaque);
- stack_sz *= device->max_compute_unit;
+ stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
/* Because HSW calc stack offset per thread is relative with half slice, when
thread schedule in half slice is not balance, would out of bound. Because
the max half slice is 4 in GT4, multiply stack size with 4 for safe.
/* Setup the kernel */
if (queue->props & CL_QUEUE_PROFILING_ENABLE)
- cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
+ cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
else
- cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
+ cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
printf_num = interp_get_printf_num(printf_info);
if (printf_num) {
static struct _cl_device_id intel_ivb_gt2_device = {
INIT_ICD(dispatch)
- .max_compute_unit = 128,
+ .max_compute_unit = 16,
.max_thread_per_unit = 8,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 1024,
static struct _cl_device_id intel_ivb_gt1_device = {
INIT_ICD(dispatch)
- .max_compute_unit = 64,
+ .max_compute_unit = 8,
.max_thread_per_unit = 8,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 512,
static struct _cl_device_id intel_baytrail_t_device = {
INIT_ICD(dispatch)
- .max_compute_unit = 32,
+ .max_compute_unit = 4,
.max_thread_per_unit = 8,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 256,
/* XXX we clone IVB for HSW now */
static struct _cl_device_id intel_hsw_gt1_device = {
INIT_ICD(dispatch)
- .max_compute_unit = 70,
+ .max_compute_unit = 10,
.max_thread_per_unit = 7,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 512,
static struct _cl_device_id intel_hsw_gt2_device = {
INIT_ICD(dispatch)
- .max_compute_unit = 140,
+ .max_compute_unit = 20,
.max_thread_per_unit = 7,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 1024,
static struct _cl_device_id intel_hsw_gt3_device = {
INIT_ICD(dispatch)
- .max_compute_unit = 280,
+ .max_compute_unit = 40,
.max_thread_per_unit = 7,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 1024,