INIT_ICD(dispatch)
.max_compute_unit = 16,
.max_thread_per_unit = 8,
+ .sub_slice_count = 2,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 6,
.max_thread_per_unit = 6,
+ .sub_slice_count = 1,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 512,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 4,
.max_thread_per_unit = 8,
+ .sub_slice_count = 1,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 512,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 10,
.max_thread_per_unit = 7,
+ .sub_slice_count = 1,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 20,
.max_thread_per_unit = 7,
+ .sub_slice_count = 2,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 40,
.max_thread_per_unit = 7,
+ .sub_slice_count = 4,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 12,
.max_thread_per_unit = 7,
+ .sub_slice_count = 2,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 24,
.max_thread_per_unit = 7,
+ .sub_slice_count = 3,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
INIT_ICD(dispatch)
.max_compute_unit = 48,
.max_thread_per_unit = 7,
+ .sub_slice_count = 6,
.max_work_item_sizes = {1024, 1024, 1024},
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
work_group_size = kernel->program->ctx->device->max_compute_unit *
kernel->program->ctx->device->max_thread_per_unit * simd_width;
} else
- work_group_size = kernel->program->ctx->device->max_work_group_size /
- (16 / simd_width);
+ work_group_size = kernel->program->ctx->device->max_compute_unit * simd_width *
+ kernel->program->ctx->device->max_thread_per_unit / kernel->program->ctx->device->sub_slice_count;
return work_group_size;
}