From 0ff930bdb06a41e8cd5397e5fba8fb9234007eaa Mon Sep 17 00:00:00 2001 From: Yang Rong Date: Mon, 9 Jun 2014 23:29:48 +0800 Subject: [PATCH] HSW: Set correct max work group size for GT2 and GT3. v2: Return an error when can't fit work group to a single half slice. Signed-off-by: Yang Rong Reviewed-by: Zhigang Gong --- src/cl_command_queue_gen7.c | 7 +++++++ src/cl_device_id.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index d875021..c9818e6 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -30,6 +30,7 @@ #include #include +#define MAX_GROUP_SIZE_IN_HALFSLICE 512 static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; } /* "Varing" payload is the part of the curbe that changes accross threads in the @@ -278,6 +279,12 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz; kernel.curbe_sz = cst_sz; + /* Barrier and SLM must fit into a single half slice */ + if(kernel.use_slm > 0 && simd_sz == 8 && local_sz > MAX_GROUP_SIZE_IN_HALFSLICE){ + fprintf(stderr, "Beignet: Work group CAN NOT large than %d when using barrier or local momery.\n", MAX_GROUP_SIZE_IN_HALFSLICE); + return CL_OUT_OF_RESOURCES; + } + if (scratch_sz > ker->program->ctx->device->scratch_mem_size) { fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz); return CL_OUT_OF_RESOURCES; diff --git a/src/cl_device_id.c b/src/cl_device_id.c index d2b3bed..c435307 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -86,7 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = { .max_compute_unit = 140, .max_thread_per_unit = 7, .max_work_item_sizes = {512, 512, 512}, - .max_work_group_size = 512, + .max_work_group_size = 1024, .max_clock_frequency = 1000, .wg_sz = 1024, #include "cl_gen75_device.h" @@ -97,7 +97,7 @@ static struct _cl_device_id intel_hsw_gt3_device = { .max_compute_unit = 280, .max_thread_per_unit = 7, .max_work_item_sizes = {512, 512, 512}, - .max_work_group_size = 512, + .max_work_group_size = 1024, .max_clock_frequency = 1000, .wg_sz = 2048, #include "cl_gen75_device.h" -- 2.7.4