#include <stdio.h>
#include <string.h>
+#define MAX_GROUP_SIZE_IN_HALFSLICE 512
static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }
/* "Varing" payload is the part of the curbe that changes accross threads in the
kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
kernel.curbe_sz = cst_sz;
+ /* Barrier and SLM must fit into a single half slice */
+ if(kernel.use_slm > 0 && simd_sz == 8 && local_sz > MAX_GROUP_SIZE_IN_HALFSLICE){
+ fprintf(stderr, "Beignet: Work group CAN NOT large than %d when using barrier or local momery.\n", MAX_GROUP_SIZE_IN_HALFSLICE);
+ return CL_OUT_OF_RESOURCES;
+ }
+
if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
return CL_OUT_OF_RESOURCES;
.max_compute_unit = 140,
.max_thread_per_unit = 7,
.max_work_item_sizes = {512, 512, 512},
- .max_work_group_size = 512,
+ .max_work_group_size = 1024,
.max_clock_frequency = 1000,
.wg_sz = 1024,
#include "cl_gen75_device.h"
.max_compute_unit = 280,
.max_thread_per_unit = 7,
.max_work_item_sizes = {512, 512, 512},
- .max_work_group_size = 512,
+ .max_work_group_size = 1024,
.max_clock_frequency = 1000,
.wg_sz = 2048,
#include "cl_gen75_device.h"