add CL_KERNEL_GLOBAL_WORK_SIZE option for clGetKernelWorkGroupInfo.
v2: should return the max global work size instead of current work size.
This funtion need return CL_INVALID_VALUE if the device is not a custom
device or kernel is not a built-in kernel.
we have 3 kind of built-in kernels for 1d/2d/3d memories, the max global
work size are decided by the dimension and memory type.
the piglit fail is caused by calling NON built-in kernels, so need send
patch to piglit later.
Signed-off-by: Luo Xionghu <xionghu.luo@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
#include "cl_kernel.h"
#include "cl_program.h"
+static int
+cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
+{
+ const char * n = cl_kernel_get_name(kernel);
+ const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
+ const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
+ if (!strstr(device->built_in_kernels, n)){
+ return 0;
+ }else if(strstr(builtin_kernels_2d, n)){
+ return 2;
+ }else if(strstr(builtin_kernels_3d, n)){
+ return 3;
+ }else
+ return 1;
+
+}
LOCAL size_t
cl_get_kernel_max_wg_sz(cl_kernel kernel)
size_t* param_value_size_ret)
{
int err = CL_SUCCESS;
+ int dimension = 0;
if (UNLIKELY(device != &intel_ivb_gt1_device &&
device != &intel_ivb_gt2_device &&
device != &intel_baytrail_t_device &&
}
DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
+ case CL_KERNEL_GLOBAL_WORK_SIZE:
+ dimension = cl_check_builtin_kernel_dimension(kernel, device);
+ if ( !dimension ) return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(device->max_1d_global_work_sizes);
+ if (param_value) {
+ if (dimension == 1) {
+ memcpy(param_value, device->max_1d_global_work_sizes, sizeof(device->max_1d_global_work_sizes));
+ }else if(dimension == 2){
+ memcpy(param_value, device->max_2d_global_work_sizes, sizeof(device->max_2d_global_work_sizes));
+ }else if(dimension == 3){
+ memcpy(param_value, device->max_3d_global_work_sizes, sizeof(device->max_3d_global_work_sizes));
+ }else
+ return CL_INVALID_VALUE;
+
+ return CL_SUCCESS;
+ }
default:
return CL_INVALID_VALUE;
};
cl_uint max_work_item_dimensions; // should be 3.
size_t max_work_item_sizes[3]; // equal to maximum work group size.
size_t max_work_group_size; // maximum work group size under simd16 mode.
+ size_t max_1d_global_work_sizes[3]; // maximum 1d global work size for builtin kernels.
+ size_t max_2d_global_work_sizes[3]; // maximum 2d global work size for builtin kernels.
+ size_t max_3d_global_work_sizes[3]; // maximum 3d global work size for builtin kernels.
cl_uint preferred_vector_width_char;
cl_uint preferred_vector_width_short;
cl_uint preferred_vector_width_int;
.device_type = CL_DEVICE_TYPE_GPU,
.vendor_id = 0, /* == device_id (set when requested) */
.max_work_item_dimensions = 3,
+.max_1d_global_work_sizes = {1024 * 1024 * 256, 1, 1},
+.max_2d_global_work_sizes = {8192, 8192, 1},
+.max_3d_global_work_sizes = {8192, 8192, 2048},
.preferred_vector_width_char = 8,
.preferred_vector_width_short = 8,
.preferred_vector_width_int = 4,
cl_ulong local_mem_sz; /* local memory size specified in kernel args. */
size_t compile_wg_sz[3]; /* Required workgroup size by __attribute__((reqd_work_gro
up_size(X, Y, Z))) qualifier.*/
+ size_t global_work_sz[3]; /* maximum global size that can be used to execute a kernel
+ (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
size_t stack_size; /* stack size per work item. */
cl_argument *args; /* To track argument setting */
uint32_t arg_n:31; /* Number of arguments */
test_printf.cpp
enqueue_fill_buf.cpp
enqueue_built_in_kernels.cpp
+ builtin_kernel_max_global_size.cpp
image_1D_buffer.cpp
compare_image_2d_and_1d_array.cpp
compiler_constant_expr.cpp
--- /dev/null
+#include "utest_helper.hpp"
+
+void builtin_kernel_max_global_size(void)
+{
+ char* built_in_kernel_names;
+ size_t built_in_kernels_size;
+ cl_int err = CL_SUCCESS;
+ size_t ret_sz;
+
+
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+ built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+ OCL_ASSERT(ret_sz == built_in_kernels_size);
+ cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+ OCL_ASSERT(built_in_prog != NULL);
+ cl_kernel builtin_kernel_1d = clCreateKernel(built_in_prog, "__cl_copy_region_unalign_src_offset", &err);
+ OCL_ASSERT(builtin_kernel_1d != NULL);
+ size_t param_value_size;
+ void* param_value;
+ clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, 0, NULL, ¶m_value_size);
+ param_value = malloc(param_value_size);
+ clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, param_value_size, param_value, 0);
+ OCL_ASSERT(*(size_t*)param_value == 256 * 1024 *1024);
+ clReleaseKernel(builtin_kernel_1d);
+ clReleaseProgram(built_in_prog);
+ free(param_value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_kernel_max_global_size);