2 ///VectorAdd sample, from the NVidia JumpStart Guide
3 ///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
5 ///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
6 ///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
9 //#define LOAD_FROM_FILE
12 #include "MiniCL/cl.h"
15 #include <OpenCL/OpenCL.h>
25 #include "LinearMath/btMinMax.h"
26 #define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
31 #define MSTRINGIFY(A) #A
32 const char* stringifiedSourceCL =
33 #include "VectorAddKernels.cl"
35 const char* stringifiedSourceCL = "";
41 char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
44 FILE* pFileStream = NULL;
45 size_t szSourceLength;
47 // open the OpenCL source code file
48 pFileStream = fopen(cFilename, "rb");
54 size_t szPreambleLength = strlen(cPreamble);
56 // get the length of the source code
57 fseek(pFileStream, 0, SEEK_END);
58 szSourceLength = ftell(pFileStream);
59 fseek(pFileStream, 0, SEEK_SET);
61 // allocate a buffer for the source code string and read it in
62 char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
63 memcpy(cSourceString, cPreamble, szPreambleLength);
64 fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
66 // close the file and return the total length of the combined (preamble + source) string
68 if(szFinalLength != 0)
70 *szFinalLength = szSourceLength + szPreambleLength;
72 cSourceString[szSourceLength + szPreambleLength] = '\0';
77 size_t workitem_size[3];
79 void printDevInfo(cl_device_id device)
81 char device_string[1024];
83 clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
84 printf( " Device %s:\n", device_string);
88 clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
89 if( type & CL_DEVICE_TYPE_CPU )
90 printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
91 if( type & CL_DEVICE_TYPE_GPU )
92 printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
93 if( type & CL_DEVICE_TYPE_ACCELERATOR )
94 printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
95 if( type & CL_DEVICE_TYPE_DEFAULT )
96 printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
98 // CL_DEVICE_MAX_COMPUTE_UNITS
99 cl_uint compute_units;
100 clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
101 printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
103 // CL_DEVICE_MAX_WORK_GROUP_SIZE
105 clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
106 printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%zu / %zu / %zu \n", workitem_size[0], workitem_size[1], workitem_size[2]);
114 // *********************************************************************
115 int main(int argc, char **argv)
117 void *srcA, *srcB, *dst; // Host buffers for OpenCL test
118 cl_context cxGPUContext; // OpenCL context
119 cl_command_queue cqCommandQue; // OpenCL command que
120 cl_device_id* cdDevices; // OpenCL device list
121 cl_program cpProgram; // OpenCL program
122 cl_kernel ckKernel; // OpenCL kernel
123 cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
124 size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
125 size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
126 size_t szParmDataBytes; // Byte size of context information
127 cl_int ciErr1, ciErr2; // Error code var
128 int iTestN = 100000 * 8; // Size of Vectors to process
130 int actualGlobalSize = iTestN>>3;
132 // set Global and Local work size dimensions
133 szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
134 szLocalWorkSize[0]= iTestN>>3;
137 // Allocate and initialize host arrays
138 srcA = (void *)malloc (sizeof(cl_float) * iTestN);
139 srcB = (void *)malloc (sizeof(cl_float) * iTestN);
140 dst = (void *)malloc (sizeof(cl_float) * iTestN);
144 // Initialize arrays with some values
145 for (i=0;i<iTestN;i++)
147 ((cl_float*)srcA)[i] = cl_float(i);
148 ((cl_float*)srcB)[i] = 2;
149 ((cl_float*)dst)[i]=-1;
153 cl_uint numPlatforms;
154 cl_platform_id platform = NULL;
155 cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
157 if (0 < numPlatforms)
159 cl_platform_id* platforms = new cl_platform_id[numPlatforms];
160 status = clGetPlatformIDs(numPlatforms, platforms, NULL);
162 for (unsigned i = 0; i < numPlatforms; ++i)
165 status = clGetPlatformInfo(platforms[i],
171 platform = platforms[i];
172 if (!strcmp(pbuf, "Advanced Micro Devices, Inc."))
180 cl_context_properties cps[3] =
183 (cl_context_properties)platform,
187 // Create OpenCL context & context
188 cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
190 // Query all devices available to the context
191 ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
192 cdDevices = (cl_device_id*)malloc(szParmDataBytes);
193 ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
196 printDevInfo(cdDevices[0]);
199 // Create a command queue for first device the context reported
200 cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
203 // Allocate the OpenCL source and result buffer memory objects on the device GMEM
204 cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
206 cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
208 cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
211 ///create kernels from binary
213 ::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
214 const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
216 for (i = 0; i < numDevices; ++i) {
222 // Read the OpenCL kernel in from source file
223 const char* cSourceFile = "VectorAddKernels.cl";
225 printf("loadProgSource (%s)...\n", cSourceFile);
226 const char* cPathAndName = cSourceFile;
227 #ifdef LOAD_FROM_FILE
228 size_t szKernelLength;
229 const char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength);
231 const char* cSourceCL = stringifiedSourceCL;
232 size_t szKernelLength = strlen(stringifiedSourceCL);
233 #endif //LOAD_FROM_FILE
237 // Create the program
238 cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
239 printf("clCreateProgramWithSource...\n");
240 if (ciErr1 != CL_SUCCESS)
242 printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
246 // Build the program with 'mad' Optimization option
248 char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
250 const char* flags = "-DGUID_ARG=";
252 ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
253 printf("clBuildProgram...\n");
254 if (ciErr1 != CL_SUCCESS)
256 printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
261 ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
262 printf("clCreateKernel (VectorAdd)...\n");
263 if (ciErr1 != CL_SUCCESS)
265 printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
272 ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
273 if (ciErrNum != CL_SUCCESS)
275 printf("cannot get workgroup size\n");
282 // Set the Argument values
283 ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
284 ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
285 ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
289 int workgroupSize = wgSize;
290 if(workgroupSize <= 0)
291 { // let OpenCL library calculate workgroup size
292 size_t globalWorkSize[2];
293 globalWorkSize[0] = actualGlobalSize;
294 globalWorkSize[1] = 1;
296 // Copy input data from host to GPU and launch kernel
297 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
302 size_t localWorkSize[2], globalWorkSize[2];
303 workgroupSize = btMin(workgroupSize, actualGlobalSize);
304 int num_t = actualGlobalSize / workgroupSize;
305 int num_g = num_t * workgroupSize;
306 if(num_g < actualGlobalSize)
309 //this can cause problems -> processing outside of the buffer
310 //make sure to check kernel
313 size_t globalThreads[] = {num_t * workgroupSize};
314 size_t localThreads[] = {workgroupSize};
317 localWorkSize[0] = workgroupSize;
318 globalWorkSize[0] = num_t * workgroupSize;
319 localWorkSize[1] = 1;
320 globalWorkSize[1] = 1;
322 // Copy input data from host to GPU and launch kernel
323 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
327 if (ciErrNum != CL_SUCCESS)
329 printf("cannot clEnqueueNDRangeKernel\n");
333 clFinish(cqCommandQue);
334 // Read back results and check accumulated errors
335 ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
337 // Release kernel, program, and memory objects
338 // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
340 clReleaseKernel(ckKernel);
341 clReleaseProgram(cpProgram);
342 clReleaseCommandQueue(cqCommandQue);
343 clReleaseContext(cxGPUContext);
348 for (i = 0; i < iTestN; i++)
350 if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
356 printf("MiniCL validation FAILED\n");
359 printf("MiniCL validation SUCCESSFULL\n");
361 // Free host memory, close log and return success
362 for (i = 0; i < 3; i++)
364 clReleaseMemObject(cmMemObjs[i]);
370 printf("Press ENTER to quit\n");
377 #include "MiniCL/cl_MiniCL_Defs.h"
381 ///GUID_ARG is only used by MiniCL to pass in the guid used by its get_global_id implementation
384 #define MSTRINGIFY(A) A
385 #include "VectorAddKernels.cl"
388 MINICL_REGISTER(VectorAdd)