Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp

   1
   2 ///VectorAdd sample, from the NVidia JumpStart Guide
   3 ///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
   4
   5 ///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
   6 ///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
   7
   8
   9 //#define LOAD_FROM_FILE
  10
  11 #ifdef USE_MINICL
  12         #include "MiniCL/cl.h"
  13 #else //USE_MINICL
  14         #ifdef __APPLE__
  15                 #include <OpenCL/OpenCL.h>
  16         #else
  17                 #include <CL/cl.h>
  18         #endif //__APPLE__
  19 #endif//USE_MINICL
  20
  21 #include <stdio.h>
  22 #include <math.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include "LinearMath/btMinMax.h"
  26 #define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
  27 size_t wgSize;
  28
  29
  30 #ifndef USE_MINICL
  31 #define MSTRINGIFY(A) #A
  32 const char* stringifiedSourceCL =
  33 #include "VectorAddKernels.cl"
  34 #else
  35 const char* stringifiedSourceCL = "";
  36 #endif
  37
  38
  39
  40
  41 char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
  42 {
  43     // locals
  44     FILE* pFileStream = NULL;
  45     size_t szSourceLength;
  46
  47     // open the OpenCL source code file
  48         pFileStream = fopen(cFilename, "rb");
  49         if(pFileStream == 0)
  50         {
  51                 return NULL;
  52         }
  53
  54     size_t szPreambleLength = strlen(cPreamble);
  55
  56     // get the length of the source code
  57     fseek(pFileStream, 0, SEEK_END);
  58     szSourceLength = ftell(pFileStream);
  59     fseek(pFileStream, 0, SEEK_SET);
  60
  61     // allocate a buffer for the source code string and read it in
  62     char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
  63     memcpy(cSourceString, cPreamble, szPreambleLength);
  64     fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
  65
  66     // close the file and return the total length of the combined (preamble + source) string
  67     fclose(pFileStream);
  68     if(szFinalLength != 0)
  69     {
  70         *szFinalLength = szSourceLength + szPreambleLength;
  71     }
  72     cSourceString[szSourceLength + szPreambleLength] = '\0';
  73
  74     return cSourceString;
  75 }
  76
  77 size_t workitem_size[3];
  78
  79 void printDevInfo(cl_device_id device)
  80 {
  81     char device_string[1024];
  82
  83     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
  84     printf(  " Device %s:\n", device_string);
  85
  86     // CL_DEVICE_INFO
  87     cl_device_type type;
  88     clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
  89     if( type & CL_DEVICE_TYPE_CPU )
  90         printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
  91     if( type & CL_DEVICE_TYPE_GPU )
  92         printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
  93     if( type & CL_DEVICE_TYPE_ACCELERATOR )
  94         printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
  95     if( type & CL_DEVICE_TYPE_DEFAULT )
  96         printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
  97
  98     // CL_DEVICE_MAX_COMPUTE_UNITS
  99     cl_uint compute_units;
 100     clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
 101     printf(  " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
 102
 103     // CL_DEVICE_MAX_WORK_GROUP_SIZE
 104
 105     clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
 106     printf(  " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%zu / %zu / %zu \n", workitem_size[0], workitem_size[1], workitem_size[2]);
 107
 108 }
 109
 110
 111
 112
 113 // Main function
 114 // *********************************************************************
 115 int main(int argc, char **argv)
 116 {
 117         void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
 118     cl_context cxGPUContext;       // OpenCL context
 119     cl_command_queue cqCommandQue;  // OpenCL command que
 120     cl_device_id* cdDevices;        // OpenCL device list
 121     cl_program cpProgram;           // OpenCL program
 122     cl_kernel ckKernel;             // OpenCL kernel
 123     cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
 124     size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
 125     size_t szLocalWorkSize[1];          // 1D var for # of work items in the work group
 126     size_t szParmDataBytes;                     // Byte size of context information
 127     cl_int ciErr1, ciErr2;                      // Error code var
 128     int iTestN = 100000 * 8;            // Size of Vectors to process
 129
 130         int actualGlobalSize = iTestN>>3;
 131
 132     // set Global and Local work size dimensions
 133     szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
 134     szLocalWorkSize[0]= iTestN>>3;
 135
 136
 137     // Allocate and initialize host arrays
 138     srcA = (void *)malloc (sizeof(cl_float) * iTestN);
 139     srcB = (void *)malloc (sizeof(cl_float) * iTestN);
 140     dst = (void *)malloc (sizeof(cl_float) * iTestN);
 141
 142         int i;
 143
 144         // Initialize arrays with some values
 145         for (i=0;i<iTestN;i++)
 146         {
 147                 ((cl_float*)srcA)[i] = cl_float(i);
 148                 ((cl_float*)srcB)[i] = 2;
 149                 ((cl_float*)dst)[i]=-1;
 150         }
 151
 152
 153          cl_uint numPlatforms;
 154     cl_platform_id platform = NULL;
 155     cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
 156
 157     if (0 < numPlatforms)
 158     {
 159         cl_platform_id* platforms = new cl_platform_id[numPlatforms];
 160         status = clGetPlatformIDs(numPlatforms, platforms, NULL);
 161
 162         for (unsigned i = 0; i < numPlatforms; ++i)
 163         {
 164             char pbuf[100];
 165             status = clGetPlatformInfo(platforms[i],
 166                                        CL_PLATFORM_VENDOR,
 167                                        sizeof(pbuf),
 168                                        pbuf,
 169                                        NULL);
 170
 171             platform = platforms[i];
 172             if (!strcmp(pbuf, "Advanced Micro Devices, Inc."))
 173             {
 174                 break;
 175             }
 176         }
 177         delete[] platforms;
 178     }
 179
 180         cl_context_properties cps[3] =
 181     {
 182         CL_CONTEXT_PLATFORM,
 183         (cl_context_properties)platform,
 184         0
 185     };
 186
 187     // Create OpenCL context & context
 188     cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
 189
 190     // Query all devices available to the context
 191     ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
 192     cdDevices = (cl_device_id*)malloc(szParmDataBytes);
 193     ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
 194         if (cdDevices)
 195         {
 196                 printDevInfo(cdDevices[0]);
 197         }
 198
 199     // Create a command queue for first device the context reported
 200     cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
 201     ciErr1 |= ciErr2;
 202
 203     // Allocate the OpenCL source and result buffer memory objects on the device GMEM
 204     cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
 205     ciErr1 |= ciErr2;
 206     cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
 207     ciErr1 |= ciErr2;
 208     cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
 209     ciErr1 |= ciErr2;
 210
 211 ///create kernels from binary
 212         int numDevices = 1;
 213         ::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
 214         const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
 215
 216         for (i = 0; i < numDevices; ++i) {
 217                 images[i] = 0;
 218                 lengths[i] = 0;
 219         }
 220
 221
 222         // Read the OpenCL kernel in from source file
 223         const char* cSourceFile = "VectorAddKernels.cl";
 224
 225     printf("loadProgSource (%s)...\n", cSourceFile);
 226 #ifdef LOAD_FROM_FILE
 227     const char* cPathAndName = cSourceFile;
 228         size_t szKernelLength;
 229     const char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength);
 230 #else
 231         const char* cSourceCL = stringifiedSourceCL;
 232         size_t szKernelLength = strlen(stringifiedSourceCL);
 233 #endif //LOAD_FROM_FILE
 234
 235
 236
 237     // Create the program
 238     cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
 239     printf("clCreateProgramWithSource...\n");
 240     if (ciErr1 != CL_SUCCESS)
 241     {
 242         printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
 243         exit(0);
 244     }
 245
 246     // Build the program with 'mad' Optimization option
 247 #ifdef MAC
 248         char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
 249 #else
 250         const char* flags = "-DGUID_ARG=";
 251 #endif
 252     ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
 253     printf("clBuildProgram...\n");
 254     if (ciErr1 != CL_SUCCESS)
 255     {
 256         printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
 257         exit(0);
 258     }
 259
 260     // Create the kernel
 261     ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
 262     printf("clCreateKernel (VectorAdd)...\n");
 263     if (ciErr1 != CL_SUCCESS)
 264     {
 265         printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
 266                 exit(0);
 267     }
 268
 269
 270         cl_int ciErrNum;
 271
 272         ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
 273         if (ciErrNum != CL_SUCCESS)
 274         {
 275                 printf("cannot get workgroup size\n");
 276                 exit(0);
 277         }
 278
 279
 280
 281
 282     // Set the Argument values
 283     ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
 284     ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
 285     ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
 286
 287
 288
 289         int workgroupSize = wgSize;
 290         if(workgroupSize <= 0)
 291         { // let OpenCL library calculate workgroup size
 292                 size_t globalWorkSize[2];
 293                 globalWorkSize[0] = actualGlobalSize;
 294                 globalWorkSize[1] = 1;
 295
 296                 // Copy input data from host to GPU and launch kernel
 297                 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
 298
 299         }
 300         else
 301         {
 302                 size_t localWorkSize[2], globalWorkSize[2];
 303                 workgroupSize = btMin(workgroupSize, actualGlobalSize);
 304                 int num_t = actualGlobalSize / workgroupSize;
 305                 int num_g = num_t * workgroupSize;
 306                 if(num_g < actualGlobalSize)
 307                 {
 308                         num_t++;
 309                         //this can cause problems -> processing outside of the buffer
 310                         //make sure to check kernel
 311                 }
 312
 313                 size_t globalThreads[] = {num_t * workgroupSize};
 314                 size_t localThreads[] = {workgroupSize};
 315
 316
 317                 localWorkSize[0]  = workgroupSize;
 318                 globalWorkSize[0] = num_t * workgroupSize;
 319                 localWorkSize[1] = 1;
 320                 globalWorkSize[1] = 1;
 321
 322                 // Copy input data from host to GPU and launch kernel
 323                 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
 324
 325         }
 326
 327         if (ciErrNum != CL_SUCCESS)
 328         {
 329                 printf("cannot clEnqueueNDRangeKernel\n");
 330                 exit(0);
 331         }
 332
 333         clFinish(cqCommandQue);
 334     // Read back results and check accumulated errors
 335     ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
 336
 337     // Release kernel, program, and memory objects
 338         // NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
 339     free(cdDevices);
 340         clReleaseKernel(ckKernel);
 341     clReleaseProgram(cpProgram);
 342     clReleaseCommandQueue(cqCommandQue);
 343     clReleaseContext(cxGPUContext);
 344
 345
 346     // print the results
 347     int iErrorCount = 0;
 348     for (i = 0; i < iTestN; i++)
 349     {
 350                 if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
 351                         iErrorCount++;
 352     }
 353
 354         if (iErrorCount)
 355         {
 356                 printf("MiniCL validation FAILED\n");
 357         } else
 358         {
 359                 printf("MiniCL validation SUCCESSFULL\n");
 360         }
 361     // Free host memory, close log and return success
 362         for (i = 0; i < 3; i++)
 363     {
 364         clReleaseMemObject(cmMemObjs[i]);
 365     }
 366
 367     free(srcA);
 368     free(srcB);
 369     free (dst);
 370         printf("Press ENTER to quit\n");
 371         getchar();
 372 }
 373
 374
 375 #ifdef USE_MINICL
 376
 377 #include "MiniCL/cl_MiniCL_Defs.h"
 378
 379 extern "C"
 380 {
 381         ///GUID_ARG is only used by MiniCL to pass in the guid used by its get_global_id implementation
 382
 383
 384         #define MSTRINGIFY(A) A
 385         #include "VectorAddKernels.cl"
 386         #undef MSTRINGIFY
 387 }
 388 MINICL_REGISTER(VectorAdd)
 389 #endif//USE_MINICL