modules/ocl/src/cl_operations.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // @Authors
  19 //    Guoping Long, longguoping@gmail.com
  20 //    Niko Li, newlife20080214@gmail.com
  21 //    Yao Wang, bitwangyaoyao@gmail.com
  22 // Redistribution and use in source and binary forms, with or without modification,
  23 // are permitted provided that the following conditions are met:
  24 //
  25 //   * Redistribution's of source code must retain the above copyright notice,
  26 //     this list of conditions and the following disclaimer.
  27 //
  28 //   * Redistribution's in binary form must reproduce the above copyright notice,
  29 //     this list of conditions and the following disclaimer in the documentation
  30 //     and/or other oclMaterials provided with the distribution.
  31 //
  32 //   * The name of the copyright holders may not be used to endorse or promote products
  33 //     derived from this software without specific prior written permission.
  34 //
  35 // This software is provided by the copyright holders and contributors "as is" and
  36 // any express or implied warranties, including, but not limited to, the implied
  37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  38 // In no event shall the Intel Corporation or contributors be liable for any direct,
  39 // indirect, incidental, special, exemplary, or consequential damages
  40 // (including, but not limited to, procurement of substitute goods or services;
  41 // loss of use, data, or profits; or business interruption) however caused
  42 // and on any theory of liability, whether in contract, strict liability,
  43 // or tort (including negligence or otherwise) arising in any way out of
  44 // the use of this software, even if advised of the possibility of such damage.
  45 //
  46 //M*/
  47
  48 #include "precomp.hpp"
  49 #include <iomanip>
  50 #include <fstream>
  51 #include "cl_programcache.hpp"
  52
  53 //#define PRINT_KERNEL_RUN_TIME
  54 #define RUN_TIMES 100
  55 #ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
  56 #define CL_MEM_USE_PERSISTENT_MEM_AMD 0
  57 #endif
  58 //#define AMD_DOUBLE_DIFFER
  59
  60 namespace cv {
  61 namespace ocl {
  62
  63 DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
  64 DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
  65 int gDevMemTypeValueMap[5] = {0,
  66                               CL_MEM_ALLOC_HOST_PTR,
  67                               CL_MEM_USE_HOST_PTR,
  68                               CL_MEM_COPY_HOST_PTR,
  69                               CL_MEM_USE_PERSISTENT_MEM_AMD};
  70 int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
  71
  72 void finish()
  73 {
  74     clFinish(getClCommandQueue(Context::getContext()));
  75 }
  76
  77 bool isCpuDevice()
  78 {
  79     const DeviceInfo& info = Context::getContext()->getDeviceInfo();
  80     return (info.deviceType == CVCL_DEVICE_TYPE_CPU);
  81 }
  82
  83 size_t queryWaveFrontSize(cl_kernel kernel)
  84 {
  85     const DeviceInfo& info = Context::getContext()->getDeviceInfo();
  86     if (info.deviceType == CVCL_DEVICE_TYPE_CPU)
  87         return 1;
  88     size_t wavefront = 0;
  89     CV_Assert(kernel != NULL);
  90     openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(Context::getContext()),
  91             CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wavefront, NULL));
  92     return wavefront;
  93 }
  94
  95
  96 void openCLReadBuffer(Context *ctx, cl_mem dst_buffer, void *host_buffer, size_t size)
  97 {
  98     cl_int status;
  99     status = clEnqueueReadBuffer(getClCommandQueue(ctx), dst_buffer, CL_TRUE, 0,
 100                                  size, host_buffer, 0, NULL, NULL);
 101     openCLVerifyCall(status);
 102 }
 103
 104 cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
 105 {
 106     cl_int status;
 107     cl_mem buffer = clCreateBuffer(getClContext(ctx), (cl_mem_flags)flag, size, NULL, &status);
 108     openCLVerifyCall(status);
 109     return buffer;
 110 }
 111
 112 void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
 113                        size_t widthInBytes, size_t height)
 114 {
 115     openCLMallocPitchEx(ctx, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
 116 }
 117
 118 void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
 119                        size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
 120 {
 121     cl_int status;
 122     *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
 123                               widthInBytes * height, 0, &status);
 124     openCLVerifyCall(status);
 125     *pitch = widthInBytes;
 126 }
 127
 128 void openCLMemcpy2D(Context *ctx, void *dst, size_t dpitch,
 129                     const void *src, size_t spitch,
 130                     size_t width, size_t height, openCLMemcpyKind kind, int channels)
 131 {
 132     size_t buffer_origin[3] = {0, 0, 0};
 133     size_t host_origin[3] = {0, 0, 0};
 134     size_t region[3] = {width, height, 1};
 135     if(kind == clMemcpyHostToDevice)
 136     {
 137         if(dpitch == width || channels == 3 || height == 1)
 138         {
 139             openCLSafeCall(clEnqueueWriteBuffer(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
 140                                                 0, width * height, src, 0, NULL, NULL));
 141         }
 142         else
 143         {
 144             openCLSafeCall(clEnqueueWriteBufferRect(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
 145                                                     buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
 146         }
 147     }
 148     else if(kind == clMemcpyDeviceToHost)
 149     {
 150         if(spitch == width || channels == 3 || height == 1)
 151         {
 152             openCLSafeCall(clEnqueueReadBuffer(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
 153                                                0, width * height, dst, 0, NULL, NULL));
 154         }
 155         else
 156         {
 157             openCLSafeCall(clEnqueueReadBufferRect(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
 158                                                    buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
 159         }
 160     }
 161 }
 162
 163 void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
 164                         const void *src, size_t spitch,
 165                         size_t width, size_t height, int src_offset)
 166 {
 167     size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
 168     size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
 169     size_t region[3] = {width, height, 1};
 170
 171     openCLSafeCall(clEnqueueCopyBufferRect(getClCommandQueue(ctx), (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
 172                                            region, spitch, 0, dpitch, 0, 0, 0, 0));
 173 }
 174
 175 void openCLFree(void *devPtr)
 176 {
 177     openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
 178 }
 179
 180 cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName)
 181 {
 182     return openCLGetKernelFromSource(ctx, source, kernelName, NULL);
 183 }
 184
 185 cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
 186                                     const char *build_options)
 187 {
 188     cl_kernel kernel;
 189     cl_int status = 0;
 190     CV_Assert(ProgramCache::getProgramCache() != NULL);
 191     cl_program program = ProgramCache::getProgramCache()->getProgram(ctx, source, build_options);
 192     CV_Assert(program != NULL);
 193     kernel = clCreateKernel(program, kernelName.c_str(), &status);
 194     openCLVerifyCall(status);
 195     openCLVerifyCall(clReleaseProgram(program));
 196     return kernel;
 197 }
 198
 199 void openCLVerifyKernel(const Context *ctx, cl_kernel kernel, size_t *localThreads)
 200 {
 201     size_t kernelWorkGroupSize;
 202     openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(ctx),
 203                                             CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
 204     CV_Assert( localThreads[0] <= ctx->getDeviceInfo().maxWorkItemSizes[0] );
 205     CV_Assert( localThreads[1] <= ctx->getDeviceInfo().maxWorkItemSizes[1] );
 206     CV_Assert( localThreads[2] <= ctx->getDeviceInfo().maxWorkItemSizes[2] );
 207     CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
 208     CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= ctx->getDeviceInfo().maxWorkGroupSize );
 209 }
 210
 211 #ifdef PRINT_KERNEL_RUN_TIME
 212 static double total_execute_time = 0;
 213 static double total_kernel_time = 0;
 214 #endif
 215
 216 static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
 217 {
 218     if (buildOptions == NULL)
 219         return "";
 220
 221     size_t length = strlen(buildOptions), didx = 0, sidx = 0;
 222     while (sidx < length && buildOptions[sidx] == 0)
 223         ++sidx;
 224
 225     std::string opt;
 226     opt.resize(length);
 227
 228     for ( ; sidx < length; ++sidx)
 229         if (buildOptions[sidx] != ' ')
 230             opt[didx++] = buildOptions[sidx];
 231         else if ( !(didx > 0 && opt[didx - 1] == ' ') )
 232             opt[didx++] = buildOptions[sidx];
 233
 234     return opt;
 235 }
 236
 237 void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
 238                           size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
 239                           int depth, const char *build_options)
 240 {
 241     //construct kernel name
 242     //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
 243     //for example split_C2_D3, represent the split kernel with channels = 2 and dataType Depth = 3(Data type is short)
 244     std::stringstream idxStr;
 245     if(channels != -1)
 246         idxStr << "_C" << channels;
 247     if(depth != -1)
 248         idxStr << "_D" << depth;
 249     kernelName = kernelName + idxStr.str();
 250
 251     cl_kernel kernel;
 252     std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
 253     kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
 254
 255     if ( localThreads != NULL)
 256     {
 257         globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
 258         globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
 259         globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
 260
 261         cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
 262     }
 263     for(size_t i = 0; i < args.size(); i ++)
 264         openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
 265
 266 #ifndef PRINT_KERNEL_RUN_TIME
 267     openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
 268                                           localThreads, 0, NULL, NULL));
 269 #else
 270     cl_event event = NULL;
 271     openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
 272                                           localThreads, 0, NULL, &event));
 273
 274     cl_ulong start_time, end_time, queue_time;
 275     double execute_time = 0;
 276     double total_time   = 0;
 277
 278     openCLSafeCall(clWaitForEvents(1, &event));
 279     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
 280                                            sizeof(cl_ulong), &start_time, 0));
 281
 282     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
 283                                            sizeof(cl_ulong), &end_time, 0));
 284
 285     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
 286                                            sizeof(cl_ulong), &queue_time, 0));
 287
 288     execute_time = (double)(end_time - start_time) / (1000 * 1000);
 289     total_time = (double)(end_time - queue_time) / (1000 * 1000);
 290
 291     total_execute_time += execute_time;
 292     total_kernel_time += total_time;
 293     clReleaseEvent(event);
 294 #endif
 295
 296     clFlush(getClCommandQueue(ctx));
 297     openCLSafeCall(clReleaseKernel(kernel));
 298 }
 299
 300 void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
 301                          size_t globalThreads[3], size_t localThreads[3],
 302                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
 303 {
 304     openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args,
 305                         channels, depth, NULL);
 306 }
 307 void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
 308                          size_t globalThreads[3], size_t localThreads[3],
 309                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
 310
 311 {
 312 #ifndef PRINT_KERNEL_RUN_TIME
 313     openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
 314                          build_options);
 315 #else
 316     string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
 317     cout << endl;
 318     cout << "Function Name: " << kernelName;
 319     if(depth >= 0)
 320         cout << " |data type: " << data_type[depth];
 321     cout << " |channels: " << channels;
 322     cout << " |Time Unit: " << "ms" << endl;
 323
 324     total_execute_time = 0;
 325     total_kernel_time = 0;
 326     cout << "-------------------------------------" << endl;
 327
 328     cout << setiosflags(ios::left) << setw(15) << "execute time";
 329     cout << setiosflags(ios::left) << setw(15) << "launch time";
 330     cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
 331     int i = 0;
 332     for(i = 0; i < RUN_TIMES; i++)
 333         openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
 334                              build_options);
 335
 336     cout << "average kernel execute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
 337     cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
 338 #endif
 339 }
 340
 341 void openCLExecuteKernelInterop(Context *ctx, const cv::ocl::ProgramSource& source, String kernelName,
 342                          size_t globalThreads[3], size_t localThreads[3],
 343                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
 344
 345 {
 346     //construct kernel name
 347     //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
 348     //for example split_C2_D2, represent the split kernel with channels = 2 and dataType Depth = 2 (Data type is char)
 349     std::stringstream idxStr;
 350     if(channels != -1)
 351         idxStr << "_C" << channels;
 352     if(depth != -1)
 353         idxStr << "_D" << depth;
 354     kernelName = kernelName + idxStr.str();
 355
 356     std::string name = std::string("custom_") + source.name;
 357     ProgramEntry program = { name.c_str(), source.programStr, source.programHash };
 358     cl_kernel kernel = openCLGetKernelFromSource(ctx, &program, kernelName, build_options);
 359
 360     CV_Assert(globalThreads != NULL);
 361     if ( localThreads != NULL)
 362     {
 363         globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
 364         globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
 365         globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
 366
 367         cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
 368     }
 369     for(size_t i = 0; i < args.size(); i ++)
 370         openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
 371
 372     openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
 373                     localThreads, 0, NULL, NULL));
 374
 375     clFinish(getClCommandQueue(ctx));
 376     openCLSafeCall(clReleaseKernel(kernel));
 377 }
 378
 379 cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
 380                      const size_t size)
 381 {
 382     int status;
 383     cl_mem con_struct;
 384
 385     con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
 386     openCLSafeCall(status);
 387
 388     openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
 389                                         value, 0, 0, 0));
 390
 391     return con_struct;
 392 }
 393
 394 }//namespace ocl
 395 }//namespace cv