Merge pull request #1663 from vpisarev:ocl_experiments3
[profile/ivi/opencv.git] / modules / ocl / src / cl_operations.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // @Authors
19 //    Guoping Long, longguoping@gmail.com
20 //    Niko Li, newlife20080214@gmail.com
21 //    Yao Wang, bitwangyaoyao@gmail.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
24 //
25 //   * Redistribution's of source code must retain the above copyright notice,
26 //     this list of conditions and the following disclaimer.
27 //
28 //   * Redistribution's in binary form must reproduce the above copyright notice,
29 //     this list of conditions and the following disclaimer in the documentation
30 //     and/or other oclMaterials provided with the distribution.
31 //
32 //   * The name of the copyright holders may not be used to endorse or promote products
33 //     derived from this software without specific prior written permission.
34 //
35 // This software is provided by the copyright holders and contributors "as is" and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
45 //
46 //M*/
47
48 #include "precomp.hpp"
49 #include <iomanip>
50 #include <fstream>
51 #include "cl_programcache.hpp"
52
53 //#define PRINT_KERNEL_RUN_TIME
54 #define RUN_TIMES 100
55 #ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
56 #define CL_MEM_USE_PERSISTENT_MEM_AMD 0
57 #endif
58 //#define AMD_DOUBLE_DIFFER
59
60 namespace cv {
61 namespace ocl {
62
63 DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
64 DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
65 int gDevMemTypeValueMap[5] = {0,
66                               CL_MEM_ALLOC_HOST_PTR,
67                               CL_MEM_USE_HOST_PTR,
68                               CL_MEM_COPY_HOST_PTR,
69                               CL_MEM_USE_PERSISTENT_MEM_AMD};
70 int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
71
72 void finish()
73 {
74     clFinish(getClCommandQueue(Context::getContext()));
75 }
76
77 bool isCpuDevice()
78 {
79     const DeviceInfo& info = Context::getContext()->getDeviceInfo();
80     return (info.deviceType == CVCL_DEVICE_TYPE_CPU);
81 }
82
83 size_t queryWaveFrontSize(cl_kernel kernel)
84 {
85     const DeviceInfo& info = Context::getContext()->getDeviceInfo();
86     if (info.deviceType == CVCL_DEVICE_TYPE_CPU)
87         return 1;
88     size_t wavefront = 0;
89     CV_Assert(kernel != NULL);
90     openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(Context::getContext()),
91             CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wavefront, NULL));
92     return wavefront;
93 }
94
95
96 void openCLReadBuffer(Context *ctx, cl_mem dst_buffer, void *host_buffer, size_t size)
97 {
98     cl_int status;
99     status = clEnqueueReadBuffer(getClCommandQueue(ctx), dst_buffer, CL_TRUE, 0,
100                                  size, host_buffer, 0, NULL, NULL);
101     openCLVerifyCall(status);
102 }
103
104 cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
105 {
106     cl_int status;
107     cl_mem buffer = clCreateBuffer(getClContext(ctx), (cl_mem_flags)flag, size, NULL, &status);
108     openCLVerifyCall(status);
109     return buffer;
110 }
111
112 void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
113                        size_t widthInBytes, size_t height)
114 {
115     openCLMallocPitchEx(ctx, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
116 }
117
118 void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
119                        size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
120 {
121     cl_int status;
122     *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
123                               widthInBytes * height, 0, &status);
124     openCLVerifyCall(status);
125     *pitch = widthInBytes;
126 }
127
128 void openCLMemcpy2D(Context *ctx, void *dst, size_t dpitch,
129                     const void *src, size_t spitch,
130                     size_t width, size_t height, openCLMemcpyKind kind, int channels)
131 {
132     size_t buffer_origin[3] = {0, 0, 0};
133     size_t host_origin[3] = {0, 0, 0};
134     size_t region[3] = {width, height, 1};
135     if(kind == clMemcpyHostToDevice)
136     {
137         if(dpitch == width || channels == 3 || height == 1)
138         {
139             openCLSafeCall(clEnqueueWriteBuffer(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
140                                                 0, width * height, src, 0, NULL, NULL));
141         }
142         else
143         {
144             openCLSafeCall(clEnqueueWriteBufferRect(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
145                                                     buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
146         }
147     }
148     else if(kind == clMemcpyDeviceToHost)
149     {
150         if(spitch == width || channels == 3 || height == 1)
151         {
152             openCLSafeCall(clEnqueueReadBuffer(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
153                                                0, width * height, dst, 0, NULL, NULL));
154         }
155         else
156         {
157             openCLSafeCall(clEnqueueReadBufferRect(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
158                                                    buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
159         }
160     }
161 }
162
163 void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
164                         const void *src, size_t spitch,
165                         size_t width, size_t height, int src_offset)
166 {
167     size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
168     size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
169     size_t region[3] = {width, height, 1};
170
171     openCLSafeCall(clEnqueueCopyBufferRect(getClCommandQueue(ctx), (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
172                                            region, spitch, 0, dpitch, 0, 0, 0, 0));
173 }
174
175 void openCLFree(void *devPtr)
176 {
177     openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
178 }
179
180 cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName)
181 {
182     return openCLGetKernelFromSource(ctx, source, kernelName, NULL);
183 }
184
185 cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
186                                     const char *build_options)
187 {
188     cl_kernel kernel;
189     cl_int status = 0;
190     CV_Assert(ProgramCache::getProgramCache() != NULL);
191     cl_program program = ProgramCache::getProgramCache()->getProgram(ctx, source, build_options);
192     CV_Assert(program != NULL);
193     kernel = clCreateKernel(program, kernelName.c_str(), &status);
194     openCLVerifyCall(status);
195     openCLVerifyCall(clReleaseProgram(program));
196     return kernel;
197 }
198
199 void openCLVerifyKernel(const Context *ctx, cl_kernel kernel, size_t *localThreads)
200 {
201     size_t kernelWorkGroupSize;
202     openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(ctx),
203                                             CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
204     CV_Assert( localThreads[0] <= ctx->getDeviceInfo().maxWorkItemSizes[0] );
205     CV_Assert( localThreads[1] <= ctx->getDeviceInfo().maxWorkItemSizes[1] );
206     CV_Assert( localThreads[2] <= ctx->getDeviceInfo().maxWorkItemSizes[2] );
207     CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
208     CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= ctx->getDeviceInfo().maxWorkGroupSize );
209 }
210
211 #ifdef PRINT_KERNEL_RUN_TIME
212 static double total_execute_time = 0;
213 static double total_kernel_time = 0;
214 #endif
215
216 static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
217 {
218     if (buildOptions == NULL)
219         return "";
220
221     size_t length = strlen(buildOptions), didx = 0, sidx = 0;
222     while (sidx < length && buildOptions[sidx] == 0)
223         ++sidx;
224
225     std::string opt;
226     opt.resize(length);
227
228     for ( ; sidx < length; ++sidx)
229         if (buildOptions[sidx] != ' ')
230             opt[didx++] = buildOptions[sidx];
231         else if ( !(didx > 0 && opt[didx - 1] == ' ') )
232             opt[didx++] = buildOptions[sidx];
233
234     return opt;
235 }
236
237 void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
238                           size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
239                           int depth, const char *build_options)
240 {
241     //construct kernel name
242     //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
243     //for example split_C2_D3, represent the split kernel with channels = 2 and dataType Depth = 3(Data type is short)
244     std::stringstream idxStr;
245     if(channels != -1)
246         idxStr << "_C" << channels;
247     if(depth != -1)
248         idxStr << "_D" << depth;
249     kernelName = kernelName + idxStr.str();
250
251     cl_kernel kernel;
252     std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
253     kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
254
255     if ( localThreads != NULL)
256     {
257         globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
258         globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
259         globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
260
261         cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
262     }
263     for(size_t i = 0; i < args.size(); i ++)
264         openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
265
266 #ifndef PRINT_KERNEL_RUN_TIME
267     openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
268                                           localThreads, 0, NULL, NULL));
269 #else
270     cl_event event = NULL;
271     openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
272                                           localThreads, 0, NULL, &event));
273
274     cl_ulong start_time, end_time, queue_time;
275     double execute_time = 0;
276     double total_time   = 0;
277
278     openCLSafeCall(clWaitForEvents(1, &event));
279     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
280                                            sizeof(cl_ulong), &start_time, 0));
281
282     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
283                                            sizeof(cl_ulong), &end_time, 0));
284
285     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
286                                            sizeof(cl_ulong), &queue_time, 0));
287
288     execute_time = (double)(end_time - start_time) / (1000 * 1000);
289     total_time = (double)(end_time - queue_time) / (1000 * 1000);
290
291     total_execute_time += execute_time;
292     total_kernel_time += total_time;
293     clReleaseEvent(event);
294 #endif
295
296     clFlush(getClCommandQueue(ctx));
297     openCLSafeCall(clReleaseKernel(kernel));
298 }
299
300 void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
301                          size_t globalThreads[3], size_t localThreads[3],
302                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
303 {
304     openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args,
305                         channels, depth, NULL);
306 }
307 void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
308                          size_t globalThreads[3], size_t localThreads[3],
309                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
310
311 {
312 #ifndef PRINT_KERNEL_RUN_TIME
313     openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
314                          build_options);
315 #else
316     string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
317     cout << endl;
318     cout << "Function Name: " << kernelName;
319     if(depth >= 0)
320         cout << " |data type: " << data_type[depth];
321     cout << " |channels: " << channels;
322     cout << " |Time Unit: " << "ms" << endl;
323
324     total_execute_time = 0;
325     total_kernel_time = 0;
326     cout << "-------------------------------------" << endl;
327
328     cout << setiosflags(ios::left) << setw(15) << "execute time";
329     cout << setiosflags(ios::left) << setw(15) << "launch time";
330     cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
331     int i = 0;
332     for(i = 0; i < RUN_TIMES; i++)
333         openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
334                              build_options);
335
336     cout << "average kernel execute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
337     cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
338 #endif
339 }
340
341 void openCLExecuteKernelInterop(Context *ctx, const cv::ocl::ProgramSource& source, String kernelName,
342                          size_t globalThreads[3], size_t localThreads[3],
343                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
344
345 {
346     //construct kernel name
347     //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
348     //for example split_C2_D2, represent the split kernel with channels = 2 and dataType Depth = 2 (Data type is char)
349     std::stringstream idxStr;
350     if(channels != -1)
351         idxStr << "_C" << channels;
352     if(depth != -1)
353         idxStr << "_D" << depth;
354     kernelName = kernelName + idxStr.str();
355
356     std::string name = std::string("custom_") + source.name;
357     ProgramEntry program = { name.c_str(), source.programStr, source.programHash };
358     cl_kernel kernel = openCLGetKernelFromSource(ctx, &program, kernelName, build_options);
359
360     CV_Assert(globalThreads != NULL);
361     if ( localThreads != NULL)
362     {
363         globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
364         globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
365         globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
366
367         cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
368     }
369     for(size_t i = 0; i < args.size(); i ++)
370         openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
371
372     openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
373                     localThreads, 0, NULL, NULL));
374
375     clFinish(getClCommandQueue(ctx));
376     openCLSafeCall(clReleaseKernel(kernel));
377 }
378
379 cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
380                      const size_t size)
381 {
382     int status;
383     cl_mem con_struct;
384
385     con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
386     openCLSafeCall(status);
387
388     openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
389                                         value, 0, 0, 0));
390
391     return con_struct;
392 }
393
394 }//namespace ocl
395 }//namespace cv