Merge pull request #1663 from vpisarev:ocl_experiments3
[profile/ivi/opencv.git] / modules / nonfree / src / surf.ocl.cpp
1 /*M/////////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Peng Xiao, pengxiao@multicorewareinc.com
19 //
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
22 //
23 //   * Redistribution's of source code must retain the above copyright notice,
24 //     this list of conditions and the following disclaimer.
25 //
26 //   * Redistribution's in binary form must reproduce the above copyright notice,
27 //     this list of conditions and the following disclaimer in the documentation
28 //     and/or other oclMaterials provided with the distribution.
29 //
30 //   * The name of the copyright holders may not be used to endorse or promote products
31 //     derived from this software without specific prior written permission.
32 //
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
43 //
44 //M*/
45 #include "precomp.hpp"
46
47 #ifdef HAVE_OPENCV_OCL
48 #include <cstdio>
49 #include "opencl_kernels.hpp"
50
51 using namespace cv;
52 using namespace cv::ocl;
53
54 namespace cv
55 {
56     namespace ocl
57     {
58         static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
59             size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
60         {
61             char optBuf [100] = {0};
62             char * optBufPtr = optBuf;
63             cl_kernel kernel;
64             kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
65             size_t wave_size = queryWaveFrontSize(kernel);
66             CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
67             sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
68             openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
69         }
70     }
71 }
72
73 static inline int calcSize(int octave, int layer)
74 {
75     /* Wavelet size at first layer of first octave. */
76     const int HAAR_SIZE0 = 9;
77
78     /* Wavelet size increment between layers. This should be an even number,
79     such that the wavelet sizes in an octave are either all even or all odd.
80     This ensures that when looking for the neighbors of a sample, the layers
81
82     above and below are aligned correctly. */
83     const int HAAR_SIZE_INC = 6;
84
85     return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
86 }
87
88
89 class SURF_OCL_Invoker
90 {
91 public:
92     // facilities
93     void bindImgTex(const oclMat &img, cl_mem &texture);
94
95     //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
96     //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
97
98     // kernel callers declarations
99     void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
100
101     void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
102                                   int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
103
104     void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
105                                     oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);
106
107     void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
108
109     void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
110
111     void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
112     // end of kernel callers declarations
113
114     SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
115         surf_(surf),
116         img_cols(img.cols), img_rows(img.rows),
117         use_mask(!mask.empty()), counters(oclMat()),
118         imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
119     {
120         CV_Assert(!img.empty() && img.type() == CV_8UC1);
121         CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
122         CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
123
124         const int min_size = calcSize(surf_.nOctaves - 1, 0);
125         CV_Assert(img_rows - min_size >= 0);
126         CV_Assert(img_cols - min_size >= 0);
127
128         const int layer_rows = img_rows >> (surf_.nOctaves - 1);
129         const int layer_cols = img_cols >> (surf_.nOctaves - 1);
130         const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
131         CV_Assert(layer_rows - 2 * min_margin > 0);
132         CV_Assert(layer_cols - 2 * min_margin > 0);
133
134         maxFeatures   = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
135         maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
136
137         CV_Assert(maxFeatures > 0);
138
139         counters.create(1, surf_.nOctaves + 1, CV_32SC1);
140         counters.setTo(Scalar::all(0));
141
142         integral(img, surf_.sum);
143
144         bindImgTex(img, imgTex);
145         bindImgTex(surf_.sum, sumTex);
146         finish();
147
148         maskSumTex = 0;
149
150         if (use_mask)
151         {
152             CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet");
153             //!FIXME
154             // temp fix for missing min overload
155             //oclMat temp(mask.size(), mask.type());
156             //temp.setTo(Scalar::all(1.0));
157             ////cv::ocl::min(mask, temp, surf_.mask1);           ///////// disable this
158             //integral(surf_.mask1, surf_.maskSum);
159             //bindImgTex(surf_.maskSum, maskSumTex);
160         }
161     }
162
163     void detectKeypoints(oclMat &keypoints)
164     {
165         // create image pyramid buffers
166         // different layers have same sized buffers, but they are sampled from Gaussian kernel.
167         ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
168         ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
169
170         ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
171         ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
172         keypoints.setTo(Scalar::all(0));
173
174         for (int octave = 0; octave < surf_.nOctaves; ++octave)
175         {
176             const int layer_rows = img_rows >> octave;
177             const int layer_cols = img_cols >> octave;
178
179             //loadOctaveConstants(octave, layer_rows, layer_cols);
180
181             icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
182
183             icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
184                                      octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
185
186             int maxCounter = ((Mat)counters).at<int>(1 + octave);
187             maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
188
189             if (maxCounter > 0)
190             {
191                 icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
192                                            keypoints, counters, octave, layer_rows, maxFeatures);
193             }
194         }
195         int featureCounter = Mat(counters).at<int>(0);
196         featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
197
198         keypoints.cols = featureCounter;
199
200         if (surf_.upright)
201         {
202             //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
203             setUpright(keypoints);
204         }
205         else
206         {
207             findOrientation(keypoints);
208         }
209     }
210
211     void setUpright(oclMat &keypoints)
212     {
213         const int nFeatures = keypoints.cols;
214         if(nFeatures > 0)
215         {
216             icvSetUpright_gpu(keypoints, keypoints.cols);
217         }
218     }
219
220     void findOrientation(oclMat &keypoints)
221     {
222         const int nFeatures = keypoints.cols;
223         if (nFeatures > 0)
224         {
225             icvCalcOrientation_gpu(keypoints, nFeatures);
226         }
227     }
228
229     void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize)
230     {
231         const int nFeatures = keypoints.cols;
232         if (nFeatures > 0)
233         {
234             ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
235             compute_descriptors_gpu(descriptors, keypoints, nFeatures);
236         }
237     }
238
239     ~SURF_OCL_Invoker()
240     {
241         if(imgTex)
242             openCLFree(imgTex);
243         if(sumTex)
244             openCLFree(sumTex);
245         if(maskSumTex)
246             openCLFree(maskSumTex);
247     }
248
249 private:
250     SURF_OCL &surf_;
251
252     int img_cols, img_rows;
253
254     bool use_mask;
255
256     int maxCandidates;
257     int maxFeatures;
258
259     oclMat counters;
260
261     // texture buffers
262     cl_mem imgTex;
263     cl_mem sumTex;
264     cl_mem maskSumTex;
265
266     const oclMat _img; // make a copy for non-image2d_t supported platform
267
268     SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
269     {
270         (*this) = right;
271         return *this;
272     } // remove warning C4512
273 };
274
275 cv::ocl::SURF_OCL::SURF_OCL()
276 {
277     hessianThreshold = 100.0f;
278     extended = true;
279     nOctaves = 4;
280     nOctaveLayers = 2;
281     keypointsRatio = 0.01f;
282     upright = false;
283 }
284
285 cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
286 {
287     hessianThreshold = saturate_cast<float>(_threshold);
288     extended = _extended;
289     nOctaves = _nOctaves;
290     nOctaveLayers = _nOctaveLayers;
291     keypointsRatio = _keypointsRatio;
292     upright = _upright;
293 }
294
295 int cv::ocl::SURF_OCL::descriptorSize() const
296 {
297     return extended ? 128 : 64;
298 }
299
300 void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints, oclMat &keypointsGPU)
301 {
302     if (keypoints.empty())
303         keypointsGPU.release();
304     else
305     {
306         Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
307
308         float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
309         float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
310         int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
311         int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
312         float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
313         float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
314         float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
315
316         for (size_t i = 0, size = keypoints.size(); i < size; ++i)
317         {
318             const KeyPoint &kp = keypoints[i];
319             kp_x[i] = kp.pt.x;
320             kp_y[i] = kp.pt.y;
321             kp_octave[i] = kp.octave;
322             kp_size[i] = kp.size;
323             kp_dir[i] = kp.angle;
324             kp_hessian[i] = kp.response;
325             kp_laplacian[i] = 1;
326         }
327
328         keypointsGPU.upload(keypointsCPU);
329     }
330 }
331
332 void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vector<KeyPoint> &keypoints)
333 {
334     const int nFeatures = keypointsGPU.cols;
335
336     if (nFeatures == 0)
337         keypoints.clear();
338     else
339     {
340         CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
341
342         Mat keypointsCPU(keypointsGPU);
343
344         keypoints.resize(nFeatures);
345
346         float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
347         float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
348         int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
349         int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
350         float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
351         float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
352         float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
353
354         for (int i = 0; i < nFeatures; ++i)
355         {
356             KeyPoint &kp = keypoints[i];
357             kp.pt.x = kp_x[i];
358             kp.pt.y = kp_y[i];
359             kp.class_id = kp_laplacian[i];
360             kp.octave = kp_octave[i];
361             kp.size = kp_size[i];
362             kp.angle = kp_dir[i];
363             kp.response = kp_hessian[i];
364         }
365     }
366 }
367
368 void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, std::vector<float> &descriptors)
369 {
370     if (descriptorsGPU.empty())
371         descriptors.clear();
372     else
373     {
374         CV_Assert(descriptorsGPU.type() == CV_32F);
375
376         descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
377         Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
378         descriptorsGPU.download(descriptorsCPU);
379     }
380 }
381
382 void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints)
383 {
384     if (!img.empty())
385     {
386         SURF_OCL_Invoker surf(*this, img, mask);
387
388         surf.detectKeypoints(keypoints);
389     }
390 }
391
392 void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
393                                    bool useProvidedKeypoints)
394 {
395     if (!img.empty())
396     {
397         SURF_OCL_Invoker surf(*this, img, mask);
398
399         if (!useProvidedKeypoints)
400             surf.detectKeypoints(keypoints);
401         else if (!upright)
402         {
403             surf.findOrientation(keypoints);
404         }
405
406         surf.computeDescriptors(keypoints, descriptors, descriptorSize());
407     }
408 }
409
410 void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints)
411 {
412     oclMat keypointsGPU;
413
414     (*this)(img, mask, keypointsGPU);
415
416     downloadKeypoints(keypointsGPU, keypoints);
417 }
418
419 void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints,
420                                    oclMat &descriptors, bool useProvidedKeypoints)
421 {
422     oclMat keypointsGPU;
423
424     if (useProvidedKeypoints)
425         uploadKeypoints(keypoints, keypointsGPU);
426
427     (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
428
429     downloadKeypoints(keypointsGPU, keypoints);
430 }
431
432 void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints,
433                                    std::vector<float> &descriptors, bool useProvidedKeypoints)
434 {
435     oclMat descriptorsGPU;
436
437     (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
438
439     downloadDescriptors(descriptorsGPU, descriptors);
440 }
441
442 void cv::ocl::SURF_OCL::releaseMemory()
443 {
444     sum.release();
445     mask1.release();
446     maskSum.release();
447     intBuffer.release();
448     det.release();
449     trace.release();
450     maxPosBuffer.release();
451 }
452
453
454 // bind source buffer to image oject.
455 void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
456 {
457     if(texture)
458     {
459         openCLFree(texture);
460     }
461     texture = bindTexture(img);
462 }
463
464 ////////////////////////////
465 // kernel caller definitions
466 void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows)
467 {
468     const int min_size = calcSize(octave, 0);
469     const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
470     const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
471
472     Context *clCxt = det.clCxt;
473     String kernelName = "icvCalcLayerDetAndTrace";
474     std::vector< std::pair<size_t, const void *> > args;
475
476     if(sumTex)
477     {
478         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
479     }
480     else
481     {
482         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
483     }
484     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
485     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
486     args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
487     args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
488     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
489     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
490     args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
491     args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
492     args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
493     args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
494
495     size_t localThreads[3]  = {16, 16, 1};
496     size_t globalThreads[3] =
497     {
498         divUp(max_samples_j, localThreads[0]) *localThreads[0],
499         divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
500         1
501     };
502     openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
503 }
504
505 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
506         int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
507 {
508     const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
509
510     Context *clCxt = det.clCxt;
511     String kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
512     std::vector< std::pair<size_t, const void *> > args;
513
514     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
515     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
516     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
517     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
518     args.push_back( std::make_pair( sizeof(cl_int), (void *)&counterOffset));
519     args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
520     args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
521     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
522     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
523     args.push_back( std::make_pair( sizeof(cl_int), (void *)&nLayers));
524     args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
525     args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
526     args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_cols));
527     args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates));
528     args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
529
530     if(useMask)
531     {
532         if(maskSumTex)
533         {
534             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
535         }
536         else
537         {
538             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
539         }
540         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
541     }
542     size_t localThreads[3]  = {16, 16, 1};
543     size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
544                                divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
545                                1
546                               };
547
548     openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
549 }
550
551 void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
552         oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
553 {
554     Context *clCxt = det.clCxt;
555     String kernelName = "icvInterpolateKeypoint";
556     std::vector< std::pair<size_t, const void *> > args;
557
558     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
559     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
560     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
561     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data));
562     args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
563     args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
564     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
565     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
566     args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
567     args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
568     args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features));
569
570     size_t localThreads[3]  = {3, 3, 3};
571     size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
572
573     openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
574 }
575
576 void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
577 {
578     Context *clCxt = counters.clCxt;
579     String kernelName = "icvCalcOrientation";
580
581     std::vector< std::pair<size_t, const void *> > args;
582
583     if(sumTex)
584     {
585         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
586     }
587     else
588     {
589         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
590     }
591     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
592     args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
593     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
594     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
595     args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
596
597     size_t localThreads[3]  = {32, 4, 1};
598     size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
599
600     openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
601 }
602
603 void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
604 {
605     Context *clCxt = counters.clCxt;
606     String kernelName = "icvSetUpright";
607
608     std::vector< std::pair<size_t, const void *> > args;
609
610     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
611     args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
612     args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));
613
614     size_t localThreads[3]  = {256, 1, 1};
615     size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
616
617     openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
618 }
619
620
621 void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
622 {
623     // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
624     Context *clCxt = descriptors.clCxt;
625     String kernelName;
626     std::vector< std::pair<size_t, const void *> > args;
627     size_t localThreads[3]  = {1, 1, 1};
628     size_t globalThreads[3] = {1, 1, 1};
629
630     if(descriptors.cols == 64)
631     {
632         kernelName = "compute_descriptors64";
633
634         localThreads[0] = 6;
635         localThreads[1] = 6;
636
637         globalThreads[0] = nFeatures * localThreads[0];
638         globalThreads[1] = 16 * localThreads[1];
639
640         args.clear();
641         if(imgTex)
642         {
643             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
644         }
645         else
646         {
647             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
648         }
649         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
650         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
651         args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
652         args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
653         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
654         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
655         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
656
657         openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
658
659         kernelName = "normalize_descriptors64";
660
661         localThreads[0] = 64;
662         localThreads[1] = 1;
663
664         globalThreads[0] = nFeatures * localThreads[0];
665         globalThreads[1] = localThreads[1];
666
667         args.clear();
668         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
669         args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
670
671         openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
672     }
673     else
674     {
675         kernelName = "compute_descriptors128";
676
677         localThreads[0] = 6;
678         localThreads[1] = 6;
679
680         globalThreads[0] = nFeatures * localThreads[0];
681         globalThreads[1] = 16 * localThreads[1];
682
683         args.clear();
684         if(imgTex)
685         {
686             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
687         }
688         else
689         {
690             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
691         }
692         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
693         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
694         args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
695         args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
696         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
697         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
698         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
699
700         openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
701
702         kernelName = "normalize_descriptors128";
703
704         localThreads[0] = 128;
705         localThreads[1] = 1;
706
707         globalThreads[0] = nFeatures * localThreads[0];
708         globalThreads[1] = localThreads[1];
709
710         args.clear();
711         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
712         args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
713
714         openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
715     }
716 }
717
718 #endif //HAVE_OPENCV_OCL