1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Peng Xiao, pengxiao@outlook.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other materials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
45 #include "precomp.hpp"
46 #include "opencl_kernels.hpp"
49 using namespace cv::ocl;
51 // currently sort procedure on the host is more efficient
52 static bool use_cpu_sorter = true;
54 // compact structure for corners
57 float eig; //eigenvalue of corner
58 short x; //x coordinate of corner point
59 short y; //y coordinate of corner point
62 // compare procedure for corner
63 //it is used for sort on the host side
64 struct DefCornerCompare
66 bool operator()(const DefCorner a, const DefCorner b) const
72 // sort corner point using opencl bitonicosrt implementation
73 static void sortCorners_caller(oclMat& corners, const int count)
75 Context * cxt = Context::getContext();
78 size_t globalThreads[3] = {GS, 1, 1};
79 size_t localThreads[3] = {LS, 1, 1};
81 // 2^numStages should be equal to count or the output is invalid
83 for(int i = count; i > 1; i >>= 1)
88 std::vector< std::pair<size_t, const void *> > args(argc);
89 std::string kernelname = "sortCorners_bitonicSort";
90 args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
91 args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
92 for(int stage = 0; stage < numStages; ++stage)
94 args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
95 for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
97 args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
98 openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
103 // find corners on matrix and put it into array
104 static void findCorners_caller(
105 const oclMat& eig_mat, //input matrix worth eigenvalues
106 oclMat& eigMinMax, //input with min and max values of eigenvalues
107 const float qualityLevel,
109 oclMat& corners, //output array with detected corners
110 oclMat& counter) //output value with number of detected corners, have to be 0 before call
114 Context * cxt = Context::getContext();
116 std::vector< std::pair<size_t, const void*> > args;
118 const int mask_strip = mask.step / mask.elemSize1();
120 args.push_back(std::make_pair( sizeof(cl_mem), (void*)&(eig_mat.data)));
122 int src_pitch = (int)eig_mat.step;
123 args.push_back(std::make_pair( sizeof(cl_int), (void*)&src_pitch ));
124 args.push_back(std::make_pair( sizeof(cl_mem), (void*)&mask.data ));
125 args.push_back(std::make_pair( sizeof(cl_mem), (void*)&corners.data ));
126 args.push_back(std::make_pair( sizeof(cl_int), (void*)&mask_strip));
127 args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eigMinMax.data ));
128 args.push_back(std::make_pair( sizeof(cl_float), (void*)&qualityLevel ));
129 args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig_mat.rows ));
130 args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig_mat.cols ));
131 args.push_back(std::make_pair( sizeof(cl_int), (void*)&corners.cols ));
132 args.push_back(std::make_pair( sizeof(cl_mem), (void*)&counter.data ));
134 size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
135 size_t localThreads[3] = {16, 16, 1};
137 opt += " -D WITH_MASK=1";
139 openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
143 static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
145 size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
146 CV_Assert(groupnum != 0);
148 int dbsize = groupnum * 2 * src.elemSize();
150 ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
152 cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
154 int all_cols = src.step / src.elemSize();
155 int pre_cols = (src.offset % src.step) / src.elemSize();
156 int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
157 int invalid_cols = pre_cols + sec_cols;
158 int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
159 int offset = src.offset / src.elemSize();
161 {// first parallel pass
162 std::vector<std::pair<size_t , const void *> > args;
163 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
164 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
165 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
166 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
167 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
168 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
169 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
170 size_t globalThreads[3] = {groupnum * 256, 1, 1};
171 size_t localThreads[3] = {256, 1, 1};
172 openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
173 args, -1, -1, "-D T=float -D DEPTH_5");
176 {// run final "serial" kernel to find accumulate results from threads and reset corner counter
177 std::vector<std::pair<size_t , const void *> > args;
178 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
179 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum ));
180 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
181 size_t globalThreads[3] = {1, 1, 1};
182 size_t localThreads[3] = {1, 1, 1};
183 openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
188 void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
190 CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
191 CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
193 ensureSizeIsEnough(image.size(), CV_32F, eig_);
195 if (useHarrisDetector)
196 cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
198 cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
200 ensureSizeIsEnough(1,1, CV_32SC1, counter_);
202 // find max eigenvalue and reset detected counters
203 minMaxEig_caller(eig_,eig_minmax_,counter_);
205 // allocate buffer for kernels
206 int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
211 for(n=1;n<(unsigned int)corner_array_size;n<<=1) ;
212 corner_array_size = (int)n;
214 ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
216 // set to 0 to be able use bitonic sort on whole 2^n array
217 tmpCorners_.setTo(0);
221 ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
224 int total = tmpCorners_.cols; // by default the number of corner is full array
225 std::vector<DefCorner> tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
227 //find points with high eigenvalue and put it into the output array
231 static_cast<float>(qualityLevel),
237 {// sort detected corners on deivce side
238 sortCorners_caller(tmpCorners_, corner_array_size);
241 {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
242 openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
245 //blocking read whole corners array (sorted or not sorted)
246 openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
249 {// check for trivial case
255 {// sort detected corners on cpu side.
257 std::sort(tmp.begin(), tmp.end(), DefCornerCompare());
260 //estimate maximal size of final output array
261 int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
262 int D2 = (int)ceil(minDistance * minDistance);
263 // allocate output buffer
264 std::vector<Point2f> tmp2;
265 tmp2.reserve(total_max);
269 {// we have not distance restriction. then just copy with conversion maximal allowed points into output array
270 for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
272 tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
276 {// we have distance restriction. then start coping to output array from the first element and check distance for each next one
277 const int cell_size = cvRound(minDistance);
278 const int grid_width = (image.cols + cell_size - 1) / cell_size;
279 const int grid_height = (image.rows + cell_size - 1) / cell_size;
281 std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
283 for (int i = 0; i < total ; ++i)
285 DefCorner p = tmp[i];
288 break; // condition to stop that is needed for GPU bitonic sort usage.
292 int x_cell = static_cast<int>(p.x / cell_size);
293 int y_cell = static_cast<int>(p.y / cell_size);
301 x1 = std::max(0, x1);
302 y1 = std::max(0, y1);
303 x2 = std::min(grid_width - 1, x2);
304 y2 = std::min(grid_height - 1, y2);
306 for (int yy = y1; yy <= y2; yy++)
308 for (int xx = x1; xx <= x2; xx++)
310 std::vector<Point2i>& m = grid[yy * grid_width + xx];
313 for(size_t j = 0; j < m.size(); j++)
315 int dx = p.x - m[j].x;
316 int dy = p.y - m[j].y;
318 if (dx * dx + dy * dy < D2)
331 grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
333 tmp2.push_back(Point2f(p.x,p.y));
335 if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
341 int final_size = static_cast<int>(tmp2.size());
343 corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
347 void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
349 CV_DbgAssert(points.type() == CV_32FC2);
350 points_v.resize(points.cols);
351 openCLSafeCall(clEnqueueReadBuffer(
352 *(cl_command_queue*)getClCommandQueuePtr(),
353 reinterpret_cast<cl_mem>(points.data),
356 points.cols * sizeof(Point2f),