From 8d5e95226322268381be212533eaabe80c950414 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 4 Feb 2014 03:21:03 +0400 Subject: [PATCH] very preliminary port of SURF to T-API (compiles but certainly does not work) --- modules/features2d/include/opencv2/features2d.hpp | 2 +- .../nonfree/include/opencv2/nonfree/features2d.hpp | 1 - modules/nonfree/include/opencv2/nonfree/ocl.hpp | 126 ---- modules/nonfree/src/opencl/surf.cl | 66 +- modules/nonfree/src/precomp.hpp | 5 - modules/nonfree/src/surf.cpp | 38 +- modules/nonfree/src/surf.hpp | 123 ++++ modules/nonfree/src/surf.ocl.cpp | 800 ++++++++------------- 8 files changed, 486 insertions(+), 675 deletions(-) delete mode 100644 modules/nonfree/include/opencv2/nonfree/ocl.hpp create mode 100644 modules/nonfree/src/surf.hpp diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp index 8f2d056..39e02dc 100644 --- a/modules/features2d/include/opencv2/features2d.hpp +++ b/modules/features2d/include/opencv2/features2d.hpp @@ -235,7 +235,7 @@ public: // Compute the BRISK features and descriptors on an image void operator()( InputArray image, InputArray mask, std::vector& keypoints, - OutputArray descriptors, bool useProvidedKeypoints=false ) const; + OutputArray descriptors, bool useProvidedKeypoints=false ) const; AlgorithmInfo* info() const; diff --git a/modules/nonfree/include/opencv2/nonfree/features2d.hpp b/modules/nonfree/include/opencv2/nonfree/features2d.hpp index 353d1bf..0c4e0c7 100644 --- a/modules/nonfree/include/opencv2/nonfree/features2d.hpp +++ b/modules/nonfree/include/opencv2/nonfree/features2d.hpp @@ -142,7 +142,6 @@ public: CV_PROP_RW bool upright; protected: - void detectImpl( InputArray image, std::vector& keypoints, InputArray mask = noArray() ) const; void computeImpl( const Mat& image, std::vector& keypoints, Mat& descriptors ) const; }; diff --git a/modules/nonfree/include/opencv2/nonfree/ocl.hpp b/modules/nonfree/include/opencv2/nonfree/ocl.hpp deleted file mode 100644 index b06fa39..0000000 --- a/modules/nonfree/include/opencv2/nonfree/ocl.hpp +++ /dev/null @@ -1,126 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_NONFREE_OCL_HPP__ -#define __OPENCV_NONFREE_OCL_HPP__ - -#include "opencv2/ocl.hpp" - -namespace cv -{ - namespace ocl - { - //! Speeded up robust features, port from CUDA module. - ////////////////////////////////// SURF ////////////////////////////////////////// - - class CV_EXPORTS SURF_OCL - { - public: - enum KeypointLayout - { - X_ROW = 0, - Y_ROW, - LAPLACIAN_ROW, - OCTAVE_ROW, - SIZE_ROW, - ANGLE_ROW, - HESSIAN_ROW, - ROWS_COUNT - }; - - //! the default constructor - SURF_OCL(); - //! the full constructor taking all the necessary parameters - explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4, - int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false); - - //! returns the descriptor size in float's (64 or 128) - int descriptorSize() const; - //! returns the default norm type - int defaultNorm() const; - //! upload host keypoints to device memory - void uploadKeypoints(const std::vector &keypoints, oclMat &keypointsocl); - //! download keypoints from device to host memory - void downloadKeypoints(const oclMat &keypointsocl, std::vector &keypoints); - //! download descriptors from device to host memory - void downloadDescriptors(const oclMat &descriptorsocl, std::vector &descriptors); - //! finds the keypoints using fast hessian detector used in SURF - //! supports CV_8UC1 images - //! keypoints will have nFeature cols and 6 rows - //! keypoints.ptr(X_ROW)[i] will contain x coordinate of i'th feature - //! keypoints.ptr(Y_ROW)[i] will contain y coordinate of i'th feature - //! keypoints.ptr(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature - //! keypoints.ptr(OCTAVE_ROW)[i] will contain octave of i'th feature - //! keypoints.ptr(SIZE_ROW)[i] will contain size of i'th feature - //! keypoints.ptr(ANGLE_ROW)[i] will contain orientation of i'th feature - //! keypoints.ptr(HESSIAN_ROW)[i] will contain response of i'th feature - void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints); - //! finds the keypoints and computes their descriptors. - //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction - void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors, - bool useProvidedKeypoints = false); - void operator()(const oclMat &img, const oclMat &mask, std::vector &keypoints); - void operator()(const oclMat &img, const oclMat &mask, std::vector &keypoints, oclMat &descriptors, - bool useProvidedKeypoints = false); - void operator()(const oclMat &img, const oclMat &mask, std::vector &keypoints, std::vector &descriptors, - bool useProvidedKeypoints = false); - - void releaseMemory(); - - // SURF parameters - float hessianThreshold; - int nOctaves; - int nOctaveLayers; - bool extended; - bool upright; - //! max keypoints = min(keypointsRatio * img.size().area(), 65535) - float keypointsRatio; - oclMat sum, mask1, maskSum, intBuffer; - oclMat det, trace; - oclMat maxPosBuffer; - - }; - } -} - -#endif //__OPENCV_NONFREE_OCL_HPP__ diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index 405e48f..c7c4c7d 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -45,6 +45,12 @@ // //M*/ +// The number of degrees between orientation samples in calcOrientation +#define ORI_SEARCH_INC 5 + +// The local size of the calcOrientation kernel +#define ORI_LOCAL_SIZE (360 / ORI_SEARCH_INC) + // specialized for non-image2d_t supported platform, intel HD4000, for example #ifdef DISABLE_IMAGE2D #define IMAGE_INT32 __global uint * @@ -175,7 +181,7 @@ F calcAxisAlignedDerivative( } //calculate targeted layer per-pixel determinant and trace with an integral image -__kernel void icvCalcLayerDetAndTrace( +__kernel void SURF_calcLayerDetAndTrace( IMAGE_INT32 sumTex, // input integral image __global float * det, // output Determinant __global float * trace, // output trace @@ -338,7 +344,7 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro // Non-maximal suppression to further filtering the candidates from previous step __kernel -void icvFindMaximaInLayer_withmask( +void SURF_findMaximaInLayerWithMask( __global const float * det, __global const float * trace, __global int4 * maxPosBuffer, @@ -466,7 +472,7 @@ void icvFindMaximaInLayer_withmask( } __kernel -void icvFindMaximaInLayer( +void SURF_findMaximaInLayer( __global float * det, __global float * trace, __global int4 * maxPosBuffer, @@ -624,7 +630,7 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x) //////////////////////////////////////////////////////////////////////// // INTERPOLATION __kernel -void icvInterpolateKeypoint( +void SURF_interpolateKeypoint( __global const float * det, __global const int4 * maxPosBuffer, __global float * keypoints, @@ -829,7 +835,7 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc } __kernel -void icvCalcOrientation( +void SURF_calcOrientation( IMAGE_INT32 sumTex, __global float * keypoints, int keypoints_step, @@ -995,18 +1001,17 @@ void icvCalcOrientation( } __kernel -void icvSetUpright( +void SURF_setUpright( __global float * keypoints, - int keypoints_step, - int nFeatures -) + int keypoints_step, int keypoints_offset, + int rows, int cols ) { + int i = get_global_id(0); keypoints_step /= sizeof(*keypoints); - __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; - if(get_global_id(0) <= nFeatures) + if(i < cols) { - featureDir[get_global_id(0)] = 270.0f; + keypoints[mad24(keypoints_step, ANGLE_ROW, i)] = 270.f; } } @@ -1162,6 +1167,7 @@ void calc_dx_dy( s_dy_bin[tid] = vy; } } + void reduce_sum25( volatile __local float* sdata1, volatile __local float* sdata2, @@ -1225,16 +1231,14 @@ void reduce_sum25( } __kernel -void compute_descriptors64( +void SURF_computeDescriptors64( IMAGE_INT8 imgTex, + int img_step, int img_offset, + int rows, int cols, + __global const float* keypoints, + int keypoints_step, int keypoints_offset, __global float * descriptors, - __global const float * keypoints, - int descriptors_step, - int keypoints_step, - int rows, - int cols, - int img_step -) + int descriptors_step, int descriptors_offset) { descriptors_step /= sizeof(float); keypoints_step /= sizeof(float); @@ -1279,17 +1283,16 @@ void compute_descriptors64( } } } + __kernel -void compute_descriptors128( +void SURF_computeDescriptors128( IMAGE_INT8 imgTex, - __global float * descriptors, - __global float * keypoints, - int descriptors_step, - int keypoints_step, - int rows, - int cols, - int img_step -) + int img_step, int img_offset, + int rows, int cols, + __global const float* keypoints, + int keypoints_step, int keypoints_offset, + __global float* descriptors, + int descriptors_step, int descriptors_offset) { descriptors_step /= sizeof(*descriptors); keypoints_step /= sizeof(*keypoints); @@ -1483,7 +1486,7 @@ void reduce_sum64(volatile __local float* smem, int tid) } __kernel -void normalize_descriptors128(__global float * descriptors, int descriptors_step) +void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); // no need for thread ID @@ -1509,8 +1512,9 @@ void normalize_descriptors128(__global float * descriptors, int descriptors_step // normalize and store in output descriptor_base[get_local_id(0)] = lookup / len; } + __kernel -void normalize_descriptors64(__global float * descriptors, int descriptors_step) +void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); // no need for thread ID diff --git a/modules/nonfree/src/precomp.hpp b/modules/nonfree/src/precomp.hpp index 204feaf..001b500 100644 --- a/modules/nonfree/src/precomp.hpp +++ b/modules/nonfree/src/precomp.hpp @@ -60,11 +60,6 @@ # include "opencv2/cudaarithm.hpp" #endif -#ifdef HAVE_OPENCV_OCL -# include "opencv2/nonfree/ocl.hpp" -# include "opencv2/ocl/private/util.hpp" -#endif - #include "opencv2/core/private.hpp" #endif diff --git a/modules/nonfree/src/surf.cpp b/modules/nonfree/src/surf.cpp index db846bb..9182916 100644 --- a/modules/nonfree/src/surf.cpp +++ b/modules/nonfree/src/surf.cpp @@ -108,6 +108,7 @@ Modifications by Ian Mahon */ #include "precomp.hpp" +#include "surf.hpp" namespace cv { @@ -897,11 +898,42 @@ void SURF::operator()(InputArray _img, InputArray _mask, OutputArray _descriptors, bool useProvidedKeypoints) const { - Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum; + int imgtype = _img.type(), imgcn = CV_MAT_CN(imgtype); bool doDescriptors = _descriptors.needed(); - CV_Assert(!img.empty() && img.depth() == CV_8U); - if( img.channels() > 1 ) + CV_Assert(!_img.empty() && CV_MAT_DEPTH(imgtype) == CV_8U && (imgcn == 1 || imgcn == 3 || imgcn == 4)); + CV_Assert(_descriptors.needed() && !useProvidedKeypoints); + + if( ocl::useOpenCL() ) + { + SURF_OCL ocl_surf; + UMat gpu_kpt; + bool ok = ocl_surf.init(this); + + if( ok ) + { + if( !_descriptors.needed() ) + { + ok = ocl_surf.detect(_img, _mask, gpu_kpt); + } + else + { + if(useProvidedKeypoints) + ocl_surf.uploadKeypoints(keypoints, gpu_kpt); + ok = ocl_surf.detectAndCompute(_img, _mask, gpu_kpt, _descriptors, useProvidedKeypoints); + } + } + if( ok ) + { + if(!useProvidedKeypoints) + ocl_surf.downloadKeypoints(gpu_kpt, keypoints); + return; + } + } + + Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum; + + if( imgcn > 1 ) cvtColor(img, img, COLOR_BGR2GRAY); CV_Assert(mask.empty() || (mask.type() == CV_8U && mask.size() == img.size())); diff --git a/modules/nonfree/src/surf.hpp b/modules/nonfree/src/surf.hpp new file mode 100644 index 0000000..b589210 --- /dev/null +++ b/modules/nonfree/src/surf.hpp @@ -0,0 +1,123 @@ +///////////// see LICENSE.txt in the OpenCV root directory ////////////// + +#ifndef __OPENCV_NONFREE_SURF_HPP__ +#define __OPENCV_NONFREE_SURF_HPP__ + +namespace cv +{ +//! Speeded up robust features, port from CUDA module. +////////////////////////////////// SURF ////////////////////////////////////////// + +class SURF_OCL +{ +public: + enum KeypointLayout + { + X_ROW = 0, + Y_ROW, + LAPLACIAN_ROW, + OCTAVE_ROW, + SIZE_ROW, + ANGLE_ROW, + HESSIAN_ROW, + ROWS_COUNT + }; + + //! the full constructor taking all the necessary parameters + SURF_OCL(); + + bool init(const SURF* params); + + //! returns the descriptor size in float's (64 or 128) + int descriptorSize() const { return params->extended ? 128 : 64; } + + void uploadKeypoints(const std::vector &keypoints, UMat &keypointsGPU); + void downloadKeypoints(const UMat &keypointsGPU, std::vector &keypoints); + + //! finds the keypoints using fast hessian detector used in SURF + //! supports CV_8UC1 images + //! keypoints will have nFeature cols and 6 rows + //! keypoints.ptr(X_ROW)[i] will contain x coordinate of i'th feature + //! keypoints.ptr(Y_ROW)[i] will contain y coordinate of i'th feature + //! keypoints.ptr(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature + //! keypoints.ptr(OCTAVE_ROW)[i] will contain octave of i'th feature + //! keypoints.ptr(SIZE_ROW)[i] will contain size of i'th feature + //! keypoints.ptr(ANGLE_ROW)[i] will contain orientation of i'th feature + //! keypoints.ptr(HESSIAN_ROW)[i] will contain response of i'th feature + bool detect(InputArray img, InputArray mask, UMat& keypoints); + //! finds the keypoints and computes their descriptors. + //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction + bool detectAndCompute(InputArray img, InputArray mask, UMat& keypoints, + OutputArray descriptors, bool useProvidedKeypoints = false); + +protected: + bool setImage(InputArray img, InputArray mask); + + // kernel callers declarations + bool calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int layer_rows); + + bool findMaximaInLayer(const UMat &det, const UMat &trace, UMat &maxPosBuffer, + UMat &maxCounter, int counterOffset, + int octave, int layer_rows, int layer_cols); + + bool interpolateKeypoint(const UMat &det, const UMat &maxPosBuffer, int maxCounter, + UMat &keypoints, UMat &counters, int octave, int layer_rows, int maxFeatures); + + bool calcOrientation(UMat &keypoints); + + bool setUpRight(UMat &keypoints); + + bool computeDescriptors(const UMat &keypoints, OutputArray descriptors); + + bool detectKeypoints(UMat &keypoints); + + const SURF* params; + int refcount; + + //! max keypoints = min(keypointsRatio * img.size().area(), 65535) + UMat sum, mask1, maskSum, intBuffer; + UMat det, trace; + UMat maxPosBuffer; + + int img_cols, img_rows; + + int maxCandidates; + int maxFeatures; + + UMat img, counters; + + // texture buffers + ocl::Image2D imgTex, sumTex, maskSumTex; + bool haveImageSupport; + + int status; + ocl::Kernel kerCalcDetTrace, kerFindMaxima, kerFindMaximaMask, kerInterp; + ocl::Kernel kerUpRight, kerOri, kerCalcDesc64, kerCalcDesc128, kerNormDesc64, kerNormDesc128; +}; + +/* +template void copyVectorToUMat(const std::vector<_Tp>& v, UMat& um) +{ + if(v.empty()) + um.release(); + else + Mat(1, (int)(v.size()*sizeof(v[0])), CV_8U, (void*)&v[0]).copyTo(um); +} + +template void copyUMatToVector(const UMat& um, std::vector<_Tp>& v) +{ + if(um.empty()) + v.clear(); + else + { + size_t sz = um.total()*um.elemSize(); + CV_Assert(um.isContinuous() && (sz % sizeof(_Tp) == 0)); + v.resize(sz/sizeof(_Tp)); + Mat m(um.size(), um.type(), &v[0]); + um.copyTo(m); + } +}*/ + +} + +#endif diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 8fd717c..70b4be5 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -43,42 +43,30 @@ // //M*/ #include "precomp.hpp" +#include "surf.hpp" -#ifdef HAVE_OPENCV_OCL #include #include #include "opencl_kernels.hpp" -using namespace cv; -using namespace cv::ocl; - -static ProgramEntry surfprog = cv::ocl::nonfree::surf; - namespace cv { - namespace ocl - { - // The number of degrees between orientation samples in calcOrientation - const static int ORI_SEARCH_INC = 5; - // The local size of the calcOrientation kernel - const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC); - static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth) - { - std::stringstream optsStr; - optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " "; - optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " "; - cl_kernel kernel; - kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str()); - size_t wave_size = queryWaveFrontSize(kernel); - CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS); - optsStr << "-D WAVE_SIZE=" << wave_size; - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str()); - } +enum { ORI_SEARCH_INC=5, ORI_LOCAL_SIZE=(360 / ORI_SEARCH_INC) }; - } -} +/*static void openCLExecuteKernelSURF(Context2 *clCxt, const ProgramEntry* source, String kernelName, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth) +{ + std::stringstream optsStr; + optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " "; + optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " "; + cl_kernel kernel; + kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str()); + size_t wave_size = queryWaveFrontSize(kernel); + CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS); + optsStr << "-D WAVE_SIZE=" << wave_size; + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str()); +}*/ static inline int calcSize(int octave, int layer) { @@ -96,223 +84,220 @@ static inline int calcSize(int octave, int layer) } -class SURF_OCL_Invoker +SURF_OCL::SURF_OCL() { -public: - // facilities - void bindImgTex(const oclMat &img, cl_mem &texture); - - //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold); - //void loadOctaveConstants(int octave, int layer_rows, int layer_cols); + img_cols = img_rows = maxCandidates = maxFeatures = 0; + haveImageSupport = false; + status = -1; +} - // kernel callers declarations - void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows); +bool SURF_OCL::init(const SURF* p) +{ + params = p; + if(status < 0) + { + status = 0; + if(ocl::haveOpenCL()) + { + const ocl::Device& dev = ocl::Device::getDefault(); + if( dev.type() == ocl::Device::TYPE_CPU ) + return false; + haveImageSupport = dev.imageSupport(); + String opts = haveImageSupport ? "-D DISABLE_IMAGE2D" : ""; + + if( kerCalcDetTrace.create("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, opts) && + kerFindMaxima.create("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, opts) && + kerFindMaximaMask.create("SURF_findMaximaInLayerWithMask", ocl::nonfree::surf_oclsrc, opts) && + kerInterp.create("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, opts) && + kerUpRight.create("SURF_setUpRight", ocl::nonfree::surf_oclsrc, opts) && + kerOri.create("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, opts) && + kerCalcDesc64.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, opts) && + kerCalcDesc128.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, opts) && + kerNormDesc64.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, opts) && + kerNormDesc128.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, opts)) + status = 1; + } + } + return status > 0; +} - void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, - int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols); - void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, - oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures); +bool SURF_OCL::setImage(InputArray _img, InputArray _mask) +{ + if( status <= 0 ) + return false; + CV_Assert(!_img.empty() && _img.type() == CV_8UC1); + CV_Assert(_mask.empty() || (_mask.size() == _img.size() && _mask.type() == CV_8UC1)); + CV_Assert(params && params->nOctaves > 0 && params->nOctaveLayers > 0); + + int min_size = calcSize(params->nOctaves - 1, 0); + Size sz = _img.size(); + img_cols = sz.width; + img_rows = sz.height; + CV_Assert(img_rows >= min_size && img_cols >= min_size); + + const int layer_rows = img_rows >> (params->nOctaves - 1); + const int layer_cols = img_cols >> (params->nOctaves - 1); + const int min_margin = ((calcSize((params->nOctaves - 1), 2) >> 1) >> (params->nOctaves - 1)) + 1; + CV_Assert(layer_rows - 2 * min_margin > 0); + CV_Assert(layer_cols - 2 * min_margin > 0); + + maxFeatures = std::min(static_cast(img_cols*img_rows * 0.01f), 65535); + maxCandidates = std::min(static_cast(1.5 * maxFeatures), 65535); + + CV_Assert(maxFeatures > 0); + + counters.create(1, params->nOctaves + 1, CV_32SC1); + counters.setTo(Scalar::all(0)); + + img.release(); + if(_img.isUMat()) + img = _img.getUMat(); + else + _img.copyTo(img); - void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures); + integral(img, sum); - void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures); + if(haveImageSupport) + { + imgTex = ocl::Image2D(img); + sumTex = ocl::Image2D(sum); + } - void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures); - // end of kernel callers declarations + maskSumTex = ocl::Image2D(); - SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) : - surf_(surf), - img_cols(img.cols), img_rows(img.rows), - use_mask(!mask.empty()), counters(oclMat()), - imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img) + if(!_mask.empty()) { - CV_Assert(!img.empty() && img.type() == CV_8UC1); - CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); - CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0); - - const int min_size = calcSize(surf_.nOctaves - 1, 0); - CV_Assert(img_rows - min_size >= 0); - CV_Assert(img_cols - min_size >= 0); + CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet"); + } + return true; +} - const int layer_rows = img_rows >> (surf_.nOctaves - 1); - const int layer_cols = img_cols >> (surf_.nOctaves - 1); - const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1; - CV_Assert(layer_rows - 2 * min_margin > 0); - CV_Assert(layer_cols - 2 * min_margin > 0); - maxFeatures = std::min(static_cast(img.size().area() * surf.keypointsRatio), 65535); - maxCandidates = std::min(static_cast(1.5 * maxFeatures), 65535); +bool SURF_OCL::detectKeypoints(UMat &keypoints) +{ + // create image pyramid buffers + // different layers have same sized buffers, but they are sampled from Gaussian kernel. + det.create(img_rows * (params->nOctaveLayers + 2), img_cols, CV_32F); + trace.create(img_rows * (params->nOctaveLayers + 2), img_cols, CV_32FC1); - CV_Assert(maxFeatures > 0); + maxPosBuffer.create(1, maxCandidates, CV_32SC4); + keypoints.create(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32F); + keypoints.setTo(Scalar::all(0)); + Mat cpuCounters; - counters.create(1, surf_.nOctaves + 1, CV_32SC1); - counters.setTo(Scalar::all(0)); + for (int octave = 0; octave < params->nOctaves; ++octave) + { + const int layer_rows = img_rows >> octave; + const int layer_cols = img_cols >> octave; - integral(img, surf_.sum); + if(!calcLayerDetAndTrace(det, trace, octave, layer_rows)) + return false; - bindImgTex(img, imgTex); - bindImgTex(surf_.sum, sumTex); - finish(); + if(!findMaximaInLayer(det, trace, maxPosBuffer, counters, 1 + octave, octave, + layer_rows, layer_cols)) + return false; - maskSumTex = 0; + cpuCounters = counters.getMat(ACCESS_READ); + int maxCounter = cpuCounters.at(1 + octave); + maxCounter = std::min(maxCounter, maxCandidates); + cpuCounters.release(); - if (use_mask) + if (maxCounter > 0) { - CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet"); - //!FIXME - // temp fix for missing min overload - //oclMat temp(mask.size(), mask.type()); - //temp.setTo(Scalar::all(1.0)); - ////cv::ocl::min(mask, temp, surf_.mask1); ///////// disable this - //integral(surf_.mask1, surf_.maskSum); - //bindImgTex(surf_.maskSum, maskSumTex); + if(!interpolateKeypoint(det, maxPosBuffer, maxCounter, keypoints, + counters, octave, layer_rows, maxFeatures)) + return false; } } - void detectKeypoints(oclMat &keypoints) - { - // create image pyramid buffers - // different layers have same sized buffers, but they are sampled from Gaussian kernel. - ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det); - ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace); - - ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer); - ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints); - keypoints.setTo(Scalar::all(0)); - - for (int octave = 0; octave < surf_.nOctaves; ++octave) - { - const int layer_rows = img_rows >> octave; - const int layer_cols = img_cols >> octave; - - //loadOctaveConstants(octave, layer_rows, layer_cols); + cpuCounters = counters.getMat(ACCESS_READ); + int featureCounter = cpuCounters.at(0); + featureCounter = std::min(featureCounter, maxFeatures); + cpuCounters.release(); - icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows); + keypoints = UMat(keypoints, Rect(0, 0, featureCounter, 1)); - icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave, - octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols); - - int maxCounter = ((Mat)counters).at(1 + octave); - maxCounter = std::min(maxCounter, static_cast(maxCandidates)); + if (params->upright) + return setUpRight(keypoints); + else + return calcOrientation(keypoints); +} - if (maxCounter > 0) - { - icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter, - keypoints, counters, octave, layer_rows, maxFeatures); - } - } - int featureCounter = Mat(counters).at(0); - featureCounter = std::min(featureCounter, static_cast(maxFeatures)); - keypoints.cols = featureCounter; +bool SURF_OCL::setUpRight(UMat &keypoints) +{ + int nFeatures = keypoints.cols; + if( nFeatures == 0 ) + return true; - if (surf_.upright) - { - //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0)); - setUpright(keypoints); - } - else - { - findOrientation(keypoints); - } - } + size_t globalThreads[3] = {nFeatures, 1}; + return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, false); +} - void setUpright(oclMat &keypoints) +bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors) +{ + int descriptorSize = params->descriptorSize(); + int nFeatures = keypoints.cols; + if (nFeatures == 0) { - const int nFeatures = keypoints.cols; - if(nFeatures > 0) - { - icvSetUpright_gpu(keypoints, keypoints.cols); - } + _descriptors.release(); + return true; } + _descriptors.create(nFeatures, descriptorSize, CV_32F); + UMat descriptors; + if( _descriptors.isUMat() ) + descriptors = _descriptors.getUMat(); + else + descriptors.create(nFeatures, descriptorSize, CV_32F); - void findOrientation(oclMat &keypoints) - { - const int nFeatures = keypoints.cols; - if (nFeatures > 0) - { - icvCalcOrientation_gpu(keypoints, nFeatures); - } - } + ocl::Kernel kerCalcDesc, kerNormDesc; - void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize) + if( descriptorSize == 64 ) { - const int nFeatures = keypoints.cols; - if (nFeatures > 0) - { - ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors); - compute_descriptors_gpu(descriptors, keypoints, nFeatures); - } + kerCalcDesc = kerCalcDesc64; + kerNormDesc = kerNormDesc64; } - - ~SURF_OCL_Invoker() + else { - if(imgTex) - openCLFree(imgTex); - if(sumTex) - openCLFree(sumTex); - if(maskSumTex) - openCLFree(maskSumTex); + CV_Assert(descriptorSize == 128); + kerCalcDesc = kerCalcDesc128; + kerNormDesc = kerNormDesc128; } -private: - SURF_OCL &surf_; - - int img_cols, img_rows; - - bool use_mask; - - int maxCandidates; - int maxFeatures; - - oclMat counters; + size_t localThreads[] = {6, 6}; + size_t globalThreads[] = {nFeatures*localThreads[0], localThreads[1]}; - // texture buffers - cl_mem imgTex; - cl_mem sumTex; - cl_mem maskSumTex; - - const oclMat _img; // make a copy for non-image2d_t supported platform - - SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right) + if(haveImageSupport) + { + kerCalcDesc.args(imgTex, + ocl::KernelArg::ReadOnlyNoSize(keypoints), + ocl::KernelArg::WriteOnlyNoSize(descriptors)); + } + else { - (*this) = right; - return *this; - } // remove warning C4512 -}; + kerCalcDesc.args(ocl::KernelArg::ReadOnly(img), + ocl::KernelArg::ReadOnlyNoSize(keypoints), + ocl::KernelArg::WriteOnlyNoSize(descriptors)); + } -cv::ocl::SURF_OCL::SURF_OCL() -{ - hessianThreshold = 100.0f; - extended = true; - nOctaves = 4; - nOctaveLayers = 2; - keypointsRatio = 0.01f; - upright = false; -} + if(!kerCalcDesc.run(2, globalThreads, localThreads, false)) + return false; -cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright) -{ - hessianThreshold = saturate_cast(_threshold); - extended = _extended; - nOctaves = _nOctaves; - nOctaveLayers = _nOctaveLayers; - keypointsRatio = _keypointsRatio; - upright = _upright; -} + size_t localThreads_n[] = {descriptorSize, 1}; + size_t globalThreads_n[] = {nFeatures*localThreads_n[0], localThreads_n[1]}; -int cv::ocl::SURF_OCL::descriptorSize() const -{ - return extended ? 128 : 64; + globalThreads[0] = nFeatures * localThreads[0]; + globalThreads[1] = localThreads[1]; + bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)). + run(2, globalThreads_n, localThreads_n, false); + if(ok && !_descriptors.isUMat()) + descriptors.copyTo(_descriptors); + return ok; } -int cv::ocl::SURF_OCL::defaultNorm() const -{ - return NORM_L2; -} -void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector &keypoints, oclMat &keypointsGPU) +void SURF_OCL::uploadKeypoints(const std::vector &keypoints, UMat &keypointsGPU) { if (keypoints.empty()) keypointsGPU.release(); @@ -340,11 +325,11 @@ void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector &keypoints, kp_laplacian[i] = 1; } - keypointsGPU.upload(keypointsCPU); + keypointsCPU.copyTo(keypointsGPU); } } -void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vector &keypoints) +void SURF_OCL::downloadKeypoints(const UMat &keypointsGPU, std::vector &keypoints) { const int nFeatures = keypointsGPU.cols; @@ -354,8 +339,7 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto { CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT); - Mat keypointsCPU(keypointsGPU); - + Mat keypointsCPU = keypointsGPU.getMat(ACCESS_READ); keypoints.resize(nFeatures); float *kp_x = keypointsCPU.ptr(SURF_OCL::X_ROW); @@ -380,354 +364,154 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto } } -void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, std::vector &descriptors) +bool SURF_OCL::detect(InputArray img, InputArray mask, UMat& keypoints) { - if (descriptorsGPU.empty()) - descriptors.clear(); - else - { - CV_Assert(descriptorsGPU.type() == CV_32F); + if( !setImage(img, mask) ) + return false; - descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols); - Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]); - descriptorsGPU.download(descriptorsCPU); - } + return detectKeypoints(keypoints); } -void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints) -{ - if (!img.empty()) - { - SURF_OCL_Invoker surf(*this, img, mask); - - surf.detectKeypoints(keypoints); - } -} -void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors, - bool useProvidedKeypoints) +bool SURF_OCL::detectAndCompute(InputArray img, InputArray mask, UMat& keypoints, + OutputArray _descriptors, bool useProvidedKeypoints ) { - if (!img.empty()) - { - SURF_OCL_Invoker surf(*this, img, mask); + if( !setImage(img, mask) ) + return false; - if (!useProvidedKeypoints) - surf.detectKeypoints(keypoints); - else if (!upright) - { - surf.findOrientation(keypoints); - } + if( !useProvidedKeypoints && !detectKeypoints(keypoints) ) + return false; - surf.computeDescriptors(keypoints, descriptors, descriptorSize()); - } + return computeDescriptors(keypoints, _descriptors); } -void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector &keypoints) -{ - oclMat keypointsGPU; - - (*this)(img, mask, keypointsGPU); - - downloadKeypoints(keypointsGPU, keypoints); -} - -void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector &keypoints, - oclMat &descriptors, bool useProvidedKeypoints) -{ - oclMat keypointsGPU; - - if (useProvidedKeypoints) - uploadKeypoints(keypoints, keypointsGPU); - - (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints); - - downloadKeypoints(keypointsGPU, keypoints); -} - -void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector &keypoints, - std::vector &descriptors, bool useProvidedKeypoints) -{ - oclMat descriptorsGPU; - - (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints); - - downloadDescriptors(descriptorsGPU, descriptors); -} - -void cv::ocl::SURF_OCL::releaseMemory() -{ - sum.release(); - mask1.release(); - maskSum.release(); - intBuffer.release(); - det.release(); - trace.release(); - maxPosBuffer.release(); -} - - -// bind source buffer to image oject. -void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture) -{ - if(texture) - { - openCLFree(texture); - } - texture = bindTexture(img); -} +inline int divUp(int a, int b) { return (a + b-1)/b; } //////////////////////////// // kernel caller definitions -void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows) +bool SURF_OCL::calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int c_layer_rows) { + int nOctaveLayers = params->nOctaveLayers; const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave); - Context *clCxt = det.clCxt; - String kernelName = "icvCalcLayerDetAndTrace"; + String kernelName = "SURF_calcLayerDetAndTrace"; std::vector< std::pair > args; - if(sumTex) + size_t localThreads[3] = {16, 16}; + size_t globalThreads[3] = { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex)); + divUp(max_samples_j, localThreads[0]) *localThreads[0], + divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2) + }; + if(haveImageSupport) + { + kerCalcDetTrace.args(sumTex, + img_rows, img_cols, nOctaveLayers, + octave, c_layer_rows, + ocl::KernelArg::WriteOnlyNoSize(det), + ocl::KernelArg::WriteOnlyNoSize(trace)); } else { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported + kerCalcDetTrace.args(ocl::KernelArg::ReadOnlyNoSize(sum), + img_rows, img_cols, nOctaveLayers, + octave, c_layer_rows, + ocl::KernelArg::WriteOnlyNoSize(det), + ocl::KernelArg::WriteOnlyNoSize(trace)); } - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step)); - - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3] = - { - divUp(max_samples_j, localThreads[0]) *localThreads[0], - divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2), - 1 - }; - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); + return kerCalcDetTrace.run(2, globalThreads, localThreads, false); } -void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, - int octave, bool useMask, int nLayers, int layer_rows, int layer_cols) +bool SURF_OCL::findMaximaInLayer(const UMat &det, const UMat &trace, + UMat &maxPosBuffer, UMat &maxCounter, + int counterOffset, int octave, + int layer_rows, int layer_cols) { const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; + bool haveMask = !maskSum.empty() || (maskSumTex.ptr() != 0); + int nOctaveLayers = params->nOctaveLayers; - Context *clCxt = det.clCxt; - String kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer"; - std::vector< std::pair > args; - - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxCounter.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&counterOffset)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&nLayers)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates)); - args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold)); - - if(useMask) + ocl::Kernel ker; + if( haveMask ) { - if(maskSumTex) - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex)); - } + if( haveImageSupport ) + ker = kerFindMaximaMask.args(maskSumTex, + ocl::KernelArg::ReadOnlyNoSize(det), + ocl::KernelArg::ReadOnlyNoSize(trace), + ocl::KernelArg::PtrReadWrite(maxPosBuffer), + ocl::KernelArg::PtrReadWrite(maxCounter), + counterOffset, img_rows, img_cols, + octave, nOctaveLayers, + layer_rows, layer_cols, + maxCandidates, + (float)params->hessianThreshold); else - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data)); - } - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step)); - } - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0], - divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1], - 1 - }; - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); -} - -void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, - oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features) -{ - Context *clCxt = det.clCxt; - String kernelName = "icvInterpolateKeypoint"; - std::vector< std::pair > args; - - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features)); - - size_t localThreads[3] = {3, 3, 3}; - size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1}; - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); -} - -void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures) -{ - Context *clCxt = counters.clCxt; - String kernelName = "icvCalcOrientation"; - - std::vector< std::pair > args; - - if(sumTex) - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex)); + ker = kerFindMaximaMask.args(ocl::KernelArg::ReadOnlyNoSize(maskSum), + ocl::KernelArg::ReadOnlyNoSize(det), + ocl::KernelArg::ReadOnlyNoSize(trace), + ocl::KernelArg::PtrReadWrite(maxPosBuffer), + ocl::KernelArg::PtrReadWrite(maxCounter), + counterOffset, img_rows, img_cols, + octave, nOctaveLayers, + layer_rows, layer_cols, + maxCandidates, + (float)params->hessianThreshold); } else { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported + ker = kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det), + ocl::KernelArg::ReadOnlyNoSize(trace), + ocl::KernelArg::PtrReadWrite(maxPosBuffer), + ocl::KernelArg::PtrReadWrite(maxCounter), + counterOffset, img_rows, img_cols, + octave, nOctaveLayers, + layer_rows, layer_cols, + maxCandidates, + (float)params->hessianThreshold); } - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step)); - - size_t localThreads[3] = {ORI_LOCAL_SIZE, 1, 1}; - size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1}; + size_t localThreads[3] = {16, 16}; + size_t globalThreads[3] = + { + divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0], + divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nOctaveLayers *localThreads[1] + }; - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); + return ker.run(2, globalThreads, localThreads, false); } -void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures) +bool SURF_OCL::interpolateKeypoint(const UMat &det, const UMat &maxPosBuffer, int maxCounter, + UMat &keypoints, UMat &counters_, int octave, int layer_rows, int max_features) { - Context *clCxt = counters.clCxt; - String kernelName = "icvSetUpright"; - - std::vector< std::pair > args; - - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures)); - - size_t localThreads[3] = {256, 1, 1}; - size_t globalThreads[3] = {saturate_cast(nFeatures), 1, 1}; - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); + size_t localThreads[3] = {3, 3, 3}; + size_t globalThreads[3] = {maxCounter*localThreads[0], localThreads[1], 3}; + + return kerInterp.args(ocl::KernelArg::ReadOnlyNoSize(det), + ocl::KernelArg::PtrReadOnly(maxPosBuffer), + ocl::KernelArg::ReadWriteNoSize(keypoints), + ocl::KernelArg::PtrReadWrite(counters_), + img_rows, img_cols, octave, layer_rows, max_features). + run(3, globalThreads, localThreads, false); } - -void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures) +bool SURF_OCL::calcOrientation(UMat &keypoints) { - // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D - Context *clCxt = descriptors.clCxt; - String kernelName; - std::vector< std::pair > args; - size_t localThreads[3] = {1, 1, 1}; - size_t globalThreads[3] = {1, 1, 1}; - - if(descriptors.cols == 64) - { - kernelName = "compute_descriptors64"; - - localThreads[0] = 6; - localThreads[1] = 6; - - globalThreads[0] = nFeatures * localThreads[0]; - globalThreads[1] = 16 * localThreads[1]; - - args.clear(); - if(imgTex) - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex)); - } - else - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data)); - } - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step)); - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); - - kernelName = "normalize_descriptors64"; - - localThreads[0] = 64; - localThreads[1] = 1; - - globalThreads[0] = nFeatures * localThreads[0]; - globalThreads[1] = localThreads[1]; - - args.clear(); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step)); - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); - } + int nFeatures = keypoints.cols; + if( nFeatures == 0 ) + return true; + if( haveImageSupport ) + kerOri.args(sumTex, + ocl::KernelArg::ReadWriteNoSize(keypoints), + img_rows, img_cols); else - { - kernelName = "compute_descriptors128"; - - localThreads[0] = 6; - localThreads[1] = 6; + kerOri.args(ocl::KernelArg::ReadOnlyNoSize(sum), + ocl::KernelArg::ReadWriteNoSize(keypoints), + img_rows, img_cols); - globalThreads[0] = nFeatures * localThreads[0]; - globalThreads[1] = 16 * localThreads[1]; - - args.clear(); - if(imgTex) - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex)); - } - else - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data)); - } - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step)); - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); - - kernelName = "normalize_descriptors128"; - - localThreads[0] = 128; - localThreads[1] = 1; - - globalThreads[0] = nFeatures * localThreads[0]; - globalThreads[1] = localThreads[1]; - - args.clear(); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step)); - - openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); - } + size_t localThreads[3] = {ORI_LOCAL_SIZE, 1}; + size_t globalThreads[3] = {nFeatures * localThreads[0], 1}; + return kerOri.run(2, globalThreads, localThreads, false); } -#endif //HAVE_OPENCV_OCL +} -- 2.7.4