endif()
set(the_description "Functionality with possible limitations on the use")
-ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu)
+ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu opencv_ocl)
ocv_module_include_directories()
if(HAVE_CUDA AND HAVE_opencv_gpu)
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_NONFREE_OCL_HPP__
+#define __OPENCV_NONFREE_OCL_HPP__
+
+#include "opencv2/ocl/ocl.hpp"
+
+namespace cv
+{
+ namespace ocl
+ {
+ //! Speeded up robust features, port from GPU module.
+ ////////////////////////////////// SURF //////////////////////////////////////////
+
+ class CV_EXPORTS SURF_OCL
+ {
+ public:
+ enum KeypointLayout
+ {
+ X_ROW = 0,
+ Y_ROW,
+ LAPLACIAN_ROW,
+ OCTAVE_ROW,
+ SIZE_ROW,
+ ANGLE_ROW,
+ HESSIAN_ROW,
+ ROWS_COUNT
+ };
+
+ //! the default constructor
+ SURF_OCL();
+ //! the full constructor taking all the necessary parameters
+ explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
+ int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
+
+ //! returns the descriptor size in float's (64 or 128)
+ int descriptorSize() const;
+ //! upload host keypoints to device memory
+ void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
+ //! download keypoints from device to host memory
+ void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints);
+ //! download descriptors from device to host memory
+ void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors);
+ //! finds the keypoints using fast hessian detector used in SURF
+ //! supports CV_8UC1 images
+ //! keypoints will have nFeature cols and 6 rows
+ //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
+ //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
+ //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
+ //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
+ //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
+ //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
+ //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
+ void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
+ //! finds the keypoints and computes their descriptors.
+ //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+ void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
+ bool useProvidedKeypoints = false);
+ void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
+ void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
+ bool useProvidedKeypoints = false);
+ void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
+ bool useProvidedKeypoints = false);
+
+ void releaseMemory();
+
+ // SURF parameters
+ float hessianThreshold;
+ int nOctaves;
+ int nOctaveLayers;
+ bool extended;
+ bool upright;
+ //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+ float keypointsRatio;
+ oclMat sum, mask1, maskSum, intBuffer;
+ oclMat det, trace;
+ oclMat maxPosBuffer;
+
+ };
+ }
+}
+
+#endif __OPENCV_NONFREE_OCL_HPP__
\ No newline at end of file
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+// specialized for non-image2d_t supported platform, intel HD4000, for example
+#ifdef DISABLE_IMAGE2D
+#define IMAGE_INT32 __global uint *
+#define IMAGE_INT8 __global uchar *
+#else
+#define IMAGE_INT32 image2d_t
+#define IMAGE_INT8 image2d_t
+#endif
+
+uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
+{
+#ifdef DISABLE_IMAGE2D
+ int x = clamp(coord.x, 0, cols);
+ int y = clamp(coord.y, 0, rows);
+ return img[elemPerRow * y + x];
+#else
+ return read_imageui(img, sam, coord).x;
+#endif
+}
+uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
+{
+#ifdef DISABLE_IMAGE2D
+ int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
+ int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
+ return img[elemPerRow * y + x];
+#else
+ return (uchar)read_imageui(img, sam, coord).x;
+#endif
+}
+
+// dynamically change the precision used for floating type
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#define F double
+#else
+#define F float
+#endif
+
+// Image read mode
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+#ifndef FLT_EPSILON
+#define FLT_EPSILON (1e-15)
+#endif
+
+#ifndef CV_PI_F
+#define CV_PI_F 3.14159265f
+#endif
+
+// Use integral image to calculate haar wavelets.
+// N = 2
+// for simple haar paatern
+float icvCalcHaarPatternSum_2(
+ IMAGE_INT32 sumTex,
+ __constant float src[2][5],
+ int oldSize,
+ int newSize,
+ int y, int x,
+ int rows, int cols, int elemPerRow)
+{
+
+ float ratio = (float)newSize / oldSize;
+
+ F d = 0;
+
+#pragma unroll
+ for (int k = 0; k < 2; ++k)
+ {
+ int dx1 = convert_int_rte(ratio * src[k][0]);
+ int dy1 = convert_int_rte(ratio * src[k][1]);
+ int dx2 = convert_int_rte(ratio * src[k][2]);
+ int dy2 = convert_int_rte(ratio * src[k][3]);
+
+ F t = 0;
+ t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+ t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+ t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+ t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
+ d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+ }
+
+ return (float)d;
+}
+
+// N = 3
+float icvCalcHaarPatternSum_3(
+ IMAGE_INT32 sumTex,
+ __constant float src[2][5],
+ int oldSize,
+ int newSize,
+ int y, int x,
+ int rows, int cols, int elemPerRow)
+{
+
+ float ratio = (float)newSize / oldSize;
+
+ F d = 0;
+
+#pragma unroll
+ for (int k = 0; k < 3; ++k)
+ {
+ int dx1 = convert_int_rte(ratio * src[k][0]);
+ int dy1 = convert_int_rte(ratio * src[k][1]);
+ int dx2 = convert_int_rte(ratio * src[k][2]);
+ int dy2 = convert_int_rte(ratio * src[k][3]);
+
+ F t = 0;
+ t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+ t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+ t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+ t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
+ d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+ }
+
+ return (float)d;
+}
+
+// N = 4
+float icvCalcHaarPatternSum_4(
+ IMAGE_INT32 sumTex,
+ __constant float src[2][5],
+ int oldSize,
+ int newSize,
+ int y, int x,
+ int rows, int cols, int elemPerRow)
+{
+
+ float ratio = (float)newSize / oldSize;
+
+ F d = 0;
+
+#pragma unroll
+ for (int k = 0; k < 4; ++k)
+ {
+ int dx1 = convert_int_rte(ratio * src[k][0]);
+ int dy1 = convert_int_rte(ratio * src[k][1]);
+ int dx2 = convert_int_rte(ratio * src[k][2]);
+ int dy2 = convert_int_rte(ratio * src[k][3]);
+
+ F t = 0;
+ t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+ t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+ t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+ t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
+ d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+ }
+
+ return (float)d;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Hessian
+
+__constant float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
+__constant float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
+__constant float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
+
+__inline int calcSize(int octave, int layer)
+{
+ /* Wavelet size at first layer of first octave. */
+ const int HAAR_SIZE0 = 9;
+
+ /* Wavelet size increment between layers. This should be an even number,
+ such that the wavelet sizes in an octave are either all even or all odd.
+ This ensures that when looking for the neighbours of a sample, the layers
+ above and below are aligned correctly. */
+ const int HAAR_SIZE_INC = 6;
+
+ return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+}
+
+
+//calculate targeted layer per-pixel determinant and trace with an integral image
+__kernel void icvCalcLayerDetAndTrace(
+ IMAGE_INT32 sumTex, // input integral image
+ __global float * det, // output Determinant
+ __global float * trace, // output trace
+ int det_step, // the step of det in bytes
+ int trace_step, // the step of trace in bytes
+ int c_img_rows,
+ int c_img_cols,
+ int c_nOctaveLayers,
+ int c_octave,
+ int c_layer_rows,
+ int sumTex_step
+ )
+{
+ det_step /= sizeof(*det);
+ trace_step /= sizeof(*trace);
+ sumTex_step/= sizeof(uint);
+ // Determine the indices
+ const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
+ const int blockIdx_y = get_group_id(1) % gridDim_y;
+ const int blockIdx_z = get_group_id(1) / gridDim_y;
+
+ const int j = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ const int i = get_local_id(1) + blockIdx_y * get_local_size(1);
+ const int layer = blockIdx_z;
+
+ const int size = calcSize(c_octave, layer);
+
+ const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
+ const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
+
+ // Ignore pixels where some of the kernel is outside the image
+ const int margin = (size >> 1) >> c_octave;
+
+ if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
+ {
+ const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+ const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+ const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+
+ det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
+ trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// NONMAX
+
+__constant float c_DM[5] = {0, 0, 9, 9, 1};
+
+bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
+{
+ float ratio = (float)size / 9.0f;
+
+ float d = 0;
+
+ int dx1 = convert_int_rte(ratio * c_DM[0]);
+ int dy1 = convert_int_rte(ratio * c_DM[1]);
+ int dx2 = convert_int_rte(ratio * c_DM[2]);
+ int dy2 = convert_int_rte(ratio * c_DM[3]);
+
+ float t = 0;
+
+ t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
+ t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
+ t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
+ t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
+
+ d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
+
+ return (d >= 0.5f);
+}
+
+// Non-maximal suppression to further filtering the candidates from previous step
+__kernel
+ void icvFindMaximaInLayer_withmask(
+ __global const float * det,
+ __global const float * trace,
+ __global int4 * maxPosBuffer,
+ volatile __global int* maxCounter,
+ int counter_offset,
+ int det_step, // the step of det in bytes
+ int trace_step, // the step of trace in bytes
+ int c_img_rows,
+ int c_img_cols,
+ int c_nOctaveLayers,
+ int c_octave,
+ int c_layer_rows,
+ int c_layer_cols,
+ int c_max_candidates,
+ float c_hessianThreshold,
+ IMAGE_INT32 maskSumTex,
+ int mask_step
+ )
+{
+ volatile __local float N9[768]; // threads.x * threads.y * 3
+
+ det_step /= sizeof(*det);
+ trace_step /= sizeof(*trace);
+ maxCounter += counter_offset;
+ mask_step /= sizeof(uint);
+
+ // Determine the indices
+ const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
+ const int blockIdx_y = get_group_id(1) % gridDim_y;
+ const int blockIdx_z = get_group_id(1) / gridDim_y;
+
+ const int layer = blockIdx_z + 1;
+
+ const int size = calcSize(c_octave, layer);
+
+ // Ignore pixels without a 3x3x3 neighbourhood in the layer above
+ const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
+
+ const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
+ const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
+
+ // Is this thread within the hessian buffer?
+ const int zoff = get_local_size(0) * get_local_size(1);
+ const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
+ N9[localLin - zoff] =
+ det[det_step *
+ (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ + min(max(j, 0), c_img_cols - 1)]; // x
+ N9[localLin ] =
+ det[det_step *
+ (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ + min(max(j, 0), c_img_cols - 1)]; // x
+ N9[localLin + zoff] =
+ det[det_step *
+ (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ + min(max(j, 0), c_img_cols - 1)]; // x
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (i < c_layer_rows - margin
+ && j < c_layer_cols - margin
+ && get_local_id(0) > 0
+ && get_local_id(0) < get_local_size(0) - 1
+ && get_local_id(1) > 0
+ && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
+ )
+ {
+ float val0 = N9[localLin];
+
+ if (val0 > c_hessianThreshold)
+ {
+ // Coordinates for the start of the wavelet in the sum image. There
+ // is some integer division involved, so don't try to simplify this
+ // (cancel out sampleStep) without checking the result is the same
+ const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
+ const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
+
+ if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
+ {
+ // Check to see if we have a max (in its 26 neighbours)
+ const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
+ && val0 > N9[localLin - get_local_size(0) - zoff]
+ && val0 > N9[localLin + 1 - get_local_size(0) - zoff]
+ && val0 > N9[localLin - 1 - zoff]
+ && val0 > N9[localLin - zoff]
+ && val0 > N9[localLin + 1 - zoff]
+ && val0 > N9[localLin - 1 + get_local_size(0) - zoff]
+ && val0 > N9[localLin + get_local_size(0) - zoff]
+ && val0 > N9[localLin + 1 + get_local_size(0) - zoff]
+
+ && val0 > N9[localLin - 1 - get_local_size(0)]
+ && val0 > N9[localLin - get_local_size(0)]
+ && val0 > N9[localLin + 1 - get_local_size(0)]
+ && val0 > N9[localLin - 1 ]
+ && val0 > N9[localLin + 1 ]
+ && val0 > N9[localLin - 1 + get_local_size(0)]
+ && val0 > N9[localLin + get_local_size(0)]
+ && val0 > N9[localLin + 1 + get_local_size(0)]
+
+ && val0 > N9[localLin - 1 - get_local_size(0) + zoff]
+ && val0 > N9[localLin - get_local_size(0) + zoff]
+ && val0 > N9[localLin + 1 - get_local_size(0) + zoff]
+ && val0 > N9[localLin - 1 + zoff]
+ && val0 > N9[localLin + zoff]
+ && val0 > N9[localLin + 1 + zoff]
+ && val0 > N9[localLin - 1 + get_local_size(0) + zoff]
+ && val0 > N9[localLin + get_local_size(0) + zoff]
+ && val0 > N9[localLin + 1 + get_local_size(0) + zoff]
+ ;
+
+ if(condmax)
+ {
+ int ind = atomic_inc(maxCounter);
+
+ if (ind < c_max_candidates)
+ {
+ const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
+
+ maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
+ }
+ }
+ }
+ }
+ }
+}
+
+__kernel
+ void icvFindMaximaInLayer(
+ __global float * det,
+ __global float * trace,
+ __global int4 * maxPosBuffer,
+ volatile __global int* maxCounter,
+ int counter_offset,
+ int det_step, // the step of det in bytes
+ int trace_step, // the step of trace in bytes
+ int c_img_rows,
+ int c_img_cols,
+ int c_nOctaveLayers,
+ int c_octave,
+ int c_layer_rows,
+ int c_layer_cols,
+ int c_max_candidates,
+ float c_hessianThreshold
+ )
+{
+ volatile __local float N9[768]; // threads.x * threads.y * 3
+
+ det_step /= sizeof(float);
+ trace_step /= sizeof(float);
+ maxCounter += counter_offset;
+
+ // Determine the indices
+ const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
+ const int blockIdx_y = get_group_id(1) % gridDim_y;
+ const int blockIdx_z = get_group_id(1) / gridDim_y;
+
+ const int layer = blockIdx_z + 1;
+
+ const int size = calcSize(c_octave, layer);
+
+ // Ignore pixels without a 3x3x3 neighbourhood in the layer above
+ const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
+
+ const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
+ const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
+
+ // Is this thread within the hessian buffer?
+ const int zoff = get_local_size(0) * get_local_size(1);
+ const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
+
+ int l_x = min(max(j, 0), c_img_cols - 1);
+ int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
+
+ N9[localLin - zoff] =
+ det[det_step * (l_y - c_layer_rows) + l_x];
+ N9[localLin ] =
+ det[det_step * (l_y ) + l_x];
+ N9[localLin + zoff] =
+ det[det_step * (l_y + c_layer_rows) + l_x];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (i < c_layer_rows - margin
+ && j < c_layer_cols - margin
+ && get_local_id(0) > 0
+ && get_local_id(0) < get_local_size(0) - 1
+ && get_local_id(1) > 0
+ && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
+ )
+ {
+ float val0 = N9[localLin];
+ if (val0 > c_hessianThreshold)
+ {
+ // Coordinates for the start of the wavelet in the sum image. There
+ // is some integer division involved, so don't try to simplify this
+ // (cancel out sampleStep) without checking the result is the same
+
+ // Check to see if we have a max (in its 26 neighbours)
+ const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
+ && val0 > N9[localLin - get_local_size(0) - zoff]
+ && val0 > N9[localLin + 1 - get_local_size(0) - zoff]
+ && val0 > N9[localLin - 1 - zoff]
+ && val0 > N9[localLin - zoff]
+ && val0 > N9[localLin + 1 - zoff]
+ && val0 > N9[localLin - 1 + get_local_size(0) - zoff]
+ && val0 > N9[localLin + get_local_size(0) - zoff]
+ && val0 > N9[localLin + 1 + get_local_size(0) - zoff]
+
+ && val0 > N9[localLin - 1 - get_local_size(0)]
+ && val0 > N9[localLin - get_local_size(0)]
+ && val0 > N9[localLin + 1 - get_local_size(0)]
+ && val0 > N9[localLin - 1 ]
+ && val0 > N9[localLin + 1 ]
+ && val0 > N9[localLin - 1 + get_local_size(0)]
+ && val0 > N9[localLin + get_local_size(0)]
+ && val0 > N9[localLin + 1 + get_local_size(0)]
+
+ && val0 > N9[localLin - 1 - get_local_size(0) + zoff]
+ && val0 > N9[localLin - get_local_size(0) + zoff]
+ && val0 > N9[localLin + 1 - get_local_size(0) + zoff]
+ && val0 > N9[localLin - 1 + zoff]
+ && val0 > N9[localLin + zoff]
+ && val0 > N9[localLin + 1 + zoff]
+ && val0 > N9[localLin - 1 + get_local_size(0) + zoff]
+ && val0 > N9[localLin + get_local_size(0) + zoff]
+ && val0 > N9[localLin + 1 + get_local_size(0) + zoff]
+ ;
+
+ if(condmax)
+ {
+ int ind = atomic_inc(maxCounter);
+
+ if (ind < c_max_candidates)
+ {
+ const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
+
+ maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
+ }
+ }
+ }
+ }
+}
+
+// solve 3x3 linear system Ax=b for floating point input
+inline bool solve3x3_float(volatile __local const float A[3][3], volatile __local const float b[3], volatile __local float x[3])
+{
+ float det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+ - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+ + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+ if (det != 0)
+ {
+ F invdet = 1.0 / det;
+
+ x[0] = invdet *
+ (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+ A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
+ A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
+
+ x[1] = invdet *
+ (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
+ b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+ A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
+
+ x[2] = invdet *
+ (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
+ A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
+ b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
+
+ return true;
+ }
+ return false;
+}
+
+#define X_ROW 0
+#define Y_ROW 1
+#define LAPLACIAN_ROW 2
+#define OCTAVE_ROW 3
+#define SIZE_ROW 4
+#define ANGLE_ROW 5
+#define HESSIAN_ROW 6
+#define ROWS_COUNT 7
+
+////////////////////////////////////////////////////////////////////////
+// INTERPOLATION
+__kernel
+ void icvInterpolateKeypoint(
+ __global const float * det,
+ __global const int4 * maxPosBuffer,
+ __global float * keypoints,
+ volatile __global int * featureCounter,
+ int det_step,
+ int keypoints_step,
+ int c_img_rows,
+ int c_img_cols,
+ int c_octave,
+ int c_layer_rows,
+ int c_max_features
+ )
+{
+ det_step /= sizeof(*det);
+ keypoints_step /= sizeof(*keypoints);
+ __global float * featureX = keypoints + X_ROW * keypoints_step;
+ __global float * featureY = keypoints + Y_ROW * keypoints_step;
+ __global int * featureLaplacian = (__global int *)keypoints + LAPLACIAN_ROW * keypoints_step;
+ __global int * featureOctave = (__global int *)keypoints + OCTAVE_ROW * keypoints_step;
+ __global float * featureSize = keypoints + SIZE_ROW * keypoints_step;
+ __global float * featureHessian = keypoints + HESSIAN_ROW * keypoints_step;
+
+ const int4 maxPos = maxPosBuffer[get_group_id(0)];
+
+ const int j = maxPos.x - 1 + get_local_id(0);
+ const int i = maxPos.y - 1 + get_local_id(1);
+ const int layer = maxPos.z - 1 + get_local_id(2);
+
+ volatile __local float N9[3][3][3];
+
+ N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
+ det[det_step * (c_layer_rows * layer + i) + j];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
+ {
+ volatile __local float dD[3];
+
+ //dx
+ dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
+ //dy
+ dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
+ //ds
+ dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
+
+ volatile __local float H[3][3];
+
+ //dxx
+ H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
+ //dxy
+ H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
+ //dxs
+ H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
+ //dyx = dxy
+ H[1][0] = H[0][1];
+ //dyy
+ H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
+ //dys
+ H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
+ //dsx = dxs
+ H[2][0] = H[0][2];
+ //dsy = dys
+ H[2][1] = H[1][2];
+ //dss
+ H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
+
+ volatile __local float x[3];
+
+ if (solve3x3_float(H, dD, x))
+ {
+ if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
+ {
+ // if the step is within the interpolation region, perform it
+
+ const int size = calcSize(c_octave, maxPos.z);
+
+ const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
+ const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
+
+ const float center_i = sum_i + (float)(size - 1) / 2;
+ const float center_j = sum_j + (float)(size - 1) / 2;
+
+ const float px = center_j + x[0] * (1 << c_octave);
+ const float py = center_i + x[1] * (1 << c_octave);
+
+ const int ds = size - calcSize(c_octave, maxPos.z - 1);
+ const float psize = round(size + x[2] * ds);
+
+ /* The sampling intervals and wavelet sized for selecting an orientation
+ and building the keypoint descriptor are defined relative to 's' */
+ const float s = psize * 1.2f / 9.0f;
+
+ /* To find the dominant orientation, the gradients in x and y are
+ sampled in a circle of radius 6s using wavelets of size 4s.
+ We ensure the gradient wavelet size is even to ensure the
+ wavelet pattern is balanced and symmetric around its center */
+ const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+
+ // check when grad_wav_size is too big
+ if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+ {
+ // Get a new feature index.
+ int ind = atomic_inc(featureCounter);
+
+ if (ind < c_max_features)
+ {
+ featureX[ind] = px;
+ featureY[ind] = py;
+ featureLaplacian[ind] = maxPos.w;
+ featureOctave[ind] = c_octave;
+ featureSize[ind] = psize;
+ featureHessian[ind] = N9[1][1][1];
+ }
+ } // grad_wav_size check
+ } // If the subpixel interpolation worked
+ }
+ } // If this is thread 0.
+}
+
+////////////////////////////////////////////////////////////////////////
+// Orientation
+
+#define ORI_SEARCH_INC 5
+#define ORI_WIN 60
+#define ORI_SAMPLES 113
+
+__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
+__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
+__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
+ 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
+ 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
+ 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
+ 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
+ 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
+ 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
+ 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
+ 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
+ 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
+ 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
+ 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
+ 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
+ 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
+ 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
+ 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
+ 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
+ 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
+ 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
+ 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
+ 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
+ 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
+ 0.001707611023448408f, 0.001455130288377404f};
+
+__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
+__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
+
+void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid)
+{
+#define op(A, B) (*A)+(B)
+ data[tid] = *partial_reduction;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (tid < 16)
+ {
+ data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+ data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+ data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+ data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+ data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
+ }
+#undef op
+}
+
+__kernel
+ void icvCalcOrientation(
+ IMAGE_INT32 sumTex,
+ __global float * keypoints,
+ int keypoints_step,
+ int c_img_rows,
+ int c_img_cols,
+ int sum_step
+ )
+{
+ keypoints_step /= sizeof(*keypoints);
+ sum_step /= sizeof(uint);
+ __global float* featureX = keypoints + X_ROW * keypoints_step;
+ __global float* featureY = keypoints + Y_ROW * keypoints_step;
+ __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
+ __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
+
+
+ volatile __local float s_X[128];
+ volatile __local float s_Y[128];
+ volatile __local float s_angle[128];
+
+ volatile __local float s_sumx[32 * 4];
+ volatile __local float s_sumy[32 * 4];
+
+ /* The sampling intervals and wavelet sized for selecting an orientation
+ and building the keypoint descriptor are defined relative to 's' */
+ const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
+
+
+ /* To find the dominant orientation, the gradients in x and y are
+ sampled in a circle of radius 6s using wavelets of size 4s.
+ We ensure the gradient wavelet size is even to ensure the
+ wavelet pattern is balanced and symmetric around its center */
+ const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+
+ // check when grad_wav_size is too big
+ if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
+ return;
+
+ // Calc X, Y, angle and store it to shared memory
+ const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+ float X = 0.0f, Y = 0.0f, angle = 0.0f;
+
+ if (tid < ORI_SAMPLES)
+ {
+ const float margin = (float)(grad_wav_size - 1) / 2.0f;
+ const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
+ const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
+
+ if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
+ x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
+ {
+ X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+ Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+
+ angle = atan2(Y, X);
+
+ if (angle < 0)
+ angle += 2.0f * CV_PI_F;
+ angle *= 180.0f / CV_PI_F;
+
+ }
+ }
+ s_X[tid] = X;
+ s_Y[tid] = Y;
+ s_angle[tid] = angle;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ float bestx = 0, besty = 0, best_mod = 0;
+
+#pragma unroll
+ for (int i = 0; i < 18; ++i)
+ {
+ const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
+
+ volatile float sumx = 0.0f, sumy = 0.0f;
+ int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+ {
+ sumx = s_X[get_local_id(0)];
+ sumy = s_Y[get_local_id(0)];
+ }
+ d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+ {
+ sumx += s_X[get_local_id(0) + 32];
+ sumy += s_Y[get_local_id(0) + 32];
+ }
+ d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+ {
+ sumx += s_X[get_local_id(0) + 64];
+ sumy += s_Y[get_local_id(0) + 64];
+ }
+ d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+ {
+ sumx += s_X[get_local_id(0) + 96];
+ sumy += s_Y[get_local_id(0) + 96];
+ }
+ reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
+ reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
+
+ const float temp_mod = sumx * sumx + sumy * sumy;
+ if (temp_mod > best_mod)
+ {
+ best_mod = temp_mod;
+ bestx = sumx;
+ besty = sumy;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if (get_local_id(0) == 0)
+ {
+ s_X[get_local_id(1)] = bestx;
+ s_Y[get_local_id(1)] = besty;
+ s_angle[get_local_id(1)] = best_mod;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_local_id(1) == 0 && get_local_id(0) == 0)
+ {
+ int bestIdx = 0;
+
+ if (s_angle[1] > s_angle[bestIdx])
+ bestIdx = 1;
+ if (s_angle[2] > s_angle[bestIdx])
+ bestIdx = 2;
+ if (s_angle[3] > s_angle[bestIdx])
+ bestIdx = 3;
+
+ float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
+ if (kp_dir < 0)
+ kp_dir += 2.0f * CV_PI_F;
+ kp_dir *= 180.0f / CV_PI_F;
+
+ kp_dir = 360.0f - kp_dir;
+ if (fabs(kp_dir - 360.f) < FLT_EPSILON)
+ kp_dir = 0.f;
+
+ featureDir[get_group_id(0)] = kp_dir;
+ }
+}
+
+
+__kernel
+ void icvSetUpright(
+ __global float * keypoints,
+ int keypoints_step,
+ int nFeatures
+ )
+{
+ keypoints_step /= sizeof(*keypoints);
+ __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
+
+ if(get_global_id(0) <= nFeatures)
+ {
+ featureDir[get_global_id(0)] = 270.0f;
+ }
+}
+
+
+#undef ORI_SEARCH_INC
+#undef ORI_WIN
+#undef ORI_SAMPLES
+
+////////////////////////////////////////////////////////////////////////
+// Descriptors
+
+#define PATCH_SZ 20
+
+__constant float c_DW[PATCH_SZ * PATCH_SZ] =
+{
+ 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
+ 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
+ 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
+ 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
+ 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
+ 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
+ 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
+ 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
+ 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
+ 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
+ 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
+ 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
+ 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
+ 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
+ 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
+ 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
+ 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
+ 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
+ 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
+ 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
+};
+
+// utility for linear filter
+inline uchar readerGet(
+ IMAGE_INT8 src,
+ const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
+ int i, int j, int rows, int cols, int elemPerRow
+ )
+{
+ float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+ float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
+ return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
+}
+
+inline float linearFilter(
+ IMAGE_INT8 src,
+ const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
+ float y, float x, int rows, int cols, int elemPerRow
+ )
+{
+ x -= 0.5f;
+ y -= 0.5f;
+
+ float out = 0.0f;
+
+ const int x1 = convert_int_rtn(x);
+ const int y1 = convert_int_rtn(y);
+ const int x2 = x1 + 1;
+ const int y2 = y1 + 1;
+
+ uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
+ out = out + src_reg * ((x2 - x) * (y2 - y));
+
+ src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
+ out = out + src_reg * ((x - x1) * (y2 - y));
+
+ src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
+ out = out + src_reg * ((x2 - x) * (y - y1));
+
+ src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
+ out = out + src_reg * ((x - x1) * (y - y1));
+
+ return out;
+}
+
+void calc_dx_dy(
+ IMAGE_INT8 imgTex,
+ volatile __local float s_dx_bin[25],
+ volatile __local float s_dy_bin[25],
+ volatile __local float s_PATCH[6][6],
+ __global const float* featureX,
+ __global const float* featureY,
+ __global const float* featureSize,
+ __global const float* featureDir,
+ int rows,
+ int cols,
+ int elemPerRow
+ )
+{
+ const float centerX = featureX[get_group_id(0)];
+ const float centerY = featureY[get_group_id(0)];
+ const float size = featureSize[get_group_id(0)];
+ float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
+ if(fabs(descriptor_dir - 360.0f) < FLT_EPSILON)
+ {
+ descriptor_dir = 0.0f;
+ }
+ descriptor_dir *= (float)(CV_PI_F / 180.0f);
+
+ /* The sampling intervals and wavelet sized for selecting an orientation
+ and building the keypoint descriptor are defined relative to 's' */
+ const float s = size * 1.2f / 9.0f;
+
+ /* Extract a window of pixels around the keypoint of size 20s */
+ const int win_size = (int)((PATCH_SZ + 1) * s);
+
+ float sin_dir;
+ float cos_dir;
+ sin_dir = sincos(descriptor_dir, &cos_dir);
+
+ /* Nearest neighbour version (faster) */
+ const float win_offset = -(float)(win_size - 1) / 2;
+
+ // Compute sampling points
+ // since grids are 2D, need to compute xBlock and yBlock indices
+ const int xBlock = (get_group_id(1) & 3); // get_group_id(1) % 4
+ const int yBlock = (get_group_id(1) >> 2); // floor(get_group_id(1)/4)
+ const int xIndex = xBlock * 5 + get_local_id(0);
+ const int yIndex = yBlock * 5 + get_local_id(1);
+
+ const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
+ const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
+
+ s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_local_id(0) < 5 && get_local_id(1) < 5)
+ {
+ const int tid = get_local_id(1) * 5 + get_local_id(0);
+
+ const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
+
+ const float vx = (
+ s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
+ s_PATCH[get_local_id(1) ][get_local_id(0) ] +
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
+ * dw;
+ const float vy = (
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
+ s_PATCH[get_local_id(1) ][get_local_id(0) ] +
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+ s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
+ * dw;
+ s_dx_bin[tid] = vx;
+ s_dy_bin[tid] = vy;
+ }
+}
+void reduce_sum25(
+ volatile __local float* sdata1,
+ volatile __local float* sdata2,
+ volatile __local float* sdata3,
+ volatile __local float* sdata4,
+ int tid
+ )
+{
+ // first step is to reduce from 25 to 16
+ if (tid < 9) // use 9 threads
+ {
+ sdata1[tid] += sdata1[tid + 16];
+ sdata2[tid] += sdata2[tid + 16];
+ sdata3[tid] += sdata3[tid + 16];
+ sdata4[tid] += sdata4[tid + 16];
+ }
+
+ // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+ if (tid < 8)
+ {
+ sdata1[tid] += sdata1[tid + 8];
+ sdata1[tid] += sdata1[tid + 4];
+ sdata1[tid] += sdata1[tid + 2];
+ sdata1[tid] += sdata1[tid + 1];
+
+ sdata2[tid] += sdata2[tid + 8];
+ sdata2[tid] += sdata2[tid + 4];
+ sdata2[tid] += sdata2[tid + 2];
+ sdata2[tid] += sdata2[tid + 1];
+
+ sdata3[tid] += sdata3[tid + 8];
+ sdata3[tid] += sdata3[tid + 4];
+ sdata3[tid] += sdata3[tid + 2];
+ sdata3[tid] += sdata3[tid + 1];
+
+ sdata4[tid] += sdata4[tid + 8];
+ sdata4[tid] += sdata4[tid + 4];
+ sdata4[tid] += sdata4[tid + 2];
+ sdata4[tid] += sdata4[tid + 1];
+ }
+}
+
+__kernel
+ void compute_descriptors64(
+ IMAGE_INT8 imgTex,
+ volatile __global float * descriptors,
+ __global const float * keypoints,
+ int descriptors_step,
+ int keypoints_step,
+ int rows,
+ int cols,
+ int img_step
+ )
+{
+ descriptors_step /= sizeof(float);
+ keypoints_step /= sizeof(float);
+ __global const float * featureX = keypoints + X_ROW * keypoints_step;
+ __global const float * featureY = keypoints + Y_ROW * keypoints_step;
+ __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
+ __global const float * featureDir = keypoints + ANGLE_ROW * keypoints_step;
+
+ // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+ volatile __local float sdx[25];
+ volatile __local float sdy[25];
+ volatile __local float sdxabs[25];
+ volatile __local float sdyabs[25];
+ volatile __local float s_PATCH[6][6];
+
+ calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+ if (tid < 25)
+ {
+ sdxabs[tid] = fabs(sdx[tid]); // |dx| array
+ sdyabs[tid] = fabs(sdy[tid]); // |dy| array
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 25)
+ {
+ reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 25)
+ {
+ volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
+
+ // write dx, dy, |dx|, |dy|
+ if (tid == 0)
+ {
+ descriptors_block[0] = sdx[0];
+ descriptors_block[1] = sdy[0];
+ descriptors_block[2] = sdxabs[0];
+ descriptors_block[3] = sdyabs[0];
+ }
+ }
+}
+__kernel
+ void compute_descriptors128(
+ IMAGE_INT8 imgTex,
+ __global volatile float * descriptors,
+ __global float * keypoints,
+ int descriptors_step,
+ int keypoints_step,
+ int rows,
+ int cols,
+ int img_step
+ )
+{
+ descriptors_step /= sizeof(*descriptors);
+ keypoints_step /= sizeof(*keypoints);
+
+ __global float * featureX = keypoints + X_ROW * keypoints_step;
+ __global float * featureY = keypoints + Y_ROW * keypoints_step;
+ __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
+ __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
+
+ // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+ volatile __local float sdx[25];
+ volatile __local float sdy[25];
+
+ // sum (reduce) 5x5 area response
+ volatile __local float sd1[25];
+ volatile __local float sd2[25];
+ volatile __local float sdabs1[25];
+ volatile __local float sdabs2[25];
+ volatile __local float s_PATCH[6][6];
+
+ calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+ if (tid < 25)
+ {
+ if (sdy[tid] >= 0)
+ {
+ sd1[tid] = sdx[tid];
+ sdabs1[tid] = fabs(sdx[tid]);
+ sd2[tid] = 0;
+ sdabs2[tid] = 0;
+ }
+ else
+ {
+ sd1[tid] = 0;
+ sdabs1[tid] = 0;
+ sd2[tid] = sdx[tid];
+ sdabs2[tid] = fabs(sdx[tid]);
+ }
+ //barrier(CLK_LOCAL_MEM_FENCE);
+
+ reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
+ //barrier(CLK_LOCAL_MEM_FENCE);
+
+ volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+
+ // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
+ if (tid == 0)
+ {
+ descriptors_block[0] = sd1[0];
+ descriptors_block[1] = sdabs1[0];
+ descriptors_block[2] = sd2[0];
+ descriptors_block[3] = sdabs2[0];
+ }
+
+ if (sdx[tid] >= 0)
+ {
+ sd1[tid] = sdy[tid];
+ sdabs1[tid] = fabs(sdy[tid]);
+ sd2[tid] = 0;
+ sdabs2[tid] = 0;
+ }
+ else
+ {
+ sd1[tid] = 0;
+ sdabs1[tid] = 0;
+ sd2[tid] = sdy[tid];
+ sdabs2[tid] = fabs(sdy[tid]);
+ }
+ //barrier(CLK_LOCAL_MEM_FENCE);
+
+ reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
+ //barrier(CLK_LOCAL_MEM_FENCE);
+
+ // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
+ if (tid == 0)
+ {
+ descriptors_block[4] = sd1[0];
+ descriptors_block[5] = sdabs1[0];
+ descriptors_block[6] = sd2[0];
+ descriptors_block[7] = sdabs2[0];
+ }
+ }
+}
+
+__kernel
+ void normalize_descriptors128(__global float * descriptors, int descriptors_step)
+{
+ descriptors_step /= sizeof(*descriptors);
+ // no need for thread ID
+ __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
+
+ // read in the unnormalized descriptor values (squared)
+ volatile __local float sqDesc[128];
+ const float lookup = descriptor_base[get_local_id(0)];
+ sqDesc[get_local_id(0)] = lookup * lookup;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_local_id(0) < 64)
+ sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // reduction to get total
+ if (get_local_id(0) < 32)
+ {
+ volatile __local float* smem = sqDesc;
+
+ smem[get_local_id(0)] += smem[get_local_id(0) + 32];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 16];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 8];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 4];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 2];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 1];
+ }
+
+ // compute length (square root)
+ volatile __local float len;
+ if (get_local_id(0) == 0)
+ {
+ len = sqrt(sqDesc[0]);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // normalize and store in output
+ descriptor_base[get_local_id(0)] = lookup / len;
+}
+__kernel
+ void normalize_descriptors64(__global float * descriptors, int descriptors_step)
+{
+ descriptors_step /= sizeof(*descriptors);
+ // no need for thread ID
+ __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
+
+ // read in the unnormalized descriptor values (squared)
+ volatile __local float sqDesc[64];
+ const float lookup = descriptor_base[get_local_id(0)];
+ sqDesc[get_local_id(0)] = lookup * lookup;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // reduction to get total
+ if (get_local_id(0) < 32)
+ {
+ volatile __local float* smem = sqDesc;
+
+ smem[get_local_id(0)] += smem[get_local_id(0) + 32];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 16];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 8];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 4];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 2];
+ smem[get_local_id(0)] += smem[get_local_id(0) + 1];
+ }
+
+ // compute length (square root)
+ volatile __local float len;
+ if (get_local_id(0) == 0)
+ {
+ len = sqrt(sqDesc[0]);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // normalize and store in output
+ descriptor_base[get_local_id(0)] = lookup / len;
+}
#endif
#endif
+#ifdef HAVE_OPENCV_OCL
+# include "opencv2/nonfree/ocl.hpp"
+# include "opencv2/ocl/private/util.hpp"
+#endif
+
#endif
--- /dev/null
+/*M/////////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCV_OCL
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+namespace cv
+{
+ namespace ocl
+ {
+ ///////////////////////////OpenCL kernel strings///////////////////////////
+ extern const char *surf;
+
+ const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+
+ static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
+ size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth)
+ {
+ if(support_image2d())
+ {
+ openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
+ }
+ else
+ {
+ openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
+ }
+ }
+ }
+}
+
+
+static inline int divUp(int total, int grain)
+{
+ return (total + grain - 1) / grain;
+}
+static inline int calcSize(int octave, int layer)
+{
+ /* Wavelet size at first layer of first octave. */
+ const int HAAR_SIZE0 = 9;
+
+ /* Wavelet size increment between layers. This should be an even number,
+ such that the wavelet sizes in an octave are either all even or all odd.
+ This ensures that when looking for the neighbors of a sample, the layers
+
+ above and below are aligned correctly. */
+ const int HAAR_SIZE_INC = 6;
+
+ return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+}
+
+
+class SURF_OCL_Invoker
+{
+public:
+ // facilities
+ void bindImgTex(const oclMat &img, cl_mem &texture);
+
+ //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
+ //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
+
+ // kernel callers declarations
+ void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
+
+ void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
+ int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
+
+ void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
+ oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);
+
+ void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
+
+ void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
+
+ void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
+ // end of kernel callers declarations
+
+ SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
+ surf_(surf),
+ img_cols(img.cols), img_rows(img.rows),
+ use_mask(!mask.empty()), counters(oclMat()),
+ imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
+ {
+ CV_Assert(!img.empty() && img.type() == CV_8UC1);
+ CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
+ CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
+
+ const int min_size = calcSize(surf_.nOctaves - 1, 0);
+ CV_Assert(img_rows - min_size >= 0);
+ CV_Assert(img_cols - min_size >= 0);
+
+ const int layer_rows = img_rows >> (surf_.nOctaves - 1);
+ const int layer_cols = img_cols >> (surf_.nOctaves - 1);
+ const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
+ CV_Assert(layer_rows - 2 * min_margin > 0);
+ CV_Assert(layer_cols - 2 * min_margin > 0);
+
+ maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
+ maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
+
+ CV_Assert(maxFeatures > 0);
+
+ counters.create(1, surf_.nOctaves + 1, CV_32SC1);
+ counters.setTo(Scalar::all(0));
+
+ integral(img, surf_.sum);
+ if(support_image2d())
+ {
+ bindImgTex(img, imgTex);
+ bindImgTex(surf_.sum, sumTex);
+ }
+
+ maskSumTex = 0;
+
+ if (use_mask)
+ {
+ CV_Error(CV_StsBadFunc, "Masked SURF detector is not implemented yet");
+ //!FIXME
+ // temp fix for missing min overload
+ //oclMat temp(mask.size(), mask.type());
+ //temp.setTo(Scalar::all(1.0));
+ ////cv::ocl::min(mask, temp, surf_.mask1); ///////// disable this
+ //integral(surf_.mask1, surf_.maskSum);
+ //bindImgTex(surf_.maskSum, maskSumTex);
+ }
+ }
+
+ void detectKeypoints(oclMat &keypoints)
+ {
+ // create image pyramid buffers
+ // different layers have same sized buffers, but they are sampled from Gaussian kernel.
+ ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
+ ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
+
+ ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
+ ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
+ keypoints.setTo(Scalar::all(0));
+
+ for (int octave = 0; octave < surf_.nOctaves; ++octave)
+ {
+ const int layer_rows = img_rows >> octave;
+ const int layer_cols = img_cols >> octave;
+
+ //loadOctaveConstants(octave, layer_rows, layer_cols);
+
+ icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
+
+ icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
+ octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
+
+ int maxCounter = ((Mat)counters).at<int>(1 + octave);
+ maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
+
+ if (maxCounter > 0)
+ {
+ icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
+ keypoints, counters, octave, layer_rows, maxFeatures);
+ }
+ }
+ int featureCounter = Mat(counters).at<int>(0);
+ featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
+
+ keypoints.cols = featureCounter;
+
+ if (surf_.upright)
+ {
+ //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+ setUpright(keypoints);
+ }
+ else
+ {
+ findOrientation(keypoints);
+ }
+ }
+
+ void setUpright(oclMat &keypoints)
+ {
+ const int nFeatures = keypoints.cols;
+ if(nFeatures > 0)
+ {
+ icvSetUpright_gpu(keypoints, keypoints.cols);
+ }
+ }
+
+ void findOrientation(oclMat &keypoints)
+ {
+ const int nFeatures = keypoints.cols;
+ if (nFeatures > 0)
+ {
+ icvCalcOrientation_gpu(keypoints, nFeatures);
+ }
+ }
+
+ void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize)
+ {
+ const int nFeatures = keypoints.cols;
+ if (nFeatures > 0)
+ {
+ ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
+ compute_descriptors_gpu(descriptors, keypoints, nFeatures);
+ }
+ }
+
+ ~SURF_OCL_Invoker()
+ {
+ if(imgTex)
+ openCLFree(imgTex);
+ if(sumTex)
+ openCLFree(sumTex);
+ if(maskSumTex)
+ openCLFree(maskSumTex);
+ }
+
+private:
+ SURF_OCL &surf_;
+
+ int img_cols, img_rows;
+
+ bool use_mask;
+
+ int maxCandidates;
+ int maxFeatures;
+
+ oclMat counters;
+
+ // texture buffers
+ cl_mem imgTex;
+ cl_mem sumTex;
+ cl_mem maskSumTex;
+
+ const oclMat _img; // make a copy for non-image2d_t supported platform
+
+ SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
+ {
+ (*this) = right;
+ return *this;
+ } // remove warning C4512
+};
+
+cv::ocl::SURF_OCL::SURF_OCL()
+{
+ hessianThreshold = 100.0f;
+ extended = true;
+ nOctaves = 4;
+ nOctaveLayers = 2;
+ keypointsRatio = 0.01f;
+ upright = false;
+}
+
+cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
+{
+ hessianThreshold = saturate_cast<float>(_threshold);
+ extended = _extended;
+ nOctaves = _nOctaves;
+ nOctaveLayers = _nOctaveLayers;
+ keypointsRatio = _keypointsRatio;
+ upright = _upright;
+}
+
+int cv::ocl::SURF_OCL::descriptorSize() const
+{
+ return extended ? 128 : 64;
+}
+
+void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint> &keypoints, oclMat &keypointsGPU)
+{
+ if (keypoints.empty())
+ keypointsGPU.release();
+ else
+ {
+ Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
+
+ float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
+ float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
+ int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
+ int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
+ float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
+ float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
+ float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
+
+ for (size_t i = 0, size = keypoints.size(); i < size; ++i)
+ {
+ const KeyPoint &kp = keypoints[i];
+ kp_x[i] = kp.pt.x;
+ kp_y[i] = kp.pt.y;
+ kp_octave[i] = kp.octave;
+ kp_size[i] = kp.size;
+ kp_dir[i] = kp.angle;
+ kp_hessian[i] = kp.response;
+ kp_laplacian[i] = 1;
+ }
+
+ keypointsGPU.upload(keypointsCPU);
+ }
+}
+
+void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, vector<KeyPoint> &keypoints)
+{
+ const int nFeatures = keypointsGPU.cols;
+
+ if (nFeatures == 0)
+ keypoints.clear();
+ else
+ {
+ CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
+
+ Mat keypointsCPU(keypointsGPU);
+
+ keypoints.resize(nFeatures);
+
+ float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
+ float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
+ int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
+ int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
+ float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
+ float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
+ float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
+
+ for (int i = 0; i < nFeatures; ++i)
+ {
+ KeyPoint &kp = keypoints[i];
+ kp.pt.x = kp_x[i];
+ kp.pt.y = kp_y[i];
+ kp.class_id = kp_laplacian[i];
+ kp.octave = kp_octave[i];
+ kp.size = kp_size[i];
+ kp.angle = kp_dir[i];
+ kp.response = kp_hessian[i];
+ }
+ }
+}
+
+void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, vector<float> &descriptors)
+{
+ if (descriptorsGPU.empty())
+ descriptors.clear();
+ else
+ {
+ CV_Assert(descriptorsGPU.type() == CV_32F);
+
+ descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
+ Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
+ descriptorsGPU.download(descriptorsCPU);
+ }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints)
+{
+ if (!img.empty())
+ {
+ SURF_OCL_Invoker surf(*this, img, mask);
+
+ surf.detectKeypoints(keypoints);
+ }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
+ bool useProvidedKeypoints)
+{
+ if (!img.empty())
+ {
+ SURF_OCL_Invoker surf(*this, img, mask);
+
+ if (!useProvidedKeypoints)
+ surf.detectKeypoints(keypoints);
+ else if (!upright)
+ {
+ surf.findOrientation(keypoints);
+ }
+
+ surf.computeDescriptors(keypoints, descriptors, descriptorSize());
+ }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints)
+{
+ oclMat keypointsGPU;
+
+ (*this)(img, mask, keypointsGPU);
+
+ downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints,
+ oclMat &descriptors, bool useProvidedKeypoints)
+{
+ oclMat keypointsGPU;
+
+ if (useProvidedKeypoints)
+ uploadKeypoints(keypoints, keypointsGPU);
+
+ (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
+
+ downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints,
+ vector<float> &descriptors, bool useProvidedKeypoints)
+{
+ oclMat descriptorsGPU;
+
+ (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
+
+ downloadDescriptors(descriptorsGPU, descriptors);
+}
+
+void cv::ocl::SURF_OCL::releaseMemory()
+{
+ sum.release();
+ mask1.release();
+ maskSum.release();
+ intBuffer.release();
+ det.release();
+ trace.release();
+ maxPosBuffer.release();
+}
+
+
+// bind source buffer to image oject.
+void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
+{
+ if(texture)
+ {
+ openCLFree(texture);
+ }
+ texture = bindTexture(img);
+}
+
+////////////////////////////
+// kernel caller definitions
+void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows)
+{
+ const int min_size = calcSize(octave, 0);
+ const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
+ const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
+
+ Context *clCxt = det.clCxt;
+ string kernelName = "icvCalcLayerDetAndTrace";
+ vector< pair<size_t, const void *> > args;
+
+ if(sumTex)
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+ }
+ else
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+ }
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
+
+ size_t localThreads[3] = {16, 16, 1};
+ size_t globalThreads[3] =
+ {
+ divUp(max_samples_j, localThreads[0]) *localThreads[0],
+ divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
+ 1
+ };
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
+ int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
+{
+ const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
+
+ Context *clCxt = det.clCxt;
+ string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
+ vector< pair<size_t, const void *> > args;
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&counterOffset));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&nLayers));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&layer_cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
+ args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
+
+ if(use_mask)
+ {
+ if(maskSumTex)
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
+ }
+ else
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
+ }
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
+ }
+ size_t localThreads[3] = {16, 16, 1};
+ size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
+ divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
+ 1
+ };
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
+ oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
+{
+ Context *clCxt = det.clCxt;
+ string kernelName = "icvInterpolateKeypoint";
+ vector< pair<size_t, const void *> > args;
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures));
+
+ size_t localThreads[3] = {3, 3, 3};
+ size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
+{
+ Context *clCxt = counters.clCxt;
+ string kernelName = "icvCalcOrientation";
+
+ vector< pair<size_t, const void *> > args;
+
+ if(sumTex)
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+ }
+ else
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+ }
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
+
+ size_t localThreads[3] = {32, 4, 1};
+ size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
+{
+ Context *clCxt = counters.clCxt;
+ string kernelName = "icvSetUpright";
+
+ vector< pair<size_t, const void *> > args;
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&nFeatures));
+
+ size_t localThreads[3] = {256, 1, 1};
+ size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+
+void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
+{
+ // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
+ Context *clCxt = descriptors.clCxt;
+ string kernelName = "";
+ vector< pair<size_t, const void *> > args;
+ size_t localThreads[3] = {1, 1, 1};
+ size_t globalThreads[3] = {1, 1, 1};
+
+ if(descriptors.cols == 64)
+ {
+ kernelName = "compute_descriptors64";
+
+ localThreads[0] = 6;
+ localThreads[1] = 6;
+
+ globalThreads[0] = nFeatures * localThreads[0];
+ globalThreads[1] = 16 * localThreads[1];
+
+ args.clear();
+ if(imgTex)
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+ }
+ else
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
+ }
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+
+ kernelName = "normalize_descriptors64";
+
+ localThreads[0] = 64;
+ localThreads[1] = 1;
+
+ globalThreads[0] = nFeatures * localThreads[0];
+ globalThreads[1] = localThreads[1];
+
+ args.clear();
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+ }
+ else
+ {
+ kernelName = "compute_descriptors128";
+
+ localThreads[0] = 6;
+ localThreads[1] = 6;
+
+ globalThreads[0] = nFeatures * localThreads[0];
+ globalThreads[1] = 16 * localThreads[1];
+
+ args.clear();
+ if(imgTex)
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+ }
+ else
+ {
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
+ }
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+
+ kernelName = "normalize_descriptors128";
+
+ localThreads[0] = 128;
+ localThreads[1] = 1;
+
+ globalThreads[0] = nFeatures * localThreads[0];
+ globalThreads[1] = localThreads[1];
+
+ args.clear();
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+
+ openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
+ }
+}
+
+#endif //HAVE_OPENCV_OCL
endif()
set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
enum DevMemRW
{
- DEVICE_MEM_R_W = 0,
- DEVICE_MEM_R_ONLY,
+ DEVICE_MEM_R_W = 0,
+ DEVICE_MEM_R_ONLY,
DEVICE_MEM_W_ONLY
};
-
+
enum DevMemType
- {
- DEVICE_MEM_DEFAULT = 0,
+ {
+ DEVICE_MEM_DEFAULT = 0,
DEVICE_MEM_AHP, //alloc host pointer
DEVICE_MEM_UHP, //use host pointer
DEVICE_MEM_CHP, //copy host pointer
DEVICE_MEM_PM //persistent memory
};
- //Get the global device memory and read/write type
+ //Get the global device memory and read/write type
//return 1 if unified memory system supported, otherwise return 0
CV_EXPORTS int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type);
- //Set the global device memory and read/write type,
+ //Set the global device memory and read/write type,
//the newly generated oclMat will all use this type
//return -1 if the target type is unsupported, otherwise return 0
- CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
+ CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
//this class contains ocl runtime information
class CV_EXPORTS Info
//////////////////////////////// OpenCL context ////////////////////////
//This is a global singleton class used to represent a OpenCL context.
- class Context
+ class CV_EXPORTS Context
{
protected:
Context();
};
-
- //! Speeded up robust features, port from GPU module.
- ////////////////////////////////// SURF //////////////////////////////////////////
-
- class CV_EXPORTS SURF_OCL
-
- {
-
- public:
-
- enum KeypointLayout
-
- {
-
- X_ROW = 0,
-
- Y_ROW,
-
- LAPLACIAN_ROW,
-
- OCTAVE_ROW,
-
- SIZE_ROW,
-
- ANGLE_ROW,
-
- HESSIAN_ROW,
-
- ROWS_COUNT
-
- };
-
-
-
- //! the default constructor
-
- SURF_OCL();
-
- //! the full constructor taking all the necessary parameters
-
- explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
-
- int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
-
-
-
- //! returns the descriptor size in float's (64 or 128)
-
- int descriptorSize() const;
-
-
-
- //! upload host keypoints to device memory
-
- void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
-
- //! download keypoints from device to host memory
-
- void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints);
-
-
-
- //! download descriptors from device to host memory
-
- void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors);
-
-
-
- //! finds the keypoints using fast hessian detector used in SURF
-
- //! supports CV_8UC1 images
-
- //! keypoints will have nFeature cols and 6 rows
-
- //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
-
- //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
-
- //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
-
- //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
-
- //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
-
- //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
-
- //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
-
- void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
-
- //! finds the keypoints and computes their descriptors.
-
- //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
-
- void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
-
- bool useProvidedKeypoints = false);
-
-
-
- void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
-
- void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
-
- bool useProvidedKeypoints = false);
-
-
-
- void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
-
- bool useProvidedKeypoints = false);
-
-
-
- void releaseMemory();
-
-
-
- // SURF parameters
-
- float hessianThreshold;
-
- int nOctaves;
-
- int nOctaveLayers;
-
- bool extended;
-
- bool upright;
-
-
-
- //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-
- float keypointsRatio;
-
-
-
- oclMat sum, mask1, maskSum, intBuffer;
-
-
-
- oclMat det, trace;
-
-
-
- oclMat maxPosBuffer;
-
- };
-
////////////////////////feature2d_ocl/////////////////
/****************************************************************************************\
* Distance *
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_OCL_PRIVATE_UTIL__
+#define __OPENCV_OCL_PRIVATE_UTIL__
+
+#include "opencv2/ocl/ocl.hpp"
+
+#if defined __APPLE__
+#include <OpenCL/OpenCL.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+namespace cv
+{
+ namespace ocl
+ {
+ ///////////////////////////OpenCL call wrappers////////////////////////////
+ void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
+ size_t widthInBytes, size_t height);
+ void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+ size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
+ void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
+ const void *src, size_t spitch,
+ size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
+ void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
+ const void *src, size_t spitch,
+ size_t width, size_t height, int src_offset);
+ void CV_EXPORTS openCLFree(void *devPtr);
+ cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
+ void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
+ cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+ const char **source, std::string kernelName);
+ cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+ const char **source, std::string kernelName, const char *build_options);
+ void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+ void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
+ int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
+ void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
+ size_t globalThreads[3], size_t localThreads[3],
+ std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
+ void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+ size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
+ void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+ size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
+ int depth, const char *build_options);
+
+ cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+ const size_t size);
+
+ cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
+
+ int CV_EXPORTS savetofile(const Context *clcxt, cl_program &program, const char *fileName);
+
+ enum FLUSH_MODE
+ {
+ CLFINISH = 0,
+ CLFLUSH,
+ DISABLE
+ };
+
+ void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+ size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
+ void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+ size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
+ int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+ // bind oclMat to OpenCL image textures
+ // note:
+ // 1. there is no memory management. User need to explicitly release the resource
+ // 2. for faster clamping, there is no buffer padding for the constructed texture
+ cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
+ void CV_EXPORTS releaseTexture(cl_mem& texture);
+
+ // returns whether the current context supports image2d_t format or not
+ bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
+
+ }//namespace ocl
+
+}//namespace cv
+
+#endif //__OPENCV_OCL_PRIVATE_UTIL__
//
//M*/
-#include <iomanip>
#include "precomp.hpp"
-#include "mcwutil.hpp"
using namespace cv;
using namespace cv::ocl;
//M*/
#include "precomp.hpp"
-#include "mcwutil.hpp"
-#include <iostream>
+
using namespace std;
using namespace cv;
using namespace cv::ocl;
//M*/
#include "precomp.hpp"
-#include "mcwutil.hpp"
+
using namespace cv;
using namespace cv::ocl;
using namespace std;
//
//M*/
-#include <iomanip>
#include "precomp.hpp"
-#include "mcwutil.hpp"
using namespace std;
using namespace cv;
//
//M*/
-#include "mcwutil.hpp"
+#include "opencv2/ocl/private/util.hpp"
#if defined (HAVE_OPENCL)
#ifndef CL_VERSION_1_2
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other oclMaterials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef _OPENCV_MCWUTIL_
-#define _OPENCV_MCWUTIL_
-
-#include "precomp.hpp"
-using namespace std;
-
-namespace cv
-{
- namespace ocl
- {
- enum FLUSH_MODE
- {
- CLFINISH = 0,
- CLFLUSH,
- DISABLE
- };
- void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
- size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
- void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
- size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
- int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
- // bind oclMat to OpenCL image textures
- // note:
- // 1. there is no memory management. User need to explicitly release the resource
- // 2. for faster clamping, there is no buffer padding for the constructed texture
- cl_mem bindTexture(const oclMat &mat);
- void releaseTexture(cl_mem& texture);
-
- // returns whether the current context supports image2d_t format or not
- bool support_image2d(Context *clCxt = Context::getContext());
-
- }//namespace ocl
-
-}//namespace cv
-
-#endif //_OPENCV_MCWUTIL_
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other oclMaterials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-// specialized for non-image2d_t supported platform, intel HD4000, for example
-#ifdef DISABLE_IMAGE2D
-#define IMAGE_INT32 __global uint *
-#define IMAGE_INT8 __global uchar *
-#else
-#define IMAGE_INT32 image2d_t
-#define IMAGE_INT8 image2d_t
-#endif
-
-uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
-{
-#ifdef DISABLE_IMAGE2D
- int x = clamp(coord.x, 0, cols);
- int y = clamp(coord.y, 0, rows);
- return img[elemPerRow * y + x];
-#else
- return read_imageui(img, sam, coord).x;
-#endif
-}
-uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
-{
-#ifdef DISABLE_IMAGE2D
- int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
- int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
- return img[elemPerRow * y + x];
-#else
- return (uchar)read_imageui(img, sam, coord).x;
-#endif
-}
-
-// dynamically change the precision used for floating type
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#define F double
-#else
-#define F float
-#endif
-
-// Image read mode
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-#ifndef FLT_EPSILON
-#define FLT_EPSILON (1e-15)
-#endif
-
-#ifndef CV_PI_F
-#define CV_PI_F 3.14159265f
-#endif
-
-// Use integral image to calculate haar wavelets.
-// N = 2
-// for simple haar paatern
-float icvCalcHaarPatternSum_2(
- IMAGE_INT32 sumTex,
- __constant float src[2][5],
- int oldSize,
- int newSize,
- int y, int x,
- int rows, int cols, int elemPerRow)
-{
-
- float ratio = (float)newSize / oldSize;
-
- F d = 0;
-
-#pragma unroll
- for (int k = 0; k < 2; ++k)
- {
- int dx1 = convert_int_rte(ratio * src[k][0]);
- int dy1 = convert_int_rte(ratio * src[k][1]);
- int dx2 = convert_int_rte(ratio * src[k][2]);
- int dy2 = convert_int_rte(ratio * src[k][3]);
-
- F t = 0;
- t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
- t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
- t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
- t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
- d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
- }
-
- return (float)d;
-}
-
-// N = 3
-float icvCalcHaarPatternSum_3(
- IMAGE_INT32 sumTex,
- __constant float src[2][5],
- int oldSize,
- int newSize,
- int y, int x,
- int rows, int cols, int elemPerRow)
-{
-
- float ratio = (float)newSize / oldSize;
-
- F d = 0;
-
-#pragma unroll
- for (int k = 0; k < 3; ++k)
- {
- int dx1 = convert_int_rte(ratio * src[k][0]);
- int dy1 = convert_int_rte(ratio * src[k][1]);
- int dx2 = convert_int_rte(ratio * src[k][2]);
- int dy2 = convert_int_rte(ratio * src[k][3]);
-
- F t = 0;
- t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
- t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
- t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
- t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
- d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
- }
-
- return (float)d;
-}
-
-// N = 4
-float icvCalcHaarPatternSum_4(
- IMAGE_INT32 sumTex,
- __constant float src[2][5],
- int oldSize,
- int newSize,
- int y, int x,
- int rows, int cols, int elemPerRow)
-{
-
- float ratio = (float)newSize / oldSize;
-
- F d = 0;
-
-#pragma unroll
- for (int k = 0; k < 4; ++k)
- {
- int dx1 = convert_int_rte(ratio * src[k][0]);
- int dy1 = convert_int_rte(ratio * src[k][1]);
- int dx2 = convert_int_rte(ratio * src[k][2]);
- int dy2 = convert_int_rte(ratio * src[k][3]);
-
- F t = 0;
- t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
- t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
- t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
- t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
- d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
- }
-
- return (float)d;
-}
-
-////////////////////////////////////////////////////////////////////////
-// Hessian
-
-__constant float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
-__constant float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
-__constant float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
-
-__inline int calcSize(int octave, int layer)
-{
- /* Wavelet size at first layer of first octave. */
- const int HAAR_SIZE0 = 9;
-
- /* Wavelet size increment between layers. This should be an even number,
- such that the wavelet sizes in an octave are either all even or all odd.
- This ensures that when looking for the neighbours of a sample, the layers
- above and below are aligned correctly. */
- const int HAAR_SIZE_INC = 6;
-
- return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
-}
-
-
-//calculate targeted layer per-pixel determinant and trace with an integral image
-__kernel void icvCalcLayerDetAndTrace(
- IMAGE_INT32 sumTex, // input integral image
- __global float * det, // output Determinant
- __global float * trace, // output trace
- int det_step, // the step of det in bytes
- int trace_step, // the step of trace in bytes
- int c_img_rows,
- int c_img_cols,
- int c_nOctaveLayers,
- int c_octave,
- int c_layer_rows,
- int sumTex_step
- )
-{
- det_step /= sizeof(*det);
- trace_step /= sizeof(*trace);
- sumTex_step/= sizeof(uint);
- // Determine the indices
- const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
- const int blockIdx_y = get_group_id(1) % gridDim_y;
- const int blockIdx_z = get_group_id(1) / gridDim_y;
-
- const int j = get_local_id(0) + get_group_id(0) * get_local_size(0);
- const int i = get_local_id(1) + blockIdx_y * get_local_size(1);
- const int layer = blockIdx_z;
-
- const int size = calcSize(c_octave, layer);
-
- const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
- const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
-
- // Ignore pixels where some of the kernel is outside the image
- const int margin = (size >> 1) >> c_octave;
-
- if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
- {
- const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
- const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
- const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
-
- det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
- trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////
-// NONMAX
-
-__constant float c_DM[5] = {0, 0, 9, 9, 1};
-
-bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
-{
- float ratio = (float)size / 9.0f;
-
- float d = 0;
-
- int dx1 = convert_int_rte(ratio * c_DM[0]);
- int dy1 = convert_int_rte(ratio * c_DM[1]);
- int dx2 = convert_int_rte(ratio * c_DM[2]);
- int dy2 = convert_int_rte(ratio * c_DM[3]);
-
- float t = 0;
-
- t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
- t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
- t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
- t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
-
- d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
-
- return (d >= 0.5f);
-}
-
-// Non-maximal suppression to further filtering the candidates from previous step
-__kernel
- void icvFindMaximaInLayer_withmask(
- __global const float * det,
- __global const float * trace,
- __global int4 * maxPosBuffer,
- volatile __global int* maxCounter,
- int counter_offset,
- int det_step, // the step of det in bytes
- int trace_step, // the step of trace in bytes
- int c_img_rows,
- int c_img_cols,
- int c_nOctaveLayers,
- int c_octave,
- int c_layer_rows,
- int c_layer_cols,
- int c_max_candidates,
- float c_hessianThreshold,
- IMAGE_INT32 maskSumTex,
- int mask_step
- )
-{
- volatile __local float N9[768]; // threads.x * threads.y * 3
-
- det_step /= sizeof(*det);
- trace_step /= sizeof(*trace);
- maxCounter += counter_offset;
- mask_step /= sizeof(uint);
-
- // Determine the indices
- const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
- const int blockIdx_y = get_group_id(1) % gridDim_y;
- const int blockIdx_z = get_group_id(1) / gridDim_y;
-
- const int layer = blockIdx_z + 1;
-
- const int size = calcSize(c_octave, layer);
-
- // Ignore pixels without a 3x3x3 neighbourhood in the layer above
- const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
-
- const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
- const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
-
- // Is this thread within the hessian buffer?
- const int zoff = get_local_size(0) * get_local_size(1);
- const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
- N9[localLin - zoff] =
- det[det_step *
- (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
- + min(max(j, 0), c_img_cols - 1)]; // x
- N9[localLin ] =
- det[det_step *
- (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
- + min(max(j, 0), c_img_cols - 1)]; // x
- N9[localLin + zoff] =
- det[det_step *
- (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
- + min(max(j, 0), c_img_cols - 1)]; // x
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (i < c_layer_rows - margin
- && j < c_layer_cols - margin
- && get_local_id(0) > 0
- && get_local_id(0) < get_local_size(0) - 1
- && get_local_id(1) > 0
- && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
- )
- {
- float val0 = N9[localLin];
-
- if (val0 > c_hessianThreshold)
- {
- // Coordinates for the start of the wavelet in the sum image. There
- // is some integer division involved, so don't try to simplify this
- // (cancel out sampleStep) without checking the result is the same
- const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
- const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
-
- if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
- {
- // Check to see if we have a max (in its 26 neighbours)
- const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
- && val0 > N9[localLin - get_local_size(0) - zoff]
- && val0 > N9[localLin + 1 - get_local_size(0) - zoff]
- && val0 > N9[localLin - 1 - zoff]
- && val0 > N9[localLin - zoff]
- && val0 > N9[localLin + 1 - zoff]
- && val0 > N9[localLin - 1 + get_local_size(0) - zoff]
- && val0 > N9[localLin + get_local_size(0) - zoff]
- && val0 > N9[localLin + 1 + get_local_size(0) - zoff]
-
- && val0 > N9[localLin - 1 - get_local_size(0)]
- && val0 > N9[localLin - get_local_size(0)]
- && val0 > N9[localLin + 1 - get_local_size(0)]
- && val0 > N9[localLin - 1 ]
- && val0 > N9[localLin + 1 ]
- && val0 > N9[localLin - 1 + get_local_size(0)]
- && val0 > N9[localLin + get_local_size(0)]
- && val0 > N9[localLin + 1 + get_local_size(0)]
-
- && val0 > N9[localLin - 1 - get_local_size(0) + zoff]
- && val0 > N9[localLin - get_local_size(0) + zoff]
- && val0 > N9[localLin + 1 - get_local_size(0) + zoff]
- && val0 > N9[localLin - 1 + zoff]
- && val0 > N9[localLin + zoff]
- && val0 > N9[localLin + 1 + zoff]
- && val0 > N9[localLin - 1 + get_local_size(0) + zoff]
- && val0 > N9[localLin + get_local_size(0) + zoff]
- && val0 > N9[localLin + 1 + get_local_size(0) + zoff]
- ;
-
- if(condmax)
- {
- int ind = atomic_inc(maxCounter);
-
- if (ind < c_max_candidates)
- {
- const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
-
- maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
- }
- }
- }
- }
- }
-}
-
-__kernel
- void icvFindMaximaInLayer(
- __global float * det,
- __global float * trace,
- __global int4 * maxPosBuffer,
- volatile __global int* maxCounter,
- int counter_offset,
- int det_step, // the step of det in bytes
- int trace_step, // the step of trace in bytes
- int c_img_rows,
- int c_img_cols,
- int c_nOctaveLayers,
- int c_octave,
- int c_layer_rows,
- int c_layer_cols,
- int c_max_candidates,
- float c_hessianThreshold
- )
-{
- volatile __local float N9[768]; // threads.x * threads.y * 3
-
- det_step /= sizeof(float);
- trace_step /= sizeof(float);
- maxCounter += counter_offset;
-
- // Determine the indices
- const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
- const int blockIdx_y = get_group_id(1) % gridDim_y;
- const int blockIdx_z = get_group_id(1) / gridDim_y;
-
- const int layer = blockIdx_z + 1;
-
- const int size = calcSize(c_octave, layer);
-
- // Ignore pixels without a 3x3x3 neighbourhood in the layer above
- const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
-
- const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
- const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
-
- // Is this thread within the hessian buffer?
- const int zoff = get_local_size(0) * get_local_size(1);
- const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
-
- int l_x = min(max(j, 0), c_img_cols - 1);
- int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
-
- N9[localLin - zoff] =
- det[det_step * (l_y - c_layer_rows) + l_x];
- N9[localLin ] =
- det[det_step * (l_y ) + l_x];
- N9[localLin + zoff] =
- det[det_step * (l_y + c_layer_rows) + l_x];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (i < c_layer_rows - margin
- && j < c_layer_cols - margin
- && get_local_id(0) > 0
- && get_local_id(0) < get_local_size(0) - 1
- && get_local_id(1) > 0
- && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
- )
- {
- float val0 = N9[localLin];
- if (val0 > c_hessianThreshold)
- {
- // Coordinates for the start of the wavelet in the sum image. There
- // is some integer division involved, so don't try to simplify this
- // (cancel out sampleStep) without checking the result is the same
-
- // Check to see if we have a max (in its 26 neighbours)
- const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
- && val0 > N9[localLin - get_local_size(0) - zoff]
- && val0 > N9[localLin + 1 - get_local_size(0) - zoff]
- && val0 > N9[localLin - 1 - zoff]
- && val0 > N9[localLin - zoff]
- && val0 > N9[localLin + 1 - zoff]
- && val0 > N9[localLin - 1 + get_local_size(0) - zoff]
- && val0 > N9[localLin + get_local_size(0) - zoff]
- && val0 > N9[localLin + 1 + get_local_size(0) - zoff]
-
- && val0 > N9[localLin - 1 - get_local_size(0)]
- && val0 > N9[localLin - get_local_size(0)]
- && val0 > N9[localLin + 1 - get_local_size(0)]
- && val0 > N9[localLin - 1 ]
- && val0 > N9[localLin + 1 ]
- && val0 > N9[localLin - 1 + get_local_size(0)]
- && val0 > N9[localLin + get_local_size(0)]
- && val0 > N9[localLin + 1 + get_local_size(0)]
-
- && val0 > N9[localLin - 1 - get_local_size(0) + zoff]
- && val0 > N9[localLin - get_local_size(0) + zoff]
- && val0 > N9[localLin + 1 - get_local_size(0) + zoff]
- && val0 > N9[localLin - 1 + zoff]
- && val0 > N9[localLin + zoff]
- && val0 > N9[localLin + 1 + zoff]
- && val0 > N9[localLin - 1 + get_local_size(0) + zoff]
- && val0 > N9[localLin + get_local_size(0) + zoff]
- && val0 > N9[localLin + 1 + get_local_size(0) + zoff]
- ;
-
- if(condmax)
- {
- int ind = atomic_inc(maxCounter);
-
- if (ind < c_max_candidates)
- {
- const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
-
- maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
- }
- }
- }
- }
-}
-
-// solve 3x3 linear system Ax=b for floating point input
-inline bool solve3x3_float(volatile __local const float A[3][3], volatile __local const float b[3], volatile __local float x[3])
-{
- float det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
- - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
- + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
-
- if (det != 0)
- {
- F invdet = 1.0 / det;
-
- x[0] = invdet *
- (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
- A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
- A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
-
- x[1] = invdet *
- (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
- b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
- A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
-
- x[2] = invdet *
- (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
- A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
- b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
-
- return true;
- }
- return false;
-}
-
-#define X_ROW 0
-#define Y_ROW 1
-#define LAPLACIAN_ROW 2
-#define OCTAVE_ROW 3
-#define SIZE_ROW 4
-#define ANGLE_ROW 5
-#define HESSIAN_ROW 6
-#define ROWS_COUNT 7
-
-////////////////////////////////////////////////////////////////////////
-// INTERPOLATION
-__kernel
- void icvInterpolateKeypoint(
- __global const float * det,
- __global const int4 * maxPosBuffer,
- __global float * keypoints,
- volatile __global int * featureCounter,
- int det_step,
- int keypoints_step,
- int c_img_rows,
- int c_img_cols,
- int c_octave,
- int c_layer_rows,
- int c_max_features
- )
-{
- det_step /= sizeof(*det);
- keypoints_step /= sizeof(*keypoints);
- __global float * featureX = keypoints + X_ROW * keypoints_step;
- __global float * featureY = keypoints + Y_ROW * keypoints_step;
- __global int * featureLaplacian = (__global int *)keypoints + LAPLACIAN_ROW * keypoints_step;
- __global int * featureOctave = (__global int *)keypoints + OCTAVE_ROW * keypoints_step;
- __global float * featureSize = keypoints + SIZE_ROW * keypoints_step;
- __global float * featureHessian = keypoints + HESSIAN_ROW * keypoints_step;
-
- const int4 maxPos = maxPosBuffer[get_group_id(0)];
-
- const int j = maxPos.x - 1 + get_local_id(0);
- const int i = maxPos.y - 1 + get_local_id(1);
- const int layer = maxPos.z - 1 + get_local_id(2);
-
- volatile __local float N9[3][3][3];
-
- N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
- det[det_step * (c_layer_rows * layer + i) + j];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
- {
- volatile __local float dD[3];
-
- //dx
- dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
- //dy
- dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
- //ds
- dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
-
- volatile __local float H[3][3];
-
- //dxx
- H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
- //dxy
- H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
- //dxs
- H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
- //dyx = dxy
- H[1][0] = H[0][1];
- //dyy
- H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
- //dys
- H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
- //dsx = dxs
- H[2][0] = H[0][2];
- //dsy = dys
- H[2][1] = H[1][2];
- //dss
- H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
-
- volatile __local float x[3];
-
- if (solve3x3_float(H, dD, x))
- {
- if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
- {
- // if the step is within the interpolation region, perform it
-
- const int size = calcSize(c_octave, maxPos.z);
-
- const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
- const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
-
- const float center_i = sum_i + (float)(size - 1) / 2;
- const float center_j = sum_j + (float)(size - 1) / 2;
-
- const float px = center_j + x[0] * (1 << c_octave);
- const float py = center_i + x[1] * (1 << c_octave);
-
- const int ds = size - calcSize(c_octave, maxPos.z - 1);
- const float psize = round(size + x[2] * ds);
-
- /* The sampling intervals and wavelet sized for selecting an orientation
- and building the keypoint descriptor are defined relative to 's' */
- const float s = psize * 1.2f / 9.0f;
-
- /* To find the dominant orientation, the gradients in x and y are
- sampled in a circle of radius 6s using wavelets of size 4s.
- We ensure the gradient wavelet size is even to ensure the
- wavelet pattern is balanced and symmetric around its center */
- const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
-
- // check when grad_wav_size is too big
- if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
- {
- // Get a new feature index.
- int ind = atomic_inc(featureCounter);
-
- if (ind < c_max_features)
- {
- featureX[ind] = px;
- featureY[ind] = py;
- featureLaplacian[ind] = maxPos.w;
- featureOctave[ind] = c_octave;
- featureSize[ind] = psize;
- featureHessian[ind] = N9[1][1][1];
- }
- } // grad_wav_size check
- } // If the subpixel interpolation worked
- }
- } // If this is thread 0.
-}
-
-////////////////////////////////////////////////////////////////////////
-// Orientation
-
-#define ORI_SEARCH_INC 5
-#define ORI_WIN 60
-#define ORI_SAMPLES 113
-
-__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
-__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
-__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
- 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
- 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
- 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
- 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
- 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
- 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
- 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
- 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
- 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
- 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
- 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
- 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
- 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
- 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
- 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
- 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
- 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
- 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
- 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
- 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
- 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
- 0.001707611023448408f, 0.001455130288377404f};
-
-__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
-__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
-
-void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid)
-{
-#define op(A, B) (*A)+(B)
- data[tid] = *partial_reduction;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 16)
- {
- data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
- data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
- data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
- data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
- data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
- }
-#undef op
-}
-
-__kernel
- void icvCalcOrientation(
- IMAGE_INT32 sumTex,
- __global float * keypoints,
- int keypoints_step,
- int c_img_rows,
- int c_img_cols,
- int sum_step
- )
-{
- keypoints_step /= sizeof(*keypoints);
- sum_step /= sizeof(uint);
- __global float* featureX = keypoints + X_ROW * keypoints_step;
- __global float* featureY = keypoints + Y_ROW * keypoints_step;
- __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
- __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
-
-
- volatile __local float s_X[128];
- volatile __local float s_Y[128];
- volatile __local float s_angle[128];
-
- volatile __local float s_sumx[32 * 4];
- volatile __local float s_sumy[32 * 4];
-
- /* The sampling intervals and wavelet sized for selecting an orientation
- and building the keypoint descriptor are defined relative to 's' */
- const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
-
-
- /* To find the dominant orientation, the gradients in x and y are
- sampled in a circle of radius 6s using wavelets of size 4s.
- We ensure the gradient wavelet size is even to ensure the
- wavelet pattern is balanced and symmetric around its center */
- const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
-
- // check when grad_wav_size is too big
- if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
- return;
-
- // Calc X, Y, angle and store it to shared memory
- const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-
- float X = 0.0f, Y = 0.0f, angle = 0.0f;
-
- if (tid < ORI_SAMPLES)
- {
- const float margin = (float)(grad_wav_size - 1) / 2.0f;
- const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
- const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
-
- if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
- x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
- {
- X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
- Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
-
- angle = atan2(Y, X);
-
- if (angle < 0)
- angle += 2.0f * CV_PI_F;
- angle *= 180.0f / CV_PI_F;
-
- }
- }
- s_X[tid] = X;
- s_Y[tid] = Y;
- s_angle[tid] = angle;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- float bestx = 0, besty = 0, best_mod = 0;
-
-#pragma unroll
- for (int i = 0; i < 18; ++i)
- {
- const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
-
- volatile float sumx = 0.0f, sumy = 0.0f;
- int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
- {
- sumx = s_X[get_local_id(0)];
- sumy = s_Y[get_local_id(0)];
- }
- d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
- {
- sumx += s_X[get_local_id(0) + 32];
- sumy += s_Y[get_local_id(0) + 32];
- }
- d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
- {
- sumx += s_X[get_local_id(0) + 64];
- sumy += s_Y[get_local_id(0) + 64];
- }
- d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
- if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
- {
- sumx += s_X[get_local_id(0) + 96];
- sumy += s_Y[get_local_id(0) + 96];
- }
- reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
- reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
-
- const float temp_mod = sumx * sumx + sumy * sumy;
- if (temp_mod > best_mod)
- {
- best_mod = temp_mod;
- bestx = sumx;
- besty = sumy;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if (get_local_id(0) == 0)
- {
- s_X[get_local_id(1)] = bestx;
- s_Y[get_local_id(1)] = besty;
- s_angle[get_local_id(1)] = best_mod;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(1) == 0 && get_local_id(0) == 0)
- {
- int bestIdx = 0;
-
- if (s_angle[1] > s_angle[bestIdx])
- bestIdx = 1;
- if (s_angle[2] > s_angle[bestIdx])
- bestIdx = 2;
- if (s_angle[3] > s_angle[bestIdx])
- bestIdx = 3;
-
- float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
- if (kp_dir < 0)
- kp_dir += 2.0f * CV_PI_F;
- kp_dir *= 180.0f / CV_PI_F;
-
- kp_dir = 360.0f - kp_dir;
- if (fabs(kp_dir - 360.f) < FLT_EPSILON)
- kp_dir = 0.f;
-
- featureDir[get_group_id(0)] = kp_dir;
- }
-}
-
-
-__kernel
- void icvSetUpright(
- __global float * keypoints,
- int keypoints_step,
- int nFeatures
- )
-{
- keypoints_step /= sizeof(*keypoints);
- __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
-
- if(get_global_id(0) <= nFeatures)
- {
- featureDir[get_global_id(0)] = 270.0f;
- }
-}
-
-
-#undef ORI_SEARCH_INC
-#undef ORI_WIN
-#undef ORI_SAMPLES
-
-////////////////////////////////////////////////////////////////////////
-// Descriptors
-
-#define PATCH_SZ 20
-
-__constant float c_DW[PATCH_SZ * PATCH_SZ] =
-{
- 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
- 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
- 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
- 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
- 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
- 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
- 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
- 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
- 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
- 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
- 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
- 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
- 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
- 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
- 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
- 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
- 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
- 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
- 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
- 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
-};
-
-// utility for linear filter
-inline uchar readerGet(
- IMAGE_INT8 src,
- const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
- int i, int j, int rows, int cols, int elemPerRow
- )
-{
- float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
- float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
- return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
-}
-
-inline float linearFilter(
- IMAGE_INT8 src,
- const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
- float y, float x, int rows, int cols, int elemPerRow
- )
-{
- x -= 0.5f;
- y -= 0.5f;
-
- float out = 0.0f;
-
- const int x1 = convert_int_rtn(x);
- const int y1 = convert_int_rtn(y);
- const int x2 = x1 + 1;
- const int y2 = y1 + 1;
-
- uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
- out = out + src_reg * ((x2 - x) * (y2 - y));
-
- src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
- out = out + src_reg * ((x - x1) * (y2 - y));
-
- src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
- out = out + src_reg * ((x2 - x) * (y - y1));
-
- src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
- out = out + src_reg * ((x - x1) * (y - y1));
-
- return out;
-}
-
-void calc_dx_dy(
- IMAGE_INT8 imgTex,
- volatile __local float s_dx_bin[25],
- volatile __local float s_dy_bin[25],
- volatile __local float s_PATCH[6][6],
- __global const float* featureX,
- __global const float* featureY,
- __global const float* featureSize,
- __global const float* featureDir,
- int rows,
- int cols,
- int elemPerRow
- )
-{
- const float centerX = featureX[get_group_id(0)];
- const float centerY = featureY[get_group_id(0)];
- const float size = featureSize[get_group_id(0)];
- float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
- if(fabs(descriptor_dir - 360.0f) < FLT_EPSILON)
- {
- descriptor_dir = 0.0f;
- }
- descriptor_dir *= (float)(CV_PI_F / 180.0f);
-
- /* The sampling intervals and wavelet sized for selecting an orientation
- and building the keypoint descriptor are defined relative to 's' */
- const float s = size * 1.2f / 9.0f;
-
- /* Extract a window of pixels around the keypoint of size 20s */
- const int win_size = (int)((PATCH_SZ + 1) * s);
-
- float sin_dir;
- float cos_dir;
- sin_dir = sincos(descriptor_dir, &cos_dir);
-
- /* Nearest neighbour version (faster) */
- const float win_offset = -(float)(win_size - 1) / 2;
-
- // Compute sampling points
- // since grids are 2D, need to compute xBlock and yBlock indices
- const int xBlock = (get_group_id(1) & 3); // get_group_id(1) % 4
- const int yBlock = (get_group_id(1) >> 2); // floor(get_group_id(1)/4)
- const int xIndex = xBlock * 5 + get_local_id(0);
- const int yIndex = yBlock * 5 + get_local_id(1);
-
- const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
- const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
-
- s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 5 && get_local_id(1) < 5)
- {
- const int tid = get_local_id(1) * 5 + get_local_id(0);
-
- const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
-
- const float vx = (
- s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
- s_PATCH[get_local_id(1) ][get_local_id(0) ] +
- s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
- s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
- * dw;
- const float vy = (
- s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
- s_PATCH[get_local_id(1) ][get_local_id(0) ] +
- s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
- s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
- * dw;
- s_dx_bin[tid] = vx;
- s_dy_bin[tid] = vy;
- }
-}
-void reduce_sum25(
- volatile __local float* sdata1,
- volatile __local float* sdata2,
- volatile __local float* sdata3,
- volatile __local float* sdata4,
- int tid
- )
-{
- // first step is to reduce from 25 to 16
- if (tid < 9) // use 9 threads
- {
- sdata1[tid] += sdata1[tid + 16];
- sdata2[tid] += sdata2[tid + 16];
- sdata3[tid] += sdata3[tid + 16];
- sdata4[tid] += sdata4[tid + 16];
- }
-
- // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
- if (tid < 8)
- {
- sdata1[tid] += sdata1[tid + 8];
- sdata1[tid] += sdata1[tid + 4];
- sdata1[tid] += sdata1[tid + 2];
- sdata1[tid] += sdata1[tid + 1];
-
- sdata2[tid] += sdata2[tid + 8];
- sdata2[tid] += sdata2[tid + 4];
- sdata2[tid] += sdata2[tid + 2];
- sdata2[tid] += sdata2[tid + 1];
-
- sdata3[tid] += sdata3[tid + 8];
- sdata3[tid] += sdata3[tid + 4];
- sdata3[tid] += sdata3[tid + 2];
- sdata3[tid] += sdata3[tid + 1];
-
- sdata4[tid] += sdata4[tid + 8];
- sdata4[tid] += sdata4[tid + 4];
- sdata4[tid] += sdata4[tid + 2];
- sdata4[tid] += sdata4[tid + 1];
- }
-}
-
-__kernel
- void compute_descriptors64(
- IMAGE_INT8 imgTex,
- volatile __global float * descriptors,
- __global const float * keypoints,
- int descriptors_step,
- int keypoints_step,
- int rows,
- int cols,
- int img_step
- )
-{
- descriptors_step /= sizeof(float);
- keypoints_step /= sizeof(float);
- __global const float * featureX = keypoints + X_ROW * keypoints_step;
- __global const float * featureY = keypoints + Y_ROW * keypoints_step;
- __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
- __global const float * featureDir = keypoints + ANGLE_ROW * keypoints_step;
-
- // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
- volatile __local float sdx[25];
- volatile __local float sdy[25];
- volatile __local float sdxabs[25];
- volatile __local float sdyabs[25];
- volatile __local float s_PATCH[6][6];
-
- calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-
- if (tid < 25)
- {
- sdxabs[tid] = fabs(sdx[tid]); // |dx| array
- sdyabs[tid] = fabs(sdy[tid]); // |dy| array
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 25)
- {
- reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 25)
- {
- volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
-
- // write dx, dy, |dx|, |dy|
- if (tid == 0)
- {
- descriptors_block[0] = sdx[0];
- descriptors_block[1] = sdy[0];
- descriptors_block[2] = sdxabs[0];
- descriptors_block[3] = sdyabs[0];
- }
- }
-}
-__kernel
- void compute_descriptors128(
- IMAGE_INT8 imgTex,
- __global volatile float * descriptors,
- __global float * keypoints,
- int descriptors_step,
- int keypoints_step,
- int rows,
- int cols,
- int img_step
- )
-{
- descriptors_step /= sizeof(*descriptors);
- keypoints_step /= sizeof(*keypoints);
-
- __global float * featureX = keypoints + X_ROW * keypoints_step;
- __global float * featureY = keypoints + Y_ROW * keypoints_step;
- __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
- __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
-
- // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
- volatile __local float sdx[25];
- volatile __local float sdy[25];
-
- // sum (reduce) 5x5 area response
- volatile __local float sd1[25];
- volatile __local float sd2[25];
- volatile __local float sdabs1[25];
- volatile __local float sdabs2[25];
- volatile __local float s_PATCH[6][6];
-
- calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-
- if (tid < 25)
- {
- if (sdy[tid] >= 0)
- {
- sd1[tid] = sdx[tid];
- sdabs1[tid] = fabs(sdx[tid]);
- sd2[tid] = 0;
- sdabs2[tid] = 0;
- }
- else
- {
- sd1[tid] = 0;
- sdabs1[tid] = 0;
- sd2[tid] = sdx[tid];
- sdabs2[tid] = fabs(sdx[tid]);
- }
- //barrier(CLK_LOCAL_MEM_FENCE);
-
- reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
- //barrier(CLK_LOCAL_MEM_FENCE);
-
- volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
-
- // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
- if (tid == 0)
- {
- descriptors_block[0] = sd1[0];
- descriptors_block[1] = sdabs1[0];
- descriptors_block[2] = sd2[0];
- descriptors_block[3] = sdabs2[0];
- }
-
- if (sdx[tid] >= 0)
- {
- sd1[tid] = sdy[tid];
- sdabs1[tid] = fabs(sdy[tid]);
- sd2[tid] = 0;
- sdabs2[tid] = 0;
- }
- else
- {
- sd1[tid] = 0;
- sdabs1[tid] = 0;
- sd2[tid] = sdy[tid];
- sdabs2[tid] = fabs(sdy[tid]);
- }
- //barrier(CLK_LOCAL_MEM_FENCE);
-
- reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
- //barrier(CLK_LOCAL_MEM_FENCE);
-
- // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
- if (tid == 0)
- {
- descriptors_block[4] = sd1[0];
- descriptors_block[5] = sdabs1[0];
- descriptors_block[6] = sd2[0];
- descriptors_block[7] = sdabs2[0];
- }
- }
-}
-
-__kernel
- void normalize_descriptors128(__global float * descriptors, int descriptors_step)
-{
- descriptors_step /= sizeof(*descriptors);
- // no need for thread ID
- __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
-
- // read in the unnormalized descriptor values (squared)
- volatile __local float sqDesc[128];
- const float lookup = descriptor_base[get_local_id(0)];
- sqDesc[get_local_id(0)] = lookup * lookup;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 64)
- sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // reduction to get total
- if (get_local_id(0) < 32)
- {
- volatile __local float* smem = sqDesc;
-
- smem[get_local_id(0)] += smem[get_local_id(0) + 32];
- smem[get_local_id(0)] += smem[get_local_id(0) + 16];
- smem[get_local_id(0)] += smem[get_local_id(0) + 8];
- smem[get_local_id(0)] += smem[get_local_id(0) + 4];
- smem[get_local_id(0)] += smem[get_local_id(0) + 2];
- smem[get_local_id(0)] += smem[get_local_id(0) + 1];
- }
-
- // compute length (square root)
- volatile __local float len;
- if (get_local_id(0) == 0)
- {
- len = sqrt(sqDesc[0]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // normalize and store in output
- descriptor_base[get_local_id(0)] = lookup / len;
-}
-__kernel
- void normalize_descriptors64(__global float * descriptors, int descriptors_step)
-{
- descriptors_step /= sizeof(*descriptors);
- // no need for thread ID
- __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
-
- // read in the unnormalized descriptor values (squared)
- volatile __local float sqDesc[64];
- const float lookup = descriptor_base[get_local_id(0)];
- sqDesc[get_local_id(0)] = lookup * lookup;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // reduction to get total
- if (get_local_id(0) < 32)
- {
- volatile __local float* smem = sqDesc;
-
- smem[get_local_id(0)] += smem[get_local_id(0) + 32];
- smem[get_local_id(0)] += smem[get_local_id(0) + 16];
- smem[get_local_id(0)] += smem[get_local_id(0) + 8];
- smem[get_local_id(0)] += smem[get_local_id(0) + 4];
- smem[get_local_id(0)] += smem[get_local_id(0) + 2];
- smem[get_local_id(0)] += smem[get_local_id(0) + 1];
- }
-
- // compute length (square root)
- volatile __local float len;
- if (get_local_id(0) == 0)
- {
- len = sqrt(sqDesc[0]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // normalize and store in output
- descriptor_base[get_local_id(0)] = lookup / len;
-}
#if defined (HAVE_OPENCL)
-#if defined __APPLE__
-#include <OpenCL/OpenCL.h>
-#else
-#include <CL/opencl.h>
-#endif
-
+#include "opencv2/ocl/private/util.hpp"
#include "safe_call.hpp"
using namespace std;
{
namespace ocl
{
- ///////////////////////////OpenCL call wrappers////////////////////////////
- void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
- size_t widthInBytes, size_t height);
- void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
- size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
- void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
- const void *src, size_t spitch,
- size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
- void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
- const void *src, size_t spitch,
- size_t width, size_t height, int src_offset);
- void openCLFree(void *devPtr);
- cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
- void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
- cl_kernel openCLGetKernelFromSource(const Context *clCxt,
- const char **source, string kernelName);
- cl_kernel openCLGetKernelFromSource(const Context *clCxt,
- const char **source, string kernelName, const char *build_options);
- void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
- void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, vector< std::pair<size_t, const void *> > &args,
- int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
- void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName,
- size_t globalThreads[3], size_t localThreads[3],
- vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
- void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
- size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth);
- void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
- size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
- int depth, const char *build_options);
-
- cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
- const size_t size);
-
- cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
-
- //void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr,
- // enum openCLMemcpyKind kind, cl_bool blocking_write);
- int savetofile(const Context *clcxt, cl_program &program, const char *fileName);
struct Context::Impl
{
//Information of the OpenCL context
#include "precomp.hpp"
-#include "mcwutil.hpp"
using namespace std;
using namespace cv;
using namespace cv::ocl;
+++ /dev/null
-/*M/////////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other oclMaterials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include <iomanip>
-#include "precomp.hpp"
-#include "mcwutil.hpp"
-//#include "opencv2/highgui/highgui.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
- namespace ocl
- {
- ///////////////////////////OpenCL kernel strings///////////////////////////
- extern const char *nonfree_surf;
-
- const char* noImage2dOption = "-D DISABLE_IMAGE2D";
-
- static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
- size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth)
- {
- if(support_image2d())
- {
- openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
- }
- else
- {
- openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
- }
- }
- }
-}
-
-
-static inline int divUp(int total, int grain)
-{
- return (total + grain - 1) / grain;
-}
-static inline int calcSize(int octave, int layer)
-{
- /* Wavelet size at first layer of first octave. */
- const int HAAR_SIZE0 = 9;
-
- /* Wavelet size increment between layers. This should be an even number,
- such that the wavelet sizes in an octave are either all even or all odd.
- This ensures that when looking for the neighbors of a sample, the layers
-
- above and below are aligned correctly. */
- const int HAAR_SIZE_INC = 6;
-
- return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
-}
-
-
-class SURF_OCL_Invoker
-{
-public:
- // facilities
- void bindImgTex(const oclMat &img, cl_mem &texture);
-
- //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
- //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
-
- // kernel callers declarations
- void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
-
- void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
- int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
-
- void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
- oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);
-
- void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
-
- void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
-
- void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
- // end of kernel callers declarations
-
- SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
- surf_(surf),
- img_cols(img.cols), img_rows(img.rows),
- use_mask(!mask.empty()), counters(oclMat()),
- imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
- {
- CV_Assert(!img.empty() && img.type() == CV_8UC1);
- CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
- CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
-
- const int min_size = calcSize(surf_.nOctaves - 1, 0);
- CV_Assert(img_rows - min_size >= 0);
- CV_Assert(img_cols - min_size >= 0);
-
- const int layer_rows = img_rows >> (surf_.nOctaves - 1);
- const int layer_cols = img_cols >> (surf_.nOctaves - 1);
- const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
- CV_Assert(layer_rows - 2 * min_margin > 0);
- CV_Assert(layer_cols - 2 * min_margin > 0);
-
- maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
- maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
-
- CV_Assert(maxFeatures > 0);
-
- counters.create(1, surf_.nOctaves + 1, CV_32SC1);
- counters.setTo(Scalar::all(0));
-
- integral(img, surf_.sum);
- if(support_image2d())
- {
- bindImgTex(img, imgTex);
- bindImgTex(surf_.sum, sumTex);
- }
-
- maskSumTex = 0;
-
- if (use_mask)
- {
- CV_Error(CV_StsBadFunc, "Masked SURF detector is not implemented yet");
- //!FIXME
- // temp fix for missing min overload
- //oclMat temp(mask.size(), mask.type());
- //temp.setTo(Scalar::all(1.0));
- ////cv::ocl::min(mask, temp, surf_.mask1); ///////// disable this
- //integral(surf_.mask1, surf_.maskSum);
- //bindImgTex(surf_.maskSum, maskSumTex);
- }
- }
-
- void detectKeypoints(oclMat &keypoints)
- {
- // create image pyramid buffers
- // different layers have same sized buffers, but they are sampled from Gaussian kernel.
- ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
- ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
-
- ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
- ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
- keypoints.setTo(Scalar::all(0));
-
- for (int octave = 0; octave < surf_.nOctaves; ++octave)
- {
- const int layer_rows = img_rows >> octave;
- const int layer_cols = img_cols >> octave;
-
- //loadOctaveConstants(octave, layer_rows, layer_cols);
-
- icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
-
- icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
- octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
-
- int maxCounter = ((Mat)counters).at<int>(1 + octave);
- maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
-
- if (maxCounter > 0)
- {
- icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
- keypoints, counters, octave, layer_rows, maxFeatures);
- }
- }
- int featureCounter = Mat(counters).at<int>(0);
- featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
-
- keypoints.cols = featureCounter;
-
- if (surf_.upright)
- {
- //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
- setUpright(keypoints);
- }
- else
- {
- findOrientation(keypoints);
- }
- }
-
- void setUpright(oclMat &keypoints)
- {
- const int nFeatures = keypoints.cols;
- if(nFeatures > 0)
- {
- icvSetUpright_gpu(keypoints, keypoints.cols);
- }
- }
-
- void findOrientation(oclMat &keypoints)
- {
- const int nFeatures = keypoints.cols;
- if (nFeatures > 0)
- {
- icvCalcOrientation_gpu(keypoints, nFeatures);
- }
- }
-
- void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize)
- {
- const int nFeatures = keypoints.cols;
- if (nFeatures > 0)
- {
- ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
- compute_descriptors_gpu(descriptors, keypoints, nFeatures);
- }
- }
-
- ~SURF_OCL_Invoker()
- {
- if(imgTex)
- openCLFree(imgTex);
- if(sumTex)
- openCLFree(sumTex);
- if(maskSumTex)
- openCLFree(maskSumTex);
- }
-
-private:
- SURF_OCL &surf_;
-
- int img_cols, img_rows;
-
- bool use_mask;
-
- int maxCandidates;
- int maxFeatures;
-
- oclMat counters;
-
- // texture buffers
- cl_mem imgTex;
- cl_mem sumTex;
- cl_mem maskSumTex;
-
- const oclMat _img; // make a copy for non-image2d_t supported platform
-
- SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
- {
- (*this) = right;
- return *this;
- } // remove warning C4512
-};
-
-cv::ocl::SURF_OCL::SURF_OCL()
-{
- hessianThreshold = 100.0f;
- extended = true;
- nOctaves = 4;
- nOctaveLayers = 2;
- keypointsRatio = 0.01f;
- upright = false;
-}
-
-cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
-{
- hessianThreshold = saturate_cast<float>(_threshold);
- extended = _extended;
- nOctaves = _nOctaves;
- nOctaveLayers = _nOctaveLayers;
- keypointsRatio = _keypointsRatio;
- upright = _upright;
-}
-
-int cv::ocl::SURF_OCL::descriptorSize() const
-{
- return extended ? 128 : 64;
-}
-
-void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint> &keypoints, oclMat &keypointsGPU)
-{
- if (keypoints.empty())
- keypointsGPU.release();
- else
- {
- Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
-
- float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
- float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
- int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
- int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
- float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
- float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
- float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
-
- for (size_t i = 0, size = keypoints.size(); i < size; ++i)
- {
- const KeyPoint &kp = keypoints[i];
- kp_x[i] = kp.pt.x;
- kp_y[i] = kp.pt.y;
- kp_octave[i] = kp.octave;
- kp_size[i] = kp.size;
- kp_dir[i] = kp.angle;
- kp_hessian[i] = kp.response;
- kp_laplacian[i] = 1;
- }
-
- keypointsGPU.upload(keypointsCPU);
- }
-}
-
-void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, vector<KeyPoint> &keypoints)
-{
- const int nFeatures = keypointsGPU.cols;
-
- if (nFeatures == 0)
- keypoints.clear();
- else
- {
- CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
-
- Mat keypointsCPU(keypointsGPU);
-
- keypoints.resize(nFeatures);
-
- float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
- float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
- int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
- int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
- float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
- float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
- float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
-
- for (int i = 0; i < nFeatures; ++i)
- {
- KeyPoint &kp = keypoints[i];
- kp.pt.x = kp_x[i];
- kp.pt.y = kp_y[i];
- kp.class_id = kp_laplacian[i];
- kp.octave = kp_octave[i];
- kp.size = kp_size[i];
- kp.angle = kp_dir[i];
- kp.response = kp_hessian[i];
- }
- }
-}
-
-void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, vector<float> &descriptors)
-{
- if (descriptorsGPU.empty())
- descriptors.clear();
- else
- {
- CV_Assert(descriptorsGPU.type() == CV_32F);
-
- descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
- Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
- descriptorsGPU.download(descriptorsCPU);
- }
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints)
-{
- if (!img.empty())
- {
- SURF_OCL_Invoker surf(*this, img, mask);
-
- surf.detectKeypoints(keypoints);
- }
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
- bool useProvidedKeypoints)
-{
- if (!img.empty())
- {
- SURF_OCL_Invoker surf(*this, img, mask);
-
- if (!useProvidedKeypoints)
- surf.detectKeypoints(keypoints);
- else if (!upright)
- {
- surf.findOrientation(keypoints);
- }
-
- surf.computeDescriptors(keypoints, descriptors, descriptorSize());
- }
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints)
-{
- oclMat keypointsGPU;
-
- (*this)(img, mask, keypointsGPU);
-
- downloadKeypoints(keypointsGPU, keypoints);
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints,
- oclMat &descriptors, bool useProvidedKeypoints)
-{
- oclMat keypointsGPU;
-
- if (useProvidedKeypoints)
- uploadKeypoints(keypoints, keypointsGPU);
-
- (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
-
- downloadKeypoints(keypointsGPU, keypoints);
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints,
- vector<float> &descriptors, bool useProvidedKeypoints)
-{
- oclMat descriptorsGPU;
-
- (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
-
- downloadDescriptors(descriptorsGPU, descriptors);
-}
-
-void cv::ocl::SURF_OCL::releaseMemory()
-{
- sum.release();
- mask1.release();
- maskSum.release();
- intBuffer.release();
- det.release();
- trace.release();
- maxPosBuffer.release();
-}
-
-
-// bind source buffer to image oject.
-void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
-{
- if(texture)
- {
- openCLFree(texture);
- }
- texture = bindTexture(img);
-}
-
-////////////////////////////
-// kernel caller definitions
-void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows)
-{
- const int min_size = calcSize(octave, 0);
- const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
- const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
-
- Context *clCxt = det.clCxt;
- string kernelName = "icvCalcLayerDetAndTrace";
- vector< pair<size_t, const void *> > args;
-
- if(sumTex)
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
- }
- else
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
- }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
- args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
- args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
-
- size_t localThreads[3] = {16, 16, 1};
- size_t globalThreads[3] =
- {
- divUp(max_samples_j, localThreads[0]) *localThreads[0],
- divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
- 1
- };
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
- int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
-{
- const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
-
- Context *clCxt = det.clCxt;
- string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
- vector< pair<size_t, const void *> > args;
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&counterOffset));
- args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&nLayers));
- args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
- args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&layer_cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
- args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
-
- if(use_mask)
- {
- if(maskSumTex)
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
- }
- else
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
- }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
- }
- size_t localThreads[3] = {16, 16, 1};
- size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
- divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
- 1
- };
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
- oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
-{
- Context *clCxt = det.clCxt;
- string kernelName = "icvInterpolateKeypoint";
- vector< pair<size_t, const void *> > args;
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
- args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures));
-
- size_t localThreads[3] = {3, 3, 3};
- size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
-{
- Context *clCxt = counters.clCxt;
- string kernelName = "icvCalcOrientation";
-
- vector< pair<size_t, const void *> > args;
-
- if(sumTex)
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
- }
- else
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
- }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
-
- size_t localThreads[3] = {32, 4, 1};
- size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
-{
- Context *clCxt = counters.clCxt;
- string kernelName = "icvSetUpright";
-
- vector< pair<size_t, const void *> > args;
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&nFeatures));
-
- size_t localThreads[3] = {256, 1, 1};
- size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-
-void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
-{
- // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
- Context *clCxt = descriptors.clCxt;
- string kernelName = "";
- vector< pair<size_t, const void *> > args;
- size_t localThreads[3] = {1, 1, 1};
- size_t globalThreads[3] = {1, 1, 1};
-
- if(descriptors.cols == 64)
- {
- kernelName = "compute_descriptors64";
-
- localThreads[0] = 6;
- localThreads[1] = 6;
-
- globalThreads[0] = nFeatures * localThreads[0];
- globalThreads[1] = 16 * localThreads[1];
-
- args.clear();
- if(imgTex)
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
- }
- else
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
- }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-
- kernelName = "normalize_descriptors64";
-
- localThreads[0] = 64;
- localThreads[1] = 1;
-
- globalThreads[0] = nFeatures * localThreads[0];
- globalThreads[1] = localThreads[1];
-
- args.clear();
- args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
- }
- else
- {
- kernelName = "compute_descriptors128";
-
- localThreads[0] = 6;
- localThreads[1] = 6;
-
- globalThreads[0] = nFeatures * localThreads[0];
- globalThreads[1] = 16 * localThreads[1];
-
- args.clear();
- if(imgTex)
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
- }
- else
- {
- args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
- }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-
- kernelName = "normalize_descriptors128";
-
- localThreads[0] = 128;
- localThreads[1] = 1;
-
- globalThreads[0] = nFeatures * localThreads[0];
- globalThreads[1] = localThreads[1];
-
- args.clear();
- args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
-
- openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
- }
-}