added mask support to SURF_GPU.
////////////////////////////// Arithmetics ///////////////////////////////////\r
\r
//! transposes the matrix\r
- //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type\r
+ //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)\r
CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);\r
\r
//! reverses the order of the rows, columns or both in a matrix\r
//! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types\r
CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar());\r
\r
+ //! computes the integral image\r
+ //! sum will have CV_32S type, but will contain unsigned int values\r
+ //! supports only CV_8UC1 source type\r
+ CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum);\r
+\r
//! computes the integral image and integral for the squared image\r
//! sum will have CV_32S type, sqsum - CV32F type\r
//! supports only CV_8UC1 source type\r
static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);\r
\r
//! finds the keypoints using fast hessian detector used in SURF\r
- //! supports CV_8UC1 (0..255) and CV_32FC1 (0..1) images\r
+ //! supports CV_8UC1 images\r
//! keypoints will have 1 row and type CV_32FC(6)\r
- //! keypoints.at<float6>(1, i) contains i'th keypoint\r
+ //! keypoints.at<float[6]>(1, i) contains i'th keypoint\r
//! format: (x, y, size, response, angle, octave)\r
- void operator()(const GpuMat& img, GpuMat& keypoints);\r
+ void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);\r
//! finds the keypoints and computes their descriptors. \r
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction\r
- void operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, \r
+ void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, \r
bool useProvidedKeypoints = false, bool calcOrientation = true);\r
\r
- void operator()(const GpuMat& img, std::vector<KeyPoint>& keypoints);\r
- void operator()(const GpuMat& img, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, \r
+ void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);\r
+ void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, \r
bool useProvidedKeypoints = false, bool calcOrientation = true);\r
\r
- void operator()(const GpuMat& img, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, \r
+ void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, \r
bool useProvidedKeypoints = false, bool calcOrientation = true);\r
\r
- GpuMat img_float;\r
- GpuMat img_float_tr;\r
-\r
GpuMat sum;\r
+ GpuMat sumf;\r
+\r
+ GpuMat mask1;\r
+ GpuMat maskSum;\r
\r
GpuMat hessianBuffer;\r
GpuMat maxPosBuffer;\r
////////////////////////////////////////////////////////////////////////\r
// transpose\r
\r
-namespace cv { namespace gpu { namespace mathfunc\r
-{\r
- void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst);\r
-}}}\r
-\r
void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)\r
{\r
- CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8SC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 \r
- || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);\r
+ CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8);\r
\r
dst.create( src.cols, src.rows, src.type() );\r
\r
- if (src.type() == CV_8UC1 || src.type() == CV_8SC1)\r
+ if (src.elemSize() == 1)\r
{\r
NppiSize sz;\r
sz.width = src.cols;\r
\r
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );\r
}\r
- else\r
+ else if (src.elemSize() == 4)\r
{\r
- mathfunc::transpose_gpu(src, dst);\r
+ NppStSize32u sz;\r
+ sz.width = src.cols;\r
+ sz.height = src.rows;\r
+\r
+ nppSafeCall( nppiStTranspose_32u_C1R(const_cast<NppSt32u*>(src.ptr<NppSt32u>()), src.step, \r
+ dst.ptr<NppSt32u>(), dst.step, sz) );\r
+ }\r
+ else // if (src.elemSize() == 8)\r
+ {\r
+ NppStSize32u sz;\r
+ sz.width = src.cols;\r
+ sz.height = src.rows;\r
+\r
+ nppSafeCall( nppiStTranspose_64u_C1R(const_cast<NppSt64u*>(src.ptr<NppSt64u>()), src.step, \r
+ dst.ptr<NppSt64u>(), dst.step, sz) );\r
}\r
}\r
\r
\r
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);\r
}\r
-\r
-\r
-//////////////////////////////////////////////////////////////////////////////////////////////////////////\r
-// transpose\r
-\r
- __global__ void transpose(const DevMem2Di src, PtrStepi dst)\r
- {\r
- __shared__ int s_mem[16 * 17];\r
-\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
- int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;\r
-\r
- if (y < src.rows && x < src.cols)\r
- {\r
- s_mem[smem_idx] = src.ptr(y)[x];\r
- }\r
- __syncthreads();\r
-\r
- smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;\r
-\r
- x = blockIdx.y * blockDim.x + threadIdx.x;\r
- y = blockIdx.x * blockDim.y + threadIdx.y;\r
-\r
- if (y < src.cols && x < src.rows)\r
- {\r
- dst.ptr(y)[x] = s_mem[smem_idx];\r
- }\r
- }\r
-\r
- void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)\r
- {\r
- dim3 threads(16, 16, 1);\r
- dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);\r
-\r
- transpose<<<grid, threads>>>(src, dst);\r
- cudaSafeCall( cudaThreadSynchronize() );\r
- }\r
}}}\r
\r
\r
\r
////////////////////////////////////////////////////////////////////////\r
// NONMAX\r
+ \r
+ texture<int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+\r
+ struct WithOutMask\r
+ {\r
+ static __device__ bool check(float, float, float)\r
+ {\r
+ return true;\r
+ }\r
+ };\r
+ struct WithMask\r
+ {\r
+ static __device__ bool check(float x, float y, float fscale)\r
+ {\r
+ float half_width = fscale / 2;\r
+ \r
+ float result = 0.f;\r
+\r
+ result += tex2D(maskSumTex, x - half_width, y - half_width);\r
+ result -= tex2D(maskSumTex, x + half_width, y - half_width);\r
+ result -= tex2D(maskSumTex, x - half_width, y + half_width);\r
+ result += tex2D(maskSumTex, x + half_width, y + half_width);\r
+ \r
+ result /= (fscale * fscale);\r
\r
+ return (result >= 0.5f);\r
+ }\r
+ };\r
+\r
+ template <typename Mask>\r
__global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)\r
{ \r
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
float val = fh_vals[localLin];\r
\r
- if (inBounds2 && val >= c_threshold)\r
+ // Compute the lookup location of the mask center\r
+ float x = hidx_x * c_step + c_border;\r
+ float y = hidx_y * c_step + c_border;\r
+ float fscale = calcScale(hidx_z);\r
+\r
+ if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale))\r
{\r
// Check to see if we have a max (in its 26 neighbours)\r
int zoff = blockDim.x * blockDim.y;\r
}\r
\r
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
- int nIntervals, int x_size, int y_size)\r
+ int nIntervals, int x_size, int y_size, bool use_mask)\r
{\r
dim3 threads;\r
threads.x = 16;\r
\r
DeviceReference<unsigned int> maxCounterWrapper(maxCounter);\r
\r
- nonmaxonly<<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
+ if (use_mask)\r
+ nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
+ else\r
+ nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); }\r
void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); }\r
void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int) { throw_nogpu(); }\r
+void cv::gpu::integral(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&) { throw_nogpu(); }\r
////////////////////////////////////////////////////////////////////////\r
// integral\r
\r
+void cv::gpu::integral(const GpuMat& src, GpuMat& sum)\r
+{\r
+ CV_Assert(src.type() == CV_8UC1);\r
+\r
+ sum.create(src.rows + 1, src.cols + 1, CV_32S);\r
+ \r
+ NppStSize32u roiSize;\r
+ roiSize.width = src.cols;\r
+ roiSize.height = src.rows;\r
+\r
+ NppSt32u bufSize;\r
+\r
+ nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize) );\r
+\r
+ GpuMat buffer(1, bufSize, CV_8UC1);\r
+\r
+ nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<NppSt8u*>(src.ptr<NppSt8u>()), src.step, \r
+ sum.ptr<NppSt32u>(), sum.step, roiSize, buffer.ptr<NppSt8u>(), bufSize) );\r
+}\r
+\r
void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum)\r
{\r
CV_Assert(src.type() == CV_8UC1);\r
void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }\r
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); }\r
\r
#else /* !defined (HAVE_CUDA) */\r
\r
void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);\r
\r
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
- int nIntervals, int x_size, int y_size);\r
+ int nIntervals, int x_size, int y_size, bool use_mask);\r
\r
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, \r
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);\r
class SURF_GPU_Invoker : private SURFParams_GPU\r
{\r
public:\r
- SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img) : \r
+ SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : \r
SURFParams_GPU(surf),\r
\r
- img_float(surf.img_float), img_float_tr(surf.img_float_tr),\r
+ sum(surf.sum), sumf(surf.sumf),\r
\r
- sum(surf.sum), \r
+ mask1(surf.mask1), maskSum(surf.maskSum),\r
\r
hessianBuffer(surf.hessianBuffer), \r
maxPosBuffer(surf.maxPosBuffer), \r
\r
img_cols(img.cols), img_rows(img.rows),\r
\r
+ use_mask(!mask.empty()),\r
+\r
mask_width(0), mask_height(0),\r
\r
featureCounter(0), maxCounter(0)\r
{\r
- CV_Assert((img.type() == CV_8UC1 || img.type() == CV_32FC1) && nOctaves > 0 && nIntervals > 2);\r
+ CV_Assert(img.type() == CV_8UC1);\r
+ CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));\r
+ CV_Assert(nOctaves > 0 && nIntervals > 2);\r
CV_Assert(hasAtomicsSupport(getDevice()));\r
\r
max_features = static_cast<int>(img.size().area() * featuresRatio);\r
\r
hessianBuffer.create(height0 * nIntervals, width0, CV_32F);\r
\r
- if (img.type() == CV_32FC1)\r
- img_float = img;\r
- else\r
- img.convertTo(img_float, CV_32F, 1.0 / 255.0);\r
+ integral(img, sum);\r
+ sum.convertTo(sumf, CV_32F, 1.0 / 255.0);\r
+ \r
+ bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf);\r
\r
- transpose(img_float, img_float_tr);\r
- columnSum(img_float_tr, img_float_tr);\r
- transpose(img_float_tr, sum);\r
- columnSum(sum, sum);\r
+ if (!mask.empty())\r
+ {\r
+ min(mask, 1.0, mask1);\r
+ integral(mask1, maskSum);\r
\r
- bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sum);\r
+ bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum);\r
+ }\r
}\r
\r
~SURF_GPU_Invoker()\r
{\r
unbindTexture("cv::gpu::surf::sumTex");\r
+ if (use_mask)\r
+ unbindTexture("cv::gpu::surf::maskSumTex");\r
}\r
\r
void detectKeypoints(GpuMat& keypoints)\r
// Reset the candidate count.\r
maxCounter = 0;\r
\r
- nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size); \r
+ nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); \r
\r
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));\r
\r
}\r
\r
private:\r
- GpuMat& img_float;\r
- GpuMat& img_float_tr;\r
-\r
GpuMat& sum;\r
+ GpuMat& sumf;\r
+\r
+ GpuMat& mask1;\r
+ GpuMat& maskSum;\r
\r
GpuMat& hessianBuffer;\r
GpuMat& maxPosBuffer;\r
GpuMat& featuresBuffer;\r
\r
int img_cols, img_rows;\r
+\r
+ bool use_mask;\r
\r
float mask_width, mask_height;\r
\r
descriptorsGPU.download(descriptorsCPU);\r
}\r
\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints)\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)\r
{\r
- SURF_GPU_Invoker surf(*this, img);\r
+ SURF_GPU_Invoker surf(*this, img, mask);\r
\r
surf.detectKeypoints(keypoints);\r
\r
surf.findOrientation(keypoints);\r
}\r
\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, \r
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, \r
bool useProvidedKeypoints, bool calcOrientation)\r
{\r
- SURF_GPU_Invoker surf(*this, img);\r
+ SURF_GPU_Invoker surf(*this, img, mask);\r
\r
if (!useProvidedKeypoints)\r
surf.detectKeypoints(keypoints);\r
surf.computeDescriptors(keypoints, descriptors, descriptorSize());\r
}\r
\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector<KeyPoint>& keypoints)\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints)\r
{\r
GpuMat keypointsGPU;\r
\r
- (*this)(img, keypointsGPU);\r
+ (*this)(img, mask, keypointsGPU);\r
\r
downloadKeypoints(keypointsGPU, keypoints);\r
}\r
\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector<KeyPoint>& keypoints, GpuMat& descriptors, \r
- bool useProvidedKeypoints, bool calcOrientation)\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, \r
+ GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation)\r
{\r
GpuMat keypointsGPU;\r
\r
if (useProvidedKeypoints)\r
uploadKeypoints(keypoints, keypointsGPU); \r
\r
- (*this)(img, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation);\r
+ (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation);\r
\r
downloadKeypoints(keypointsGPU, keypoints);\r
}\r
\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector<KeyPoint>& keypoints, vector<float>& descriptors, \r
- bool useProvidedKeypoints, bool calcOrientation)\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, \r
+ vector<float>& descriptors, bool useProvidedKeypoints, bool calcOrientation)\r
{\r
GpuMat descriptorsGPU;\r
\r
- (*this)(img, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation);\r
+ (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation);\r
\r
downloadDescriptors(descriptorsGPU, descriptors);\r
}\r
return CvTS::OK;\r
}\r
\r
- Mat cpusum, cpusqsum;\r
- cv::integral(img, cpusum, cpusqsum, CV_32S);\r
+ Mat cpusum;\r
+ cv::integral(img, cpusum, CV_32S);\r
\r
GpuMat gpu1(img);\r
- GpuMat gpusum, gpusqsum;\r
- cv::gpu::integral(gpu1, gpusum, gpusqsum);\r
+ GpuMat gpusum;\r
+ cv::gpu::integral(gpu1, gpusum);\r
\r
- gpusqsum.convertTo(gpusqsum, CV_64F);\r
-\r
- int test_res = CvTS::OK;\r
-\r
- if (CheckNorm(cpusum, gpusum) != CvTS::OK)\r
- {\r
- ts->printf(CvTS::LOG, "\nSum failed\n");\r
- test_res = CvTS::FAIL_GENERIC;\r
- }\r
- if (CheckNorm(cpusqsum, gpusqsum) != CvTS::OK)\r
- {\r
- ts->printf(CvTS::LOG, "\nSquared sum failed\n");\r
- test_res = CvTS::FAIL_GENERIC;\r
- }\r
-\r
- return test_res;\r
+ return CheckNorm(cpusum, gpusum) == CvTS::OK ? CvTS::OK : CvTS::FAIL_GENERIC;\r
}\r
};\r
\r