\r
#else /* !defined (HAVE_CUDA) */\r
\r
-namespace cv { namespace gpu { namespace bfmatcher\r
+namespace cv { namespace gpu { namespace bf_match\r
{\r
template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& distance, \r
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, \r
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,\r
int cc, cudaStream_t stream);\r
+}}}\r
\r
+namespace cv { namespace gpu { namespace bf_knnmatch\r
+{\r
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, \r
int cc, cudaStream_t stream);\r
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, \r
int cc, cudaStream_t stream);\r
+}}}\r
\r
+namespace cv { namespace gpu { namespace bf_radius_match \r
+{\r
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, \r
cudaStream_t stream);\r
if (queryDescs.empty() || trainDescs.empty())\r
return;\r
\r
- using namespace cv::gpu::bfmatcher;\r
+ using namespace cv::gpu::bf_match;\r
\r
typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& distance, \r
if (queryDescs.empty() || trainCollection.empty())\r
return;\r
\r
- using namespace cv::gpu::bfmatcher;\r
+ using namespace cv::gpu::bf_match;\r
\r
typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, \r
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, \r
if (queryDescs.empty() || trainDescs.empty())\r
return;\r
\r
- using namespace cv::gpu::bfmatcher;\r
+ using namespace cv::gpu::bf_knnmatch;\r
\r
typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, \r
if (queryDescs.empty() || trainDescs.empty())\r
return;\r
\r
- using namespace cv::gpu::bfmatcher;\r
+ using namespace cv::gpu::bf_radius_match;\r
\r
typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, \r
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, \r
}\r
};\r
\r
- CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));\r
+ CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));\r
\r
const int nQuery = queryDescs.rows;\r
const int nTrain = trainDescs.rows;\r
using namespace cv::gpu;\r
using namespace cv::gpu::device;\r
\r
-namespace cv { namespace gpu { namespace bfmatcher\r
+namespace cv { namespace gpu { namespace bf_radius_match\r
{\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
- __global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask, \r
- DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)\r
+ __device__ __forceinline__ void store(const int* sidx, const float* sdist, const unsigned int scount, int* trainIdx, float* distance, int& sglob_ind, const int tid)\r
{\r
- #if __CUDA_ARCH__ >= 110\r
+ if (tid < scount)\r
+ {\r
+ trainIdx[sglob_ind + tid] = sidx[tid];\r
+ distance[sglob_ind + tid] = sdist[tid];\r
+ }\r
\r
- __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+ if (tid == 0)\r
+ sglob_ind += scount;\r
+ }\r
\r
- typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
- \r
- const int queryIdx = blockIdx.x;\r
- const T* queryDescs = query.ptr(queryIdx);\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename VecDiff, typename Dist, typename T, typename Mask>\r
+ __global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, const float maxDistance, const Mask mask, \r
+ DevMem2Di trainIdx_, PtrStepf distance, unsigned int* nMatches)\r
+ {\r
+ #if __CUDA_ARCH__ >= 120\r
+\r
+ typedef typename Dist::result_type result_type;\r
+ typedef typename Dist::value_type value_type;\r
+\r
+ __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+ __shared__ int sidx[BLOCK_STACK];\r
+ __shared__ float sdist[BLOCK_STACK];\r
+ __shared__ unsigned int scount;\r
+ __shared__ int sglob_ind;\r
\r
- const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+ const int queryIdx = blockIdx.x;\r
+ const int tid = threadIdx.y * BLOCK_DIM_X + threadIdx.x;\r
\r
- if (trainIdx < train.rows)\r
+ if (tid == 0)\r
{\r
- const T* trainDescs = train.ptr(trainIdx);\r
+ scount = 0;\r
+ sglob_ind = 0;\r
+ }\r
+ __syncthreads();\r
+\r
+ int* trainIdx_row = trainIdx_.ptr(queryIdx);\r
+ float* distance_row = distance.ptr(queryIdx);\r
+\r
+ const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, tid, threadIdx.x);\r
+ \r
+ typename Dist::result_type* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y;\r
\r
+ for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y)\r
+ {\r
if (mask(queryIdx, trainIdx))\r
{\r
Dist dist;\r
\r
- calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);\r
+ const T* trainRow = train.ptr(trainIdx);\r
+ \r
+ vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);\r
\r
- if (threadIdx.x == 0)\r
+ const typename Dist::result_type val = dist;\r
+\r
+ if (threadIdx.x == 0 && val < maxDistance)\r
{\r
- if (dist < maxDistance)\r
- {\r
- unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
- if (i < trainIdx_.cols)\r
- {\r
- distance.ptr(queryIdx)[i] = dist;\r
- trainIdx_.ptr(queryIdx)[i] = trainIdx;\r
- }\r
- }\r
+ unsigned int i = atomicInc(&scount, (unsigned int) -1);\r
+ sidx[i] = trainIdx;\r
+ sdist[i] = val;\r
}\r
}\r
+ __syncthreads();\r
+\r
+ if (scount > BLOCK_STACK - BLOCK_DIM_Y)\r
+ {\r
+ store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid);\r
+ if (tid == 0)\r
+ scount = 0;\r
+ }\r
+ __syncthreads();\r
}\r
\r
+ store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid);\r
+\r
+ if (tid == 0)\r
+ nMatches[queryIdx] = sglob_ind;\r
+\r
#endif\r
}\r
- \r
+\r
///////////////////////////////////////////////////////////////////////////////\r
// Radius Match kernel caller\r
\r
- template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
- void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
- const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance, \r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename Dist, typename T, typename Mask>\r
+ void radiusMatchSimple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches,\r
cudaStream_t stream)\r
{\r
+ StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check();\r
+ StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check();\r
+\r
+ const dim3 grid(query.rows, 1, 1);\r
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
- const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);\r
\r
- radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);\r
+ radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>\r
+ <<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, distance, nMatches);\r
cudaSafeCall( cudaGetLastError() );\r
\r
if (stream == 0)\r
cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- \r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>\r
+ void radiusMatchCached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+ const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches, \r
+ cudaStream_t stream)\r
+ {\r
+ StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check();\r
+ StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check();\r
+ StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check();\r
+ StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();\r
+\r
+ const dim3 grid(query.rows, 1, 1);\r
+ const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+ radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>\r
+ <<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, distance, nMatches);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+\r
///////////////////////////////////////////////////////////////////////////////\r
// Radius Match Dispatcher\r
-\r
+ \r
template <typename Dist, typename T, typename Mask>\r
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
- const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, \r
+ const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, \r
cudaStream_t stream)\r
{\r
- radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, \r
- static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance), \r
- stream);\r
- }\r
+ if (query.cols < 64)\r
+ {\r
+ radiusMatchCached_caller<16, 16, 64, 64, false, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,\r
+ stream);\r
+ }\r
+ else if (query.cols == 64)\r
+ {\r
+ radiusMatchCached_caller<16, 16, 64, 64, true, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,\r
+ stream);\r
+ }\r
+ else if (query.cols < 128)\r
+ {\r
+ radiusMatchCached_caller<16, 16, 64, 128, false, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,\r
+ stream);\r
+ }\r
+ else if (query.cols == 128)\r
+ {\r
+ radiusMatchCached_caller<16, 16, 64, 128, true, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,\r
+ stream);\r
+ }\r
+ else if (query.cols < 256)\r
+ {\r
+ radiusMatchCached_caller<16, 16, 64, 256, false, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,\r
+ stream);\r
+ }\r
+ else if (query.cols == 256)\r
+ {\r
+ radiusMatchCached_caller<16, 16, 64, 256, true, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data, \r
+ stream);\r
+ }\r
+ else\r
+ {\r
+ radiusMatchSimple_caller<16, 16, 64, Dist>(\r
+ query, train, maxDistance, mask, \r
+ static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,\r
+ stream);\r
+ }\r
+ } \r
\r
///////////////////////////////////////////////////////////////////////////////\r
// Radius Match caller\r
if (mask.data)\r
{\r
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
- trainIdx, nMatches, distance, \r
+ trainIdx, distance, nMatches, \r
stream);\r
}\r
else\r
{\r
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
- trainIdx, nMatches, distance, \r
+ trainIdx, distance, nMatches, \r
stream);\r
}\r
}\r
if (mask.data)\r
{\r
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
- trainIdx, nMatches, distance, \r
+ trainIdx, distance, nMatches, \r
stream);\r
}\r
else\r
{\r
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
- trainIdx, nMatches, distance, \r
+ trainIdx, distance, nMatches, \r
stream);\r
}\r
}\r
if (mask.data)\r
{\r
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
- trainIdx, nMatches, distance, \r
+ trainIdx, distance, nMatches, \r
stream);\r
}\r
else\r
{\r
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
- trainIdx, nMatches, distance, \r
+ trainIdx, distance, nMatches, \r
stream);\r
}\r
}\r